# This file is part of COFFEE
#
# COFFEE is Copyright (c) 2014, Imperial College London.
# Please see the AUTHORS file in the main source directory for
# a full list of copyright holders.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * The name of Imperial College London or that of other
#       contributors may not be used to endorse or promote products
#       derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS
# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.

"""COFFEE's autotuning system."""

from base import *
from vectorizer import vect_roundup

import subprocess
import os
import tempfile


class Autotuner(object):

    _code_template = """
// This file was automatically generated by COFFEE for kernels autotuning.

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

// Timing
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>

// Firedrake headers
#include "firedrake_geometry.h"

%(vect_header)s
#define VECTOR_ALIGN %(vect_align)d
%(blas_header)s
%(blas_namespace)s

#define RESOLUTION %(resolution)d
#define TOLERANCE 0.000000001

#define PRINT_ARRAY(ARR, SZ)  do { \\
  printf("ARR: "); \\
  for (int k = 0; k < SZ; ++k) \\
    printf("%%e ", ARR[k]); \\
  printf("\\n"); \\
  } while (0);

static inline long stamp()
{
  struct timespec tv;
  clock_gettime(CLOCK_MONOTONIC, &tv);
  return tv.tv_sec * 1000 * 1000 * 1000 + tv.tv_nsec;
}

#ifdef DEBUG
static int compare_1d(double A1[%(cols)s], double A2[%(cols)s], FILE* out)
{
  for(int i = 0; i < %(cols)s; i++)
  {
    if(fabs(A1[i] - A2[i]) > TOLERANCE)
    {
      fprintf(out, "i=%%d, A1[i]=%%e, A2[i]=%%e\\n", i, A1[i], A2[i]);
      return 1;
    }
  }
  return 0;
}

static int compare_2d(double A1[%(rows)s][%(cols)s], double A2[%(rows)s][%(cols)s], FILE* out)
{
  for(int i = 0; i < %(rows)s; i++)
  {
    for(int j = 0; j < %(cols)s; j++)
    {
      if(fabs(A1[i][j] - A2[i][j]) > TOLERANCE)
      {
        fprintf(out, "i=%%d, j=%%d, A1[i][j]=%%e, A2[i][j]=%%e\\n", i, j, A1[i][j], A2[i][j]);
        return 1;
      }
    }
  }
  return 0;
}
#endif

%(globals)s

%(variants)s

%(externc_open)s
int main()
{
  int i = 0, c = 0;
  int counters[%(nvariants)d] = {0};
  char* all_opts[%(nvariants)d];

  /* Call kernel variants */
  %(call_variants)s

  /* Find the fastest variant */
  int best = 0;
  for(int j = 0; j < %(nvariants)d; j++)
  {
    if(counters[j] > counters[best])
    {
      best = j;
    }
  }

  /* Output all variants */
  FILE* out = fopen("%(filename)s", "a");
  fprintf(out, "COFFEE Autotuner: cost of variants:\\n");
  for (int j = 0; j < %(nvariants)d; j++)
  {
    fprintf(out, "  Variant %%d: %%d\\n", j, counters[j]);
  }

  /* Output base, licm1, and fastest variants */
  /*
  fprintf(out, "Summary:\\n");
  fprintf(out, "Base variant: %%d \\n", counters[0]);
  fprintf(out, "Licm1 variant: %%d \\n", counters[1]);
  */

  fprintf(out, "Fastest variant ID=%%d: %%d \\n", best, counters[best]);
  fprintf(out, "***Chosen optimizations set: %%s***\\n", all_opts[best]);

#ifdef DEBUG
  %(debug_code)s
#endif

  fclose(out);
  return best;
}
%(externc_close)s
"""
    _coeffs_template = """
    // Initialize coefficients
    for (int j = 0; j < %(ndofs)d; j++)
    {
%(init_coeffs)s
    }
"""
    _run_template = """
  // Code variant %(iter)d call
  srand (1);
  all_opts[%(iter)d] = "%(used_opts)s";
  long start%(iter)d, end%(iter)d;
  %(decl_params)s
  start%(iter)d = stamp();
  end%(iter)d = start%(iter)d + RESOLUTION;
#ifndef DEBUG
  #pragma forceinline
  while (stamp() < end%(iter)d)
#else
  while (c < 1)
#endif
  {
    // Initialize coordinates
    for (int j = 0; j < %(ncoords)d; j++)
    {
#ifndef DEBUG
      vertex_coordinates_%(iter)d[j][0] = (double)rand();
#else
      vertex_coordinates_%(iter)d[j][0] = (double)(rand()%%10);
#endif
    }
    %(init_coeffs)s
    #pragma noinline
    %(call_variant)s
    c++;
  }
  counters[i++] = c;
  c = 0;
"""
    _debug_template = """
  if(%(call_debug)s(A_0, A_%(iter)s, out))
  {
    fprintf(out, "COFFEE Warning: code variants 0 and %(iter)s differ\\n");
  }
"""
    _filename = "autotuning_code"
    _coord_size = {
        'compute_jacobian_interval_1d': 2,
        'compute_jacobian_interval_2d': 4,
        'compute_jacobian_interval_3d': 6,
        'compute_jacobian_quad_2d': 8,
        'compute_jacobian_quad_3d': 12,
        'compute_jacobian_triangle_2d': 6,
        'compute_jacobian_triangle_3d': 9,
        'compute_jacobian_tetrahedron_3d': 12,
        'compute_jacobian_prism_3d': 18,
        'compute_jacobian_interval_int_1d': 4,
        'compute_jacobian_interval_int_2d': 8,
        'compute_jacobian_quad_int_2d': 16,
        'compute_jacobian_quad_int_3d': 24,
        'compute_jacobian_interval_int_3d': 12,
        'compute_jacobian_triangle_int_2d': 12,
        'compute_jacobian_triangle_int_3d': 18,
        'compute_jacobian_tetrahedron_int_3d': 24,
        'compute_jacobian_prism_int_3d': 36
    }

    """Create and execute a C file in which multiple variants of the same kernel
    are executed to determine the fastest implementation."""

    def __init__(self, variants, include, compiler, isa, blas):
        """Initialize the autotuner.

        :arg variants:     list of (ast, used_optimizations) for autotuning
        :arg include:      list of directories to be searched for header files
        :arg compiler:     backend compiler info
        :arg isa:          instruction set architecture info
        :arg blas:         COFFEE's dense linear algebra library info
        """

        self.variants = variants
        self.include = include
        self.compiler = compiler
        self.isa = isa
        self.blas = blas

        # Set the directory in which COFFEE will dump any relevant information
        coffee_dir = os.path.join(tempfile.gettempdir(), "coffee-dump-uid%s" % os.getuid())
        # Wrap in try/except to protect against race conditions in parallel
        try:
            if not os.path.exists(coffee_dir):
                os.makedirs(coffee_dir)
        except OSError:
            pass

        # Set the directory where the autotuner will dump its output
        kernel_name = variants[0][0].children[1].name
        tempfile.tempdir = coffee_dir
        self.coffee_dir = tempfile.mkdtemp(suffix="_tune_%s" % kernel_name)
        tempfile.tempdir = None

    def _retrieve_coords_size(self, kernel):
        """Return coordinates array size"""
        for i in Autotuner._coord_size:
            if i in kernel:
                return Autotuner._coord_size[i]
        raise RuntimeError("COFFEE: Autotuner does not know how to expand the jacobian")

    def _retrieve_coeff_size(self, root, coeffs):
        """Return coefficient sizes, rounded up to multiple of vector length"""
        def find_coeff_size(node, coeff, loop_sizes):
            if isinstance(node, FlatBlock):
                return 0
            elif isinstance(node, Symbol):
                if node.symbol == coeff:
                    return loop_sizes[node.rank[0]] if node.rank[0] != '0' else 1
                return 0
            elif isinstance(node, For):
                loop_sizes[node.dim] = node.size
            for n in node.children:
                size = find_coeff_size(n, coeff, loop_sizes)
                if size:
                    return size

        coeffs_size = {}
        for c in coeffs:
            size = find_coeff_size(root, c, {})
            coeffs_size[c] = vect_roundup(size if size else 1)  # Else handles constants case
        return coeffs_size

    def _run(self, src):
        """Compile and run the generated test cases. Return the fastest kernel version."""

        # If requested, run the autotuner in debug mode: eventually, a log file
        # is outputed reporting the result of the numerical comparison of the
        # element matrices as evaluated by the various code variants
        debug_mode = [] if not os.environ.get('COFFEE_DEBUG') else ["-DDEBUG"]

        fext = "c"
        cppargs = ["-std=gnu99", "-O3", self.compiler['native_opt']] + debug_mode + \
                  ["-I%s" % d for d in self.include]
        ldargs = ["-lrt", "-lm"]
        if self.compiler:
            cppargs += [self.compiler[self.isa['inst_set']]]
            cppargs += [self.compiler['ipo']]
        if self.blas:
            blas_dir = self.blas['dir']
            if blas_dir:
                cppargs += ["-I%s/include" % blas_dir]
                ldargs += ["-L%s/lib" % blas_dir]
            ldargs += self.blas['link']
            if self.blas['name'] == 'eigen':
                fext = "cpp"

        # Dump autotuning source out to a file
        filename = os.path.join(self.coffee_dir, "%s.%s" % (Autotuner._filename, fext))
        with file(filename, 'w') as f:
            f.write(src)
        objname = os.path.join(self.coffee_dir, Autotuner._filename)
        logfile = os.path.join(self.coffee_dir, "%s.log" % Autotuner._filename)
        errfile = os.path.join(self.coffee_dir, "%s.err" % Autotuner._filename)
        cc = [self.compiler["cmd"], filename] + cppargs + ['-o', objname] + ldargs
        with file(logfile, "a") as log:
            with file(errfile, "a") as err:
                log.write("Compilation command:\n")
                log.write(" ".join(cc))
                log.write("\n\n")
                # Compile the source code
                try:
                    subprocess.check_call(cc, stderr=err, stdout=log)
                except:
                    raise RuntimeError("""Unable to compile autotuner file
See %s for more info about the error""" % errfile)
                # Execute the autotuner
                try:
                    return subprocess.call([objname], stderr=err, stdout=log)
                except:
                    raise RuntimeError("""Unable to run the autotuner
See %s for more info about the error""" % logfile)

    def tune(self, resolution):
        """Return the fastest kernel implementation.

        :arg resolution: the amount of time in milliseconds a kernel is run."""

        is_global = lambda s: isinstance(s, Decl) and ('static' and 'const' in s.qual)

        # First, determine sizes of parameters in the non-transformed variant
        non_transf_ast = self.variants[0][0]
        fun_decl = non_transf_ast.children[1]
        # Local tensor size
        tensor_rank = fun_decl.args[0].sym.rank
        lt_rows, lt_cols = tensor_rank[0], tensor_rank[-1]
        # Coordinates size
        coords_size = self._retrieve_coords_size(str(non_transf_ast))
        # Coefficients size
        coeffs_syms = [f.sym.symbol.replace('*', '') for f in fun_decl.args[2:]]
        coeffs_size = self._retrieve_coeff_size(fun_decl, coeffs_syms)

        # Create the invidual test cases
        call_variants, debug_code, global_decls = ([], [], [])
        for i, variant in enumerate(self.variants):
            ast, used_opts = variant

            # Create ficticious kernel parameters
            # Here, we follow the "standard" convention:
            # - The first parameter is the local tensor (lt)
            # - The second parameter is the coordinates field (coords)
            # - (Optional) any additional parameter is a generic field,
            #   whose size is bound to the number of dofs in the kernel
            fun_decl = ast.children[1]
            fun_decl.pred.remove('inline')

            lt_arg = fun_decl.args[0].sym
            lt_sym = lt_arg.symbol + "_%d" % i
            lt_init = "".join("{" for r in lt_arg.rank) + "0.0" + \
                "".join("}" for r in lt_arg.rank)
            lt_align = self.compiler['align']("VECTOR_ALIGN")
            if lt_arg.rank[-1] % self.isa["dp_reg"]:
                lt_align = ""
            lt_decl = "double " + lt_sym + "".join(["[%d]" % r for r in lt_arg.rank]) + \
                lt_align + " = " + lt_init

            # Coordinates
            coords_sym = fun_decl.args[1].sym.symbol.replace('*', '')
            coords_decl = "double " + coords_sym + "_%d[%d][1]" % (i, coords_size)

            # Coefficients
            coeffs_syms = [f.sym.symbol.replace('*', '') for f in fun_decl.args[2:]]
            coeffs_types = [f.typ.replace('*', '') for f in fun_decl.args[2:]]
            coeffs_decl = ["%s " % t + f + "_%d[%d][1]" % (i, coeffs_size[f]) for t, f
                           in zip(coeffs_types, coeffs_syms)]

            # Adjust kernel's signature
            fun_decl.args[1].typ = "double"
            fun_decl.args[1].sym = Symbol(coords_sym, ("%d" % coords_size, 1))
            for d, f in zip(fun_decl.args[2:], coeffs_syms):
                d.typ = "double"
                d.sym = Symbol(f, ("%d" % coeffs_size[f], 1))

            # Adjust symbols names for kernel invokation
            coords_sym += "_%d" % i
            coeffs_syms = [f + "_%d" % i for f in coeffs_syms]
            # Adjust kernel name
            fun_decl.name = fun_decl.name + "_%d" % i

            # Remove any static const declaration from the kernel (they are declared
            # just once at the beginning of the file, to reduce code size)
            global_decls = "\n".join([str(s) for s in fun_decl.body if is_global(s)])
            fun_decl.body = [s for s in fun_decl.body if not is_global(s)]

            # Initialize coefficients (if any)
            init_coeffs = ""
            if coeffs_syms:
                wrap_coeffs = "#ifndef DEBUG\n      %s\n#else\n      %s\n#endif"
                real_coeffs = ";\n      ".join([f + "[j][0] = (double)rand();"
                                                for f in coeffs_syms])
                debug_coeffs = ";\n      ".join([f + "[j][0] = (double)(rand()%10);"
                                                 for f in coeffs_syms])
                init_coeffs = Autotuner._coeffs_template % {
                    'ndofs': min(coeffs_size.values()),
                    'init_coeffs': wrap_coeffs % (real_coeffs, debug_coeffs)
                }

            # Instantiate code variant
            params = ", ".join([lt_sym, coords_sym] + coeffs_syms)
            call_variants.append(Autotuner._run_template % {
                'iter': i,
                'used_opts': str(used_opts),
                'decl_params': ";\n  ".join([lt_decl, coords_decl] + coeffs_decl) + ";",
                'ncoords': coords_size,
                'init_coeffs': init_coeffs,
                'call_variant': fun_decl.name + "(%s);" % params
            })

            # Create debug code, apart from the BLAS case
            if not used_opts.get('blas'):
                debug_code.append(Autotuner._debug_template % {
                    'iter': i,
                    'call_debug': "compare_2d"
                })

        # Instantiate the autotuner skeleton
        kernels_code = "\n".join(["/* Code variant %d */" % i + str(k.children[1])
                                  for i, k in enumerate(zip(*self.variants)[0])])
        code_template = Autotuner._code_template % {
            'filename': os.path.join(self.coffee_dir, "%s.out" % Autotuner._filename),
            'rows': lt_rows,
            'cols': lt_cols,
            'vect_header': self.compiler['vect_header'],
            'vect_align': self.isa['alignment'],
            'blas_header': self.blas.get('header'),
            'blas_namespace': self.blas.get('namespace'),
            'resolution': resolution,
            'globals': global_decls,
            'variants': kernels_code,
            'nvariants': len(self.variants),
            'call_variants': "".join(call_variants),
            'externc_open': 'extern "C" {' if self.blas.get('name') in ['eigen'] else "",
            'externc_close': "}" if self.blas.get('name') in ['eigen'] else "",
            'debug_code': "".join(debug_code)
        }

        # Clean code from spurious pragmas
        code_template = '\n'.join(l for l in code_template.split("\n")
                                  if not l.strip().startswith('#pragma coffee'))

        return self._run(code_template)
