/* RKL - RPN kernel Library for C and FORTRAN programming
 * Copyright (C) 2019  Recherche en Prevision Numerique
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation,
 * version 2.1 of the License.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

//****P* librkl/RPN kernel library interpolators
// DESCRIPTION
//
// Set of 3D interpolators,
// C and Fortran callable (C name and Fortran name are the same)
//
// interpolate the value at point (px, py, pz) fron a 3 Dimensional real(float) array
// Fortran : real, dimension(dimi,dimj,dimk) :: array
// C       : float array[dimk][dimj][dimi]
// (Fortran storage order is expected)
//
// the interpolation routines expect to receive the address of point
// array(i[+offseti],1[+offsetj],1)
// and will interpolate to get the value at point (px,py,pz)
// px, py, pz are expressed in "index" space, i.e.
// array(1.5,  1,  1) is halfway between array(1,1,1) and array(2,1,1)
// array(1  ,1.5,  1) is halfway between array(1,1,1) and array(1,2,1)
// array(1  ,1  ,2.5) is halfway between array(1,1,2) and array(1,1,3)
// array(2.5,3.5,4.5) is halfway between array(2,3,4) and array(3,4,5)
//
// constant spacing between array points is assumed for the first 2 dimensions.
//
// along the third dimension, non constant spacing is assumed, the positions along
// said third dimension are supplied via an opaque descriptor created with a call
// to the Vsearch_setup family of routines (returning a C pointer to said descriptor)
//
// 4 point lagrange polynomials are used for cubic interpolations. interpolation is always
// cubic along the first 2 dimensions, while along the third dimension linear interpolation
// is used in the first 2 (pz <= 2)  and the last 2 (pz >= nk-1) intervals, and cubic
// interpolation is used in the other intervals (2.0 < pz < nk - 1)
//
// there is a also monotonic interpolation option.
// in that case, 4 results are returned
// - normal interpolation result
// - tri-linear interpolation result
// - min and max values for the inner 2x2x2 set of values used for tri-linear interpolation
//
//     Name                        Purpose
//
// Vsearch_setup             : simple setup
// Vsearch_setup_plus        : Vsearch_setup + x and y direction offsets
// Tricublin_zyx1_n          : simple interpolator, source is a single variable in a 3D array
// Tricublin_zyx1_n_m        : source is a group of m  3D arrays
// Tricublin_zyx3_n          : source is a 3 variable ( dimension(3, ....) ) 3D array
// Tricublin_mono_zyx_n      : monotonic version of Tricublin_zyx1_n
// Tricublin_mono_zyx_n_m    : monotonic version of Tricublin_zyx1_n_m
//
// Tricublin_zyx1_n_m, Tricublin_zyx3_n, Tricublin_mono_zyx_n_m save computing cycles by
// reusing the computed interpolation coefficients for multiple arrays. The interpolation
// coefficients are dependent upon px, py, pz.
//
// the horizontal (first 2 dimensions) interpolation is bicubic.
// (an extra linear interpolation is performed in the monotonic case)
//
// the vertical (3rd dimension) interpolation is normally cubic, except when the target
// lies between the first 2 or the last 2 levels, in which case it is linear.
//
// the monotonic interpolators provides 4 outputs
// - the result of the centered tri-cubic or bi-cubic+linear interpolation (4x4x4 or 4x4x2)
// - the result of a  centered tri-linear interpolation (2x2x2)
// - the minimum of the 2x2x2 sub-cube used for tri-linear interpolation
// - the maximum of the 2x2x2 sub-cube used for tri-linear interpolation
//
// the interpolation routines assume that the address of the source array is the point
// px = 1.0, py = 1.0, pz = 1.0 which is why f(1,1,1) is normally passed to these routines.
// if this is not the case, offseti and offsetj are used to compensate
//
// EXAMPLES
//
//  use ISO_C_BINDING
//  include "tricublin_f90.inc"
//  type(C_PTR) :: lv, mv                            ! C pointer opaque descriptor object
//  real(kind=REAL64), dimension(NK) :: levels                  ! list of positions for the third dimension
//  real, dimension(-2:NI+4,-1:NJ+3,NK), target   :: f, f1, f2, f3   ! single valued source arrays
//  real, dimension(3,-2:NI+4,-1:NJ+3,NK), target :: f3              ! triple valued source array
//  type(C_PTR), dimension(3) :: f123
//  real, dimension(NPOINTS) :: d, l, mi, ma         ! normal, linear, min, max
//  real, dimension(3,NPOINTS) :: d3, l3, mi3, ma3   ! normal, linear, min, max
//  real, dimension(3,NPOINTS) :: pxpypz             ! (px,py,pz) triplets
//  ......                                           ! levels(1:NK) = ....
//  lv = vsearch_setup(levels, NK, NI+7, NJ+5)       ! NI+7, NJ+5 to account for halos
//  mv = vsearch_setup_plus(levels, NK, NI+7, NJ+5, 3, 2) ! passing f instead of f(1,1,1)
//  ......                                           ! pxpypz(1:3,1:NPOINTS) = ...., [1.0,1.0,1.0] points to  to f(1,1,1)
//  call tricublin_zyx1_n(d ,f(1,1,1)   ,pxpypz,lv,NPOINTS)      ! single interpolation
//  call tricublin_zyx1_n(d ,f          ,pxpypz,mv,NPOINTS)      ! with offset compensation (halos)
//  call tricublin_zyx3_n(d3,f3(1,1,1,1),pxpypz,lv,NPOINTS)      ! triple interpolation
//  f123(1) = C_LOC(f1(1,1,1))
//  f123(2) = C_LOC(f2(1,1,1))
//  f123(3) = C_LOC(f3(1,1,1))
//  call tricublin_zyx1_n_m(d3,f123,pxpypz,lv,NPOINTS,3)         ! 3 single interpolations
//  call tricublin_mono_zyx_n(d,l,mi,ma,f(1,1,1),pxpypz,lv,NPOINTS)      ! monotonic interpolation, 1 array
//  call tricublin_mono_zyx_n_m(d3,l3,mi3,ma3,f123,pxpypz,lv,NPOINTS,3)  ! monotonic interpolation, 3 arrays
//****

#define MAX(a,b) ((a) > (b)) ? (a) : (b)
#define MIN(a,b) ((a) < (b)) ? (a) : (b)

#if defined(__AVX2__) && defined(__x86_64__)
#include <immintrin.h>
#endif

#include <stdint.h>
#include <stdlib.h>

#if defined(TIMING)
#include <sys/time.h>
//uint64_t Nanocycles(void) {
// uint64_t rdtscp_(void) {   // version "in order" avec "serialization"
//   uint32_t lo, hi;
//   __asm__ volatile ("rdtscp"
//       : /* outputs */ "=a" (lo), "=d" (hi)
//       : /* no inputs */
//       : /* clobbers */ "%rcx");
// //   __asm__ volatile ("mfence");
//   return (uint64_t)lo | (((uint64_t)hi) << 32);
//  struct timeval t;
//  uint64_t i;
//  int status;
//  status = gettimeofday(&t,NULL);
//  i = t.tv_sec;
//  i = i * 1000000;
//  i = i + t.tv_usec;
//  i = i * 1000;      // nanoseconds
//  return i;
//}*/
#endif

#if defined(__AVX2__) && defined(__x86_64__)
static double cp133 =  1.0/3.0;
#endif
static double cp167 =  1.0/6.0;
static double cm167 = -1.0/6.0;
static double cm5 = -0.5;
static double cp5 = 0.5;
static double one = 1.0;
static double two = 2.0;

typedef struct{
  float px;    // position along x in index space
  float py;    // position along y in index space
  float pz;    // position along z in index space
//   float z;     // absolute position along z
} pxpypz;

typedef struct{
  double *z;       // table for levels (nk doubles)
  double *ocz;     // pointer to inverse of coefficient denominators [4*nk doubles]
  uint32_t ni;     // distance between rows
  uint32_t nj;     // number of rows
  uint32_t nk;     // number of levels (max 255)
  uint32_t nij;    // ni*nj, distance between levels
  uint32_t offi;   // offset to add to i position (e.g. global to local grid remapping)
  uint32_t offj;   // offset to add to j position (e.g. global to local grid remapping)
}ztab;

/*static void cubic_coeff_denom(double *t, double *f, int n){
  int i, j;

  t += 4;    // will store no coefficients for first interval (no point below)
  j = 0;

  double a, b, c, d, ab, ac, ad, bc, bd, cd;
    for( ; j<n-3 ; j++){     // will store no coefficients for the last 2 intervals
      a = f[0];
      b = f[1];
      c = f[2];
      d = f[3];
      ab = a - b;
      ac = a - c;
      ad = a - d;
      bc = b - c;
      bd = b - d;
      cd = c - d;
      t[0]  =  1.0/(ab*ac*ad);
      t[1]  = -1.0/(ab*bc*bd);
      t[2]  =  1.0/(ac*bc*cd);
      t[3]  = -1.0/(ad*bd*cd);
      t += 4;
      f++;
    }
}*/

// triple product used for Lagrange cubic polynomials coefficients in the non constant case
#define TRIPRD(x,a,b,c) ((x-a)*(x-b)*(x-c))

// inverse denominators r from positions a b c d
static inline void denominators(double *r, double a, double b, double c, double d){
  r[0] = 1.0 / TRIPRD(a,b,c,d);
  r[1] = 1.0 / TRIPRD(b,a,c,d);
  r[2] = 1.0 / TRIPRD(c,a,b,d);
  r[3] = 1.0 / TRIPRD(d,a,b,c);
//  int Mohammad;
//  scanf("Mohammad %d", Mohammad);
}

// allocate lookup table set and
// return pointer to filled table set
// targets are expected to be positive, and monotonically increasing
//****f* librkl/Vsearch_setup
// Synopsis
//
// ARGUMENTS
ztab *Vsearch_setup_sweep(double *targets, int nk, int ni, int nj)
//****
{
  int i;
  ztab *lv;
  double pad;
  int nkl = MAX(nk,4);

  lv = malloc(sizeof(ztab));
  if(lv == NULL) return NULL ;

  lv->z = malloc(nk * sizeof(double));
  if(NULL == lv->z){      // malloc failed
    free(lv);             // deallocate lv
    return NULL;
  }
  for(i=0 ; i<nk  ; i++) { lv->z[i] = targets[i] ; }   // z coordinate table

// for(i=0 ; i<nk  ; i++) { printf("%12.7f ",lv->z[i]) ; } ; printf("\n");
  // denominators for Lagrange cubic polynomials coefficients ( entries 1 to n-2 make sense )
  // entries 0 and n-1 are fudged
  lv->ocz = (double *) malloc(4 * nkl * sizeof(double));
  if(NULL == lv->ocz){    // malloc failed
    free(lv->z);          // deallocate z
    free(lv);             // deallocate lv
    return NULL;
  }
  if(nk >= 4){             // protect against nk < 4
    denominators( &(lv->ocz[0]) , targets[0], targets[1], targets[2], targets[3]);                     // level 0 coeffs are normally not used
    for(i=1 ; i<nk - 2   ; i++) {
      denominators( &(lv->ocz[4*i]) , lv->z[i-1], lv->z[i  ], lv->z[i+1], lv->z[i+2]);
    }
    denominators( &(lv->ocz[4*(nk-2)]) , targets[nk-4], targets[nk-3], targets[nk-2], targets[nk-1]);  // level nk-2 coeffs are normally not used
    denominators( &(lv->ocz[4*(nk-1)]) , targets[nk-4], targets[nk-3], targets[nk-2], targets[nk-1]);  // level nk-1 coeffs are normally not used
  }else{
    for(i=0 ; i<nkl*4 ; i++) lv->ocz[i] = 1.0;            // not used, set to 1.0 to avoid Floating Point errors later on
  }

  lv->ni = ni;           // nb of points along x
  lv->nj = nj;           // nb of points along y
  lv->nk = nk;           // nb of points along z
  lv->nij = ni*nj;       // ni * nj
  lv->offi = 0;
  lv->offj = 0;
  return lv;             // return pointer to filled table
}
//****f* librkl/Vsearch_setup_plus
// Synopsis
//
// ARGUMENTS
ztab *Vsearch_setup_plus_sweep(double *targets, int nk, int ni, int nj, int offseti, int offsetj)
//****
{
  ztab *lv;
  lv = Vsearch_setup_sweep(targets, nk, ni, nj);
  if(lv != NULL) {
    lv->offi = offseti;
    lv->offj = offsetj;
  }
  return lv;             // return pointer to filled table
}

static inline int Vcoef_pxyz4_inline(double *cxyz, int *offset, float px8, float py8, float pz8, ztab *lv, int Step_kount){
  int ix, iy, iz, ijk, zlinear;
  double pxy[2], *base, *pos;
  double zza, zzb, zzc, zzd, zzab, zzcd, dz, px, py, pz;

  int i, j;

    px = px8 ;            // fractional index positions along x, y, z (float to double)
    py = py8 ;
    pz = pz8 ;
// printf("px py pz = %f %f %f\n",px,py,pz);
    ix = px ;
    px = px - ix;                   // px is now deltax (fractional part of px)
    ix = ix + lv->offi;             // global to local grid remapping
    ijk = ix - 2;                   // x displacement (elements), ix assumed to always be >1 and < ni-1
// printf("ix = %d, ijk = %d\n",ix,ijk);
    cxyz[12] = 0.0;
    cxyz[13] = 1.0 - px;             // linear interpolation coefficients along x
    cxyz[14] = px;
    cxyz[15] = 0.0;

    iy = py ;
    py = py - iy;                   // py is now deltay (fractional part of py)
    iy = iy + lv->offj;             // global to local grid remapping
    ijk = ijk + (iy - 2) * lv->ni;  // add y displacement (rows), ix assumed to always be >1 and < nj-1
// printf("iy = %d, ijk = %d\n",iy,ijk);
    cxyz[16] = 1.0 - py;             // linear interpolation coefficients along y
    cxyz[17] = py;

    iz = pz ;
    if(iz<1) iz = 1;
    if(iz>lv->nk-1) iz = lv->nk-1;  // iz < 1 or iz > nk-1 will result in linear extrapolation
    dz = pz - iz;                   // dz is now "fractional" part of pz  (may be <0 or >1 if extrapolating)
    ijk = ijk + (iz -1) * lv->nij;  // add z displacement (2D planes)
    cxyz[18] = 1.0 - dz;             // linear interpolation coefficients along z
    cxyz[19] = dz;

    iz--;                           // iz needs to be in "origin 0" (C index from Fortran index)
    zlinear = (iz - 1) | (lv->nk - 3 - iz);
    zlinear >>= 31;                  // nonzero only if iz < 1 or iz > nk -3 (top and bottom intervals)
    if(! zlinear) ijk = ijk - lv->nij;  // not the linear case, go down one 2D plane to get lower left corner of 4x4x4 cube
    *offset = ijk;
// printf("iz = %d, ijk = %d\n",iz,ijk);
    // now we can compute the coefficients along z using iz and dz
    
    
    /*if(zlinear){
      cxyz[ 8] = 0.0;                    // coefficients for linear interpolation along z
      cxyz[ 9] = 1.0 - dz;
      cxyz[10] = dz;
      cxyz[11] = 0.0;
    }else{
      base  = &(lv->ocz[4*iz]);  // precomputed inverses of denominators
      pos   = &(lv->z[iz]);
      pz  = dz * pos[1] + (1.0 - dz) * pos[0];   // pz is now an absolute position
      zza = pz - pos[-1] ; zzb = pz - pos[0] ; zzc = pz - pos[1] ; zzd = pz - pos[2] ;
      zzab = zza * zzb ; zzcd = zzc * zzd;

      cxyz[ 8] = zzb * zzcd * base[0];   //   cxyz[16] = TRIPRD(pz,pos[1],pos[2],pos[3]) * base[0];
      cxyz[ 9] = zza * zzcd * base[1];   //   cxyz[17] = TRIPRD(pz,pos[0],pos[2],pos[3]) * base[1];
      cxyz[10] = zzd * zzab * base[2];   //   cxyz[18] = TRIPRD(pz,pos[0],pos[1],pos[3]) * base[2];
      cxyz[11] = zzc * zzab * base[3];   //   cxyz[19] = TRIPRD(pz,pos[0],pos[1],pos[2]) * base[3];
    }*/





    /*cxyz[ 0] = cm167*px*(px-one)*(px-two);        // coefficients for cubic interpolation along x
    cxyz[ 1] = cp5*(px+one)*(px-one)*(px-two);
    cxyz[ 2] = cm5*px*(px+one)*(px-two);
    cxyz[ 3] = cp167*px*(px+one)*(px-one);

    cxyz[ 4] = cm167*py*(py-one)*(py-two);        // coefficients for cubic interpolation along y
    cxyz[ 5] = cp5*(py+one)*(py-one)*(py-two);
    cxyz[ 6] = cm5*py*(py+one)*(py-two);
    cxyz[ 7] = cp167*py*(py+one)*(py-one);*/


  //Mohammad


    if(zlinear){
      cxyz[ 8] = 0.0;                    // coefficients for linear interpolation along z
      cxyz[ 9] = 1.0 - dz;
      cxyz[10] = dz;
      cxyz[11] = 0.0;
    }else{
      /*cxyz[ 8] = 0.0;                    // coefficients for linear interpolation along z
      cxyz[ 9] = 1.0 - dz;
      cxyz[10] = dz;
      cxyz[11] = 0.0;*/

      base  = &(lv->ocz[4*iz]);  // precomputed inverses of denominators
      pos   = &(lv->z[iz]);
      pz  = dz * pos[1] + (1.0 - dz) * pos[0];   // pz is now an absolute position
      
      if (Step_kount/2-Step_kount/2.0==0){//backward
        cxyz[ 8] = (pz - pos[0])*(pz - pos[1])/((pos[-1] - pos[0])*(pos[-1] - pos[1]));
        cxyz[ 9] = (pz - pos[-1])*(pz - pos[1])/((pos[0] - pos[-1])*(pos[0] - pos[1]));
        cxyz[10] = (pz - pos[-1])*(pz - pos[0])/((pos[1] - pos[-1])*(pos[1] - pos[0]));
        cxyz[11] = 0.0;
      }
      else{
        cxyz[ 8] = 0.0;
        cxyz[ 9] = (pz - pos[1])*(pz - pos[2])/((pos[0] - pos[1])*(pos[0] - pos[2]));
        cxyz[10] = (pz - pos[0])*(pz - pos[2])/((pos[1] - pos[0])*(pos[1] - pos[2]));
        cxyz[11] = (pz - pos[0])*(pz - pos[1])/((pos[2] - pos[0])*(pos[2] - pos[1]));
      }      
   
    }

  if (Step_kount/2-Step_kount/2.0==0){//backward
    cxyz[0] = cp5*px*(px-one);       // polynomial coefficients along i
    cxyz[1] = -1.0*(px+one)*(px-one);
    cxyz[2] = cp5*px*(px+one);
    cxyz[3] = 0.0;//cp167*x*(x+one)*(x-one);

    cxyz[4] = cp5*py*(py-one);       // polynomial coefficients along j
    cxyz[5] = -1.0*(py+one)*(py-one);
    cxyz[6] = cp5*py*(py+one);
    cxyz[7] = 0.0;//cp167*y*(y+one)*(y-one);
  }
  else{//forward
    cxyz[0] = 0.0;//cm167*x*(x-one)*(x-two);       // polynomial coefficients along i
    cxyz[1] = cp5*(px-one)*(px-two);
    cxyz[2] = -1.0*px*(px-two);
    cxyz[3] = cp5*px*(px-one);

    cxyz[4] = 0.0;//cm167*y*(y-one)*(y-two);       // polynomial coefficients along j
    cxyz[5] = cp5*(py-one)*(py-two);
    cxyz[6] = -1.0*py*(py-two);
    cxyz[7] = cp5*py*(py-one);
  }


    //linear
/*    cxyz[ 8] = 0.0;                    // coefficients for linear interpolation along z
    cxyz[ 9] = 1.0 - dz;
    cxyz[10] = dz;
    cxyz[11] = 0.0;

    cxyz[0] = 0.0;//cp5*x*(x-one);       // polynomial coefficients along i
    cxyz[1] = -1.0*(px-one);
    cxyz[2] = 1.0*px;
    cxyz[3] = 0.0;//cp167*x*(x+one)*(x-one);

    cxyz[4] = 0.0;//cp5*y*(y-one);       // polynomial coefficients along j
    cxyz[5] = -1.0*(py-one);
    cxyz[6] = 1.0*py;
    cxyz[7] = 0.0;//cp167*y*(y+one)*(y-one);
*/



  return zlinear;  // linear / cubic flag
}

// Fortran dimensions: f1(NI,NJ,NK)
// NI      : length of a line
// NINJ    : length of a plane
// zlinear : zero if cubic interpolation along z, non zero if linear interpolation along z
// f1      : address of lower left corner of the 4 x 4 x 4 or 4 x 4 x 2 interpolation box
// interpolation is done along z, then along y, then along x
// values are interpolated using the cubic/linear polynomial coefficients
// pxyz(8) and pxyz(11) both = 0.0 when linear interpolation along z
// pxyz(9) = 1.0 - dz, pxyz(10) = dz are expected in the linear case
// in the z linear case, planes 0 and 1 are the same as are planes 2 and 3
// static inline void Tricublin_zyxf_beta(float *d, float *f1, double *px, double *py, double *pz, int NI, int NINJ){
static inline void Sweep_zyxf1_inline(float *d, float *f1, double *pxyz, int NI, int NINJ, int zlinear, int Step_kount){
  int ni = NI;
  int ninj = NINJ;
  int ninjl;    // ninj (cubic along z) or 0 (linear along z)
  int ni2, ni3;
  double *px = pxyz;
  double *py = pxyz+4;
  double *pz = pxyz+8;

  double va4[4], vb4[4], vc4[4], vd4[4], dst[4];
  int i, ninj2, ninj3;

  ni2 = ni + ni;
  ni3 = ni2 + ni;
  ninjl = ninj ; // assuming cubic case. in the linear case, ninjl will be set to 0
  if(zlinear) ninjl = 0;

  ninj2 = ninj + ninjl;
  ninj3 = ninj2 + ninjl;
  // field 1
  /*for (i=0 ; i<4 ; i++){
    va4[i] = f1[i    ]*pz[0] + f1[i    +ninjl]*pz[1] +  f1[i    +ninj2]*pz[2] + f1[i    +ninj3]*pz[3];
    vb4[i] = f1[i+ni ]*pz[0] + f1[i+ni +ninjl]*pz[1] +  f1[i+ni +ninj2]*pz[2] + f1[i+ni +ninj3]*pz[3];
    vc4[i] = f1[i+ni2]*pz[0] + f1[i+ni2+ninjl]*pz[1] +  f1[i+ni2+ninj2]*pz[2] + f1[i+ni2+ninj3]*pz[3];
    vd4[i] = f1[i+ni3]*pz[0] + f1[i+ni3+ninjl]*pz[1] +  f1[i+ni3+ninj2]*pz[2] + f1[i+ni3+ninj3]*pz[3];
    dst[i] = va4[i]*py[0] + vb4[i]*py[1] + vc4[i]*py[2] + vd4[i]*py[3];
  }
  d[0] = dst[0]*px[0] + dst[1]*px[1] + dst[2]*px[2] + dst[3]*px[3];
  */
  
//SWEEP
  if (Step_kount/2-Step_kount/2.0==0){//backward
    for (i=0 ; i<3 ; i++){
      va4[i] = f1[i    ]*pz[0] + f1[i    +ninjl]*pz[1] +  f1[i    +ninj2]*pz[2];// + f1[i    +ninj3]*pz[3];
      vb4[i] = f1[i+ni ]*pz[0] + f1[i+ni +ninjl]*pz[1] +  f1[i+ni +ninj2]*pz[2];// + f1[i+ni +ninj3]*pz[3];
      vc4[i] = f1[i+ni2]*pz[0] + f1[i+ni2+ninjl]*pz[1] +  f1[i+ni2+ninj2]*pz[2];// + f1[i+ni2+ninj3]*pz[3];
      //vd4[i] = f1[i+ni3]*pz[0] + f1[i+ni3+ninjl]*pz[1] +  f1[i+ni3+ninj2]*pz[2] + f1[i+ni3+ninj3]*pz[3];
      dst[i] = va4[i]*py[0] + vb4[i]*py[1] + vc4[i]*py[2];// + vd4[i]*py[3];
    }
    d[0] = dst[0]*px[0] + dst[1]*px[1] + dst[2]*px[2];// + dst[3]*px[3];
  }
  else{//forward
    for (i=1 ; i<4 ; i++){
      //va4[i] = f1[i    ]*pz[0] + f1[i    +ninjl]*pz[1] +  f1[i    +ninj2]*pz[2] + f1[i    +ninj3]*pz[3];
      vb4[i] = /*f1[i+ni ]*pz[0] + */f1[i+ni +ninjl]*pz[1] +  f1[i+ni +ninj2]*pz[2] + f1[i+ni +ninj3]*pz[3];
      vc4[i] = /*f1[i+ni2]*pz[0] + */f1[i+ni2+ninjl]*pz[1] +  f1[i+ni2+ninj2]*pz[2] + f1[i+ni2+ninj3]*pz[3];
      vd4[i] = /*f1[i+ni3]*pz[0] + */f1[i+ni3+ninjl]*pz[1] +  f1[i+ni3+ninj2]*pz[2] + f1[i+ni3+ninj3]*pz[3];
      dst[i] = /*va4[i]*py[0] + */vb4[i]*py[1] + vc4[i]*py[2] + vd4[i]*py[3];
    }
    d[0] = /*dst[0]*px[0] + */dst[1]*px[1] + dst[2]*px[2] + dst[3]*px[3];

  }

}





// Fortran dimensions: d(n) , f1(*), pxyz(3,n), lv(*)
// d       : results
// f1      : source array (dimensions found in opaque object lv)
//           first index along each dimension assumed to be 1
// pxyz    : positions along x, y, z , in "index" space
// lv      : opaque object containing vertical description (see Vsearch_setup)
// n       : number of points


// process n points
// for each point 1 value from ixyz, 3 values from pxyz are used (1 element)
// it is ASSUMED that along X and Y interpolation will always be CUBIC
// ( 2 <= px < "ni"-1 ) ( 2 <= py < "nj"-1 )
// pz < 2 and pz >= nk - 1 will induce linear interpolation or extrapolation
//****f* librkl/Tricublin_zyx1_n
// Synopsis
//
// ARGUMENTS
void Sweep_zyx1_n(float *d, float *f1, pxpypz *pxyz,  ztab *lv, int n, int Step_kount)
//****
{
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  while(n--){
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    Sweep_zyxf1_inline(d, f1 + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);         // interpolate
    d++;         // next result
    pxyz += 1;   // next set of positions
  }
}

//****f* librkl/Tricublin_zyx1_n_m
// Synopsis
//
// ARGUMENTS
void Sweep_zyx1_n_m(float *d, float **fs, pxpypz *pxyz,  ztab *lv, int n, int m, int Step_kount)
//****
{  // multiple field version with sources pointer lists, d[n][m]  ( Fortran dimension(m,n))
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int i;
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  while(n--){
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    for(i=0 ; i< m ; i++){
      Sweep_zyxf1_inline(d, fs[i] + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);       // interpolate
      d++;   // next result
    }
    pxyz += 1;   // next set of positions
  }
}

//****f* librkl/Tricublin_zyx1_m_n
// Synopsis
//
// ARGUMENTS
void Sweep_zyx1_m_n(float *dm, float **fs, pxpypz *pxyz,  ztab *lv, int n, int m, int Step_kount)
//****
{  // multiple field version with sources pointer lists, d[m][n]  ( Fortran dimension(n,m))
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int i, j;
  int n0 = n;
  float *d0;
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  
 // printf("Mohammad Tricublin_zyx1_m_n \n");
  
  for(j=0 ; j<n ; j++){                                                           // loop over n points
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    d0 = dm;                                                                      // base address for results
    for(i=0 ; i< m ; i++){
      Sweep_zyxf1_inline(d0, fs[i] + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);       // interpolate
//       printf("DEBUG: i = %d, n= %d, offset = %d, value = %f \n",i,n,d0-dm,*d0);
      d0 = d0 + n0;                                                                   // next result
    }
    dm++;        // next set of results
    pxyz += 1;   // next set of positions
//     return;
  }
}

//****f* librkl/Tricublin_zyx1_p
// Synopsis
//
// ARGUMENTS
void Sweep_zyx1_p(float **dp, float **fs, pxpypz *pxyz,  ztab *lv, int n, int m, int Step_kount)
//****
{  // multiple field version with results and sources pointer lists
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int i, j;
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  for(j=0 ; j<n ; j++){                                                                  // loop over n points
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);         // compute coefficients
    for(i=0 ; i< m ; i++){                                                               // loop over m variables
      Sweep_zyxf1_inline(dp[i] + j, fs[i] + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);   // interpolate
    }
    pxyz += 1;   // next set of positions
  }
}

static inline void Sweep_zyx_mm_d_inline(float *d, float *lin, float *min, float *max, float *f1, double *cxyz, int NI, int NINJ, int zlinear, int Step_kount){
  int ni = NI;
  int ninj = NINJ;
  int ninjl;    // ninj (cubic along z) or 0 (linear along z)
  int ni2, ni3;
  double *px, *py, *pz;

  double va4[4], vb4[4], vc4[4], vd4[4], dst[4], dsl[4];
  int i, ninj2, ninj3;
  double ma, mi;

  px = cxyz;
  py = cxyz + 4;
  pz = cxyz + 8;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
  ninjl = ninj ; // assuming cubic case. in the linear case, ninjl will be set to 0
  if(zlinear) ninjl = 0;


  ninj2 = ninj + ninjl;
  ninj3 = ninj2 + ninjl;
  // field 1
  // TODO: add min max code
  /*for (i=0 ; i<4 ; i++){   // tricubic or bicubic/linear interpolation
    va4[i] = f1[i    ]*pz[0] + f1[i    +ninjl]*pz[1] +  f1[i    +ninj2]*pz[2] + f1[i    +ninj3]*pz[3];
    vb4[i] = f1[i+ni ]*pz[0] + f1[i+ni +ninjl]*pz[1] +  f1[i+ni +ninj2]*pz[2] + f1[i+ni +ninj3]*pz[3];
    vc4[i] = f1[i+ni2]*pz[0] + f1[i+ni2+ninjl]*pz[1] +  f1[i+ni2+ninj2]*pz[2] + f1[i+ni2+ninj3]*pz[3];
    vd4[i] = f1[i+ni3]*pz[0] + f1[i+ni3+ninjl]*pz[1] +  f1[i+ni3+ninj2]*pz[2] + f1[i+ni3+ninj3]*pz[3];
    dst[i] = va4[i]*py[0] + vb4[i]*py[1] + vc4[i]*py[2] + vd4[i]*py[3];
  }
  d[0] = dst[0]*px[0] + dst[1]*px[1] + dst[2]*px[2] + dst[3]*px[3];
  */
  

  //SWEEP
  if (Step_kount/2-Step_kount/2.0==0){//backward
    for (i=0 ; i<3 ; i++){
      va4[i] = f1[i    ]*pz[0] + f1[i    +ninjl]*pz[1] +  f1[i    +ninj2]*pz[2];// + f1[i    +ninj3]*pz[3];
      vb4[i] = f1[i+ni ]*pz[0] + f1[i+ni +ninjl]*pz[1] +  f1[i+ni +ninj2]*pz[2];// + f1[i+ni +ninj3]*pz[3];
      vc4[i] = f1[i+ni2]*pz[0] + f1[i+ni2+ninjl]*pz[1] +  f1[i+ni2+ninj2]*pz[2];// + f1[i+ni2+ninj3]*pz[3];
      //vd4[i] = f1[i+ni3]*pz[0] + f1[i+ni3+ninjl]*pz[1] +  f1[i+ni3+ninj2]*pz[2] + f1[i+ni3+ninj3]*pz[3];
      dst[i] = va4[i]*py[0] + vb4[i]*py[1] + vc4[i]*py[2];// + vd4[i]*py[3];
    }
    d[0] = dst[0]*px[0] + dst[1]*px[1] + dst[2]*px[2];// + dst[3]*px[3];
  }
  else{//forward
    for (i=1 ; i<4 ; i++){
      //va4[i] = f1[i    ]*pz[0] + f1[i    +ninjl]*pz[1] +  f1[i    +ninj2]*pz[2] + f1[i    +ninj3]*pz[3];
      vb4[i] = /*f1[i+ni ]*pz[0] + */f1[i+ni +ninjl]*pz[1] +  f1[i+ni +ninj2]*pz[2] + f1[i+ni +ninj3]*pz[3];
      vc4[i] = /*f1[i+ni2]*pz[0] + */f1[i+ni2+ninjl]*pz[1] +  f1[i+ni2+ninj2]*pz[2] + f1[i+ni2+ninj3]*pz[3];
      vd4[i] = /*f1[i+ni3]*pz[0] + */f1[i+ni3+ninjl]*pz[1] +  f1[i+ni3+ninj2]*pz[2] + f1[i+ni3+ninj3]*pz[3];
      dst[i] = /*va4[i]*py[0] + */vb4[i]*py[1] + vc4[i]*py[2] + vd4[i]*py[3];
    }
    d[0] = /*dst[0]*px[0] + */dst[1]*px[1] + dst[2]*px[2] + dst[3]*px[3];
  }


  
  for (i=1 ; i<3 ; i++){   // linear interpolation
    vb4[i] = f1[i+ni +ninjl]*cxyz[18] + f1[i+ni +ninj2]*cxyz[19];
    vc4[i] = f1[i+ni2+ninjl]*cxyz[18] + f1[i+ni2+ninj2]*cxyz[19];
    dsl[i] = vb4[i]*cxyz[16] + vc4[i]*cxyz[17];
  }
  lin[0] = dsl[1]*cxyz[13] + dsl[2]*cxyz[14];
  ma = f1[1 + ni + ninjl] ; mi = ma;   // point [1,1,1] of 2x2x2 inner box
  for (i=1 ; i<3 ; i++){              // min max of 2x2x2 inner box
    ma = MAX(ma , f1[i + ni  + ninjl]); ma = MAX(ma , f1[i + ni  + ninj2]);
    mi = MIN(mi , f1[i + ni  + ninjl]); mi = MIN(mi , f1[i + ni  + ninj2]);

    ma = MAX(ma , f1[i + ni2 + ninjl]); ma = MAX(ma , f1[i + ni2 + ninj2]);
    mi = MIN(mi , f1[i + ni2 + ninjl]); mi = MIN(mi , f1[i + ni2 + ninj2]);
  }
  *max = ma ; *min = mi;
}

//****f* librkl/Tricublin_mono_zyx_n
// Synopsis
//
// ARGUMENTS
void Sweep_mono_zyx_n(float *d, float *l, float *mi, float *ma, float *f, pxpypz *pxyz,  ztab *lv, int n, int Step_kount)
//****
{
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  while(n--){
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    Sweep_zyx_mm_d_inline(d, l, mi, ma, f + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);         // interpolate
    d++;         // next result
    l++;
    mi++;
    ma++;
    pxyz += 1;   // next set of positions
  }
}

//****f* librkl/Tricublin_mono_zyx_n_m
// Synopsis
//
// ARGUMENTS
void Sweep_mono_zyx_n_m(float *d, float *l, float *mi, float *ma, float **fs, pxpypz *pxyz,  ztab *lv, int n, int m, int Step_kount)
//****
{
  int i;
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  while(n--){
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    for(i=0 ; i<m ; i++){
      Sweep_zyx_mm_d_inline(d, l, mi, ma, fs[i] + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);         // interpolate
      d++;         // next result
      l++;
      mi++;
      ma++;
    }
    pxyz += 1;   // next set of positions
  }
}

//****f* librkl/Tricublin_mono_zyx_m_n
// Synopsis
//
// ARGUMENTS
void Sweep_mono_zyx_m_n(float *dm, float *lm, float *mim, float *mam, float **fs, pxpypz *pxyz,  ztab *lv, int n, int m, int Step_kount)
//****
{

  int i;
  float *d0, *l0, *mi0, *ma0;
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int n0 = n;
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
  while(n--){
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    d0 = dm;
    l0 = lm;
    mi0 = mim;
    ma0 = mam;
    for(i=0 ; i<m ; i++){
      Sweep_zyx_mm_d_inline(d0, l0, mi0, ma0, fs[i] + ixyz, cxyz, lv->ni, lv->nij, zlinear, Step_kount);         // interpolate
      d0  += n0;         // next result
      l0  += n0;
      mi0 += n0;
      ma0 += n0;
    }
    dm++;         // next set of results
    lm++;
    mim++;
    mam++;
    pxyz += 1;   // next set of positions
  }
}
// Fortran dimensions: d(3) , f(3,NI,NJ,NK)
// NI   : size of a 1D line
// NINJ : size of a 2D plane
// f : address of lower left corner of the 4 x 4 x 4 or 4 x 4 x 2 box
// interpolation is done along z, then along y, then along x
// 3 values are interpolated using the same cubic/linear polynomial coefficients
// pz(2) = 1.0 - dz, pz(3) = dz are expected in the linear along z case (0.0 <= dz <= 1.0)
// in the z linear case, planes 0 and 1 are the same as are planes 2 and 3
// note for future expansion:
// should both interpolations have to be done, planes 1 and 2 can be used for the z linear case,
// and planes 0, 1, 2, 3 for the z cubic case
// in that case another mechanism will have to be used to signal the z linear case
static inline void Sweep_zyxf3_inline(float *d, float *f, double *pxyz, int NI, int NINJ, int zlinear, int Step_kount){
// static inline void Tricublin_zyx3f_beta(float *d, float *f, double *px, double *py, double *pz, int NI, int NINJ){
  int ni = 3*NI;
  int ninj = 3*NINJ;
  double *px = pxyz;
  double *py = pxyz+4;
  double *pz = pxyz+8;
  int ninjl;    // ninj (cubic along z) or 0 (linear along z)
  float *s = f;
  double dst[13];
//   int64_t *L = (int64_t *)d;
  int ni2, ni3;

  double va4[12], vb4[12], vc4[12], vd4[12];
  int ninj2, ninj3, i;

  ni2 = ni + ni;
  ni3 = ni2 + ni;
  dst[12] = 0;
  ninjl = ninj ; // assuming cubic case. in the linear case, ninjl will be set to 0
  if(zlinear) ninjl = 0;

  ninj2 = ninj + ninjl;
  ninj3 = ninj2 + ninjl;
  /*for (i=0 ; i<12 ; i++){
    va4[i] = s[i    ]*pz[0] + s[i    +ninjl]*pz[1] +  s[i    +ninj2]*pz[2] + s[i    +ninj3]*pz[3];
    vb4[i] = s[i+ni ]*pz[0] + s[i+ni +ninjl]*pz[1] +  s[i+ni +ninj2]*pz[2] + s[i+ni +ninj3]*pz[3];
    vc4[i] = s[i+ni2]*pz[0] + s[i+ni2+ninjl]*pz[1] +  s[i+ni2+ninj2]*pz[2] + s[i+ni2+ninj3]*pz[3];
    vd4[i] = s[i+ni3]*pz[0] + s[i+ni3+ninjl]*pz[1] +  s[i+ni3+ninj2]*pz[2] + s[i+ni3+ninj3]*pz[3];
    dst[i] = va4[i]*py[0] + vb4[i]*py[1] + vc4[i]*py[2] + vd4[i]*py[3];
  }
  d[0] = dst[0]*px[0] + dst[3]*px[1] + dst[6]*px[2] + dst[ 9]*px[3];
  d[1] = dst[1]*px[0] + dst[4]*px[1] + dst[7]*px[2] + dst[10]*px[3];
  d[2] = dst[2]*px[0] + dst[5]*px[1] + dst[8]*px[2] + dst[11]*px[3];*/



  //SWEEP
  if (Step_kount/2-Step_kount/2.0==0){//backward
    for (i=0 ; i<9 ; i++){
      va4[i] = s[i    ]*pz[0] + s[i    +ninjl]*pz[1] +  s[i    +ninj2]*pz[2];/* + s[i    +ninj3]*pz[3];*/
      vb4[i] = s[i+ni ]*pz[0] + s[i+ni +ninjl]*pz[1] +  s[i+ni +ninj2]*pz[2];/* + s[i+ni +ninj3]*pz[3];*/
      vc4[i] = s[i+ni2]*pz[0] + s[i+ni2+ninjl]*pz[1] +  s[i+ni2+ninj2]*pz[2];/* + s[i+ni2+ninj3]*pz[3];*/
      vd4[i] = s[i+ni3]*pz[0] + s[i+ni3+ninjl]*pz[1] +  s[i+ni3+ninj2]*pz[2];/* + s[i+ni3+ninj3]*pz[3];*/
      dst[i] = va4[i]*py[0] + vb4[i]*py[1] + vc4[i]*py[2];/* + vd4[i]*py[3];*/
    }
    d[0] = dst[0]*px[0] + dst[3]*px[1] + dst[6]*px[2];/* + dst[ 9]*px[3];*/
    d[1] = dst[1]*px[0] + dst[4]*px[1] + dst[7]*px[2];/* + dst[10]*px[3];*/
    d[2] = dst[2]*px[0] + dst[5]*px[1] + dst[8]*px[2];/* + dst[11]*px[3];*/

  }
  else{//forward
    for (i=3 ; i<12 ; i++){
      va4[i] = /*s[i    ]*pz[0] + */s[i    +ninjl]*pz[1] +  s[i    +ninj2]*pz[2] + s[i    +ninj3]*pz[3];
      vb4[i] = /*s[i+ni ]*pz[0] + */s[i+ni +ninjl]*pz[1] +  s[i+ni +ninj2]*pz[2] + s[i+ni +ninj3]*pz[3];
      vc4[i] = /*s[i+ni2]*pz[0] + */s[i+ni2+ninjl]*pz[1] +  s[i+ni2+ninj2]*pz[2] + s[i+ni2+ninj3]*pz[3];
      vd4[i] = /*s[i+ni3]*pz[0] + */s[i+ni3+ninjl]*pz[1] +  s[i+ni3+ninj2]*pz[2] + s[i+ni3+ninj3]*pz[3];
      dst[i] = /*va4[i]*py[0] + */vb4[i]*py[1] + vc4[i]*py[2] + vd4[i]*py[3];
    }
    d[0] = /*dst[0]*px[0] + */dst[3]*px[1] + dst[6]*px[2] + dst[ 9]*px[3];
    d[1] = /*dst[1]*px[0] + */dst[4]*px[1] + dst[7]*px[2] + dst[10]*px[3];
    d[2] = /*dst[2]*px[0] + */dst[5]*px[1] + dst[8]*px[2] + dst[11]*px[3];
  }











}

// process n triplets
// for each triplet 1 value from ixyz, 3 values from pxyz are used (1 element)
// it is ASSUMED that along X and Y interpolation will always be CUBIC
// ( 2 <= px < "ni"-1 ) ( 2 <= py < "nj"-1 )
// pz < 2 and pz >= nk - 1 will induce linear interpolation or extrapolation
//****f* librkl/Tricublin_zyx3_n
// Synopsis
//
// ARGUMENTS
void Sweep_zyx3_n(float *d, float *f3, pxpypz *pxyz,  ztab *lv, int n, int Step_kount)   // printf("Mohammad Tricublin_zyx3_n\n");
//****
{
  double cxyz[24];   // interpolation coefficients 4 for each dimension (x, y, z)
  int ixyz;          // unilinear index into array f1 (collapsed dimensions)
  int zlinear;       // non zero if linear interpolation
                     // all above computed in Vcoef_pxyz4, used in Tricublin_zyxf1
/*printf*/("%12.7f %12.7f %12.7f\n",pxyz->px, pxyz->py, pxyz->pz);
  while(n--){
    zlinear = Vcoef_pxyz4_inline(cxyz, &ixyz, pxyz->px, pxyz->py, pxyz->pz, lv, Step_kount);  // compute coefficients
    Sweep_zyxf3_inline(d, f3 + ixyz*3, cxyz, lv->ni, lv->nij, zlinear, Step_kount);         // interpolate
    d+=3;         // next result
    pxyz += 1;   // next set of positions
  }
}