#if ! defined(F_TEST)

#if defined(NEVER_TRUE)
 rm -f intrp_bicub_yx intrp_bicub_yx.o intrp_bicub_yx_f.F90
 ln -sf intrp_bicub_yx.c intrp_bicub_yx_f.F90

 to build (Intel compilers) :
 s.cc -O2 -DTIMING -c -march=core-avx2 intrp_bicub_yx.c
 s.f90 -no-wrap-margin -DF_TEST -o intrp_bicub_yx intrp_bicub_yx_f.F90 intrp_bicub_yx.o
 s.cc -O2 -DTIMING -c -march=core2 intrp_bicub_yx.c
 s.f90 -no-wrap-margin -DF_TEST -o intrp_bicub_yx0 intrp_bicub_yx_f.F90 intrp_bicub_yx.o

 to build (GNU compilers) :
 gcc -O3 -DTIMING -c -mavx2 -mfma intrp_bicub_yx.c
 gfortran -DF_TEST -o intrp_bicub_yx intrp_bicub_yx_f.F90 intrp_bicub_yx.o -ffree-line-length-none
 gcc -O3 -DTIMING -c -msse3 intrp_bicub_yx.c
 gfortran -DF_TEST -o intrp_bicub_yx0 intrp_bicub_yx_f.F90 intrp_bicub_yx.o -ffree-line-length-none

 ./intrp_bicub_yx
 ./intrp_bicub_yx0
#endif

static double cp167 =  0.166666666666666667E0;
static double cm167 = -0.166666666666666667E0;
static double cp5 =  .5;
static double cm5 = -.5;
static double one = 1.0;
static double two = 2.0;

#if defined(TIMING)
#include <stdio.h>
#include <stdint.h>
#pragma weak rdtsc_=rdtsc
uint64_t rdtsc_(void);
uint64_t rdtsc(void) {   // version rapide "out of order"
#if defined(__x86_64__)
  uint32_t lo, hi;
  __asm__ volatile ("rdtscp"
      : /* outputs */ "=a" (lo), "=d" (hi)
      : /* no inputs */
      : /* clobbers */ "%rcx");
  return (uint64_t)lo | (((uint64_t)hi) << 32);
#else
  static uint64_t time0;
  return time0++;

#endif
}
#endif

#if defined(__x86_64__)
#include <immintrin.h>
// use separate multiply and add instructions if fused multiply-add not available
#if defined(__AVX__)
#if ! defined(__FMA__)
#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a,b),c)
#define _mm256_fmadd_pd(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b),c)
#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a,b),c)
#define _mm_fmadd_pd(a,b,c) _mm_add_pd(_mm_mul_pd(a,b),c)
#endif
#endif
#endif


// default values mean practically "no limit"
static int qminx = -999999999;  // limits for the 'q' grid, in "origin 0"
static int qmaxx =  999999999;
static int qminy = -999999999;
static int qmaxy =  999999999;

static int uminx = -999999999;  // limits for the 'u' grid
static int umaxx =  999999999;
static int uminy = -999999999;
static int umaxy =  999999999;

static int vminx = -999999999;  // limits for the 'v' grid
static int vmaxx =  999999999;
static int vminy = -999999999;
static int vmaxy =  999999999;

static int qoffx = 0;
static int qoffy = 0;
static int uoffx = 0;
static int uoffy = 0;
static int voffx = 0;
static int voffy = 0;

// set offsets for q grid, replicate q values for u and v grids
void set_intrp_bicub_off_yx(int qoffsetx, int qoffsety){
  //call in yyg_init.F90, qoffsetx=1-l_i0   qoffsety=1-l_j0
  qoffx = qoffsetx;
  qoffy = qoffsety;
  uoffx = qoffsetx;
  uoffy = qoffsety;
  voffx = qoffsetx;
  voffy = qoffsety;
}

// set bounds for 'q' (scalars), 'u', and 'v' grids
void set_intrp_bicub_quv(int qxmin, int qxmax, int qymin, int qymax,
                         int uxmin, int uxmax, int uymin, int uymax,
                         int vxmin, int vxmax, int vymin, int vymax){
 //call in yyg_init.F90, (wb ,eb ,sb ,nb , wbu,ebu,sbu,nbu, wbv,ebv,sbv,nbv)
  
  // qxmin is in "origin 1", qminx is in "origin 0", same for other limits
  // uxmin is in "origin 1", uminx is in "origin 0", same for other limits
  // vxmin is in "origin 1", vminx is in "origin 0", same for other limits
  // bounds for scalars
  qminx = qxmin - 1 ;  // min index along x
  qmaxx = qxmax - 1 ;  // max index along x
  qminy = qymin - 1 ;  // min index along y
  qmaxy = qymax - 1 ;  // max index along y
  // bounds for u
  uminx = uxmin - 1 ;  // min index along x
  umaxx = uxmax - 1 ;  // max index along x
  uminy = uymin - 1 ;  // min index along y
  umaxy = uymax - 1 ;  // max index along y
  // bounds for v
  vminx = vxmin - 1 ;  // min index along x
  vmaxx = vxmax - 1 ;  // max index along x
  vminy = vymin - 1 ;  // min index along y
  vmaxy = vymax - 1 ;  // max index along y
}

static inline int bicub_coeffs_inline(double xx, double yy, int ni, double *wx, double *wy,
                                  int qminx, int qmaxx, int qminy, int qmaxy, int offsetx, int offsety, int Step_n){
  double x, y;
  int ix, iy;
  ix = xx ; if(ix > xx) ix = ix -1 ; ix = ix - 2;      // ix > xx if xx is negative
  iy = yy ; if(iy > yy) iy = iy -1 ; iy = iy - 2;      // iy > yy if yy is negative
  ix = (ix < qminx) ? qminx : ix;        // apply boundary limits
  ix = (ix > qmaxx) ? qmaxx : ix;
  iy = (iy < qminy) ? qminy : iy;
  iy = (iy > qmaxy) ? qmaxy : iy;
  
 // printf("Mohammad %d %d %d   \n", ix, qminx, qmaxx);


  x  = xx - 2 - ix;                      // xx and yy are in "ORIGIN 1"
  y  = yy - 2 - iy;
  
  ix = ix + offsetx;
  iy = iy + offsety;

 // printf("Mohammad %d \n", ix);
 // scanf("%d \n", ix);
//int Mohammad;
//  scanf("%d \n", Mohammad);


//4 grids for each departure point: x0=-1, x1=0, x2=1, x3=2
/*  wx[0] = cm167*x*(x-one)*(x-two);       // polynomial coefficients along i
  wx[1] = cp5*(x+one)*(x-one)*(x-two);
  wx[2] = cm5*x*(x+one)*(x-two);
  wx[3] = cp167*x*(x+one)*(x-one);

  wy[0] = cm167*y*(y-one)*(y-two);       // polynomial coefficients along j
  wy[1] = cp5*(y+one)*(y-one)*(y-two);
  wy[2] = cm5*y*(y+one)*(y-two);
  wy[3] = cp167*y*(y+one)*(y-one);
*/

  //Mohammad
  if (Step_n/2-Step_n/2.0==0){//backward
    wx[0] = cp5*x*(x-one);       // polynomial coefficients along i
    wx[1] = -1.0*(x+one)*(x-one);
    wx[2] = cp5*x*(x+one);
    wx[3] = 0.0;//cp167*x*(x+one)*(x-one);

    wy[0] = cp5*y*(y-one);       // polynomial coefficients along j
    wy[1] = -1.0*(y+one)*(y-one);
    wy[2] = cp5*y*(y+one);
    wy[3] = 0.0;//cp167*y*(y+one)*(y-one);
  }
  else{//forward
    wx[0] = 0.0;//cm167*x*(x-one)*(x-two);       // polynomial coefficients along i
    wx[1] = cp5*(x-one)*(x-two);
    wx[2] = -1.0*x*(x-two);
    wx[3] = cp5*x*(x-one);

    wy[0] = 0.0;//cm167*y*(y-one)*(y-two);       // polynomial coefficients along j
    wy[1] = cp5*(y-one)*(y-two);
    wy[2] = -1.0*y*(y-two);
    wy[3] = cp5*y*(y-one);
  }

 // printf("Mohammad 2142022\n");

    //linear
    /*wx[0] = 0.0;//cp5*x*(x-one);       // polynomial coefficients along i
    wx[1] = -1.0*(x-two);
    wx[2] = 1.0*(x-one);
    wx[3] = 0.0;//cp167*x*(x+one)*(x-one);

    wy[0] = 0.0;//cp5*y*(y-one);       // polynomial coefficients along j
    wy[1] = -1.0*(y-two);
    wy[2] = 1.0*(y-one);
    wy[3] = 0.0;//cp167*y*(y+one)*(y-one);
*/



  //printf("case1 \n");
  //int number;
  //scanf("%d", number);

  
  return ix + iy * ni;               // point to lower left corner of 4 x 4 subarray
}

// version for scalar variables
void intrp_bicub_yx_s(float *f, float *r, int ni, int ninj, int nk, double xx, double yy, int Step_kount){

  double fd0[4], fd1[4], fd2[4], fd3[4] ;

  double  wx[4], wy[4] ;
  int ni2 = ni + ni;    // + 2 rows
  int ni3 = ni2 + ni;   // + 3 rows
  int i, k;
  double x, y;
  int ix, iy;
  //int Step_n;
  //Step_n = Step_kount;

  f += bicub_coeffs_inline(xx, yy, ni, wx, wy, qminx, qmaxx, qminy, qmaxy, qoffx, qoffy, Step_kount);  // use inline function

  /*for(k=0 ; k<nk ; k++){
    for(i=0 ; i<4 ; i++){                   // easily vectorizable form
      fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i];
    }
    r[0] = fd0[0] + fd0[1] + fd0[2] + fd0[3];
    f+= ninj;
    r ++;
  }*/

  //SWEEP
  if (Step_kount/2-Step_kount/2.0==0){//backward
    for(k=0 ; k<nk ; k++){
      for(i=0 ; i<3 ; i++){                   // easily vectorizable form
        fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] ) * wx[i];
      }
      r[0] = fd0[0] + fd0[1] + fd0[2];
      f+= ninj;
      r ++;
    }
  }
  else{//forward
    for(k=0 ; k<nk ; k++){
      for(i=1 ; i<4 ; i++){                   // easily vectorizable form
        fd0[i] = ( f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i];
      }
      r[0] = fd0[1] + fd0[2] + fd0[3];
      f+= ninj;
      r ++;
    }
  }




}

// interpolate u and v with rotation, u and v are on different grids
// hence the 2 sets of positions (xxu,yyu) and (xxv,yyv)
// xxu, yyu, xxv, yyv in fractional index space, same as intrp_bicub_yx_s
// ru, rv : rotated U and V components of the wind at the desired point
// for rotation matrix s, see the GEM model
void intrp_bicub_yx_vec(float *u, float *v, float *ru, float *rv, int ni, int ninj, int nk,
                        double xxu, double yyu, double xxv, double yyv, double s[4], int Step_kount){
//call in yyg_param_mod.F90

  double ud0[4];
  double vd0[4];
  
  double  wxu[4], wyu[4], wxv[4], wyv[4] ;
  double  xu, yu, xv, yv, tu, tv;
  int ni2 = ni + ni;    // + 2 rows
  int ni3 = ni2 + ni;   // + 3 rows
  int i, k;
  int ixu, iyu, ixv, iyv;
  //int Step_n;
  //Step_n = Step_kount;


  u += bicub_coeffs_inline(xxu, yyu, ni, wxu, wyu, uminx, umaxx, uminy, umaxy, uoffx, uoffy, Step_kount);  // use inline function
  v += bicub_coeffs_inline(xxv, yyv, ni, wxv, wyv, vminx, vmaxx, vminy, vmaxy, voffx, voffy, Step_kount);  // use inline function

  /*for(k=0 ; k<nk ; k++){
    for(i=0 ; i<4 ; i++){                   // easily vectorizable form
      ud0[i] = ( u[i]*wyu[0] + u[i+ni]*wyu[1] + u[i+ni2]*wyu[2] + u[i+ni3]*wyu[3] ) * wxu[i];
      vd0[i] = ( v[i]*wyv[0] + v[i+ni]*wyv[1] + v[i+ni2]*wyv[2] + v[i+ni3]*wyv[3] ) * wxv[i];
    }
    tu = ud0[0] + ud0[1] + ud0[2] + ud0[3];
    tv = vd0[0] + vd0[1] + vd0[2] + vd0[3];
    ru[k] = tu*s[0] + tv*s[1] ;              // rotate wind components
    rv[k] = tu*s[2] + tv*s[3] ;
    u+= ninj;
    v+= ninj;
  }*/

  //SWEEP
  if (Step_kount/2-Step_kount/2.0==0){//backward
    for(k=0 ; k<nk ; k++){
      for(i=0 ; i<3 ; i++){                   // easily vectorizable form
        ud0[i] = ( u[i]*wyu[0] + u[i+ni]*wyu[1] + u[i+ni2]*wyu[2] ) * wxu[i];
        vd0[i] = ( v[i]*wyv[0] + v[i+ni]*wyv[1] + v[i+ni2]*wyv[2] ) * wxv[i];
      }
      tu = ud0[0] + ud0[1] + ud0[2];
      tv = vd0[0] + vd0[1] + vd0[2];
      ru[k] = tu*s[0] + tv*s[1] ;              // rotate wind components
      rv[k] = tu*s[2] + tv*s[3] ;
      u+= ninj;
      v+= ninj;
    }
  }
  else{//forward
    for(k=0 ; k<nk ; k++){
      for(i=1 ; i<4 ; i++){                   // easily vectorizable form
        ud0[i] = ( u[i+ni]*wyu[1] + u[i+ni2]*wyu[2] + u[i+ni3]*wyu[3] ) * wxu[i];
        vd0[i] = ( v[i+ni]*wyv[1] + v[i+ni2]*wyv[2] + v[i+ni3]*wyv[3] ) * wxv[i];
      }
      tu = ud0[1] + ud0[2] + ud0[3];
      tv = vd0[1] + vd0[2] + vd0[3];
      ru[k] = tu*s[0] + tv*s[1] ;              // rotate wind components
      rv[k] = tu*s[2] + tv*s[3] ;
      u+= ninj;
      v+= ninj;
    }

  }

}

// interpolate u and v with rotation, u and v on same grid, at position (xx,yy)
// xx, yy in fractional index space, same as intrp_bicub_yx_s
// ru, rv : rotated U and V components of the wind at the desired point
// for rotation matrix s, see the GEM model
void intrp_bicub_yx_uv(float *u, float *v, float *ru, float *rv, int ni, int ninj, int nk, double xx, double yy, double s[4], int Step_kount){
// call in yyg_xchng_vec_q2q.F90
  
  double ud0[4];
  double vd0[4];
  
  double  wx[4], wy[4] ;
  double  x, y;
  int ni2 = ni + ni;    // + 2 rows
  int ni3 = ni2 + ni;   // + 3 rows
  int i, k;
  int ix, iy;
  double tu, tv;
  //int Step_n;
  //Step_n = Step_kount;


  i = bicub_coeffs_inline(xx, yy, ni, wx, wy, qminx, qmaxx, qminy, qmaxy, qoffx, qoffy, Step_kount);  // use inline function
  u += i; v+= i;

  /*for(k=0 ; k<nk ; k++){
    for(i=0 ; i<4 ; i++){                   // easily vectorizable form
      ud0[i] = ( u[i]*wy[0] + u[i+ni]*wy[1] + u[i+ni2]*wy[2] + u[i+ni3]*wy[3] ) * wx[i];
      vd0[i] = ( v[i]*wy[0] + v[i+ni]*wy[1] + v[i+ni2]*wy[2] + v[i+ni3]*wy[3] ) * wx[i];
    }
    tu = ud0[0] + ud0[1] + ud0[2] + ud0[3];
    tv = vd0[0] + vd0[1] + vd0[2] + vd0[3];
    ru[k] = tu*s[0] + tv*s[1] ;              // rotate wind components
    rv[k] = tu*s[2] + tv*s[3] ;
    u  += ninj;
    v  += ninj;
  }*/



  //SWEEP
  if (Step_kount/2-Step_kount/2.0==0){//backward
    for(k=0 ; k<nk ; k++){
      for(i=0 ; i<3 ; i++){                   // easily vectorizable form
        ud0[i] = ( u[i]*wy[0] + u[i+ni]*wy[1] + u[i+ni2]*wy[2] ) * wx[i];
        vd0[i] = ( v[i]*wy[0] + v[i+ni]*wy[1] + v[i+ni2]*wy[2] ) * wx[i];
      }
      tu = ud0[0] + ud0[1] + ud0[2];
      tv = vd0[0] + vd0[1] + vd0[2];
      ru[k] = tu*s[0] + tv*s[1] ;              // rotate wind components
      rv[k] = tu*s[2] + tv*s[3] ;
      u  += ninj;
      v  += ninj;
    }

  }
  else{//forward
    for(k=0 ; k<nk ; k++){
      for(i=1 ; i<4 ; i++){                   // easily vectorizable form
        ud0[i] = ( u[i+ni]*wy[1] + u[i+ni2]*wy[2] + u[i+ni3]*wy[3] ) * wx[i];
        vd0[i] = ( v[i+ni]*wy[1] + v[i+ni2]*wy[2] + v[i+ni3]*wy[3] ) * wx[i];
      }
      tu = ud0[0] + ud0[1] + ud0[2] + ud0[3];
      tv = vd0[0] + vd0[1] + vd0[2] + vd0[3];
      ru[k] = tu*s[0] + tv*s[1] ;              // rotate wind components
      rv[k] = tu*s[2] + tv*s[3] ;
      u  += ninj;
      v  += ninj;
    }

  }







}

// "monotonic" version, the result must be between the max and min values of the 2x2 center
// of the 4x4 subarray used to interpolate
// SIMD version now consistent with  non SIMD version
void intrp_bicub_yx_s_mono(float *f, float *r, int ni, int ninj, int nk, double xx, double yy, int Step_kount){
//call in yyg_param_mod.F90

  double fd0[4], fd1[4], fd2[4], fd3[4] ;
  
  double  wx[4], wy[4] ;
  double  x, y, tm1, tm2;
  float minval, maxval;
  int ni2 = ni + ni;    // + 2 rows
  int ni3 = ni2 + ni;   // + 3 rows
  int i, k;
  int ix, iy;
  //int Step_n;
  //Step_n = Step_kount;


  f += bicub_coeffs_inline(xx, yy, ni, wx, wy, qminx, qmaxx, qminy, qmaxy, qoffx, qoffy, Step_kount);  // use inline function

  for(k=0 ; k<nk ; k++){
    minval = f[1];
    maxval = minval;
    for(i=1 ; i<3 ; i++){   // compute monotonic limits, center 2x2 points
      minval = (f[i+ni ] < minval) ? f[i+ni ] : minval;
      minval = (f[i+ni2] < minval) ? f[i+ni2] : minval;
      maxval = (f[i+ni ] > maxval) ? f[i+ni ] : maxval;
      maxval = (f[i+ni2] > maxval) ? f[i+ni2] : maxval;
    }
    /*for(i=0 ; i<4 ; i++){                   // easily vectorizable form
      fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i];
    }
    r[0] = fd0[0] + fd0[1] + fd0[2] + fd0[3];
   */ 
    
    //SWEEP
    if (Step_kount/2-Step_kount/2.0==0){//backward
      for(i=0 ; i<3 ; i++){                   // easily vectorizable form
        fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] ) * wx[i];
      }
      r[0] = fd0[0] + fd0[1] + fd0[2];
    }
    else{//forward
      for(i=1 ; i<4 ; i++){                   // easily vectorizable form
        fd0[i] = ( f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i];
      }
      r[0] = fd0[1] + fd0[2] + fd0[3];
    }

    
    r[0] = (r[0] < minval ) ? minval : r[0] ;  // apply monotonic limits
    r[0] = (r[0] > maxval ) ? maxval : r[0] ;
    f += ninj;
    r ++;  
  }

}

#endif