#if ! defined(F_TEST) #if defined(NEVER_TRUE) rm -f intrp_bicub_yx intrp_bicub_yx.o intrp_bicub_yx_f.F90 ln -sf intrp_bicub_yx.c intrp_bicub_yx_f.F90 to build (Intel compilers) : s.cc -O2 -DTIMING -c -march=core-avx2 intrp_bicub_yx.c s.f90 -no-wrap-margin -DF_TEST -o intrp_bicub_yx intrp_bicub_yx_f.F90 intrp_bicub_yx.o s.cc -O2 -DTIMING -c -march=core2 intrp_bicub_yx.c s.f90 -no-wrap-margin -DF_TEST -o intrp_bicub_yx0 intrp_bicub_yx_f.F90 intrp_bicub_yx.o to build (GNU compilers) : gcc -O3 -DTIMING -c -mavx2 -mfma intrp_bicub_yx.c gfortran -DF_TEST -o intrp_bicub_yx intrp_bicub_yx_f.F90 intrp_bicub_yx.o -ffree-line-length-none gcc -O3 -DTIMING -c -msse3 intrp_bicub_yx.c gfortran -DF_TEST -o intrp_bicub_yx0 intrp_bicub_yx_f.F90 intrp_bicub_yx.o -ffree-line-length-none ./intrp_bicub_yx ./intrp_bicub_yx0 #endif static double cp167 = 0.166666666666666667E0; static double cm167 = -0.166666666666666667E0; static double cp5 = .5; static double cm5 = -.5; static double one = 1.0; static double two = 2.0; #if defined(TIMING) #include <stdio.h> #include <stdint.h> #pragma weak rdtsc_=rdtsc uint64_t rdtsc_(void); uint64_t rdtsc(void) { // version rapide "out of order" #if defined(__x86_64__) uint32_t lo, hi; __asm__ volatile ("rdtscp" : /* outputs */ "=a" (lo), "=d" (hi) : /* no inputs */ : /* clobbers */ "%rcx"); return (uint64_t)lo | (((uint64_t)hi) << 32); #else static uint64_t time0; return time0++; #endif } #endif #if defined(__x86_64__) #include <immintrin.h> // use separate multiply and add instructions if fused multiply-add not available #if defined(__AVX__) #if ! defined(__FMA__) #define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a,b),c) #define _mm256_fmadd_pd(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b),c) #define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a,b),c) #define _mm_fmadd_pd(a,b,c) _mm_add_pd(_mm_mul_pd(a,b),c) #endif #endif #endif // default values mean practically "no limit" static int qminx = -999999999; // limits for the 'q' grid, in "origin 0" static int qmaxx = 999999999; static int qminy = -999999999; static int qmaxy = 999999999; static int uminx = -999999999; // limits for the 'u' grid static int umaxx = 999999999; static int uminy = -999999999; static int umaxy = 999999999; static int vminx = -999999999; // limits for the 'v' grid static int vmaxx = 999999999; static int vminy = -999999999; static int vmaxy = 999999999; static int qoffx = 0; static int qoffy = 0; static int uoffx = 0; static int uoffy = 0; static int voffx = 0; static int voffy = 0; // set offsets for q grid, replicate q values for u and v grids void set_intrp_bicub_off_yx(int qoffsetx, int qoffsety){ //call in yyg_init.F90, qoffsetx=1-l_i0 qoffsety=1-l_j0 qoffx = qoffsetx; qoffy = qoffsety; uoffx = qoffsetx; uoffy = qoffsety; voffx = qoffsetx; voffy = qoffsety; } // set bounds for 'q' (scalars), 'u', and 'v' grids void set_intrp_bicub_quv(int qxmin, int qxmax, int qymin, int qymax, int uxmin, int uxmax, int uymin, int uymax, int vxmin, int vxmax, int vymin, int vymax){ //call in yyg_init.F90, (wb ,eb ,sb ,nb , wbu,ebu,sbu,nbu, wbv,ebv,sbv,nbv) // qxmin is in "origin 1", qminx is in "origin 0", same for other limits // uxmin is in "origin 1", uminx is in "origin 0", same for other limits // vxmin is in "origin 1", vminx is in "origin 0", same for other limits // bounds for scalars qminx = qxmin - 1 ; // min index along x qmaxx = qxmax - 1 ; // max index along x qminy = qymin - 1 ; // min index along y qmaxy = qymax - 1 ; // max index along y // bounds for u uminx = uxmin - 1 ; // min index along x umaxx = uxmax - 1 ; // max index along x uminy = uymin - 1 ; // min index along y umaxy = uymax - 1 ; // max index along y // bounds for v vminx = vxmin - 1 ; // min index along x vmaxx = vxmax - 1 ; // max index along x vminy = vymin - 1 ; // min index along y vmaxy = vymax - 1 ; // max index along y } static inline int bicub_coeffs_inline(double xx, double yy, int ni, double *wx, double *wy, int qminx, int qmaxx, int qminy, int qmaxy, int offsetx, int offsety, int Step_n){ double x, y; int ix, iy; ix = xx ; if(ix > xx) ix = ix -1 ; ix = ix - 2; // ix > xx if xx is negative iy = yy ; if(iy > yy) iy = iy -1 ; iy = iy - 2; // iy > yy if yy is negative ix = (ix < qminx) ? qminx : ix; // apply boundary limits ix = (ix > qmaxx) ? qmaxx : ix; iy = (iy < qminy) ? qminy : iy; iy = (iy > qmaxy) ? qmaxy : iy; // printf("Mohammad %d %d %d \n", ix, qminx, qmaxx); x = xx - 2 - ix; // xx and yy are in "ORIGIN 1" y = yy - 2 - iy; ix = ix + offsetx; iy = iy + offsety; // printf("Mohammad %d \n", ix); // scanf("%d \n", ix); //int Mohammad; // scanf("%d \n", Mohammad); //4 grids for each departure point: x0=-1, x1=0, x2=1, x3=2 /* wx[0] = cm167*x*(x-one)*(x-two); // polynomial coefficients along i wx[1] = cp5*(x+one)*(x-one)*(x-two); wx[2] = cm5*x*(x+one)*(x-two); wx[3] = cp167*x*(x+one)*(x-one); wy[0] = cm167*y*(y-one)*(y-two); // polynomial coefficients along j wy[1] = cp5*(y+one)*(y-one)*(y-two); wy[2] = cm5*y*(y+one)*(y-two); wy[3] = cp167*y*(y+one)*(y-one); */ //Mohammad if (Step_n/2-Step_n/2.0==0){//backward wx[0] = cp5*x*(x-one); // polynomial coefficients along i wx[1] = -1.0*(x+one)*(x-one); wx[2] = cp5*x*(x+one); wx[3] = 0.0;//cp167*x*(x+one)*(x-one); wy[0] = cp5*y*(y-one); // polynomial coefficients along j wy[1] = -1.0*(y+one)*(y-one); wy[2] = cp5*y*(y+one); wy[3] = 0.0;//cp167*y*(y+one)*(y-one); } else{//forward wx[0] = 0.0;//cm167*x*(x-one)*(x-two); // polynomial coefficients along i wx[1] = cp5*(x-one)*(x-two); wx[2] = -1.0*x*(x-two); wx[3] = cp5*x*(x-one); wy[0] = 0.0;//cm167*y*(y-one)*(y-two); // polynomial coefficients along j wy[1] = cp5*(y-one)*(y-two); wy[2] = -1.0*y*(y-two); wy[3] = cp5*y*(y-one); } // printf("Mohammad 2142022\n"); //linear /*wx[0] = 0.0;//cp5*x*(x-one); // polynomial coefficients along i wx[1] = -1.0*(x-two); wx[2] = 1.0*(x-one); wx[3] = 0.0;//cp167*x*(x+one)*(x-one); wy[0] = 0.0;//cp5*y*(y-one); // polynomial coefficients along j wy[1] = -1.0*(y-two); wy[2] = 1.0*(y-one); wy[3] = 0.0;//cp167*y*(y+one)*(y-one); */ //printf("case1 \n"); //int number; //scanf("%d", number); return ix + iy * ni; // point to lower left corner of 4 x 4 subarray } // version for scalar variables void intrp_bicub_yx_s(float *f, float *r, int ni, int ninj, int nk, double xx, double yy, int Step_kount){ double fd0[4], fd1[4], fd2[4], fd3[4] ; double wx[4], wy[4] ; int ni2 = ni + ni; // + 2 rows int ni3 = ni2 + ni; // + 3 rows int i, k; double x, y; int ix, iy; //int Step_n; //Step_n = Step_kount; f += bicub_coeffs_inline(xx, yy, ni, wx, wy, qminx, qmaxx, qminy, qmaxy, qoffx, qoffy, Step_kount); // use inline function /*for(k=0 ; k<nk ; k++){ for(i=0 ; i<4 ; i++){ // easily vectorizable form fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i]; } r[0] = fd0[0] + fd0[1] + fd0[2] + fd0[3]; f+= ninj; r ++; }*/ //SWEEP if (Step_kount/2-Step_kount/2.0==0){//backward for(k=0 ; k<nk ; k++){ for(i=0 ; i<3 ; i++){ // easily vectorizable form fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] ) * wx[i]; } r[0] = fd0[0] + fd0[1] + fd0[2]; f+= ninj; r ++; } } else{//forward for(k=0 ; k<nk ; k++){ for(i=1 ; i<4 ; i++){ // easily vectorizable form fd0[i] = ( f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i]; } r[0] = fd0[1] + fd0[2] + fd0[3]; f+= ninj; r ++; } } } // interpolate u and v with rotation, u and v are on different grids // hence the 2 sets of positions (xxu,yyu) and (xxv,yyv) // xxu, yyu, xxv, yyv in fractional index space, same as intrp_bicub_yx_s // ru, rv : rotated U and V components of the wind at the desired point // for rotation matrix s, see the GEM model void intrp_bicub_yx_vec(float *u, float *v, float *ru, float *rv, int ni, int ninj, int nk, double xxu, double yyu, double xxv, double yyv, double s[4], int Step_kount){ //call in yyg_param_mod.F90 double ud0[4]; double vd0[4]; double wxu[4], wyu[4], wxv[4], wyv[4] ; double xu, yu, xv, yv, tu, tv; int ni2 = ni + ni; // + 2 rows int ni3 = ni2 + ni; // + 3 rows int i, k; int ixu, iyu, ixv, iyv; //int Step_n; //Step_n = Step_kount; u += bicub_coeffs_inline(xxu, yyu, ni, wxu, wyu, uminx, umaxx, uminy, umaxy, uoffx, uoffy, Step_kount); // use inline function v += bicub_coeffs_inline(xxv, yyv, ni, wxv, wyv, vminx, vmaxx, vminy, vmaxy, voffx, voffy, Step_kount); // use inline function /*for(k=0 ; k<nk ; k++){ for(i=0 ; i<4 ; i++){ // easily vectorizable form ud0[i] = ( u[i]*wyu[0] + u[i+ni]*wyu[1] + u[i+ni2]*wyu[2] + u[i+ni3]*wyu[3] ) * wxu[i]; vd0[i] = ( v[i]*wyv[0] + v[i+ni]*wyv[1] + v[i+ni2]*wyv[2] + v[i+ni3]*wyv[3] ) * wxv[i]; } tu = ud0[0] + ud0[1] + ud0[2] + ud0[3]; tv = vd0[0] + vd0[1] + vd0[2] + vd0[3]; ru[k] = tu*s[0] + tv*s[1] ; // rotate wind components rv[k] = tu*s[2] + tv*s[3] ; u+= ninj; v+= ninj; }*/ //SWEEP if (Step_kount/2-Step_kount/2.0==0){//backward for(k=0 ; k<nk ; k++){ for(i=0 ; i<3 ; i++){ // easily vectorizable form ud0[i] = ( u[i]*wyu[0] + u[i+ni]*wyu[1] + u[i+ni2]*wyu[2] ) * wxu[i]; vd0[i] = ( v[i]*wyv[0] + v[i+ni]*wyv[1] + v[i+ni2]*wyv[2] ) * wxv[i]; } tu = ud0[0] + ud0[1] + ud0[2]; tv = vd0[0] + vd0[1] + vd0[2]; ru[k] = tu*s[0] + tv*s[1] ; // rotate wind components rv[k] = tu*s[2] + tv*s[3] ; u+= ninj; v+= ninj; } } else{//forward for(k=0 ; k<nk ; k++){ for(i=1 ; i<4 ; i++){ // easily vectorizable form ud0[i] = ( u[i+ni]*wyu[1] + u[i+ni2]*wyu[2] + u[i+ni3]*wyu[3] ) * wxu[i]; vd0[i] = ( v[i+ni]*wyv[1] + v[i+ni2]*wyv[2] + v[i+ni3]*wyv[3] ) * wxv[i]; } tu = ud0[1] + ud0[2] + ud0[3]; tv = vd0[1] + vd0[2] + vd0[3]; ru[k] = tu*s[0] + tv*s[1] ; // rotate wind components rv[k] = tu*s[2] + tv*s[3] ; u+= ninj; v+= ninj; } } } // interpolate u and v with rotation, u and v on same grid, at position (xx,yy) // xx, yy in fractional index space, same as intrp_bicub_yx_s // ru, rv : rotated U and V components of the wind at the desired point // for rotation matrix s, see the GEM model void intrp_bicub_yx_uv(float *u, float *v, float *ru, float *rv, int ni, int ninj, int nk, double xx, double yy, double s[4], int Step_kount){ // call in yyg_xchng_vec_q2q.F90 double ud0[4]; double vd0[4]; double wx[4], wy[4] ; double x, y; int ni2 = ni + ni; // + 2 rows int ni3 = ni2 + ni; // + 3 rows int i, k; int ix, iy; double tu, tv; //int Step_n; //Step_n = Step_kount; i = bicub_coeffs_inline(xx, yy, ni, wx, wy, qminx, qmaxx, qminy, qmaxy, qoffx, qoffy, Step_kount); // use inline function u += i; v+= i; /*for(k=0 ; k<nk ; k++){ for(i=0 ; i<4 ; i++){ // easily vectorizable form ud0[i] = ( u[i]*wy[0] + u[i+ni]*wy[1] + u[i+ni2]*wy[2] + u[i+ni3]*wy[3] ) * wx[i]; vd0[i] = ( v[i]*wy[0] + v[i+ni]*wy[1] + v[i+ni2]*wy[2] + v[i+ni3]*wy[3] ) * wx[i]; } tu = ud0[0] + ud0[1] + ud0[2] + ud0[3]; tv = vd0[0] + vd0[1] + vd0[2] + vd0[3]; ru[k] = tu*s[0] + tv*s[1] ; // rotate wind components rv[k] = tu*s[2] + tv*s[3] ; u += ninj; v += ninj; }*/ //SWEEP if (Step_kount/2-Step_kount/2.0==0){//backward for(k=0 ; k<nk ; k++){ for(i=0 ; i<3 ; i++){ // easily vectorizable form ud0[i] = ( u[i]*wy[0] + u[i+ni]*wy[1] + u[i+ni2]*wy[2] ) * wx[i]; vd0[i] = ( v[i]*wy[0] + v[i+ni]*wy[1] + v[i+ni2]*wy[2] ) * wx[i]; } tu = ud0[0] + ud0[1] + ud0[2]; tv = vd0[0] + vd0[1] + vd0[2]; ru[k] = tu*s[0] + tv*s[1] ; // rotate wind components rv[k] = tu*s[2] + tv*s[3] ; u += ninj; v += ninj; } } else{//forward for(k=0 ; k<nk ; k++){ for(i=1 ; i<4 ; i++){ // easily vectorizable form ud0[i] = ( u[i+ni]*wy[1] + u[i+ni2]*wy[2] + u[i+ni3]*wy[3] ) * wx[i]; vd0[i] = ( v[i+ni]*wy[1] + v[i+ni2]*wy[2] + v[i+ni3]*wy[3] ) * wx[i]; } tu = ud0[0] + ud0[1] + ud0[2] + ud0[3]; tv = vd0[0] + vd0[1] + vd0[2] + vd0[3]; ru[k] = tu*s[0] + tv*s[1] ; // rotate wind components rv[k] = tu*s[2] + tv*s[3] ; u += ninj; v += ninj; } } } // "monotonic" version, the result must be between the max and min values of the 2x2 center // of the 4x4 subarray used to interpolate // SIMD version now consistent with non SIMD version void intrp_bicub_yx_s_mono(float *f, float *r, int ni, int ninj, int nk, double xx, double yy, int Step_kount){ //call in yyg_param_mod.F90 double fd0[4], fd1[4], fd2[4], fd3[4] ; double wx[4], wy[4] ; double x, y, tm1, tm2; float minval, maxval; int ni2 = ni + ni; // + 2 rows int ni3 = ni2 + ni; // + 3 rows int i, k; int ix, iy; //int Step_n; //Step_n = Step_kount; f += bicub_coeffs_inline(xx, yy, ni, wx, wy, qminx, qmaxx, qminy, qmaxy, qoffx, qoffy, Step_kount); // use inline function for(k=0 ; k<nk ; k++){ minval = f[1]; maxval = minval; for(i=1 ; i<3 ; i++){ // compute monotonic limits, center 2x2 points minval = (f[i+ni ] < minval) ? f[i+ni ] : minval; minval = (f[i+ni2] < minval) ? f[i+ni2] : minval; maxval = (f[i+ni ] > maxval) ? f[i+ni ] : maxval; maxval = (f[i+ni2] > maxval) ? f[i+ni2] : maxval; } /*for(i=0 ; i<4 ; i++){ // easily vectorizable form fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i]; } r[0] = fd0[0] + fd0[1] + fd0[2] + fd0[3]; */ //SWEEP if (Step_kount/2-Step_kount/2.0==0){//backward for(i=0 ; i<3 ; i++){ // easily vectorizable form fd0[i] = ( f[i]*wy[0] + f[i+ni]*wy[1] + f[i+ni2]*wy[2] ) * wx[i]; } r[0] = fd0[0] + fd0[1] + fd0[2]; } else{//forward for(i=1 ; i<4 ; i++){ // easily vectorizable form fd0[i] = ( f[i+ni]*wy[1] + f[i+ni2]*wy[2] + f[i+ni3]*wy[3] ) * wx[i]; } r[0] = fd0[1] + fd0[2] + fd0[3]; } r[0] = (r[0] < minval ) ? minval : r[0] ; // apply monotonic limits r[0] = (r[0] > maxval ) ? maxval : r[0] ; f += ninj; r ++; } } #endif