! This is a rather silly situation where the velocity components u,v
! need to be rotated, but only one is needed as the output. So the two
! versions are generated by CPP and are different by having _u_ or _v_
! in the middle of the subroutine name, and by selecting one line or
! the other. (Note that both lines cannot coexist because the first
! one overwrites u).


#ifndef VCOMP
# define bry_rotate_TYPE_in_place   bry_rotate_u_in_place
# define bry_rotate_TYPE_inp_thread bry_rotate_u_inp_thread
#else
# define bry_rotate_TYPE_in_place   bry_rotate_v_in_place  
# define bry_rotate_TYPE_inp_thread bry_rotate_v_inp_thread
#endif


      subroutine bry_rotate_TYPE_in_place(ncx,N, csA,snA, u,v)
      implicit none
      integer ncx,N
      real(kind=8) csA(ncx),snA(ncx)
      real(kind=4) u(ncx,N),v(ncx,N)
C$OMP PARALLEL SHARED(ncx,N, csA,snA, u,v)
      call bry_rotate_TYPE_inp_thread(ncx,N, csA,snA, u,v)
C$OMP END PARALLEL
      end

      subroutine bry_rotate_TYPE_inp_thread(ncx,N, csA,snA, u,v)
      implicit none
      integer ncx,N
      real(kind=8) csA(ncx),snA(ncx), cosA,sinA
      real(kind=4) u(ncx,N),v(ncx,N)
      integer icmin,icmax,isize, istr,iend,tile, i,k
      integer numthreads, trd, chunk_size
C$    integer omp_get_num_threads, omp_get_thread_num
C$    numthreads=omp_get_num_threads() ; trd=omp_get_thread_num()
C$    chunk_size=(ncx+numthreads-1)/numthreads
      icmin=1 ; icmax=ncx
C$    icmin=1+trd*chunk_size ; icmax=min(icmin+chunk_size-1,ncx)
      isize=(icmax-icmin+2)/2
      do tile=0,1
        istr=icmin+tile*isize ; iend=min(istr+isize-1, icmax)
        do i=istr,iend
          cosA=csA(i) ; sinA=snA(i)  ! <-- invariant for k-index
          do k=1,N
#ifndef VCOMP
            u(i,k)=u(i,k)*cosA +v(i,k)*sinA
#else
            v(i,k)=v(i,k)*cosA -u(i,k)*sinA
#endif
          enddo
        enddo
      enddo
      end

#ifndef VCOMP
# define VCOMP
# undef bry_rotate_TYPE_in_place
# undef bry_rotate_TYPE_inp_thread
# include "r2r_bry_rotate.F"
#endif

