module minimize
  implicit none 
  save
!.....number of convergence criteria achieved
  integer:: numtol = 1
  
!.....penalty: lasso or ridge or smooth
  character(len=128):: cpena= 'none'
  character(len=128):: clinmin= 'backtrack'
  character(len=128):: cfsmode= 'grad'  ! [grad,grad0corr,df0corr]
  real(8):: pwgt = 1d-15

!.....SGD parameters
  integer:: nsgdbsize = 1
  integer:: nsgdbsnode = 1
  integer,allocatable:: ismask(:)
  character(len=128):: csgdupdate = 'normal'
  real(8):: sgd_rate_ini = 0.001d0
  real(8):: sgd_rate_fin = -0.001d0
  real(8):: sgd_eps = 1.0d-8
  real(8):: adam_b1 = 0.9d0
  real(8):: adam_b2 = 0.999d0

!.....Group FS inner loop
  integer:: ninnergfs=100
  character(len=128):: cread_fsmask = ''
  character(len=128):: cfs_xrefresh = 'random' ! [zero, random, none]
  integer:: maxfsrefresh = 2

!.....Max iteration for line minimization
  integer:: niter_linmin   = 15
!.....Decreasing factor, should be < 1.0
  real(8):: fac_dec        = 0.2d0
!.....Increasing factor, should be > 1.0
  real(8):: fac_inc        = 5.0d0
!.....Armijo parameters
  real(8):: armijo_xi      = 1.0d-4
  real(8):: armijo_tau     = 0.5d0
  integer:: armijo_maxiter = 15

!.....Simulated annealing parameters
  real(8):: sa_temp0 = 1d0
  real(8):: sa_tau   = 10d0
  real(8):: sa_xw0   = 1d-3
  real(8):: sa_fctr  = 0.5d0
  real(8),allocatable:: sa_xws(:)
!.....T control method
!.....  - linear
!.....  - exp
!.....  - best
!.....  - constant
  character(len=128):: sa_tctrl = 'best'
  real(8):: sa_div_best = 10d0

!.....Metadynamics
  real(8):: md_height = 1d0
  real(8):: md_sigma  = 1d0
!.....Max num of gaussian potentials
  integer:: md_ng = 1000
  real(8),allocatable:: md_gp(:,:)

!.....CG
  integer:: icgbtype = 1 ! 1:FR, 2:PRP, 3:HS, 4:DY

!.....L-BFGS
  integer:: mstore   = 10

!.....Genetic Algorithm variables.......................................
  real(8):: ga_rate_mutate = 0.1
  integer:: ga_nindivs = 10
  integer:: ga_nbits = 16
  integer:: ga_noffsp = 0
  real(8):: ga_temp = 1d0
  integer:: ga_ngenes
  character(len=128):: ga_fitness = 'inv'
  
  type gene  ! A Gene corresponds to a parameter/variable to be optimized
    integer(2),allocatable:: bits(:)
    real(8):: val
    real(8):: vmax,vmin
  end type gene

  type individual  ! An individual is a set of parameters
    integer:: iid  ! ID for this individual
    type(gene),allocatable:: genes(:)
    real(8):: fvalue  ! Loss function value of the individual
    real(8):: ftst    ! Loss function value for test set of the individual.
    real(8):: fitness ! Fittness value of the individual
    real(8),allocatable:: vel(:)
  end type individual

!.....Differential evolution variables..................................
  character(len=128):: de_fitness = 'inv'
  character(len=128):: de_algo = 'local_neigh'
  integer:: de_nindivs = 10
  real(8):: de_frac    = 1d0
  real(8):: de_lambda  = -1d0
  real(8):: de_cross_rate = 0.5d0
  real(8):: de_wmin    = 0.4d0
  real(8):: de_wmax    = 0.8d0
  real(8):: de_temp    = 1d0

!.....Particle swarm optimization variables.............................
  integer:: pso_nindivs = 10
  real(8):: pso_w       = 0.99d0
  real(8):: pso_c1      = 2d0
  real(8):: pso_c2      = 2d0
  real(8):: pso_vinimax = 0.1d0

  real(8):: fupper_lim = 1d+5
  real(8),allocatable:: ranges(:,:)

contains
!=======================================================================
  subroutine set_ranges(ndim,xranges)
    implicit none
    integer,intent(in):: ndim
    real(8),intent(in):: xranges(2,ndim)

    if( .not. allocated(ranges) ) allocate(ranges(2,ndim))
    if( size(ranges).ne.2*ndim ) then
      deallocate(ranges)
      allocate(ranges(2,ndim))
    endif
    
    ranges(1:2,1:ndim) = xranges(1:2,1:ndim)
    return
  end subroutine set_ranges
!=======================================================================
  subroutine wrap_ranges(ndim,x,xranges)
    implicit none
    integer,intent(in):: ndim
    real(8),intent(in):: xranges(2,ndim)
    real(8),intent(inout):: x(ndim)

    integer:: i

!!$    if( .not.allocated(ranges) ) then
!!$      print *,'ERROR: ranges is not allocated yet...'
!!$      stop
!!$    endif

    do i=1,ndim
      if( x(i).lt.xranges(1,i) ) then
        x(i) = xranges(1,i)
      else if( x(i).gt.xranges(2,i) ) then
        x(i) = xranges(2,i)
      endif
    enddo
    return
  end subroutine wrap_ranges
!=======================================================================
  subroutine write_status(ionum,myid,iprint,cpena,iter,ninner &
       ,ftrn,ftst,pval,xnorm,gnorm,dxnorm,fprev)
    integer,intent(in):: ionum,myid,iprint,iter,ninner
    character(len=128),intent(in):: cpena
    real(8),intent(in)::ftrn,ftst,pval,xnorm,gnorm,dxnorm,fprev
    
    if( myid.eq.0 ) then
      if( iprint.ge.1 ) then
        if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' &
             .or. trim(cpena).eq.'ridge' ) then
          write(6,'(a,i5,i4,2es13.5,5es12.4)') &
               ' iter,ninner,ftrn,ftst,penalty,|x|,|g|,|dx|,|df|=' &
               ,iter,ninner,ftrn-pval,ftst &
               ,pval,xnorm,gnorm,dxnorm,abs(ftrn-fprev)
        else
          write(6,'(a,i5,i4,2es13.5,4es12.4,i4)') &
               ' iter,ninner,ftrn,ftst,|x|,|g|,|dx|,|df|=' &
               ,iter,ninner,ftrn,ftst,xnorm,gnorm,dxnorm,abs(ftrn-fprev)
        endif
        call flush(6)
      endif
    endif
  end subroutine write_status
!=======================================================================
  subroutine check_converge(myid,iprint,xtol,gtol,ftol &
       ,dxnorm,gnorm,fdiff,nxtol,ngtol,nftol,iflag,lconverged)
    integer,intent(in):: myid,iprint
    real(8),intent(in):: xtol,gtol,ftol,dxnorm,gnorm,fdiff
    integer,intent(inout):: nxtol,ngtol,nftol,iflag
    logical,intent(out):: lconverged

    lconverged = .false.
    if( myid.eq.0 .and. iprint.gt.1 ) then
      print '(a,3es12.4)','  dxnorm,gnorm,fdiff=',dxnorm,gnorm,fdiff
    endif
    if( dxnorm.lt.xtol ) then
      nxtol = nxtol +1
      ngtol = 0
      nftol = 0
      if( nxtol.ge.numtol ) then
        if( myid.eq.0 .and. iprint.gt.0 ) then
          print '(a,i0,a)',' >>> QN converged because xdiff < xtol over ' &
               ,numtol,' times.'
          write(6,'(a,2es13.5)') '   dxnorm,xtol=',dxnorm,xtol
        endif
        iflag= iflag +1
        lconverged = .true.
!!$        x0(1:ndim)= x(1:ndim)
!!$        maxiter = iter
        return
      endif
    else if( gnorm.lt.gtol ) then
      ngtol = ngtol +1
      nxtol = 0
      nftol = 0
      if( ngtol.ge.numtol ) then
        if( myid.eq.0 ) then
          print '(a,i0,a)',' >>> QN converged because gdiff < gtol over ' &
               ,numtol,' times.'
          write(6,'(a,2es13.5)') '   gnorm,gtol=',gnorm,gtol
        endif
        iflag= iflag +2
        lconverged = .true.
!!$        x0(1:ndim)= x(1:ndim)
!!$        maxiter = iter
        return
      endif
    else if( fdiff.lt.ftol) then
      nftol= nftol +1
      nxtol = 0
      ngtol = 0
      if( nftol.ge.numtol ) then
        if( myid.eq.0 ) then
          print '(a,i0,a)',' >>> QN converged because fdiff < ftol over ' &
               ,numtol,' times.'
          write(6,'(a,2es13.5)') '   fdiff,ftol=',fdiff, ftol
        endif
        iflag= iflag +3
        lconverged = .true.
!!$        x0(1:ndim)= x(1:ndim)
!!$        maxiter = iter
        return
      endif
    else
      nxtol = 0
      ngtol = 0
      nftol = 0
    endif
    return
  end subroutine check_converge
!=======================================================================
  subroutine steepest_descent(ndim,x0,f,g,d,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,grad,cfmethod,niter_eval,sub_eval)
    implicit none
    integer,intent(in):: ndim,iprint,myid,niter_eval
    integer,intent(inout):: iflag,maxiter
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: f,x0(ndim),g(ndim),d(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine grad(n,x,gtrn)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: gtrn(n)
      end subroutine grad
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface

    integer:: iter,niter,nxtol,ngtol,nftol
    real(8):: alpha,fp,gnorm,dxnorm,vnorm,ftst,pval
    real(8),save,allocatable:: gpena(:),dx(:),xp(:),x(:) &
         ,s(:),y(:),gprev(:)
    logical:: lconverged = .false. 

    if( myid.eq.0 ) then
      print *,''
      print *, '********************** Steepest Descent (SD) '&
           //  '**********************'
    endif

    if( .not.allocated(gpena) ) then
      allocate(gpena(ndim),dx(ndim),xp(ndim),x(ndim) &
           ,s(ndim),y(ndim),gprev(ndim))
    endif

    iter= 0
    niter = 0
    x(:) = x0(:)
    call wrap_ranges(ndim,x,xranges)
    call func(ndim,x,f,ftst)
    call grad(ndim,x,g)
    call penalty(cpena,ndim,pval,gpena,x)
    f = f +pval
    if( trim(cpena).eq.'ridge' ) g(:) = g(:) +gpena(:)
    gnorm= sqrt(sprod(ndim,g,g))
    vnorm= sqrt(sprod(ndim,x,x))
    d(1:ndim)= -g(1:ndim)
    call write_status(6,myid,iprint,cpena,iter,niter &
         ,f,ftst,pval,vnorm,gnorm,dxnorm,f)

    alpha = 1d0
    do iter=1,maxiter
      fp= f
      xp(:) = x(:)
!.....line minimization
      if( trim(clinmin).eq.'quadratic' ) then
        call quad_interpolate(ndim,x,d,f,ftst,xtol,gtol,ftol,alpha &
             ,iprint,iflag,myid,func)
!.....if quad interpolation failed, perform golden section
        if( iflag/100.ne.0 ) then
          iflag= iflag -(iflag/100)*100
          if(myid.eq.0) then
            print *,'since quad_interpolate failed, call golden_section.'
          endif
          call golden_section(ndim,x,d,f,ftst,xtol,gtol,ftol,alpha &
               ,iprint,iflag,myid,func)
        endif
      else if ( trim(clinmin).eq.'golden') then
        call golden_section(ndim,x,d,f,ftst,xtol,gtol,ftol,alpha &
             ,iprint,iflag,myid,func)
      else if( trim(clinmin).eq.'two-point' ) then
!.....Like backtrack, but once alpha is defined, not to perform line minimization
!     no matter the func value gets larger.
!.....Currently this does NOT work well...
        if( iter.le.1 ) then  ! only 1st call, perform line minimization a bit
          call backtrack(ndim,x,xranges,d,f,ftst,alpha,iprint &
               ,iflag,myid,func,niter)
        else
          alpha = sprod(ndim,s,s)/sprod(ndim,s,y)
          alpha = max(min(alpha,1d0),xtol)
        endif
      else if( trim(clinmin).eq.'armijo' ) then
!!$        alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
!!$        alpha = max(alpha,xtol/gnorm)*2d0
        alpha = alpha *fac_inc
        call armijo_search(ndim,x,xranges,d,f,ftst,g,alpha,iprint &
             ,iflag,myid,func,niter)
      else ! Default = backtrack
!.....Increase alpha a bit every step,
!.....alpha is to be decreased in subroutine backtrack to decrease func value.
!!$        alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
!!$        alpha = max(alpha,xtol/gnorm)*2d0
        alpha = alpha *fac_inc
        call backtrack(ndim,x,xranges,d,f,ftst,alpha,iprint &
             ,iflag,myid,func,niter)
      endif
      if( iflag/100.ne.0 ) then
        if( myid.eq.0 ) then
          print *,'ERROR: iflag/100.ne.0 in SD !!!'
        endif
        return
      endif
!.....evaluate statistics at every niter_eval
      if( mod(iter,niter_eval).eq.0 ) &
           call sub_eval(iter)
!.....Update x
      if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' ) then
        call soft_threshold(ndim,x,d,alpha)
      else
        x(1:ndim)= x(1:ndim) +alpha*d(1:ndim)
      endif
      s(1:ndim) = alpha*d(1:ndim)
      dx(:) = x(:) -xp(:)
      xp(:) = x(:)
      dxnorm = sqrt(sprod(ndim,dx,dx))
      call wrap_ranges(ndim,x,xranges)
      gprev(:) = g(:)
      call grad(ndim,x,g)
      call penalty(cpena,ndim,pval,gpena,x)
      if( trim(cpena).eq.'ridge' ) g(:) = g(:) +gpena(:)
      y(:) = g(:) -gprev(:)
      gnorm= sqrt(sprod(ndim,g,g))
      d(1:ndim)= -g(1:ndim)
!!$      g(1:ndim)= -g(1:ndim)/gnorm
!!$      gnorm= gnorm/ndim
      vnorm= sqrt(sprod(ndim,x,x))
      call write_status(6,myid,iprint,cpena,iter,niter &
           ,f,ftst,pval,vnorm,gnorm,dxnorm,fp)
      call check_converge(myid,iprint,xtol,gtol,ftol &
           ,dxnorm,gnorm,abs(f-fp),nxtol,ngtol,nftol,iflag,lconverged)
      if( lconverged ) then
        x0(:) = x(:)
        maxiter = iter
        return
      endif
    enddo

    x0(:) = x(:)
    return
  end subroutine steepest_descent
!=======================================================================
  subroutine sgd(ndim,x0,f,g,u,xranges,xtol,gtol,ftol,maxiter,iprint &
       ,iflag,myid,mpi_world,mynsmpl,myntrn,isid0,isid1 &
       ,func,grad,cfmethod,niter_eval,sub_eval)
!
! Stochastic gradient decent (SGD)
!
    integer,intent(in):: ndim,iprint,niter_eval,myid,mpi_world, &
         mynsmpl,myntrn,isid0,isid1
    integer,intent(inout):: iflag,maxiter
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: f,x0(ndim),g(ndim),u(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine grad(n,x,gtrn)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: gtrn(n)
      end subroutine grad
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface
    include 'mpif.h'
    real(8),parameter:: tiny  = 1d-14

    integer:: i,ismpl,iter,niter,nftol,ngtol,nxtol
    integer:: ninnerstp,innerstp,ierr,itmp
    real(8):: gnorm,xnorm,dxnorm,pval,sgd_rate,sgd_ratei
    real(8):: fp,ftmp,ftst,ftsttmp
    real(8):: rate_upper, rate_lower
    real(8),allocatable:: x(:),dx(:),rm(:),rmh(:),gpena(:),gtmp(:) &
         ,gp(:),v(:),vh(:),xp(:)
    integer,allocatable:: imaskarr(:)
    logical:: lconverged

    if( .not.allocated(x) ) then
      nsgdbsize = min(nsgdbsnode,myntrn)
      if(myid.eq.0) then
        print *,''
        print *, '************************ Stochastic gradient descent (SGD) '&
             //'************************'
        print *,'   Update method: ',trim(csgdupdate)
        if( trim(csgdupdate).ne.'normal' .and. trim(csgdupdate).ne.'adam' &
             .and. trim(csgdupdate).ne.'adabound' ) then
          if( myid.eq.0 ) then
            print *,'   WARNING: update method '//trim(csgdupdate)//' is not available.'
            print *,'            Use normal sgd instead...'
          endif
        endif
        print '(a,i0)','    SGD batch size per node = ',nsgdbsize
        if( trim(csgdupdate).eq.'adam' ) then
          print '(a,es11.3)','    epsilon value = ',sgd_eps
          print '(a,es11.3)','    beta1 = ',adam_b1
          print '(a,es11.3)','    beta2 = ',adam_b2
        else if( trim(csgdupdate).eq.'adabound' ) then
          print '(a,es11.3)','    epsilon value = ',sgd_eps
          print '(a,es11.3)','    beta1 = ',adam_b1
          print '(a,es11.3)','    beta2 = ',adam_b2
        endif
        if( sgd_rate_fin.le.0d0 ) then
          print '(a,es11.3)','    learning rate = ',sgd_rate_ini
        else
          print '(a,es11.3)','    initial learning rate = ',sgd_rate_ini
          print '(a,es11.3)','    final learning rate  = ',sgd_rate_fin
        endif
        print *,''
      endif
      allocate(x(ndim),dx(ndim),rm(ndim),rmh(ndim),gpena(ndim) &
           ,gp(ndim),gtmp(ndim),v(ndim),vh(ndim),xp(ndim))
      allocate(ismask(isid0:isid1))
      sgd_rate = sgd_rate_ini
    endif

!.....Initialization
    nftol= 0
    ngtol= 0
    nxtol= 0
    rm(:) = 0d0
    v(:) = 0d0
    x(1:ndim)= x0(1:ndim)
    allocate(imaskarr(mynsmpl))
    ninnerstp = myntrn /nsgdbsize
    if( mod(myntrn,nsgdbsize) .ne. 0 ) ninnerstp = ninnerstp + 1
    itmp = ninnerstp
    call mpi_allreduce(itmp,ninnerstp,1,mpi_integer,mpi_max,mpi_world,ierr)

!.....Unset mask to compute all the samples at the first evaluation
    ismask(:) = 0
    call wrap_ranges(ndim,x0,xranges)
    call func(ndim,x0,f,ftst)
    call grad(ndim,x0,g)
    call penalty(cpena,ndim,pval,gpena,x0)
    f = f + pval
    gnorm= sqrt(sprod(ndim,g,g))
    xnorm= sqrt(sprod(ndim,x,x))
    dxnorm = 0d0

    iter= 0
    niter = 0
    call write_status(6,myid,iprint,cpena,iter,ninnerstp &
         ,f,ftst,pval,xnorm,gnorm,dxnorm,f)

    call sub_eval(0)

!.....One iteration includes evaluation of all the training data.
    do iter=1,maxiter
      fp = f
      gp(:) = g(:)
      xp(:) = x(:)
      
!.....Gradual increasing/descreasing learning rate if sgd_rate_fin is given
      if( sgd_rate_fin .gt. 0d0 ) then
        sgd_rate = sgd_rate_ini +(sgd_rate_fin -sgd_rate_ini)/maxiter *(iter-1)
      endif

!.....Make imaskarr that contains the order of samples to be computed in each innerstp
      imaskarr(:) = 0
      call get_order_iarr(myntrn,nsgdbsize,imaskarr)
!!$      print *,'myid,iter,imaskarr(:)=',myid,iter,imaskarr(:)

!.....Inner loop for batch process
      do innerstp = 1,ninnerstp
!.....Unmask only the samples whose imaskarr(i)==innerstp
        ismask(:) = 1
        do i=1,myntrn  ! All the test samples remain masked
          ismpl = isid0 + i -1
          if( imaskarr(i).eq.innerstp ) then
            ismask(ismpl) = 0
!!$            print *,'myid,ismpl= ',myid,ismpl
          endif
        enddo

        call wrap_ranges(ndim,x,xranges)
        call func(ndim,x,f,ftst)
        call grad(ndim,x,g)
        call penalty(cpena,ndim,pval,gpena,x)
        f = f + pval
        gnorm= sqrt(sprod(ndim,g,g))
!!$        print *,'myid,innerstp,gnorm=',myid,innerstp,gnorm

!.....Compute step size of x
        if( trim(csgdupdate).eq.'adam' .or. trim(csgdupdate).eq.'Adam' ) then
          rm(:) = adam_b1*rm(:) +(1d0 -adam_b1)*g(:)
          v(:) = adam_b2*v(:) +(1d0 -adam_b2)*g(:)*g(:)
          rmh(:) = rm(:)/(1d0-adam_b1**iter)
          vh(:) = v(:)/(1d0-adam_b2**iter)
          dx(:) = -sgd_rate*rmh(:)/(sqrt(vh(:)) +sgd_eps)
        else if( trim(csgdupdate).eq.'adabound' ) then
          rm(:) = adam_b1*rm(:) +(1d0 -adam_b1)*g(:)
          v(:) = adam_b2*v(:) +(1d0 -adam_b2)*g(:)*g(:)
          rmh(:) = rm(:)/(1d0-adam_b1**iter)
          vh(:) = v(:)/(1d0-adam_b2**iter)
          rate_lower = sgd_rate *(1d0 -1d0/((1d0 -adam_b2)*iter+1d0))
          rate_upper = sgd_rate *(1d0 +1d0/((1d0 -adam_b2)*iter))
          if( iprint.gt.1 ) print '(a,i6,2es12.4)','iter,lower,upper=',iter,rate_lower,rate_upper
          do i=1,ndim
            sgd_ratei = sgd_rate/(sqrt(vh(i))+sgd_eps)
            sgd_ratei = min(max(rate_lower,sgd_ratei),rate_upper) !/sqrt(dble(iter))
            dx(i) = -sgd_ratei*rmh(i)
          enddo
        else  ! normal SGD
          if( gnorm/xnorm .gt. 1d0 ) g(:) = g(:) /gnorm *xnorm
          dx(:) = -sgd_rate *g(:)
        endif

!.....Update x
        if( trim(cpena).eq.'ridge' ) dx(:) = dx(:) -gpena(:)
        if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' ) then
          call soft_threshold(ndim,x,dx,1d0)
        else
          x(1:ndim)= x(1:ndim) +dx(1:ndim)
        endif
        call wrap_ranges(ndim,x,xranges)
        call penalty(cpena,ndim,pval,gpena,x)
      enddo  ! innerstp

      dx(:) = x(:) -xp(:)
      dxnorm = sqrt(sprod(ndim,dx,dx))
      xnorm= sqrt(sprod(ndim,x,x))
      gnorm= sqrt(sprod(ndim,g,g))
!!$      print *,'myid,iter,gnorm=',myid,iter,gnorm

!.....Evaluate statistics at every niter_eval.
      if( mod(iter,niter_eval).eq.0 ) then
!.....Before the output, compute all the remaining samples
        do ismpl=isid0,isid1
          ismask(ismpl) = mod(ismask(ismpl)+1,2)
        enddo
        call func(ndim,x,f,ftst)
        call grad(ndim,x,g)
        gnorm= sqrt(sprod(ndim,g,g))
        call sub_eval(iter)
      endif
      call write_status(6,myid,iprint,cpena,iter,ninnerstp &
           ,f,ftst,pval,xnorm,gnorm,dxnorm,fp)
      call check_converge(myid,iprint,xtol,gtol,ftol &
           ,dxnorm,gnorm,abs(f-fp),nxtol,ngtol,nftol,iflag,lconverged)
      if( lconverged ) then
        x0(:) = x(:)
        maxiter = iter
        deallocate(x,dx,rm,rmh,gpena)
        return
      endif
    enddo

    x0(1:ndim)= x(1:ndim)
    deallocate(x,dx,rm,rmh,gpena,imaskarr)
    return
  end subroutine sgd
!=======================================================================
  subroutine get_order_iarr(ndim,nbatch,iarr)
!
!  Create an array of length ndim that has the order of samples to be computed.
!
    use random
    integer,intent(in):: ndim,nbatch
    integer,intent(out):: iarr(ndim)
    integer:: jarr(ndim)
    integer:: i,j,k,l,m,inc,idx

    do i=1,ndim
      jarr(i) = i
    enddo
    inc = 0
    l=ndim
    do while(.true.)
      inc = inc + 1
      if( l.le.nbatch ) then
        do k=1,l
          idx = jarr(k)
          iarr(idx) = inc
        enddo
        exit
      else  ! if not l.le.nbatch
        do j=1,nbatch
          k = l*urnd() +1
          idx = jarr(k)
          iarr(idx) = inc
          do m=k,l-1
            jarr(m) = jarr(m+1)
          enddo
          l= l-1
        enddo
      endif
    end do
    return
  end subroutine get_order_iarr
!=======================================================================
  subroutine get_uniq_iarr(n,m,iarr)
!
! Create an array with length m which includes a unique set of integers
! randomly chosen from from 1 to n.
!
    use random
    integer,intent(in):: n,m
    integer,intent(out):: iarr(m)
    integer:: jarr(n)
    integer:: i,j,k,l

    do i=1,n
      jarr(i) = i
    enddo
    l=n
    do i=1,m
      j = l*urnd()+1
      iarr(i)= jarr(j)
      do k=j,l-1
        jarr(k)= jarr(k+1)
      enddo
      l= l-1
    enddo
    return
  end subroutine get_uniq_iarr
!=======================================================================
  subroutine cg(ndim,x0,f,g,u,xranges,xtol,gtol,ftol,maxiter,iprint,iflag,myid &
       ,func,grad,cfmethod,niter_eval,sub_eval)
!
!  Conjugate gradient minimization
!
    integer,intent(in):: ndim,iprint,myid,niter_eval
    integer,intent(inout):: iflag,maxiter
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: f,x0(ndim),g(ndim),u(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine grad(n,x,gtrn)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: gtrn(n)
      end subroutine grad
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface
    real(8),parameter:: xtiny  = 1d-14
    real(8),parameter:: t_DL   = 0.2d0
    logical:: ltwice = .false.

    integer:: iter,nftol,ngtol,nxtol,niter
    real(8):: alpha,fp,gnorm,gnormp,vnorm,beta,pval,sgnorm,ftst,dxnorm
    real(8),save,allocatable:: gpena(:),gp(:),y(:),xp(:),s(:),dx(:),uu(:),x(:)
    logical:: lconverged = .false. 

    if( myid.eq.0 ) then
      print *,''
      print *, '********************** Conjugate Gradient (CG) '&
           //  '**********************'
    endif

    if( .not.allocated(gpena) ) allocate(gpena(ndim),gp(ndim)&
         ,y(ndim),xp(ndim),s(ndim),dx(ndim),uu(ndim),x(ndim))

!.....Initialize alpha (line minimization factor)
    alpha = 1d0
    
    iter= 0
    niter = 0
    nftol= 0
    nxtol= 0
    gpena(1:ndim)= 0d0
    x(:) = x0(:)
    call wrap_ranges(ndim,x,xranges)
    call func(ndim,x,f,ftst)
    call grad(ndim,x,g)
!.....penalty
    call penalty(cpena,ndim,pval,gpena,x)
    f = f + pval
    if( trim(cpena).eq.'ridge' ) g(1:ndim)= g(1:ndim) +gpena(1:ndim)
    gnorm= sprod(ndim,g,g)
    sgnorm= sqrt(gnorm)
    vnorm= sqrt(sprod(ndim,x,x))
    call write_status(6,myid,iprint,cpena,iter,niter &
         ,f,ftst,pval,vnorm,sgnorm,dxnorm,f)
    u(1:ndim)= -g(1:ndim)

    call sub_eval(0)
    do iter=1,maxiter
      fp= f
      xp(1:ndim)= x(1:ndim)
!!$!.....normalize u-vector only for line search
!!$      unorm = sqrt(sprod(ndim,u,u))
!!$      uu(1:ndim) = u(1:ndim)/unorm
!.....line minimization
      if( trim(clinmin).eq.'quadratic' ) then
        call quad_interpolate(ndim,x,u,f,ftst,xtol,gtol,ftol &
             ,alpha,iprint,iflag,myid,func)
!.....if quad interpolation failed, perform golden section
        if( iflag/100.ne.0 ) then
          iflag= iflag -(iflag/100)*100
          if(myid.eq.0) then
            print *,'since quad_interpolate failed, call golden_section.'
          endif
          call golden_section(ndim,x,u,f,ftst,xtol,gtol,ftol &
               ,alpha,iprint,iflag,myid,func)
        endif
      else if ( trim(clinmin).eq.'golden') then
        call golden_section(ndim,x,u,f,ftst,xtol,gtol,ftol,alpha &
             ,iprint,iflag,myid,func)
      else if( trim(clinmin).eq.'armijo' ) then
!.....To enhance the convergence in Armijo search,
!.....use the history of previous alpha by multiplying 2
!.....avoiding constant decrease, but alpha should not be greater than 1.
!!$        alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
!!$        alpha = max(alpha,xtol/gnorm)*2d0
        alpha = alpha *fac_inc
        call armijo_search(ndim,x,xranges,u,f,ftst,g,alpha,iprint &
             ,iflag,myid,func,niter)
      else ! backtrack (default)
!!$        alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
!!$        alpha = max(alpha,xtol/gnorm)*2d0
        alpha = alpha *fac_inc
        call backtrack(ndim,x,xranges,u,f,ftst,alpha,iprint &
             ,iflag,myid,func,niter)
      endif

      if( iflag/100.ne.0 ) then
        if( ltwice ) then
          if(myid.eq.0) then
            print *,'>>> Line search failed twice continuously.'
          endif
          return
        else
          ltwice= .true.
          if(myid.eq.0) then
            print *,'>>> Initialize gg because alpha was not found.'
          endif
          alpha = 1d0
          u(1:ndim)= -g(1:ndim)
          f= fp
          iflag= iflag -100*(iflag/100)
          cycle
        endif
      else
        ltwice=.false.
      endif
!.....evaluate statistics at every niter_eval
      if( mod(iter,niter_eval).eq.0 ) &
           call sub_eval(iter)

!.....Update x
      if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' ) then
        call soft_threshold(ndim,x,u,alpha)
      else
        x(1:ndim)= x(1:ndim) +alpha*u(1:ndim)
      endif
      call wrap_ranges(ndim,x,xranges)
      call penalty(cpena,ndim,pval,gpena,x)

      dx(1:ndim)= x(1:ndim) -xp(1:ndim)
      gnormp= gnorm
      gp(1:ndim)= g(1:ndim)
      call grad(ndim,x,g)
      if( trim(cpena).eq.'ridge' ) g(1:ndim)= g(1:ndim) +gpena(1:ndim)
      gnorm= sprod(ndim,g,g)
      sgnorm= sqrt(gnorm)
      if( icgbtype.eq.2 ) then
!.....Polak-Ribiere-Polyak (PRP)
        y(1:ndim)= g(1:ndim) -gp(1:ndim)
        beta= sprod(ndim,g,y)/gnormp
      else if( icgbtype.eq.3 ) then
!.....Hestenes-Stiefel (HS)
        y(1:ndim)= g(1:ndim) -gp(1:ndim)
        beta= sprod(ndim,g,y)/sprod(ndim,u,y)
      else if( icgbtype.eq.4 ) then
!.....Dai-Yuan (DY)
        y(1:ndim)= g(1:ndim) -gp(1:ndim)
        beta= gnorm/sprod(ndim,u,y)
      else if( icgbtype.eq.5 ) then
!.....Dai-Liao (DL)
        s(1:ndim)= x(1:ndim) -xp(1:ndim)
        y(1:ndim)= g(1:ndim) -gp(1:ndim)
        beta= (sprod(ndim,g,y) -t_DL*sprod(ndim,g,s))&
             /sprod(ndim,u,y)
      else ! including icgbtype == 1
!.....Fletcher-Reeves (FR)
        beta= gnorm/gnormp
      endif
      u(1:ndim)= -g(1:ndim) +beta*u(1:ndim)
      vnorm= sqrt(sprod(ndim,x,x))
      dxnorm= sqrt(sprod(ndim,dx,dx))
      call write_status(6,myid,iprint,cpena,iter,niter &
           ,f,ftst,pval,vnorm,sgnorm,dxnorm,fp)
      call check_converge(myid,iprint,xtol,gtol,ftol &
           ,dxnorm,sgnorm,abs(f-fp),nxtol,ngtol,nftol,iflag,lconverged)
      if( lconverged ) then
        x0(:) = x(:)
        maxiter = iter
        return
      endif
    enddo

    x0(:) = x(:)
    return
  end subroutine cg
!=======================================================================
  subroutine qn(ndim,x0,f,g,u,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,grad,cfmethod,niter_eval,sub_eval)
!
!  Broyden-Fletcher-Goldfarb-Shanno type of Quasi-Newton method.
!
    integer,intent(in):: ndim,iprint,myid,niter_eval
    integer,intent(inout):: iflag,maxiter
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: f,x0(ndim),g(ndim),u(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine grad(n,x,gtrn)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: gtrn(n)
      end subroutine grad
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface
    real(8),parameter:: xtiny  = 1d-14
    logical:: ltwice = .false.
    real(8),save,allocatable:: gg(:,:),x(:),s(:),y(:),gp(:) &
         ,ggy(:),gpena(:),dx(:)
    real(8):: tmp1,tmp2,b,sy,syi,fp,alpha,gnorm,ynorm,vnorm,pval &
         ,estmem,ftst,dxnorm
    integer:: i,j,iter,nftol,ngtol,nxtol,mem,niter
    logical:: lconverged = .false. 

    if( .not.allocated(gg) ) then
      if(myid.eq.0) then
        print *,''
        print *, '******************************* QN(BFGS) '&
             //'*******************************'
        estmem = (ndim*ndim +ndim*6)*8
        mem= estmem/1000/1000
        if( mem.eq.0 ) then
          mem= estmem/1000
          write(6,'(a,i6,a)') ' Memory for BFGS = ' &
               ,int(estmem/1000),' kB'
        else
          write(6,'(a,i6,a)') ' Memory for BFGS = ' &
               ,int(estmem/1000/1000),' MB'
        endif
      endif
      allocate(gg(ndim,ndim),x(ndim),dx(ndim) &
           ,s(ndim),y(ndim),gp(ndim),ggy(ndim),gpena(ndim))
    endif


!.....initialize alpha (line minimization factor)
    alpha = 1d0

    nftol= 0
    ngtol= 0
    nxtol= 0
!.....initial G = I
    gg(1:ndim,1:ndim)= 0d0
    do i=1,ndim
      gg(i,i)= 1d0
    enddo

    call wrap_ranges(ndim,x0,xranges)
    call func(ndim,x0,f,ftst)
    call grad(ndim,x0,g)
    call penalty(cpena,ndim,pval,gpena,x0)
    f = f + pval
    if( trim(cpena).eq.'ridge' ) g(1:ndim)= g(1:ndim) +gpena(1:ndim)

    gnorm= sqrt(sprod(ndim,g,g))
    x(1:ndim)= x0(1:ndim)
    vnorm= sqrt(sprod(ndim,x,x))
    dxnorm = 0d0

    iter= 0
    niter = 0
    call write_status(6,myid,iprint,cpena,iter,niter &
         ,f,ftst,pval,vnorm,gnorm,dxnorm,f)

    call sub_eval(0)
    do iter=1,maxiter
      u(1:ndim)= 0d0
      do i=1,ndim
        u(1:ndim)= u(1:ndim) -gg(1:ndim,i)*g(i)
      enddo
!.....store previous func and grad values
      fp= f
      gp(1:ndim)= g(1:ndim)
!.....line minimization
      if( trim(clinmin).eq.'quadratic' ) then
        call quad_interpolate(ndim,x,u,f,ftst,xtol,gtol,ftol,alpha &
             ,iprint,iflag,myid,func)
!.....if quad interpolation failed, perform golden section
        if( iflag/100.ne.0 ) then
          iflag= iflag -(iflag/100)*100
          if(myid.eq.0) then
            print *,'since quad_interpolate failed, call golden_section.'
          endif
          call golden_section(ndim,x,u,f,ftst,xtol,gtol,ftol,alpha &
               ,iprint,iflag,myid,func)
        endif
      else if( trim(clinmin).eq.'golden') then
        call golden_section(ndim,x,u,f,ftst,xtol,gtol,ftol,alpha &
             ,iprint,iflag,myid,func)
      else if( trim(clinmin).eq.'armijo' ) then
!.....To enhance the convergence in Armijo search,
!.....use the history of previous alpha by multiplying 2
!.....avoiding constant decrease, but alpha should not be greater than 1.
!!$        alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
!!$        alpha = max(alpha,xtol)*2d0
        alpha = alpha *fac_inc
        call armijo_search(ndim,x,xranges,u,f,ftst,g,alpha,iprint &
             ,iflag,myid,func,niter)
      else ! backtrack (default)
!!$        alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
!        alpha = max(alpha,xtol)*2d0
        alpha = alpha *fac_inc
        call backtrack(ndim,x,xranges,u,f,ftst,alpha,iprint &
             ,iflag,myid,func,niter)
      endif
!!$      if(myid.eq.0) print *,'armijo steps, alpha=',niter,alpha
      if( iflag/100.ne.0 ) then
        if( ltwice ) then
          x0(1:ndim)= x(1:ndim)
          if(myid.eq.0) then
            print *,'>>> Line_minimization failed twice continuously.'
          endif
          return
        else
          ltwice= .true.
          if(myid.eq.0) then
            print *,'>>> Initialize gg because alpha was not found.'
          endif
          alpha= 1d0  ! reset alpha to 1
          gg(1:ndim,1:ndim)= 0d0
          do i=1,ndim
            gg(i,i)= 1d0
          enddo
          f= fp
          iflag= iflag -100*(iflag/100)
          cycle
        endif
      else
        ltwice=.false.
      endif
!.....evaluate statistics at every niter_eval
      if( mod(iter,niter_eval).eq.0 ) &
           call sub_eval(iter)
!.....Update x
      if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' ) then
        call soft_threshold(ndim,x,u,alpha)
      else
        x(1:ndim)= x(1:ndim) +alpha*u(1:ndim)
      endif
      call wrap_ranges(ndim,x,xranges)
      call penalty(cpena,ndim,pval,gpena,x)
      
      dx(1:ndim)= x(1:ndim) -x0(1:ndim)
      x0(1:ndim)= x(1:ndim)
      call grad(ndim,x,g)
      if( trim(cpena).eq.'ridge' ) g(1:ndim)= g(1:ndim) +gpena(1:ndim)
      gnorm= sqrt(sprod(ndim,g,g))
      vnorm= sqrt(sprod(ndim,x,x))
      dxnorm= sqrt(sprod(ndim,dx,dx))
      call write_status(6,myid,iprint,cpena,iter,niter &
           ,f,ftst,pval,vnorm,gnorm,dxnorm,fp)
      call check_converge(myid,iprint,xtol,gtol,ftol &
           ,dxnorm,gnorm,abs(f-fp),nxtol,ngtol,nftol,iflag,lconverged)
      if( lconverged ) then
        x0(:) = x(:)
        maxiter = iter
        return
      endif

      s(1:ndim)= alpha *u(1:ndim)
      y(1:ndim)= g(1:ndim) -gp(1:ndim)
      ynorm= sprod(ndim,y,y)
      if( ynorm.lt.1d-14 .or. dxnorm.lt.xtol .or. gnorm.lt.gtol &
           .or. abs(f-fp).lt.ftol ) then
        if(myid.eq.0) then
          print *,'>>> Initialize gg'
        endif
        gg(1:ndim,1:ndim)= 0d0
        do i=1,ndim
          gg(i,i)= 1d0
        enddo
        cycle
      endif

!.....update matrix gg
      sy= sprod(ndim,s,y)
      syi= 1d0/sy
      do i=1,ndim
        tmp1= 0d0
        tmp2= 0d0
        do j=1,ndim
          tmp1= tmp1 +gg(j,i)*y(j)
        enddo
        ggy(i)= tmp1 *syi
      enddo
      b= 1d0
      do i=1,ndim
        b=b +y(i)*ggy(i)
      enddo
      b= b*syi
!.....without temporary matrix aa
      do j=1,ndim
        do i=1,ndim
          gg(i,j)=gg(i,j) +s(j)*s(i)*b &
               -(s(i)*ggy(j) +ggy(i)*s(j))
        enddo
      enddo
    enddo

    x0(1:ndim)= x(1:ndim)
    return
  end subroutine qn
!=======================================================================
  subroutine get_bracket(ndim,x0,d,a,b,c,fa,fb,fc,fta,ftb,ftc &
       ,iprint,iflag,myid,func)
    implicit none
    integer,intent(in):: ndim,iprint,myid
    integer,intent(inout):: iflag
    real(8),intent(in):: x0(ndim),d(ndim)
    real(8),intent(inout):: a,b,fa,fb,fta,ftb
    real(8),intent(out):: c,fc,ftc
    
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
    end interface

    real(8),parameter:: RATIO = 1.61803398875d0
    real(8),parameter:: RATIOI= 1d0/RATIO
    real(8),parameter:: TINY= 1d-12
    real(8),parameter:: GLIMIT= 100d0
    real(8),parameter:: MAXITER= 50
    integer:: iter

    call func(ndim,x0+a*d,fa,fta)
    call func(ndim,x0+b*d,fb,ftb)
    iter= 0
10  continue
    iter= iter +1
    if( iter.gt.MAXITER ) then
      if( myid.eq.0 ) then
        print *,'WARNING: iter.gt.MAXITER in get_bracket'
        print *,'  Search direction may not be a descent direction.'
      endif
      iflag= iflag +1000
      return
!!$      stop
    endif
    if( abs(b-a).lt.1d-12) then
      if( myid.eq.0 ) then
        print *,'WARNING: a and b is too close in get_bracket'
        print *,'  Search direction may not be a descent direction.'
      endif
      iflag= iflag +2000
      return
!!$      stop
    endif
    if( fa.lt.fb ) then
      c= a +RATIOI*(b-a)
      call func(ndim,x0+c*d,fc,ftc)
      call exchange(c,b)
      call exchange(fc,fb)
      call exchange(ftc,ftb)
      if( iprint.eq.3 .and. myid.eq.0 ) then
        write(6,'(a,2(1x,3es12.4))') ' a,b,c,fa,fb,fc=',a,b,c,fa,fb,fc
      endif
      goto 10
    else
      c= a +RATIO*(b-a)
      call func(ndim,x0+c*d,fc,ftc)
      if( fb.gt.fc ) then
        b= a +RATIO*(c-a)
        call func(ndim,x0+b*d,fb,ftb)
        call exchange(b,c)
        call exchange(fb,fc)
        call exchange(ftb,ftc)
        if( iprint.eq.3 .and. myid.eq.0 ) then
          write(6,'(a,2(1x,3es12.4))') ' a,b,c,fa,fb,fc=',a,b,c,fa,fb,fc
        endif
        goto 10
      endif
!!$      write(6,'(a,2(1x,3es12.4))') ' a,b,c,fa,fb,fc=',a,b,c,fa,fb,fc
    endif

    return
  end subroutine get_bracket
!=======================================================================
  subroutine exchange(a,b)
    implicit none
    real(8),intent(inout):: a,b
    real(8):: tmp
    tmp= a
    a= b
    b= tmp
    return
  end subroutine exchange
!=======================================================================
  subroutine quad_interpolate(ndim,x0,g,f,ftst,xtol,gtol,ftol,a,iprint &
       ,iflag,myid,func)
    implicit none
    integer,intent(in):: ndim,iprint,myid
    integer,intent(inout):: iflag

    real(8),intent(in):: x0(ndim),xtol,gtol,ftol,g(ndim)
    real(8),intent(out):: f,a,ftst

    real(8),parameter:: STP0    = 1d-1
    real(8),parameter:: STPMAX  = 1d+1
    real(8),parameter:: TINY    = 1d-15

    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
    end interface

    integer:: iter,imin,imax,ix
    real(8):: r,q,fmin,fmax,dmin,dmax,d,xmin
    real(8),save,allocatable:: xi(:),fi(:),fti(:)

    if( .not. allocated(xi) ) allocate(xi(4),fi(4),fti(4))
    
    xi(1)= 0d0
    xi(2)= xi(1) +STP0
    call get_bracket(ndim,x0,g,xi(1),xi(2),xi(3),fi(1),fi(2),fi(3) &
         ,fti(1),fti(2),fti(3) &
         ,iprint,iflag,myid,func)
    if( iflag/1000.ne.0 ) return
    
    iter= 0
10  continue
    iter= iter +1
    if( iter.gt.niter_linmin ) then
      if( myid.eq.0 ) then
        print *,'WARNING: iter.gt.NITER_LINMIN in quad_interpolate !!!'
        print *,'  iter,niter_linmin= ',iter,niter_linmin
      endif
      iflag= iflag +100
      return
!!$      stop
    endif
    !.....step 3; compute turning point
    r= (xi(2)-xi(1))*(fi(2)-fi(3))
    q= (xi(2)-xi(3))*(fi(2)-fi(1))
    xi(4)= xi(2) -((xi(2)-xi(3))*q -(xi(2)-xi(1))*r) &
         /(2d0*sign(max(abs(q-r),TINY),q-r))
    call func(ndim,x0+xi(4)*g,fi(4),fti(4))
!!$    write(6,'(a,2(2x,4f11.2))') ' xi,fi=',xi(1:4),fi(1:4)

    !.....step4
    fmin= min(fi(1),fi(2),fi(3))
    fmax= max(fi(1),fi(2),fi(3))
    dmin= min(abs(xi(4)-xi(1)),abs(xi(4)-xi(2)),abs(xi(4)-xi(3)))
    if( fi(4).lt.fmin .and. dmin.gt.STPMAX ) then
!!$      print *,' 01'
      imax= 0
      dmax= 0d0
      do ix=1,3
        d= abs(xi(4)-xi(ix))
        if( dmax.lt.d ) then
          dmax= d
          imax= ix
        endif
      enddo
      !.....eliminate max function point and set point at STPMAX
      imax= 0
      fmax= -1d+30
      do ix=1,3
        if( fmax.lt.fi(ix) ) then
          fmax= fi(ix)
          imax= ix
        endif
      enddo
      do ix=imax+1,3
        xi(ix-1)= xi(ix)
        fi(ix-1)= fi(ix)
        fti(ix-1)= fti(ix)
      enddo
      if( fi(2).gt.fi(1) ) then
        xi(3)= xi(1) +STPMAX
      else
        xi(3)= xi(2) +STPMAX
      endif
      call func(ndim,x0+xi(3)*g,fi(3),fti(3))
      goto 10
    else if( fi(4).gt.fmax ) then ! fi(4) is maximum
!!$      print *,' 02'
      imin= 0
      dmin= 1d+30
      do ix=1,3
        d= abs(xi(4)-xi(ix))
        if( dmin.gt.d ) then
          dmin= d
          xmin= xi(ix)
          imin= ix
        endif
      enddo
      !.....eliminate nearest point to xi(4) and add a new point
      do ix=imin+1,3
        xi(ix-1)= xi(ix)
        fi(ix-1)= fi(ix)
        fti(ix-1)= fti(ix)
      enddo
      xi(3)= (xmin +xi(4))*0.5
      call func(ndim,x0+xi(3)*g,fi(3),fti(3))
      goto 10
    endif

    !.....step 5: check convergence
    if( dmin.lt.xtol ) then
      imin= 0
      fmin= 1d+30
      do ix=1,4
        if( fmin.gt.fi(ix) ) then
          imin= ix
          fmin= fi(ix)
        endif
      enddo
      f= fi(imin)
      a= xi(imin)
      return
    endif

!.....step 6: discard point of highest f value and replace it by xi(4)
!!$    print *,' 03'
    imax= 0
    fmax= -1d+30
    do ix=1,3
      if( fmax.lt.fi(ix) ) then
        imax= ix
        fmax= fi(ix)
      endif
    enddo
    xi(imax)= xi(4)
    fi(imax)= fi(4)
    fti(imax)= fti(4)
    goto 10

  end subroutine quad_interpolate
!=======================================================================
  subroutine golden_section(ndim,x0,g,f,ftst,xtol,gtol,ftol,alpha,iprint &
       ,iflag,myid,func)
    implicit none
    integer,intent(in):: ndim,iprint,myid
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,x0(ndim),g(ndim)
    real(8),intent(inout):: f,alpha,ftst
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
    end interface

    real(8),parameter:: STP0 = 1d-1
    real(8),parameter:: GR   = 0.61803398875d0
    real(8),parameter:: GR2  = 1d0 -GR

    integer:: iter
    real(8):: a,b1,b2,c,fa,fb1,fb2,fc,xl
    real(8):: ftb1,ftb2,fta,ftc

    a= 0d0
    b1= STP0
    call get_bracket(ndim,x0,g,a,b1,c,fa,fb1,fc,fta,ftb1,ftc,&
         iprint,iflag,myid,func)
    if( iflag/1000.ne.0 ) return
    xl= (c-a)
    b1= a +GR2*xl
    b2= a +GR *xl
    call func(ndim,x0+b1*g,fb1,ftb1)
    call func(ndim,x0+b2*g,fb2,ftb2)

    iter= 0
10  continue
    iter= iter +1
    if( iter.gt.niter_linmin ) then
      if( myid.eq.0 ) then
        print *,'WARNING: iter.gt.NITER_LINMIN in golden_section.'
        print *,'  iter,niter_linmin = ',iter,niter_linmin
      endif
      iflag= iflag +100
      return
!!$      stop
    endif
!!$    write(6,'(a,2(2x,4es11.3))') ' a,b1,b2,c,fa,fb1,fb2,fc=' &
!!$         ,a,b1,b2,c,fa,fb1,fb2,fc
    if( fb1.gt.fb2 ) then
      a= b1
      fa= fb1
      fta= ftb1
      b1= b2
      fb1= fb2
      ftb1= ftb2
      xl= (c-a)
      b2= a +GR*xl
      call func(ndim,x0+b2*g,fb2,ftb2)
    else
      c= b2
      fc= fb2
      ftc= ftb2
      b2= b1
      fb2= fb1
      ftb2= ftb1
      xl= (c-a)
      b1= a +GR2*xl
      call func(ndim,x0+b1*g,fb1,ftb1)
    endif
!!$    print *,' xl,c,a,xtol=',xl,c,a,xtol
    if( xl.lt.xtol ) then
      if( fb1.gt.fb2 ) then
        alpha= b2
        f= fb2
        ftst= ftb2
      else
        alpha= b1
        f= fb1
        ftst= ftb1
      endif
      return
    endif
    goto 10

  end subroutine golden_section
!=======================================================================
  subroutine armijo_search(ndim,x0,xranges,d,f,ftst,g,alpha,iprint &
       ,iflag,myid,func,niter)
!  
!  1D search using Armijo rule.
!
    implicit none
    integer,intent(in):: ndim,iprint,myid
    integer,intent(inout):: iflag,niter
    real(8),intent(in):: x0(ndim),g(ndim),d(ndim),xranges(2,ndim)
    real(8),intent(inout):: f,alpha,ftst
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
    end interface

!!$  real(8),external:: sprod
    real(8),parameter:: xtiny  = 1d-14
    integer:: iter
    real(8):: alphai,xigd,f0,fi,pval,fp,pvalp,alphap,ftsti
    real(8),allocatable,dimension(:):: x1(:),gpena(:)
    logical,save:: l1st = .true.

    if( l1st ) then
      if( myid.eq.0 .and. iprint.gt.0 ) then
        write(6,'(a)') ' Armijo rule parameters:'
        write(6,'(a,es12.4)') '   c       = ',armijo_xi
        write(6,'(a,f10.4)') '   tau     = ',armijo_tau
        write(6,'(a,i5)')   '   maxiter = ',niter_linmin
      endif
      l1st = .false.
    endif

    if( .not. allocated(x1)) allocate(x1(ndim),gpena(ndim))
    xigd= sprod(ndim,g,d)*armijo_xi
    if( xigd.gt.0d0 ) then
      iflag= iflag + 100
      if( myid.eq.0 .and. iprint.gt.0 ) print *,'WARNING: g*d > 0.0'
      return
    endif
    alphai= alpha
    call penalty(cpena,ndim,pval,gpena,x0)

    f0= f
    fp= f0
    do iter=1,niter_linmin
      x1(1:ndim)= x0(1:ndim)
      if( trim(cpena).eq.'lasso' .or.trim(cpena).eq.'glasso') then
        call soft_threshold(ndim,x1,d,alphai)
      else
        x1(1:ndim)= x1(1:ndim) +alphai*d(1:ndim)
      endif
      call wrap_ranges(ndim,x1,xranges)
      call func(ndim,x1,fi,ftsti)
      call penalty(cpena,ndim,pval,gpena,x1)
      fi = fi +pval
      if( myid.eq.0 .and. iprint.gt.2 ) write(6,'(a,i5,5es12.4)') &
           ' armijo: iter,fi,fi-f0,fi-fp,xigd*alphai,alphai=',&
           iter,fi,fi-fp,xigd*alphai,alphai
      if( fi-fp.le.xigd*alphai ) then
        f= fi
        alpha= alphai
        ftst= ftsti
        niter = iter
        return
      endif
      fp= fi
      pvalp= pval
      alphap= alphai
      alphai= alphai*armijo_tau
    enddo

    iflag= iflag +100
    niter= iter
    if( myid.eq.0 .and. iprint.gt.0 ) then
      print *,'WARNING: iter.gt.NITER_LINMIN in armijo_search.'
!!$      write(6,'(a,es13.5)') '   alphai   = ',alphai
!!$      write(6,'(a,es13.5)') '   xigd    = ',xigd
!!$      write(6,'(a,es13.5)') '   norm(g) = ',sqrt(sprod(ndim,g,g))
!!$      if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' .or. &
!!$           trim(cpena).eq.'ridge' ) then
!!$        write(6,'(a,es13.5)') '   pval    = ',pval
!!$      endif
    endif
    return

  end subroutine armijo_search
!=======================================================================
  subroutine backtrack(ndim,x0,xranges,d,f,ftst,alpha,iprint &
       ,iflag,myid,func,niter)
!
!  Simply move onestep towards current direction with max length
!  that can decreases function value.
!
    implicit none
    integer,intent(in):: ndim,iprint,myid
    integer,intent(inout):: iflag,niter
    real(8),intent(in):: x0(ndim),d(ndim),xranges(2,ndim)
    real(8),intent(inout):: f,alpha,ftst
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
    end interface

!.....Precision
    real(8),parameter:: tiny = 1d-15
    integer:: iter,iterp
    real(8):: alphai,alphap,f0,fi,fp,ftsti,fpi,fti
    real(8),save,allocatable:: x1(:),gpena(:)
    logical,save:: l1st = .true.

    if( l1st ) then
      l1st = .false.
      if( myid.eq.0 .and. iprint.gt.1 ) &
           print *,'backtrack, alpha=',alpha
    endif
    if( .not.allocated(x1) ) allocate(x1(ndim),gpena(ndim))
    f0 = f
    fp = f0
    alphai = alpha
    do iter=1,niter_linmin
      x1(:) = x0(:)
      if( trim(cpena).eq.'lasso' .or. trim(cpena).eq.'glasso' ) then
        call soft_threshold(ndim,x1,d,alphai)
      else
        x1(1:ndim) = x1(1:ndim) +alphai*d(1:ndim)
      endif
      call wrap_ranges(ndim,x1,xranges)
      call func(ndim,x1,fti,ftsti)
      call penalty(cpena,ndim,fpi,gpena,x1)
      fi = fti +fpi
      if( myid.eq.0 .and. iprint.gt.2 ) then
        print '(a,i8,5es12.4)','   iter,alphai,fi,fti,fi-f0,fi-fp = ' &
             ,iter,alphai,fi,fti,fi-f0,fi-fp
      endif
      if( fi.lt.f0 ) then
        f = fi
        alpha = alphai
        ftst = ftsti
        niter = iter
        return
      else  ! if fi > f0, decrease alpha
!!$        fp = min(fi,f0)
!!$        ftstp = ftsti
        fp = fi
        alphap = alphai
        alphai = alphai *fac_dec
        iterp = iter
        if( alphai.lt.tiny ) then
          if( myid.eq.0 .and. iprint.gt.0 ) then
            print *,'WARNING: alpha < tiny in backtrack,'
!!$            print *,'         The search direction would be wrong.'
!!$            print *,'   iter,alphai,fi=',iter,alphai,fi
          endif
          iflag = iflag + 100
          niter = iter
          return
        endif
      endif
    enddo

    iflag = iflag + 100
    niter = iter
    if( myid.eq.0 .and. iprint.gt.0 ) then
      print *, 'WARNING: iter exceeds NITER_LINMIN in backtrack.'
!!$      print *, '         The search direction would be wrong.'
!!$      write(6,'(a,es13.5)') '   alphai = ',alphai
    endif
    return
  end subroutine backtrack
!=======================================================================
  function sprod(n,a,b)
    implicit none
    integer,intent(in):: n
    real(8),intent(in):: a(n),b(n)
    real(8):: sprod

    integer:: i
    sprod= 0d0
    do i=1,n
      sprod= sprod +a(i)*b(i)
    enddo
    return
  end function sprod
!=======================================================================
  subroutine soft_threshold(ndim,x,d,alpha)
!
!  Estimate next weight value using soft threshold
!
    implicit none
    integer,intent(in):: ndim
    real(8),intent(in):: d(ndim),alpha
    real(8),intent(inout):: x(ndim)

    integer:: i
    real(8):: xad,sgn,val

    do i=1,ndim
      xad= x(i) +alpha*d(i)
      sgn= sign(1d0,xad)   ! sign(a,b) = |a| * (sign of b)
      val= max(abs(xad)-alpha*pwgt,0d0)
      x(i)= sgn*val
    enddo
    return
  end subroutine soft_threshold
!=======================================================================
  subroutine fs(ndim,x,f,g,d,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,grad)
!
!  Forward Stagewise (FS) regression
!
    use descriptor,only: glval,ngl,iglid
    implicit none
    integer,intent(in):: ndim,maxiter,iprint,myid
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol
    real(8),intent(inout):: f,x(ndim),g(ndim),d(ndim)
!!$    real(8):: func,grad
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine grad(n,x,gtrn)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: gtrn(n)
      end subroutine grad
    end interface

    real(8),parameter:: eps = 1d0
    real(8),parameter:: xtiny= 1d-14
    integer:: iter,i,imax,ig
    real(8):: alpha,gnorm,gmax,absg,sgnx,xad,val,absx,pval,fp,ftst
    real(8),allocatable,dimension(:):: xt,gpena,grpg

    if( trim(cpena).ne.'lasso' .and. trim(cpena).ne.'glasso' ) then
      if(myid.eq.0) then
        print *,'>>> fs works only with lasso or glasso.'
      endif
      iflag= iflag +100
      return
    endif

    if( .not.allocated(xt) ) allocate(xt(ndim) &
         ,gpena(ndim),grpg(0:ngl))

    xt(1:ndim)= x(1:ndim)
!!$    xt(1:ndim)= 1d-6

    do iter=1,maxiter
      fp= f
!.....find maximum contribution in g
      call func(ndim,xt,f,ftst)
      call grad(ndim,xt,g)
      pval= 0d0
      gpena(1:ndim)= 0d0
      if( trim(cpena).eq.'lasso' ) then
        do i=1,ndim
          sgnx= sign(1d0,xt(i))
          absx= abs(xt(i))
          pval= pval +pwgt*absx
          if(absx.gt.xtiny) g(i)= g(i) +pwgt*sgnx
        enddo
      else if( trim(cpena).eq.'glasso' ) then
        glval(0:ngl)= 0d0
        do i=1,ndim
          ig= iglid(i)
          if( ig.gt.0 ) glval(ig)= glval(ig) +xt(i)*xt(i)
        enddo
        glval(0)= 1d0
        do ig=1,ngl
          glval(ig)= sqrt(glval(ig))
          pval= pval +pwgt*glval(ig)
        enddo
        do i=1,ndim
          ig= iglid(i)
          if( ig.eq.0 ) then ! i is not in a group
            absx= abs(xt(i))
            sgnx= sign(1d0,xt(i))
            if( absx.gt.xtiny ) gpena(i)= pwgt*sgnx
            pval= pval +pwgt*absx
          else if( ig.gt.0 ) then ! i is in a group
            if( glval(ig).gt.xtiny) gpena(i)= pwgt*xt(i)/glval(ig)
          endif
        enddo
      endif
      g(1:ndim)= g(1:ndim) +gpena(1:ndim)
      gnorm= sqrt(sprod(ndim,g,g))
      if( myid.eq.0 ) then
!!$        if( iprint.eq.1 .and. mod(iter,ndim).eq.1 ) then
        if( iprint.eq.1 ) then
          write(6,'(a,i8,4es13.5)') ' iter,f,p,gnorm,f-fp=',iter,f &
               ,pval,gnorm,f-fp
          call flush(6)
        else if( iprint.ge.2 ) then
          write(6,'(a,i8,12es13.5)') ' iter,f,p,gnorm,f-fp,x(1:5)=' &
               ,iter,f,pval,gnorm,f-fp,x(1:5)
          call flush(6)
        endif
      endif
      if( gnorm.lt.gtol ) then
        if( myid.eq.0 ) then
          print *,'>>> FS converged wrt gtol'
          write(6,'(a,2es13.5)') '   gnorm,gtol=',gnorm,gtol
        endif
        x(1:ndim)= xt(1:ndim)
        iflag= iflag +2
        return
      endif

      if( trim(cpena).eq.'lasso' ) then
!.....set 0 except the maximum contribution
        imax= 0
        gmax= 0d0
        do i=1,ndim
          absg= abs(g(i))
          if( gmax.lt.absg ) then
            gmax= absg
            imax= i
          endif
        enddo
        alpha= eps
        xad= xt(imax) -alpha*g(imax)
        sgnx= sign(1d0,xad)
        val= max(abs(xad)-alpha*pwgt,0d0)
        xt(imax)= sgnx*val
      else if( trim(cpena).eq.'glasso' ) then
        grpg(0:ngl)= 0d0
        do i=1,ndim
          ig= iglid(i)
          if( ig.gt.0 ) grpg(ig)= grpg(ig) +g(i)*g(i)
        enddo
        imax= 0
        gmax= 0d0
        do ig=1,ngl
          if( gmax.lt.grpg(ig) ) then
            gmax= grpg(ig)
            imax= ig
          endif
        enddo
!!$        print *,'myid,imax,gmax=',myid,imax,gmax
        alpha= eps
        do i=1,ndim
          ig= iglid(i)
          if( ig.eq.0 .or. ig.eq.imax ) then
            xad= xt(i) -alpha*g(i)
            sgnx= sign(1d0,xad)
            val= max(abs(xad)-alpha*pwgt,0d0)
            xt(i)= sgnx*val
          endif
        enddo
      endif
    enddo

    if( myid.eq.0 ) print *,'maxiter exceeded in fs'
    iflag= iflag +10
    x(1:ndim)= xt(1:ndim)
    return
    
  end subroutine fs
!=======================================================================
  subroutine gfs(ndim,x,f,ftst,g,d,xtol,gtol,ftol,xranges,maxiter &
       ,iprint,iflag,myid,func,grad,cfmethod,niter_eval &
       ,sub_eval)
!
!  Grouped Forward Stepwise (grouped FS) regression
!
    use descriptor,only: ngl,mskgfs,msktmp,iglid
    use variables,only: gsfcorr
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,niter_eval
    integer,intent(inout):: iflag,maxiter
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: f,ftst,x(ndim),g(ndim),d(ndim)
    character(len=*),intent(in):: cfmethod
!!$    real(8):: func,grad
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine grad(n,x,gtrn)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: gtrn(n)
      end subroutine grad
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface

    integer:: iter,i,ig,jg,j,igmm,itergfs,niter,inc,jnc&
         ,nfailinmin
    real(8):: alpha,gnorm,pval,fp,f0,gmm &
         ,ftstp,fpgfs,frefb
    real(8),allocatable,save:: xt(:),xtb(:),gmaxgl(:),u(:),gmaxgl0(:)
    real(8),save,allocatable:: gg(:,:),y(:),gp(:),rg(:) &
         ,ggy(:),ygg(:),s(:),g0(:),gpena(:)  !,aa(:,:),cc(:,:),v(:)
    logical,allocatable,save:: lexclude(:)
    integer:: nmsks,nftol,nbases,nvar,nrefresh
    real(8):: ynorm,tmp1,b,sy,syi  !,svy,svyi
    character(len=128):: cnum
    real(8),parameter:: sgm = 1.0d-1

    if( trim(cpena).eq.'lasso' .and. trim(cpena).eq.'glasso' ) then
      if(myid.eq.0) then
        print *,'>>> gfs does not work with lasso or glasso.'
        print *,'>>> so gfs neglects lasso and glasso.'
      endif
      cpena = 'none'
    endif

    if( .not. allocated(gsfcorr) ) then
      if( myid.eq.0 ) then
        print *,'ERROR@gfs: gsfcorr not allocated!'
        print *,'  normalize_input should be norm/variance.'
      endif
      return
    endif

    if( .not.allocated(xt) ) allocate(xt(ndim),xtb(ndim),u(ndim),rg(ngl),g0(ndim))
    if( .not.allocated(gg) ) allocate(gg(ndim,ndim) &
         ,y(ndim),gp(ndim),ggy(ndim),ygg(ndim) &
         ,s(ndim),gpena(ndim))  !,v(ndim),aa(ndim,ndim),cc(ndim,ndim)
    if( .not. allocated(gmaxgl) ) then
      allocate(gmaxgl(ngl),gmaxgl0(ngl),lexclude(ngl))
      lexclude(:) = .false.
    endif
    if( .not.allocated(mskgfs) ) then
      allocate(mskgfs(ngl),msktmp(ngl))
    endif

    if( trim(cread_fsmask).eq.'' ) then
      mskgfs(1:ngl)= 1
    else
      call read_fsmask(cread_fsmask)
    endif
    msktmp(1:ngl)= mskgfs(1:ngl)

    xt(1:ndim)= x(1:ndim)
    do i=1,ndim
      ig= iglid(i)
      if( ig.lt.0 ) cycle
      if( mskgfs(ig).ne.0 ) xt(i)= 0d0
    enddo
    x(1:ndim)= xt(1:ndim)

    nmsks= 0
    do ig=1,ngl
      if( mskgfs(ig).eq.0 ) cycle
      nmsks= nmsks +1
    enddo
    nbases = ngl -nmsks

!.....cfsmode==df0corr, loss-func decrease of each descriptor
    if( index(cfsmode,'df0').ne.0 ) then
      call func(ndim,xt,f,ftst)
      call grad(ndim,xt,g)
      fp = f
      gp(:) = g(:)
!!$      print *,'initial f = ',fp
      do ig=1,ngl
        u(:) = -gp(:)
        g(:) = gp(:)
        do i=1,ndim
          if( ig.eq.i ) cycle
          u(i) = 0d0
          g(i) = 0d0
        enddo
        f = fp
!!$      alpha = 1d0
!!$      call backtrack(ndim,xt,xranges,u,f,ftst,alpha,iprint &
!!$           ,iflag,myid,func,niter)
        alpha = min(max(alpha,xtol*10d0)*2d0, 1d0)
        call armijo_search(ndim,xt,xranges,u,f,ftst,g,alpha,iprint &
             ,iflag,myid,func,niter)
!!$        print *,'ig,f,f-fp=',ig,f,f-fp
        gmaxgl0(ig) = -(f-fp)
      enddo
    else if( index(cfsmode,'grad0').ne.0 ) then
      call func(ndim,xt,f,ftst)
      call grad(ndim,xt,g)
      f0 = f
      g0(:) = g(:)
    endif

    if( myid.eq.0 .and. iprint.gt.0 ) then
      print '(a,i6,2es15.6e3,i6)',' iter,ftrn,ftst,nbases=',0,f,ftst,nbases
    endif
    call sub_eval(0)

!.....Start gFS loop here
    fpgfs = f
    ftstp = ftst
    do iter=1,maxiter
      if( index(cfsmode,'grad0').ne.0 ) then
        g(:) = g0(:)
      else if( index(cfsmode,'grad').ne.0 ) then
!.....First, calc of gradient needs to be done with no masks
!     because it is used to find another new basis
        msktmp(1:ngl)= mskgfs(1:ngl)
        mskgfs(1:ngl)= 0
        call func(ndim,xt,f,ftst)
        call grad(ndim,xt,g)
!.....Restore mask
        mskgfs(1:ngl)= msktmp(1:ngl)
      endif
      gnorm= sqrt(sprod(ndim,g,g))

      if( nmsks.eq.0 ) then
        if( myid.eq.0 ) then
          print *,'ngl=',ngl
          print *,'nmsks is already 0, and finsh the WHILE loop.'
        endif
        exit
      endif
!.....Find bases with the largest gradient
      gmaxgl(1:ngl)= 0d0
      do i=1,ndim
        ig= iglid(i)
        if( ig.le.0 ) cycle
        gmaxgl(ig)= gmaxgl(ig) +abs(g(i))
      enddo
      if( trim(cfsmode).eq.'grad0corr' ) then
        if( iter.eq.1 ) then
          gmaxgl0(:) = gmaxgl(:)
        else
          gmaxgl(:) = gmaxgl0(:)
        endif
      else if( trim(cfsmode).eq.'df0corr' ) then
        gmaxgl(:) = gmaxgl0(:)
      endif
      if( index(cfsmode,'corr').ne.0  .and. nbases.gt.0 ) then
        rg(:) = 0d0
        do ig=1,ngl
          if( mskgfs(ig).ne.0 ) then
            do jg=1,ngl
              if( jg.eq.ig ) cycle
              if( mskgfs(jg).ne.0 ) cycle
!.....Adopt maximum correlation as rg
              rg(ig) = max(rg(ig),abs(gsfcorr(ig,jg)))
            enddo
          endif
          if( myid.eq.0 .and. iprint.gt.1 ) then
            print '(a,i5,3es12.3e3)','  ig,rg,gmax,gmax*(1-rg)=',ig,rg(ig),gmaxgl(ig) &
                 ,gmaxgl(ig)*(1d0 -rg(ig))
          endif
!.....Scale gmaxgl w.r.t. rg
          gmaxgl(ig) = gmaxgl(ig) *(1d0- rg(ig))
        enddo
      else
        if( myid.eq.0 .and. iprint.gt.1 ) then
          do ig=1,ngl
            print '(a,i5,es12.4)','  ig,gmaxgl=',ig,gmaxgl(ig)
          enddo
        endif
      endif
      gmm= 0d0
      igmm= 0
      do ig=1,ngl
!.....Do not take mskgfs==2 into account !
!!$        print *,'myid,ig,mskgfs,gmaxgl=',myid,ig,mskgfs(ig),gmaxgl(ig)
        if( mskgfs(ig).eq.1 .and. gmaxgl(ig).gt.gmm .and. &
             .not. lexclude(ig) ) then
          gmm= gmaxgl(ig)
          igmm= ig
        endif
      enddo
      if( igmm.eq.0 ) then
        if( myid.eq.0 .and. iprint.gt.0 ) then
          print *,'igmm.eq.0 !!!'
          print *,'Nothing to do here, and going out from FS.'
        endif
        return
      endif
!.....remove mask of bases with large variations
      mskgfs(igmm)= 0
      if( myid.eq.0 ) then
        if( iprint.gt.0 ) then
          print '(a,2i5,es12.4)',' iter,igmm,gmm= ',iter,igmm,gmm
          call flush(6)
        endif
      endif
      nmsks= 0
      do ig=1,ngl
        if( mskgfs(ig).eq.0 ) cycle
        nmsks= nmsks +1
      enddo
      nbases= ngl -nmsks
      write(cnum,'(i0)') nbases
      call write_fsmask('out.fsmask.'//trim(cnum))

      nrefresh = 0
      frefb = 1d+30
      xtb(:) = xt(:)
10    continue
      nrefresh = nrefresh + 1
      x(1:ndim)= xt(1:ndim)
!.....Reset xt before going into the BFGS.
!     Because it can easily get stuck at the local minimum by starting BFGS
!     from the minimum of the previous BFGS even if another variable is added.
      if( trim(cfs_xrefresh).eq.'zero' ) then
!.....Reset all the xt connected to bases to 0.0 
        do i=1,ndim
          ig = iglid(i)
          if( ig.le.0 ) then
            xt(i) = sgm *polarbm()
          else
            xt(i)= 0d0
          endif
!!$          print *,'i,ig,xt=',i,ig,xt(i)
        enddo
      else if( trim(cfs_xrefresh).eq.'random' ) then
!.....Reset xt randomly
        do i=1,ndim
          xt(i) = sgm *polarbm()
          ig = iglid(i)
          if( ig.le.0 ) cycle
          if( mskgfs(ig).ne.0 ) xt(i) = 0d0
        enddo
      else
!.....Do nothing, use current variable values.
      endif
      call func(ndim,xt,f,ftst)
      call grad(ndim,xt,g)
!.....Penalty
      if( trim(cpena).eq.'ridge' ) then
        call penalty(cpena,ndim,pval,gpena,xt)
        f = f +pval
        g(1:ndim) = g(1:ndim) +gpena(1:ndim)
      endif
!.....preparation for BFGS
      gg(1:ndim,1:ndim)= 0d0
      do i=1,ndim
        gg(i,i)= 1d0
      enddo
!.....Mask some g's
      do i=1,ndim
        ig= iglid(i)
        if( ig.le.0 ) cycle
        if( mskgfs(ig).ne.0 ) g(i)= 0d0
      enddo

      gnorm= sqrt(sprod(ndim,g,g))
      nftol= 0
      iflag= 0
      nfailinmin = 0
!.....BFGS loop begins
      do itergfs=1,ninnergfs
        
        u(1:ndim)= 0d0
        inc = 0
        do i=1,ndim
          ig = iglid(i)
          if( ig.gt.0 .and. mskgfs(ig).ne.0 ) cycle
          inc = inc + 1
          jnc = 0
          do j=1,ndim
            jg = iglid(j)
            if( jg.gt.0 .and. mskgfs(jg).ne.0 ) cycle
            jnc = jnc + 1
            u(i)= u(i) -gg(inc,jnc)*g(j)
          enddo
        enddo
!.....mask u
        do i=1,ndim
          ig= iglid(i)
          if( ig.le.0 ) cycle
          if( mskgfs(ig).ne.0 ) then
            g(i)= 0d0
            u(i)= 0d0
          endif
        enddo
        fp= f
        gp(1:ndim)= g(1:ndim)
!.....line minimization
        if( trim(clinmin).eq.'armijo' ) then
          alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
          call armijo_search(ndim,xt,xranges,u,f,ftst,g,alpha,iprint &
               ,iflag,myid,func,niter)
!.....if something wrong with armijo search, try opposite direction
          if( iflag/100.ne.0 ) then
            alpha= -1d0
            if(myid.eq.0) print *,'trying opposite direction...'
            call armijo_search(ndim,xt,xranges,u,f,ftst,g,alpha,iprint &
                 ,iflag,myid,func,niter)
          endif
        else   ! backtrack (default)
          alpha = min(max(alpha,xtol*2d0)*2d0, 1d0)
          call backtrack(ndim,xt,xranges,u,f,ftst,alpha,iprint &
               ,iflag,myid,func,niter)
        endif
!.....get out of bfgs loop
!!$        print '(a,3i4,2es12.4)','iter,itergfs,iflag,ftrn,fpgfs=' &
!!$             ,iter,itergfs,iflag,f,fpgfs
        if( itergfs.lt.ninnergfs .and. iflag/100.ne.0 ) then
          if( itergfs.eq.1 ) then
            if( myid.eq.0 ) then
              print *,"Line minimizaitonat failed 1st step of FS,"&
                   //" but keep going forward..."
            endif
!!$!.....Set mask as 2, which means this basis will be not included
!!$!     and not taken into consideration anymore.
!!$            mskgfs(igmm)= 2
          else ! itergsf.ne.1
            nfailinmin = nfailinmin +1
            if( nfailinmin.eq.1 ) then
              if( myid.eq.0 .and. iprint.gt.1 ) then
                print *,'>>> Initialize gg because alpha was not found.'
              endif
              gg(:,:) = 0d0
              do i=1,ndim
                gg(i,i) = 1d0
              enddo
              f = fp
              iflag = iflag -100*(iflag/100)
              alpha = 1d0
              cycle
            else ! nfailnmin > 1
              if( myid.eq.0 ) then
                print *,'>>> Line minimization failed twice consecutively.'
              endif
              if( f.gt.fpgfs .and. trim(cfs_xrefresh).eq.'random' &
                   .and. nrefresh.le.maxfsrefresh ) then
                if( myid.eq.0 .and. iprint.gt.0 ) then
                  print *,'>>> But refresh x and go back to BFGS, ' &
                       //'because current f > fpgfs.'
!!$                  print *,'f,fpgfs=',f,fpgfs
                endif
                goto 10
              else if( f.gt.fpgfs ) then
!.....If f is greater than the previous value,
!.....selected symmetry function is probably not a good one, so exclude it
!.....in the following gFS steps.
                lexclude(igmm) = .true.
                mskgfs(igmm) = 1
                if( myid.eq.0 .and. iprint.gt.0 ) &
                     print *,'>>> Exclude the basis since f > fpgfs, igmm=',igmm
              endif
              exit
            endif
          endif
        else ! iflag/100.eq.0
          nfailinmin = 0
        endif
        xt(1:ndim)= xt(1:ndim) +alpha*u(1:ndim)
        call wrap_ranges(ndim,xt,xranges)
        call grad(ndim,xt,g)
        call penalty(cpena,ndim,pval,gpena,xt)
        if( trim(cpena).eq.'ridge' ) g(1:ndim) = g(1:ndim) +gpena(1:ndim)
        do i=1,ndim
          ig= iglid(i)
          if( ig.le.0 ) cycle
          if( mskgfs(ig).ne.0 ) then
            g(i)= 0d0
          endif
        enddo

        gnorm= sqrt(sprod(ndim,g,g))
        if( myid.eq.0 ) then
          if( iprint.gt.1 ) then
            write(6,'(a,i8,2es13.5)') ' itergfs,f,gnorm=',itergfs,f,gnorm
            call flush(6)
          endif
        endif

!.....check convergence
        if( gnorm.lt.gtol ) then
          if( myid.eq.0 .and. iprint.gt.0 ) then
            print '(a,2es13.5)',' >>> QN in gFS converged; gnorm,gtol= ' &
                 ,gnorm,gtol
          endif
          if( f.gt.fpgfs .and. trim(cfs_xrefresh).eq.'random' &
               .and. nrefresh.le.maxfsrefresh ) then
            if( myid.eq.0 .and. iprint.gt.0 ) then
              print *,'>>> But refresh x and go back to BFGS, because current f > fpgfs.'
!!$              print *,'f,fpgfs=',f,fpgfs
            endif
            goto 10
          else if( f.gt.fpgfs ) then
!.....If f is greater than the previous value,
!.....selected symmetry function is probably not a good one, so exclude it
!.....in the following gFS steps.
            lexclude(igmm) = .true.
            mskgfs(igmm) = 1
            if( myid.eq.0 .and. iprint.gt.0 ) &
                 print *,'>>> Exclude the basis since f > fpgfs, igmm=',igmm
          endif
          iflag= iflag +2
          exit
        else if( abs(f-fp)/abs(fp).lt.ftol) then
          nftol= nftol +1
          if( nftol.ge.numtol ) then
            if( myid.eq.0 .and. iprint.gt.0 ) then
              print '(a,i2,a)',' >>> QN in gFS is probably converged because ' // &
                   'of ftol ',numtol,' times consecutively.'
            endif
            if( f.gt.fpgfs .and. trim(cfs_xrefresh).eq.'random' &
                 .and. nrefresh.le.maxfsrefresh ) then
              if( myid.eq.0 .and. iprint.gt.0 ) then
                print *,'>>> But refresh x and go back to BFGS, because current f > fpgfs.'
!!$                print *,'f,fpgfs=',f,fpgfs
              endif
              goto 10
            else if( f.gt.fpgfs ) then
!.....If f is greater than the previous value,
!.....selected symmetry function is probably not a good one, so exclude it
!.....in the following gFS steps.
              lexclude(igmm) = .true.
              mskgfs(igmm) = 1
              if( myid.eq.0 .and. iprint.gt.0 ) &
                   print *,'>>> Exclude the basis since f > fpgfs, igmm=',igmm
            endif
            iflag= iflag +3
            exit
          else if( itergfs.lt.ninnergfs ) then
            if( myid.eq.0 .and. iprint.gt.1 ) then
              print *,'>>> gg initialized because |f-fp|/|fp|<ftol '
            endif
            gg(1:ndim,1:ndim)= 0d0
            do i=1,ndim
              gg(i,i)= 1d0
            enddo
            cycle
          endif
        endif
        if( itergfs.ge.ninnergfs ) then
          if( myid.eq.0 .and. iprint.gt.0 ) then
            print *,'>>> itergfs exceeds fs_num_inner in QN in gFS.'
          endif
          if( f.gt.fpgfs .and. trim(cfs_xrefresh).eq.'random' &
               .and. nrefresh.le.maxfsrefresh ) then
            if( myid.eq.0 .and. iprint.gt.0 ) then
              print *,'>>> But refresh x and go back to BFGS, because current f > fpgfs.'
!!$              print *,'f,fpgfs=',f,fpgfs
            endif
            goto 10
          else if( f.gt.fpgfs ) then
!.....If f is greater than the previous value,
!.....selected symmetry function is probably not a good one, so exclude it
!.....in the following gFS steps.
            lexclude(igmm) = .true.
            mskgfs(igmm) = 1
            if( myid.eq.0 .and. iprint.gt.0 ) &
                 print *,'>>> Exclude the basis since f > fpgfs, igmm=',igmm
          endif
          exit
        endif
        nftol = 0

!.....BFGS treatment with reduced number of variables
        inc = 0
        s(:) = 0d0
        y(:) = 0d0
        do i=1,ndim
          ig = iglid(i)
          if( ig.gt.0 .and. mskgfs(ig).ne.0 ) cycle
          inc = inc + 1
          s(inc) = alpha *u(i)
          y(inc) = g(i) -gp(i)
        enddo
        nvar = inc
!!$        s(1:ndim)= alpha *u(1:ndim)
!!$        y(1:ndim)= g(1:ndim) -gp(1:ndim)
        ynorm= sprod(ndim,y,y)
        if( ynorm.lt.1d-14 ) then
          if( myid.eq.0 .and. iprint.gt.1 ) then
            print *,'>>> Initialize gg because y*y < 1d-14'
          endif
          gg(1:ndim,1:ndim)= 0d0
          do i=1,ndim
            gg(i,i)= 1d0
          enddo
          cycle
        endif

!.....update matrix gg
!!$        sy= sprod(ndim,s,y)
        sy= sprod(nvar,s,y)
        syi= 1d0/sy
!!$        do i=1,ndim
        do i=1,nvar
          tmp1= 0d0
!!$          tmp2= 0d0
!!$          do j=1,ndim
          do j=1,nvar
            tmp1= tmp1 +gg(j,i)*y(j)
          enddo
          ggy(i)= tmp1 *syi
        enddo
        b= 1d0
!!$        do i=1,ndim
        do i=1,nvar
          b=b +y(i)*ggy(i)
        enddo
        b= b*syi
!.....without temporary matrix aa
!!$        do j=1,ndim
!!$          do i=1,ndim
        do j=1,nvar
          do i=1,nvar
            gg(i,j)=gg(i,j) +s(j)*s(i)*b &
                 -(s(i)*ggy(j) +ggy(i)*s(j))
          enddo
        enddo
!!$        print *,'End of inner BFGS'
      enddo  ! End of inner BFGS

!!$      print '(a,i5,2es12.4)','iter,ftrn,fpgfs=',iter,f,fpgfs
      if( f.gt.fpgfs .and. .not.lexclude(igmm) ) then
        print *,'SOMETHING WRONG!!!'
        print *,'itergfs,ninnergfs=',itergfs,ninnergfs
      endif
      if( .not. lexclude(igmm) ) then
        fpgfs = f
        ftstp = ftst
        x(1:ndim)= xt(1:ndim)
        if( myid.eq.0 .and. iprint.gt.0 ) then
          print '(a,i6,2es15.6e3,i6)',' iter,ftrn,ftst,nbases=' &
               ,iter,f,ftst,nbases
        endif
        call sub_eval(iter)
      endif
    enddo

    if( myid.eq.0 .and. iprint.gt.0 ) &
         print *,'iter exceeds maxiter in gFS.'
    iflag= iflag +10
    x(1:ndim)= xt(1:ndim)
    return

  end subroutine gfs
!=======================================================================
  subroutine read_fsmask(cfname)
!
!  Read mask for gfs from file
!
    use descriptor,only: ngl,mskgfs
    use parallel
    implicit none
    character(len=*),intent(in):: cfname

    integer,parameter:: iomask = 40
    integer:: ndata,i,ierrcode,itmp,ig

    ierrcode = 0
    if( myid.eq.0 ) then
      print *,'Read mskgfs from '//trim(cfname)
      open(iomask,file=trim(cfname),status='old')
      read(iomask,*) ndata
      if( ndata.ne.ngl ) then
        ierrcode = 1
        close(iomask)
        goto 999
      endif
      do i=1,ndata
        read(iomask,*,err=99) ig, mskgfs(ig)
      enddo
      close(iomask)
    endif

999 continue
    itmp = 0
    call mpi_allreduce(ierrcode,itmp,1,mpi_integer,mpi_max &
         ,mpi_world,ierr)
    if( itmp.gt.0 ) then
      if( myid.eq.0 ) then
        if( itmp.eq.1 ) print *,'ERROR@read_fsmask: ndata.ne.ngl !'
        if( itmp.eq.2 ) print *,'ERROR@read_fsmask: something wrong with data !'
      endif
      call mpi_finalize(ierr)
    endif
    call mpi_bcast(mskgfs,ngl,mpi_integer,0,mpi_world,ierr)
    return

99  ierrcode = 2
    close(iomask)
    goto 999
  end subroutine read_fsmask
!=======================================================================
  subroutine write_fsmask(cfname)
    use parallel
    use descriptor,only: ngl,mskgfs
    implicit none 
    character(len=*),intent(in):: cfname

    integer,parameter:: iomask = 41
    integer:: i

    if( myid.eq.0 ) then
      open(iomask,file=trim(cfname),status='replace')
      write(iomask,'(i6)') ngl
      do i=1,ngl
        write(iomask,'(i7,i3)') i, mskgfs(i)
      enddo
      close(iomask)
    endif
    call mpi_barrier(mpi_world,ierr)
    return
  end subroutine write_fsmask
!=======================================================================
  subroutine cap_grad(ndim,g)
!
!  Set the ceiling of gradient to avoid too large gnorm.
!
    implicit none
    integer,intent(in):: ndim
    real(8),intent(inout):: g(ndim)
    real(8),parameter:: gmax= 1.0d0
    real(8):: gnorm
    
    gnorm= sqrt(sprod(ndim,g,g))

    if( gnorm.gt.gmax ) then
      g(1:ndim)= g(1:ndim) *gmax /gnorm
    endif
    return
  end subroutine cap_grad
!=======================================================================
  subroutine sa(ndim,xbest,fbest,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,cfmethod,niter_eval,sub_eval)
!
! Simulated Annealing
!
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,maxiter,niter_eval
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: fbest,xbest(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface

    integer:: iter,idim,nadpt,i
    real(8):: f,ft,temp,xw,dx,ptrans,ftst,tau
    real(8),allocatable:: x(:),xt(:)
    logical,save:: l1st = .true.

    if( l1st ) then
      if(.not.allocated(sa_xws) ) allocate(sa_xws(ndim))
      do i=1,ndim
!!$        sa_xws(i) = max(xbest(i)*sa_xw0,1d-2)
        sa_xws(i) = 1d-2
      enddo
      tau = max(sa_tau,1d0)
      if(myid.eq.0 .and. iprint.ne.0) then
        write(6,*) ''
        write(6,'(a)') '------------------------------------------------------------------------'
        write(6,'(a)') '                    Simulated annealing'
        write(6,'(a)') '------------------------------------------------------------------------'
        write(6,'(a,a)') ' Temperature control method = ',trim(sa_tctrl)
        if( sa_tctrl(1:3).eq.'exp' ) then
          write(6,'(a,f10.4)') ' Initial temperature = ',sa_temp0
          write(6,'(a,f10.4)') ' Relaxation iteration (tau) = ',tau
        else if( sa_tctrl(1:3).eq.'bes' ) then
          write(6,'(a,f8.1)') ' Division of fbest = ',sa_div_best
        else
          write(6,'(a,f10.4)') ' Initial temperature = ',sa_temp0
        endif
        print *,''
      endif
      
    endif

    if( .not.allocated(x) ) allocate(x(ndim),xt(ndim))

!.....Initialize
    x(1:ndim)= xbest(1:ndim)
    do idim=1,ndim
      if( x(idim).lt.xranges(1,idim) ) x(idim) = xranges(1,idim)
      if( x(idim).gt.xranges(2,idim) ) x(idim) = xranges(2,idim)
    enddo
    call func(ndim,x,f,ftst)
!!$    p = exp(-f/temp)
    if( f*0d0 .ne. 0d0 ) f = 1d+10
    fbest= f
    if( sa_tctrl(1:3).eq.'bes' ) then
      temp = fbest/sa_div_best
    else
      temp= max(sa_temp0,1d-8)
    endif
    xw= sa_xw0
    nadpt= 0

    iter= 0
    ft = 0d0
    idim = 0
    if( myid.eq.0 .and. iprint.ne.0 ) then
      write(6,'(a,2i10,4es13.5,2f9.5)')&
           ' iter,idim,temp,f,ft,fbest,ptrans,radpt='&
           ,iter,idim,temp,f,ft,fbest,ptrans,0d0
    endif
    
    call sub_eval(0)
!.....Main loop of random displacements
    do iter=1,maxiter

!.....Choose a parameter to be displaced
      idim= urnd()*ndim +1

!.....Compute the displacement using a uniform random number
      xt(1:ndim)= x(1:ndim)
      dx= (urnd()-0.5d0)*sa_xws(idim)
      xt(idim)= xt(idim) +dx
      if( xt(idim).lt.xranges(1,idim) ) xt(idim) = xranges(1,idim)
      if( xt(idim).gt.xranges(2,idim) ) xt(idim) = xranges(2,idim)

!.....Compute function value
      call func(ndim,xt,ft,ftst)
!.....Detect NaN and skip this trial
      if( ft*0d0 .ne. 0d0 ) then
        if( myid.eq.0 .and. iprint.ne.0 ) then
          write(6,'(a,2i10,es12.4,3es13.5,2f9.5)')&
               ' [ft.eq.NaN] iter,idim,sa_xws(idim)=' &
               ,iter,idim,sa_xws(idim)
        endif
!.....Decrease the width of deviation
        sa_xws(idim) = sa_xws(idim) *sa_fctr
        goto 10
      endif
!!$      pt = exp(-ft/temp)

!.....Compute probability of taking the displacement
      ptrans= min(1d0,exp(-(ft-f)/temp))
!!$      ptrans = min(1d0,pt/p)

!.....Store the best one
      if( ft.lt.fbest ) then
        fbest= ft
        xbest(1:ndim)= xt(1:ndim)
        if( sa_tctrl(1:3).eq.'bes' ) temp = fbest/sa_div_best
      endif

      if( mod(iter,niter_eval).eq.0 ) then
        call sub_eval(iter)
      endif

      if( myid.eq.0 .and. iprint.ne.0 ) then
        write(6,'(a,2i10,4es13.5,2f9.5)')&
             ' iter,idim,temp,f,ft,fbest,ptrans,radpt='&
             ,iter,idim,temp,f,ft,fbest,ptrans,dble(nadpt)/iter
        flush(6)
      endif
      
!.....Update the parameter if needed
      if( urnd().lt.ptrans ) then
        x(idim)= xt(idim)
        f= ft
        nadpt= nadpt +1
!.....Increase the width of deviation
        sa_xws(idim) = sa_xws(idim) /sa_fctr
      else
!.....Decrease the width of deviation
        sa_xws(idim) = sa_xws(idim) *sa_fctr
      endif

10    continue
      if( sa_tctrl(1:3).eq.'lin' ) then
!.....Update temperature (linear)
        temp= dble(maxiter-iter)/maxiter *sa_temp0
      else if( sa_tctrl(1:3).eq.'exp' ) then
!.....Update temperature (exponential)
        temp= sa_temp0 *exp(-dble(iter)/tau)
      endif
    enddo

    if( myid.eq.0 ) then
      write(6,'(a,i10,a,i10)') ' Num of adoption in SA='&
           ,nadpt,'/',maxiter
!!$      print *,'sa_xws:'
!!$      do i=1,ndim
!!$        write(6,'(i6,es15.7)') i,sa_xws(i)
!!$      enddo
    endif

!.....Finally compute the function value of the best candidate
    call func(ndim,xbest,f,ftst)

    l1st = .false.
    
  end subroutine sa
!=======================================================================
  subroutine penalty(cpena,ndim,fp,gp,x)
!
! Calculate penalty term and its derivative.
! lasso and ridge are available.
!
    use descriptor,only: glval,ngl,iglid
    implicit none
    character(len=*),intent(in):: cpena
    integer,intent(in):: ndim
    real(8),intent(in):: x(ndim)
    real(8),intent(out):: fp,gp(ndim)

    integer:: i,ig
    real(8):: absx,sgnx
    real(8),parameter:: xtiny  = 1d-14

    fp= 0d0
    gp(1:ndim)= 0d0
    if( trim(cpena).eq.'lasso' ) then
      do i=1,ndim
        absx= abs(x(i))
        fp= fp +pwgt*absx
        sgnx= sign(1d0,x(i))
        if( absx.gt.xtiny ) gp(i)= pwgt*sgnx
      enddo
    else if( trim(cpena).eq.'glasso' ) then
      glval(0:ngl)= 0d0
      do i=1,ndim
        ig= iglid(i)
        if(ig.gt.0) glval(ig)= glval(ig) +x(i)*x(i)
      enddo
      glval(0)= 1d0
      do ig=1,ngl
        glval(ig)= sqrt(glval(ig))
        fp= fp +pwgt*glval(ig)
      enddo
      do i=1,ndim
        ig= iglid(i)
        if( ig.eq.0 ) then ! i is not in a group
          absx= abs(x(i))
          sgnx= sign(1d0,x(i))
          if( absx.gt.xtiny ) gp(i)= pwgt*sgnx
          fp= fp +pwgt*absx
        else if( ig.gt.0 ) then ! i is in a group
          if( glval(ig).gt.xtiny) gp(i)= pwgt*x(i)/glval(ig)
        endif
      enddo
    else if( trim(cpena).eq.'ridge' ) then
      do i=1,ndim
        fp= fp +pwgt*x(i)*x(i)
        gp(i)= 2d0*pwgt*x(i)
      enddo
    endif

    return
  end subroutine penalty
!=======================================================================
  subroutine random_search(ndim,xbest,fbest,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,cfmethod,niter_eval,sub_eval)
!
!  Pure random search with variable range
!
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,maxiter,niter_eval
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: fbest,xbest(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface

    integer:: iter,idim
    real(8):: f,ftst,fmin,xmin,xmax,xi
    real(8),allocatable:: x(:)
    logical,save:: l1st = .true.

    if( l1st ) then
      if( .not. allocated(x) ) allocate(x(ndim))
      l1st = .false.
    endif

    x(1:ndim)= xbest(1:ndim)
    call func(ndim,x,f,ftst)
    if( fmin*0d0.ne.0d0 ) then  ! NaN
      fmin = 1.0d+10
    endif
    fmin = f
    if( myid.eq.0 .and. iprint.gt.0 ) &
         write(6,'(a,i8,2es15.7,100f7.3)') ' iter,f,fmin = ',0,f,fmin
    
    do iter=1,maxiter
      do idim=1,ndim
        xmin = xranges(1,idim)
        xmax = xranges(2,idim)
        xi = (xmax-xmin)*urnd() +xmin
        x(idim) = xi
      enddo

      call func(ndim,x,f,ftst)
      if( f*0d0.ne.0d0 ) then  !NaN
        f = 1.0d+10
      endif
      if( myid.eq.0 .and. iprint.gt.0 ) &
           write(6,'(a,i8,2es15.7,100f7.3)') ' iter,f,fmin = ',iter,f,fmin
      if( f.lt.fmin ) then
        if( myid.eq.0 .and. iprint.gt.0 ) then
          write(6,'(a,i8,2es15.7)') ' fmin is updated: iter,fmin,df= ' &
               ,iter,f,abs(f-fmin)
        endif
        fmin = f
        fbest = fmin
        xbest(1:ndim) = x(1:ndim)
        call sub_eval(iter)
      endif
    enddo

  end subroutine random_search
!=======================================================================
  subroutine metadynamics(ndim,xbest,fbest,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,cfmethod,niter_eval,sub_eval)
!
!  Metadynamics for minimum search
!  Use simulated annealing-like search for local minimum search
!  and once the local minimum is found, add a gaussian potential to
!  the minimum to make it possible to escape from the minimum.
!
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,maxiter,niter_eval
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: fbest,xbest(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface

    integer:: iter,idim,nadpt,i,ng,ig,interval
    real(8):: f,ft,temp,xw,dx,ptrans,ftst,tau,adx,&
         fg,fgt,tmp
    real(8),allocatable:: x(:),xt(:),xxt(:)
    logical,save:: l1st = .true.

    if( l1st ) then
      interval = max(maxiter/md_ng,1)
      if(.not.allocated(md_gp) ) allocate(md_gp(ndim,md_ng))
      md_gp(:,:) = 0d0
      if(.not.allocated(sa_xws) ) allocate(sa_xws(ndim))
      do i=1,ndim
!!$        sa_xws(i) = max(xbest(i)*sa_xw0,1d-2)
        sa_xws(i) = 1d-2
      enddo
      tau = max(sa_tau,1d0)
      if(myid.eq.0 .and. iprint.ne.0) then
        print *,''
        print *,'******************** Metadynamics ********************'
        print *,''
        write(6,'(a,i5)') ' Number of gaussians to be added = ',md_ng
        write(6,'(a,f10.4)') ' Gaussian height         = ',md_height
        write(6,'(a,f10.4)') ' Gaussian width (sigma)  = ',md_sigma
      endif
      l1st = .false.
    endif

    if( .not.allocated(x) ) allocate(x(ndim),xt(ndim),xxt(ndim))

!.....Initialize
    x(1:ndim)= xbest(1:ndim)
    do idim=1,ndim
      if( x(idim).lt.xranges(1,idim) ) x(idim) = xranges(1,idim)
      if( x(idim).gt.xranges(2,idim) ) x(idim) = xranges(2,idim)
    enddo
    call func(ndim,x,f,ftst)
!!$    p = exp(-f/temp)
    if( f*0d0.ne.0d0 ) f = 1.0d+10
    fbest= f
    fg = f
!!$    temp= sa_temp0
    temp= fbest/sa_div_best
    xw= sa_xw0
    nadpt= 0
    ng = 0

    call sub_eval(0)
!.....Main loop of random displacements
    do iter=1,maxiter

!.....Choose a parameter to be displaced
      idim= urnd()*ndim +1

!.....Compute the displacement using a uniform random number
      xt(1:ndim)= x(1:ndim)
      dx= (urnd()-0.5d0)*sa_xws(idim)
      xt(idim)= xt(idim) +dx
      if( xt(idim).lt.xranges(1,idim) ) xt(idim) = xranges(1,idim)
      if( xt(idim).gt.xranges(2,idim) ) xt(idim) = xranges(2,idim)

!.....Compute function value
      call func(ndim,xt,ft,ftst)
!.....Detect NaN and skip this trial
      if( ft*0d0 .ne. 0d0 ) then
        if( myid.eq.0 .and. iprint.ne.0 ) then
          write(6,'(a,2i10,es12.4,3es13.5,2f9.5)')&
               ' [ft.eq.NaN] iter,idim,sa_xws(idim)=' &
               ,iter,idim,sa_xws(idim)
        endif
!.....Decrease the width of deviation
        sa_xws(idim) = sa_xws(idim) *sa_fctr
        goto 10
      endif
!.....Add gaussian potentials to ft
      fgt = ft
      do ig=1,ng
        xxt(1:ndim) = xt(1:ndim)-md_gp(1:ndim,ig)
        adx = dot_product(xxt(1:ndim),xxt(1:ndim))
        tmp = md_height*exp(-adx/2/md_sigma**2)
        fgt = fgt +tmp
      enddo
      fg = f
      do ig=1,ng
        xxt(1:ndim) = x(1:ndim)-md_gp(1:ndim,ig)
        adx = dot_product(xxt(1:ndim),xxt(1:ndim))
        tmp = md_height*exp(-adx/2/md_sigma**2)
        fg = fg +tmp
      enddo
!!$      pt = exp(-ft/temp)

!.....Store the best one
      if( ft.lt.fbest ) then
        fbest= ft
        xbest(1:ndim)= xt(1:ndim)
        temp = fbest/sa_div_best
      endif

!.....Compute probability of taking the displacement
      ptrans= min(1d0,exp(-(fgt-fg)/temp))
!!$      ptrans = min(1d0,pt/p)

      if( mod(iter,niter_eval).eq.0 ) then
        call sub_eval(iter)
      endif

      if( myid.eq.0 .and. iprint.ne.0 ) then
        write(6,'(a,2i10,6es13.5,2f9.5,i5,es13.5)')&
             ' iter,idim,temp,f,ft,fbest,fg,fgt,ptrans,radpt,ng='&
             ,iter,idim,temp,f,ft,fbest,fg,fgt,ptrans,dble(nadpt)/iter,ng
      endif
      
!.....Update the parameter if adopted
      if( urnd().lt.ptrans ) then
        x(idim)= xt(idim)
        f= ft
        fg= fgt
        nadpt= nadpt +1
!.....Increase the width of deviation
        sa_xws(idim) = sa_xws(idim) /sa_fctr
      else
!.....Decrease the width of deviation
        sa_xws(idim) = sa_xws(idim) *sa_fctr
      endif

!.....Put a gaussian potential to the current variable position
      if( mod(iter,interval).eq.0 ) then
        ng = ng + 1
        if( ng.gt.md_ng ) then
          if( myid.eq.0 .and. iprint.gt.0 ) then
            write(6,'(a,i8)') ' Number of gaussian exceeds md_ng = ',md_ng
          endif
          exit
        endif
        md_gp(1:ndim,ng) = x(1:ndim)
      endif

10    continue
!!$!.....Update temperature (linear)
!!$      temp= dble(maxiter-iter)/maxiter *sa_temp0
!!$!.....Update temperature (exponential)
!!$      temp= sa_temp0 *exp(-dble(iter)/tau)

    enddo

    if( myid.eq.0 ) then
      write(6,'(a,i10,a,i10)') ' Num of adoption in SA in Metadynamics='&
           ,nadpt,'/',maxiter
    endif

!.....Finally compute the function value of the best candidate
    call func(ndim,xbest,f,ftst)

    
  end subroutine metadynamics
!=======================================================================
  subroutine ga(ndim,xbest,fbest,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,cfmethod,niter_eval,sub_eval,sub_ergrel)
!
! Genetic algorithm (GA) which does not use gradient information.
! GA itself is a serial code, but the function evaluation can be parallel.
!
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,maxiter,niter_eval
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: fbest,xbest(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
      subroutine sub_ergrel(cadd)
        character(len=*),intent(in):: cadd
      end subroutine sub_ergrel
    end interface

    integer:: i,j,iter,i1,i2
    real(8):: ftrn,ftst
    logical,save:: l1st = .true.
    integer:: iid,iidbest
    type(individual),allocatable:: indivs(:),offsprings(:)
    real(8),allocatable:: xtmp(:)
    character(len=128):: cadd

    integer,parameter:: io_indivs = 30
    character(len=128),parameter:: cf_indivs = 'out.ga.individuals'
    integer,parameter:: io_steps = 31
    character(len=128),parameter:: cf_steps = 'out.ga.generations'

    if( l1st ) then
!.....Initialize
      ga_ngenes = ndim
      if( ga_noffsp.le.0 ) then
        ga_noffsp = ga_nindivs
      endif
!.....Allocate necessary memory spaces
      allocate(indivs(ga_nindivs),offsprings(ga_noffsp))
      allocate(xtmp(ndim))
      do i=1,ga_noffsp
        allocate(offsprings(i)%genes(ga_ngenes))
        do j=1,ga_ngenes
          allocate(offsprings(i)%genes(j)%bits(ga_nbits))
        enddo
      enddo
      do i=1,ga_nindivs
        allocate(indivs(i)%genes(ga_ngenes))
        do j=1,ga_ngenes
          allocate(indivs(i)%genes(j)%bits(ga_nbits))
        enddo
      enddo
      if( myid.eq.0 .and. iprint.ne.0 ) then
        write(6,*) ''
        write(6,'(a)') '------------------------------------------------------------------------'
        write(6,'(a)') '                    Genetic Algorithm'
        write(6,'(a)') '------------------------------------------------------------------------'
        print '(a,i4)',' Number of individuals = ',ga_nindivs
        print '(a,i4)',' Number of genes       = ',ga_ngenes
        print '(a,i4)',' Number of bits        = ',ga_nbits
        print '(a,i4)',' Number of offsprings  = ',ga_noffsp
        print *,''
        call wrap_ranges(ndim,xbest,xranges)
      endif
      l1st = .false.
    endif

    if( myid.eq.0 ) then
      open(io_indivs,file=cf_indivs,status='replace')
      write(io_indivs,'(a)') '# iid, ftrn, ftst, vars...'
      open(io_steps,file=cf_steps,status='replace')
      write(io_steps,'(a)') '# iter, iid, ftrn, ftst'
    endif
100 format(i6,2es14.6,100es11.3)

    iter = 0
    
!.....Create population that includes some individuals
    iid = 0
    fbest = 1d+30
    do i=1,ga_nindivs
      do j=1,ga_ngenes
        indivs(i)%genes(j)%vmin = xranges(1,j)
        indivs(i)%genes(j)%vmax = xranges(2,j)
      enddo
      do j=1,ga_ngenes
        call var2gene(xbest(j),indivs(i)%genes(j))
      enddo
!.....Use a given X in case of i==1, otherwise mutate it with high rate.
      if( i.ne.1 ) then
        do j=1,ga_ngenes
          call mutate(indivs(i)%genes(j),0.25d0)
        enddo
      endif
      call indiv2vars(indivs(i),ndim,xtmp)
      call func(ndim,xtmp,ftrn,ftst)
      iid = iid + 1
      if( iprint.gt.1 ) print *,'iid,ftrn,ftst=',iid,ftrn,ftst
!.....Detect NaN and replace it with 1d+10
      if( ftrn*0d0 .ne. 0d0 ) then
        if( myid.eq.0 .and. iprint.ne.0 ) then
          write(6,'(a,2i10)')&
               ' [ftrn.eq.NaN] iter,iid = ',iter,iid
        endif
        ftrn = fupper_lim
      else if( ftrn.gt.fupper_lim ) then
        ftrn = fupper_lim
      endif
      indivs(i)%fvalue = ftrn
      indivs(i)%ftst = ftst
      indivs(i)%fitness = 1d0/ftrn
      indivs(i)%iid = iid
      if( myid.eq.0 ) write(io_indivs,100) iid,ftrn,ftst,xtmp(1:min(ndim,100))
      if( ftrn.lt.fbest ) then
        fbest = ftrn
        iidbest = iid
        xbest(1:ndim) = xtmp(1:ndim)
      endif
      if( i.eq.1 ) call sub_eval(iter)
    enddo

    if( myid.eq.0 ) then
      write(6,'(a,i8,1x,100es11.3)') &
           " iter,fbest,fvals= ",&
           iter,fbest,(indivs(i)%fvalue,i=1,min(ga_nindivs,10))
      do i=1,ga_nindivs
        write(io_steps,'(2i8,2es15.7)') iter, indivs(i)%iid, indivs(i)%fvalue &
             ,indivs(i)%ftst
      enddo
    endif

!.....GA loop starts....................................................
    do iter=1,maxiter

!.....Give birth some offsprings by crossover
!!$      print *,'giving birth...'
      do i=1,ga_noffsp
        i1 = int(urnd()*ga_nindivs) +1
10      i2 = int(urnd()*ga_nindivs) +1
        if( i1.eq.i2 ) goto 10
        call crossover(indivs(i1),indivs(i2),offsprings(i))
!.....Mutation of new-born babies
        do j=1,ga_ngenes
          call mutate(offsprings(i)%genes(j),ga_rate_mutate)
        enddo
!.....Evaluate the value of each new-born babies
        call indiv2vars(offsprings(i),ndim,xtmp)
        call func(ndim,xtmp,ftrn,ftst)
        iid = iid + 1
        if( iprint.gt.1 ) print *,'iid,ftrn,ftst=',iid,ftrn,ftst
!.....Detect NaN and replace it with 1d+10
        if( ftrn*0d0 .ne. 0d0 ) then
          if( myid.eq.0 .and. iprint.ne.0 ) then
            write(6,'(a,2i10)')&
                 ' [ftrn.eq.NaN] iter,iid = ',iter,iid
          endif
          ftrn = fupper_lim
        endif
        if( ftrn.gt.fupper_lim ) then
          ftrn = fupper_lim
        endif
        if( ftst.gt.fupper_lim ) then
          ftst = fupper_lim
        endif
        offsprings(i)%fvalue = ftrn
        offsprings(i)%ftst = ftst
        offsprings(i)%fitness = 1d0/ftrn
        offsprings(i)%iid = iid
        if( myid.eq.0 ) write(io_indivs,100) iid,ftrn,ftst,xtmp(1:min(ndim,100))
        if( ftrn.lt.fbest ) then
          fbest = ftrn
          iidbest = iid
          xbest(1:ndim) = xtmp(1:ndim)
          if( iprint.ge.2 ) then
            write(cadd,'(i0)') iid
            call sub_ergrel(cadd)
          endif
          call sub_eval(iid)
        endif
      enddo

!.....Selection
!!$      print *,'selecting...'
      call roulette_selection(ga_nindivs,indivs,ga_noffsp,offsprings,fbest)

      if( myid.eq.0 ) then
        write(6,'(a,i8,1x,100es11.3)') &
           " iter,fbest,fvals= ",&
           iter,fbest,(indivs(i)%fvalue,i=1,min(ga_nindivs,10))
        do i=1,ga_nindivs
          write(io_steps,'(2i8,2es15.7)') iter, indivs(i)%iid, indivs(i)%fvalue &
               ,indivs(i)%ftst
        enddo
        flush(io_indivs)
        flush(io_steps)
      endif
    end do
!.....END of GA loop....................................................

!.....Output information of the best
    if( myid.eq.0 ) then
      write(6,*) ''
      write(6,'(a)') ' The best one in this GA simulation run:'
      write(6,'(a,i0,2x,f0.4)') '   ID, f-value: ',iidbest,fbest
!!$      write(6,'(a,100f7.3)')  '   Variables: ',xbest(1:ndim)
      write(6,*) ''
    endif

!.....Just for outputing out.erg.fin.1 of the best one     
    call func(ndim,xbest,ftrn,ftst)
!!$    if( myid.eq.0 ) then
!!$      write(6,*) 'best one re-caluclated = ', ftrn
!!$    endif

    close(io_indivs)
    close(io_steps)
    return
  end subroutine ga
!=======================================================================
  subroutine bin2dec(nbit,bin,dec)
    integer,intent(in):: nbit
    integer(2),intent(in):: bin(nbit)
    integer,intent(out):: dec
    integer:: i

    dec = 0
    do i=1,nbit
      dec = dec +bin(i)*2**(i-1)
    end do
    return
  end subroutine bin2dec
!=======================================================================
  subroutine dec2bin(nbit,dec,bin)
    integer,intent(in):: nbit
    integer,intent(in):: dec
    integer(2),intent(out):: bin(nbit)
    integer:: i,idec

    idec = dec
    bin(1:nbit) = 0
    do i=1,nbit
      bin(i) = mod(idec,2)
      idec = idec /2
    enddo
    return
  end subroutine dec2bin
!=======================================================================
  subroutine make_pairs(num,pairs)
!
!  Make random pairs from NUM elements.
!
    use random
    integer,intent(in):: num
    integer,intent(out):: pairs(2,num/2)

    integer:: i,j,l,m,n,ival,jval
    integer:: chosen(num),navail

    chosen(1:num) = 0
    
    navail = num
    do n=1,num/2
      i = int(urnd()*navail) +1
10    j = int(urnd()*navail) +1
      if( j.eq.i ) goto 10
      l=0
      ival = 0
      jval = 0
      do m=1,num
        if( chosen(m).eq.0 ) then
          l=l+1
          if( l.eq.i ) then
            ival = m
            chosen(m) = 1
          else if( l.eq.j ) then
            jval = m
            chosen(m) = 1
          endif
          if( ival.ne.0 .and. jval.ne.0 ) then
            exit
          endif
        endif
      enddo
      pairs(1,n) = ival
      pairs(2,n) = jval
      navail = 0
      do m=1,num
        if( chosen(m).eq.0 ) then
          navail = navail + 1
        endif
      enddo
    enddo
    return
  end subroutine make_pairs
!=======================================================================
  subroutine mutate(g,rate)
    use random
    type(gene),intent(inout):: g
    real(8),intent(in):: rate
    integer:: i

    do i=1,ga_nbits
      if( urnd().lt.rate ) then
        g%bits(i) = mod(int(g%bits(i))+1,2)
      endif
    enddo
    return
  end subroutine mutate
!=======================================================================
  subroutine gene2var(g,v)
    type(gene),intent(in):: g
    real(8),intent(out):: v
    integer:: dec

    call bin2dec(ga_nbits,g%bits,dec)
    v = g%vmin +dble(dec)*(g%vmax-g%vmin)/(2**ga_nbits-1)
    return
  end subroutine gene2var
!=======================================================================
  subroutine wrap_gene(g)
!
!  Wrap gene into the given variable range.
!
    type(gene),intent(inout):: g
    real(8):: v

    call gene2var(g,v)
    v = min(v,g%vmax)
    v = max(v,g%vmin)
    g%val = v
    return
  end subroutine wrap_gene
!=======================================================================
  subroutine var2gene(v,g)
    real(8),intent(in):: v
    type(gene),intent(inout):: g
    integer:: dec
    
    dec = int((v -g%vmin)/(g%vmax-g%vmin)*(2**ga_nbits-1))
    call dec2bin(ga_nbits,dec,g%bits)
    return
  end subroutine var2gene
!=======================================================================
  subroutine indiv2vars(indiv,ndim,vars)
    type(individual),intent(in):: indiv
    integer,intent(in):: ndim
    real(8),intent(out):: vars(ndim)
    integer:: i
    
    do i=1,ndim
      call gene2var(indiv%genes(i),vars(i))
    enddo
    return
  end subroutine indiv2vars
!=======================================================================
  subroutine crossover(ind1,ind2,offspring)
!
!  Homogeneous crossover of two individuals to create an offspring
!  that has some similarities to the parents.
!
    use random
    type(individual),intent(in):: ind1,ind2
    type(individual),intent(inout):: offspring
    
    integer:: i,j
    type(gene):: g1,g2
    real(8):: v1,v2,r1
    
    do i=1,ga_ngenes
      g1 = ind1%genes(i)
      g2 = ind2%genes(i)
      v1 = ind1%fvalue
      v2 = ind2%fvalue
      r1 = log(v2+1d0)/(log(v1+1d0)+log(v2+1d0))
      do j=1,ga_nbits
        offspring%genes(i)%bits(j) = g1%bits(j)
        offspring%genes(i)%vmin = g1%vmin
        offspring%genes(i)%vmax = g1%vmax
        if( g1%bits(j).ne.g2%bits(j) .and. urnd() .gt. r1 ) then
          offspring%genes(i)%bits(j) = g2%bits(j)
        end if
      end do
      call wrap_gene(offspring%genes(i))
!!$      print '(a,i4,2x,100i1)','i,bits=',i,&
!!$           (mod(offspring%genes(i)%bits(j),10),j=1,ga_nbits)
    end do
    return
  end subroutine crossover
!=======================================================================
  subroutine roulette_selection(nindivs,indivs,noffsp,offsprings,fbest)
!
!  Select individuals that are alive in the next generation according to
!  their evaulation values.
!  Selected ones are returned as an INDIVS array.
!  The best one is always selected at first.
!
    use random
    integer,intent(in):: nindivs,noffsp
    type(individual),intent(inout):: indivs(nindivs)
    type(individual),intent(in):: offsprings(noffsp)
    real(8),intent(in):: fbest

    integer:: i,j,n,ibest
    integer:: islct(nindivs)
    real(8):: fbestl,prnd,ptot

    integer,save:: nall
    real(8),save,allocatable:: probs(:)
    logical,save:: l1st = .true.
    type(individual),save,allocatable:: tmp_indivs(:)
    real(8),parameter:: pmax = 1d+10

    if( l1st ) then
      nall = nindivs + noffsp
      allocate(probs(nall),tmp_indivs(nindivs))
      l1st = .false.
    endif

!.....Compute all the probabilities using func values and temperature
    n = 0
    ibest = 0
    fbestl = 1d+30
    do i=1,nindivs
      n = n + 1
      if( indivs(i)%fvalue.lt.fbestl ) then
        fbestl = indivs(i)%fvalue
        ibest = n
      endif
      probs(n) = indivs(i)%fitness
!!$      if( trim(ga_fitness).eq.'exp' ) then
!!$        probs(n) = exp(-(indivs(i)%fvalue-fbest)/ga_temp)
!!$      else if( trim(ga_fitness).eq.'inv' ) then
!!$        probs(n) = 1d0/indivs(i)%fvalue
!!$      endif
!!$      print *,'n,fvalue,prob=',n,indivs(i)%fvalue,probs(n)
    enddo
    do i=1,noffsp
      n = n + 1
      if( offsprings(i)%fvalue.lt.fbestl ) then
        fbestl = offsprings(i)%fvalue
        ibest = n
      endif
      probs(n) = indivs(i)%fitness
!!$      if( trim(ga_fitness).eq.'exp' ) then
!!$        probs(n) = exp(-(offsprings(i)%fvalue-fbest)/ga_temp)
!!$      else if( trim(ga_fitness).eq.'inv' ) then
!!$        probs(n) = 1d0/offsprings(i)%fvalue
!!$      endif
!!$      print *,'n,fvalue,prob=',n,offsprings(i)%fvalue,probs(n)
    enddo

!.....Select individuals
    islct(1) = ibest
    probs(ibest) = 0d0
    do i=2,nindivs
      ptot = 0d0
      do j=1,nall
        ptot = ptot + probs(j)
      enddo
      prnd = urnd()*ptot
      ptot = 0d0
      do j=1,nall
        ptot = ptot +probs(j)
        if( prnd.lt.ptot ) then
          islct(i) = j
          probs(j) = 0d0
          exit
        endif
      enddo
    enddo

!!$    print *,'islct:'
!!$    do i=1,ga_nindivs
!!$      print *,'i,islct(i)=',i,islct(i)
!!$    enddo

!.....Replace indivs elements with selected ones
    do i=1,nindivs
      j = islct(i)
      if( j.le.nindivs ) then
        tmp_indivs(i) = indivs(j)
      else
        j = j - nindivs
        tmp_indivs(i) = offsprings(j)
      endif
    enddo
    do i=1,nindivs
      indivs(i) = tmp_indivs(i)
    enddo
    return
  end subroutine roulette_selection
!=======================================================================
  subroutine de(ndim,xbest,fbest,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,cfmethod,niter_eval,sub_eval,sub_ergrel)
!
! Differential evolution (DE) which does not use gradient information.
! DE itself is a serial code, but the function evaluation can be parallel.
!
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,maxiter,niter_eval
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: fbest,xbest(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
      subroutine sub_ergrel(cadd)
        character(len=*),intent(in):: cadd
      end subroutine sub_ergrel
    end interface

    integer:: i,j,iter,ip,iq,ir,is
    real(8):: ftrn,ftst,fracl,fracg,lmdl,lmdg,w,fdiff,prob,ftbest&
         ,xtbest(ndim)
    logical,save:: l1st = .true.
    integer:: iid,iidbest,iidtbest,ibest,ibest0,iidmax
    type(individual),allocatable:: indivs(:),offsprings(:)
    real(8),allocatable,dimension(:):: xtmp,xi,xp,xq,xr,xs,xbestl,xbestg&
         ,xl,xg,xd
    real(8),allocatable:: xpbest(:,:)
    character(len=128):: cadd

    integer,parameter:: io_indivs = 30
    character(len=128),parameter:: cf_indivs = 'out.de.individuals'
    integer,parameter:: io_steps = 31
    character(len=128),parameter:: cf_steps = 'out.de.generations'

    if( l1st ) then
!.....Allocate necessary memory spaces
      allocate(indivs(de_nindivs),offsprings(de_nindivs))
      allocate(xtmp(ndim),xi(ndim),xp(ndim),xq(ndim),xr(ndim),xs(ndim)&
           ,xbestl(ndim),xbestg(ndim),xl(ndim),xg(ndim),xd(ndim))
      allocate(xpbest(ndim,de_nindivs))
      do i=1,de_nindivs
        allocate(indivs(i)%genes(ndim),offsprings(i)%genes(ndim))
      enddo
!.....Initialize
      fracg = de_frac
      fracl = fracg
      if( de_lambda.le.0d0 ) de_lambda = de_frac
      lmdg = de_lambda
      lmdl = lmdg
      if( myid.eq.0 .and. iprint.ne.0 ) then
        write(6,*) ''
        write(6,'(a)') '------------------------------------------------------------------------'
        write(6,'(a)') '                          Differential Evolution'
        write(6,'(a)') '------------------------------------------------------------------------'
        print '(a,i4)',' Number of individuals = ',de_nindivs
        print '(a,f8.4)',' Fraction              =',de_frac
        print '(a,f8.4)',' Crossover rate        =',de_cross_rate
!!$        print '(a,f8.4)',' fracg                 =',fracg
!!$        print '(a,f8.4)',' fracl                 =',fracl
        print '(a,f8.4)',' frac                  =',de_frac
        if( trim(de_algo).eq.'local_neighbor' ) then
          print '(a,f8.4)',' wmin                  =',de_wmin
          print '(a,f8.4)',' wmax                  =',de_wmax
          print '(a,f8.4)',' lmdg                  =',lmdg
          print '(a,f8.4)',' lmdl                  =',lmdl
        endif
        print '(a,es12.4)',' Pseudo Temperature    =',de_temp

        print *,''
      endif
      l1st = .false.
    endif

    if( myid.eq.0 ) then
      open(io_indivs,file=cf_indivs,status='replace')
      write(io_indivs,'(a)') '# iid, ftrn, ftst, vars...'
      open(io_steps,file=cf_steps,status='replace')
      write(io_steps,'(a)') '# iter, iid, ftrn, ftst'
    endif
10  format(i6,2es14.6,100es11.3)

    iter = 0

!.....Create population that includes some individuals
    iid = 0
    fbest = 1d+30
    ftbest = 1d+30
    do i=1,de_nindivs
      do j=1,ndim
        indivs(i)%genes(j)%vmin = xranges(1,j)
        indivs(i)%genes(j)%vmax = xranges(2,j)
        if( i.eq.1 ) then
          indivs(i)%genes(j)%val = xbest(j)
        else
          indivs(i)%genes(j)%val = xranges(1,j) + &
               (xranges(2,j)-xranges(1,j))*urnd()
        endif
      enddo
      do j=1,ndim
        xtmp(j) = indivs(i)%genes(j)%val
      enddo
      call func(ndim,xtmp,ftrn,ftst)
      iid = iid + 1
      iidmax = max(iidmax,iid)
!.....Detect NaN and replace it with 1d+10
      if( ftrn*0d0 .ne. 0d0 ) then
        if( myid.eq.0 .and. iprint.ne.0 ) then
          write(6,'(a,2i10)')&
               ' [ftrn.eq.NaN] iter,iid = ',iter,iid
        endif
        ftrn = fupper_lim
      endif
      if( ftrn.gt.fupper_lim ) then
        ftrn = fupper_lim
      endif
      indivs(i)%fvalue = ftrn
      indivs(i)%ftst = ftst
      if( ftrn*0d0.ne.0d0 ) then
        indivs(i)%fitness = 0d0
      else
        indivs(i)%fitness = 1d0/ftrn
      endif
      indivs(i)%iid = iid
      if( myid.eq.0 ) write(io_indivs,10) iid,ftrn,ftst,xtmp(1:min(ndim,100))
      if( ftrn.lt.fbest ) then
        fbest = ftrn
        iidbest = iid
        ibest = iid
        xbest(1:ndim) = xtmp(1:ndim)
      else if( ftst.lt.ftbest ) then
        ftbest = ftst
        iidtbest = iid
        xtbest(1:ndim) = xtmp(1:ndim)
      endif

      if( iprint.ge.2 ) then
        write(cadd,'(i0)') iid
        call sub_ergrel(cadd)
      endif
      if( i.eq.1 ) call sub_eval(iter)
    enddo
    w = de_wmin + (de_wmax -de_wmin)*dble(iter)/maxiter
    if( maxiter.eq.0 ) w = de_wmin

    if( myid.eq.0 ) then
      if( trim(de_algo).eq.'local_neighbor' ) then
        write(6,'(a,i8,es12.4,f5.2,1x,100es12.4)') &
             " iter,fbest,w,fvals= ",&
             iter,fbest,w,(indivs(i)%fvalue,i=1,min(de_nindivs,10))
      else  ! classical DE
        write(6,'(a,i8,es12.4,1x,100es12.4)') &
             " iter,fbest,fvals= ",&
             iter,fbest,(indivs(i)%fvalue,i=1,min(de_nindivs,10))
      endif
      do i=1,de_nindivs
        write(io_steps,'(2i8,2es15.7)') iter, indivs(i)%iid, indivs(i)%fvalue &
             ,indivs(i)%ftst
      enddo
    endif

!.....DE loop starts....................................................
    do iter=1,maxiter

      w = de_wmin + (de_wmax -de_wmin)*dble(iter)/maxiter
      call make_global_best(de_nindivs,indivs,ndim,xbestg)
      ibest0 = ibest

!.....Loop for individuals
      do i=1,de_nindivs

        do j=1,ndim
          xi(j) = indivs(i)%genes(j)%val
        enddo

        if( trim(de_algo).eq.'local_neighbor' ) then

!.....Create a local vector
          ip = i+1
          if( ip.gt.de_nindivs ) ip = ip - de_nindivs
          iq = i-1
          if( iq.le.0 ) iq = iq + de_nindivs
          do j=1,ndim
            xp(j) = indivs(ip)%genes(j)%val
            xq(j) = indivs(iq)%genes(j)%val
          enddo
          xbestl(1:ndim) = 0d0
          call make_local_best(indivs(i),indivs(ip),indivs(iq),ndim,xbestl)
!!$        xl(1:ndim) = xi(1:ndim) +lmdl*(xbestl(1:ndim)-xi(1:ndim)) &
!!$             +fracl*(xp(1:ndim)-xq(1:ndim))
          xl(1:ndim) = xi(1:ndim) +lmdl*(xbestl(1:ndim)-xi(1:ndim)) &
               +urnd()*(xp(1:ndim)-xq(1:ndim))
!!$        print '(a,8es12.4)','xi,xp,xq,xbestl,fi,fp,fq,xl=' &
!!$             ,xi(1),xp(1),xq(1),xbestl(1)&
!!$             ,indivs(i)%fvalue,indivs(ip)%fvalue,indivs(iq)%fvalue,xl(1)
!.....Create a global vector
100       ir = int(urnd() *de_nindivs) +1
          if( ir.eq.i ) goto 100
110       is = int(urnd() *de_nindivs) +1
          if( is.eq.i .or. is.eq.ir ) goto 110
          do j=1,ndim
            xr(j) = indivs(ir)%genes(j)%val
            xs(j) = indivs(is)%genes(j)%val
          enddo
!!$        xg(1:ndim) = xi(1:ndim) +lmdg*(xbestg(1:ndim)-xi(1:ndim)) &
!!$             +fracg*(xr(1:ndim)-xs(1:ndim))
          xg(1:ndim) = xi(1:ndim) +lmdg*(xbestg(1:ndim)-xi(1:ndim)) &
               +urnd()*(xr(1:ndim)-xs(1:ndim))
!!$        print '(a,100f7.3)','xg(1:ndim)=',xg(1:ndim)
!.....Make the donor vector from the local and global vectors
          xd(1:ndim) = w*xg(1:ndim) +(1d0-w)*xl(1:ndim)

!.....Classical DE
        else
200       ip = int(urnd()*de_nindivs) +1
          if( ip.eq.i ) goto 200
210       ir = int(urnd()*de_nindivs) +1
          if( ir.eq.i .or. ir.eq.ip ) goto 210
220       is = int(urnd()*de_nindivs) +1
          if( is.eq.i .or. is.eq.ip .or. is.eq.ir )  goto 220
          do j=1,ndim
            xp(j) = indivs(ip)%genes(j)%val
            xr(j) = indivs(ir)%genes(j)%val
            xs(j) = indivs(is)%genes(j)%val
          enddo
!!$        xg(1:ndim) = xi(1:ndim) +lmdg*(xbestg(1:ndim)-xi(1:ndim)) &
!!$             +fracg*(xr(1:ndim)-xs(1:ndim))
!!$          xd(1:ndim) = xp(1:ndim) +de_frac *(xr(1:ndim)-xs(1:ndim))
          xd(1:ndim) = xp(1:ndim) +urnd()*de_frac *(xr(1:ndim)-xs(1:ndim))
        endif  ! de_algo

        iid = iid + 1
!.....Make a new candidate by the crossover of xd and xi
        do j=1,ndim
          if( urnd().lt.de_cross_rate ) then
            xtmp(j) = xd(j)
          else
            xtmp(j) = xi(j)
          endif
        enddo
        do j=1,ndim
          xtmp(j) = max(xtmp(j),indivs(i)%genes(j)%vmin)
          xtmp(j) = min(xtmp(j),indivs(i)%genes(j)%vmax)
!!$          offsprings(i)%genes(j)%val = xtmp(j)
        enddo
!!$        offsprings(i)%iid = iid
        call func(ndim,xtmp,ftrn,ftst)
!!$        offsprings(i)%fvalue = ftrn
!.....Detect NaN and replace it with fupper_lim
!!$        if( ftrn*0d0.ne.0d0 ) then
!!$          offsprings(i)%fitness = 0d0
!!$        else
!!$          offsprings(i)%fitness = 1d0/ftrn
!!$        endif
!!$        print *,'i,fvalue,ftrn=',i,indivs(i)%fvalue,ftrn
        fdiff = ftrn -indivs(i)%fvalue
        prob = min(1d0,exp(-fdiff/de_temp))
!!$        if( ftrn.le.indivs(i)%fvalue .or. indivs(i)%fvalue*0d0.ne.0d0 ) then
        if( urnd().le.prob .or. indivs(i)%fvalue*0d0.ne.0d0 ) then
          if( i.eq.ibest0 .and. fdiff.gt.0d0 ) cycle
          do j=1,ndim
            indivs(i)%genes(j)%val = xtmp(j)
          enddo
          indivs(i)%iid = iid
          if( ftrn.gt.fupper_lim ) ftrn = fupper_lim
          indivs(i)%fvalue = ftrn
          indivs(i)%ftst = ftst
          indivs(i)%fitness = 1d0/ftrn
        else
          cycle
        endif
        if( myid.eq.0 ) write(io_indivs,10) iid,ftrn,ftst,xtmp(1:min(ndim,100))
        if( ftrn.lt.fbest ) then
          fbest = ftrn
          iidbest = iid
          ibest = i
          xbest(1:ndim) = xtmp(1:ndim)
          if( iprint.ge.2 ) then
            write(cadd,'(i0)') iid
            call sub_ergrel(cadd)
          endif
          call sub_eval(iid)
        else if( ftst.lt.ftbest ) then
          ftbest = ftst
          iidtbest = iid
          xtbest(1:ndim) = xtmp(1:ndim)
          xtmp(1:ndim) = xbest(1:ndim)
          xbest(1:ndim) = xtbest(1:ndim)
          call sub_eval(iid)
          xbest(1:ndim) = xtmp(1:ndim)
        endif
      enddo  ! loop over individuals

      if( myid.eq.0 ) then
        write(6,'(a,i8,es12.4,1x,100es12.4)') &
             " iter,fbest,fvals= ",&
             iter,fbest,(indivs(i)%fvalue,i=1,min(de_nindivs,10))
        do i=1,de_nindivs
          write(io_steps,'(2i8,2es15.7)') iter, indivs(i)%iid, indivs(i)%fvalue &
               ,indivs(i)%ftst
        enddo
        flush(io_indivs)
        flush(io_steps)
      endif
    enddo
!.....DE loop ends......................................................

!.....Output information of the best
    if( myid.eq.0 ) then
      write(6,*) ''
      write(6,'(a)') ' The best one in this DE simulation run:'
      write(6,'(a,i8,1x,f0.4)') '   ID, f-value: ',iidbest,fbest
!!$      write(6,'(a,100f7.3)') '   Variables: ', xbest(1:min(ndim,100))
      write(6,*) ''
    endif

!.....Just for outputing out.erg.fin.1 of the best one     
    call func(ndim,xbest,ftrn,ftst)

    close(io_indivs)
    close(io_steps)
    return
  end subroutine de
!=======================================================================
  subroutine make_local_best(ind0,ind1,ind2,ndim,xbestl)
    type(individual),intent(in):: ind0,ind1,ind2
    integer,intent(in):: ndim
    real(8),intent(out):: xbestl(ndim)

    integer:: i
    real(8):: allf
    
    xbestl(1:ndim) = 0d0
    allf = ind0%fitness +ind1%fitness +ind2%fitness
    do i=1,ndim
      xbestl(i) = xbestl(i) + (&
           ind0%genes(i)%val *ind0%fitness  &
           +ind1%genes(i)%val *ind1%fitness  &
           +ind2%genes(i)%val *ind2%fitness ) /allf
    enddo
    return
  end subroutine make_local_best
!=======================================================================
  subroutine make_global_best(nindivs,indivs,ndim,xbestg)
    integer,intent(in):: nindivs,ndim
    type(individual),intent(in):: indivs(nindivs)
    real(8),intent(out):: xbestg(ndim)

    integer:: i,j
    real(8):: allf

    xbestg(1:ndim) = 0d0
    allf = 0d0
    do i=1,nindivs
      allf = allf +indivs(i)%fitness
    enddo
    do i=1,ndim
      do j=1,nindivs
        xbestg(i) = xbestg(i) +indivs(j)%fitness *indivs(j)%genes(i)%val
      enddo
      xbestg(i) = xbestg(i) /allf
    enddo
    return
  end subroutine make_global_best
!=======================================================================
  subroutine pso(ndim,xbest,fbest,xranges,xtol,gtol,ftol,maxiter &
       ,iprint,iflag,myid,func,cfmethod,niter_eval,sub_eval)
!
! Particle Swarm Optimization (PSO).
! DE itself is a serial code, but the function evaluation can be parallel.
!
    use random
    implicit none
    integer,intent(in):: ndim,iprint,myid,maxiter,niter_eval
    integer,intent(inout):: iflag
    real(8),intent(in):: xtol,gtol,ftol,xranges(2,ndim)
    real(8),intent(inout):: fbest,xbest(ndim)
    character(len=*),intent(in):: cfmethod
    interface
      subroutine func(n,x,ftrn,ftst)
        integer,intent(in):: n
        real(8),intent(in):: x(n)
        real(8),intent(out):: ftrn,ftst
      end subroutine func
      subroutine sub_eval(iter)
        integer,intent(in):: iter
      end subroutine sub_eval
    end interface

    integer:: i,j,iter
    real(8):: ftrn,ftst,xj,vj,r1,r2,w
    logical,save:: l1st = .true.
    integer:: iid,iidbest
    type(individual),allocatable:: indivs(:)
    real(8),allocatable:: xtmp(:),xpbest(:,:),fpbest(:)

    integer,parameter:: io_indivs = 30
    character(len=128),parameter:: cf_indivs = 'out.pso.individuals'
    integer,parameter:: io_fvalues = 31
    character(len=128),parameter:: cf_fvalues = 'out.pso.fvalues'
    integer,parameter:: io_steps = 32
    character(len=128),parameter:: cf_steps = 'out.pso.generations'

    if( l1st ) then
!.....Allocate necessary memory spaces
      allocate(indivs(pso_nindivs))
      allocate(xtmp(ndim),fpbest(pso_nindivs),xpbest(ndim,pso_nindivs))
      do i=1,pso_nindivs
        allocate(indivs(i)%genes(ndim),indivs(i)%vel(ndim))
      enddo
      if( myid.eq.0 .and. iprint.ne.0 ) then
        write(6,*) ''
        write(6,'(a)') '------------------------------------------------------------------------'
        write(6,'(a)') '                   Particle Swarm Optimization (PSO)'
        write(6,'(a)') '------------------------------------------------------------------------'
        print '(a,i4)',  ' Number of individuals = ',pso_nindivs
        print '(a,f8.4)',' w                     =',pso_w
        print '(a,f8.4)',' C1                    =',pso_c1
        print '(a,f8.4)',' C2                    =',pso_c2
        print '(a,es12.4)',' fval upper limit      =',fupper_lim
        print *,''
      endif
      l1st = .false.
    endif

    if( myid.eq.0 ) then
      open(io_indivs,file=cf_indivs,status='replace')
      write(io_indivs,'(a)') '# iid, ftrn, ftst, vars...'
      open(io_fvalues,file=cf_fvalues,status='replace')
      write(io_fvalues,'(a)') '# iter, iid, fvals...'
      open(io_steps,file=cf_steps,status='replace')
      write(io_steps,'(a)') '# iter, iid, ftrn, ftst'
    endif
10  format(i6,2es14.6,100es11.3)

    iter = 0

!.....Create population that includes some individuals
    iid = 0
    fbest = 1d+30
    do i=1,pso_nindivs
      do j=1,ndim
        indivs(i)%genes(j)%vmin = xranges(1,j)
        indivs(i)%genes(j)%vmax = xranges(2,j)
        if( i.eq.1 ) then
          indivs(i)%genes(j)%val = xbest(j)
        else
          indivs(i)%genes(j)%val = xranges(1,j) + &
               (xranges(2,j)-xranges(1,j))*urnd()
        endif
        indivs(i)%vel(j) = pso_vinimax*(urnd()-0.5d0)
      enddo
      do j=1,ndim
        xtmp(j) = indivs(i)%genes(j)%val
      enddo
      call func(ndim,xtmp,ftrn,ftst)
      iid = iid + 1
!.....Detect NaN and replace it with 1d+10
      if( ftrn*0d0 .ne. 0d0 ) then
        if( myid.eq.0 .and. iprint.ne.0 ) then
          write(6,'(a,2i10)')&
               ' [ftrn.eq.NaN] iter,iid = ',iter,iid
        endif
        ftrn = fupper_lim
      else if( ftrn.gt.fupper_lim ) then
        ftrn = fupper_lim
      endif
      indivs(i)%fvalue = ftrn
      indivs(i)%ftst = ftst
      fpbest(i) = ftrn
      xpbest(1:ndim,i) = xtmp(1:ndim)
      if( ftrn*0d0.ne.0d0 ) then
        indivs(i)%fitness = 0d0
      else
        indivs(i)%fitness = 1d0/ftrn
      endif
      indivs(i)%iid = iid
      if( myid.eq.0 ) write(io_indivs,10) iid,ftrn,ftst,xtmp(1:min(ndim,100))
      if( ftrn.lt.fbest ) then
        fbest = ftrn
        iidbest = iid
        xbest(1:ndim) = xtmp(1:ndim)
      endif
      if( i.eq.1 ) call sub_eval(iter)
!!$      if( myid.eq.0 ) print *,'myid,iid,fval,pbest=',myid,iid,ftrn,fpbest(i)
    enddo
    if( myid.eq.0 ) then
      write(6,'(a,i8,es12.4,1x,100es12.4)') &
           " iter,fbest,fvals= ",&
           iter,fbest,(indivs(i)%fvalue,i=1,min(ndim,10))
      write(io_fvalues,'(2i8, 100es12.4)') iter, indivs(i)%iid, &
           (indivs(i)%fvalue,i=1,pso_nindivs)
      do i=1,pso_nindivs
        write(io_steps,'(2i8,2es15.7)') iter, indivs(i)%iid, indivs(i)%fvalue,&
             indivs(i)%ftst
      enddo
    endif
!!$    if( myid.eq.0 ) print *,'myid,iidbest,fbest=',myid,iidbest,fbest
    
!.....PSO loop starts....................................................
    do iter=1,maxiter


!.....Loop for individuals
      do i=1,pso_nindivs

!.....Update velocity and position
        r1 = urnd()
        r2 = urnd()
        do j=1,ndim
          xj = indivs(i)%genes(j)%val
          vj = indivs(i)%vel(j)
          w = indivs(i)%genes(j)%vmax -indivs(i)%genes(j)%vmin
          if( w.lt.0d0 ) then
            indivs(i)%vel(j) = 0d0
            xtmp(j) = indivs(i)%genes(j)%vmax
            indivs(i)%genes(j)%val = xtmp(j)
            cycle
          endif
          indivs(i)%vel(j) = vj &
               +pso_c1*r1*( xpbest(j,i) -xj ) &
               +pso_c2*r2*( xbest(j) -xj )
          if( abs(indivs(i)%vel(j)).gt.w/5 ) &
               indivs(i)%vel(j) = sign(w/5,indivs(i)%vel(j))
          xtmp(j) = xj +indivs(i)%vel(j)
!.....Make sure the range of xtmp and if hit the wall,
!     make its velocity oposite direction.
!!$          xtmp(j) = max(xtmp(j),indivs(i)%genes(j)%vmin)
!!$          xtmp(j) = min(xtmp(j),indivs(i)%genes(j)%vmax)
          if( xtmp(j).lt.indivs(i)%genes(j)%vmin ) then
            indivs(i)%vel(j) = -indivs(i)%vel(j)
            xtmp(j) = 2d0*indivs(i)%genes(j)%vmin -xtmp(j)
          else if( xtmp(j).gt.indivs(i)%genes(j)%vmax ) then
            indivs(i)%vel(j) = -indivs(i)%vel(j)
            xtmp(j) = 2d0*indivs(i)%genes(j)%vmax -xtmp(j)
          endif
          indivs(i)%genes(j)%val = xtmp(j)
        enddo
        call func(ndim,xtmp,ftrn,ftst)
        iid = iid + 1
!.....Detect NaN and replace it with 1d+10
        if( ftrn*0d0 .ne. 0d0 ) then
          ftrn = fupper_lim
        endif
        if( ftrn.gt.fupper_lim ) then
          ftrn = fupper_lim
        endif
        if( ftst.gt.fupper_lim ) then
          ftst = fupper_lim
        endif
        indivs(i)%fvalue = ftrn
        indivs(i)%iid = iid
        if( myid.eq.0 ) write(io_indivs,10) iid,ftrn,ftst,xtmp(1:min(ndim,100))
!.....Check the global best and update if needed
        if( ftrn.lt.fbest ) then
          fbest = ftrn
          iidbest = iid
          xbest(1:ndim) = xtmp(1:ndim)
          call sub_eval(iid)
        endif
!.....Check the particle best and update if needed
        if( ftrn.lt.fpbest(i) ) then
          fpbest(i)= ftrn
          xpbest(1:ndim,i) = xtmp(1:ndim)
        endif

!!$        if( myid.eq.0 ) print *,'myid,iid,fval,pbest=',myid,iid,ftrn,fpbest(i)
      enddo  ! loop over individuals
!!$      if( myid.eq.0 ) print *,'myid,iidbest,fbest=',myid,iidbest,fbest

      if( myid.eq.0 ) then
        write(6,'(a,i8,es12.4,1x,100es12.4)') &
             " iter,fbest,fvals= ",&
             iter,fbest,(indivs(i)%fvalue,i=1,min(ndim,10))
        write(io_fvalues,'(2i8, 100es12.4)') iter, indivs(i)%iid, &
             (indivs(i)%fvalue,i=1,pso_nindivs)
        do i=1,pso_nindivs
          write(io_steps,'(2i8,2es15.7)') iter, indivs(i)%iid, indivs(i)%fvalue &
               ,indivs(i)%ftst
        enddo
        flush(io_indivs)
        flush(io_steps)
        flush(io_fvalues)
      endif
    enddo
!.....DE loop ends......................................................

!.....Output information of the best
    if( myid.eq.0 ) then
      write(6,*) ''
      write(6,'(a)') ' The best one in this PSO simulation run:'
      write(6,'(a,i8,1x,f0.4)') '   ID, f-value: ',iidbest,fbest
!!$      write(6,'(a,100f7.3)')  '   Variables: ',xbest(1:ndim)
      write(6,*) ''

!!$      do i=1,pso_nindivs
!!$        print '(a,i4,f10.1,1x,100f9.4)', ' i,fpbest,xpbest =' ,i,fpbest(i),&
!!$             xpbest(1:ndim,i)
!!$      enddo
      print *,''
    endif

!.....Just for outputing out.erg.fin.1 of the best one     
    call func(ndim,xbest,ftrn,ftst)

    close(io_indivs)
    close(io_fvalues)
    close(io_steps)
    return
  end subroutine pso
!=======================================================================
  
end module
!-----------------------------------------------------------------------
! Local Variables:
! compile-command: "make fitpot"
! End:
