#!/usr/bin/env perl
# 
# GSIDiag
#
#  27Oct2007 Todling  Created out of analyzer
#  06Feb2009 Todling  Update to new nomenclature of gsi output files
#  14Feb2009 Todling  Generalized determination of last loop index using gsi.rc file
#  22Mar2009 Todling  Add instruments handled by NCEP-Dec2008 version of GSI
#  11Dec2009 Owens/RT Add instruments from NOAA-16/19
#  06Oct2011 Todling  add suffix and catonly to allow ensemble obsvr to do its job
#  01Feb2012 Todling  remove pe files as diag files get "created"
#  09Feb2012 Meta/JS  Change ozone file name from mlsoz_aura to o3lev_aura
#  25May2012 Todling  Add instruments: seviri,cris,atms,and those in metop-b
#  06Jan2013 Guo      Send "ls: cannot access ..." messages to /dev/null when willdo!="set".
#  25May2013 Todling  Implement parallelization 
#  16Nov2013 Todling  Check all diag-types for consistency w/ obs-sys from MERRA2
#  01Oct2014 Stassi   Unwired intrument-platform names: see gsidiags.rc
#  09Oct2014 Weir     Begin adding support for trace gases
#  09Dec2016 McCarty  Add support for nc4 diag concatenation
#  13Feb2017 Todling  Set default for GSI binary output; revised env var names
#  02Aug2018 McCarty  Integrated bin2txt processing for pyradmon usage
#  27Oct2021 Todling  Removed redundancy in handling obs types; Revised parallelization. 
#-----------------------------------------------------------------------------------------------------

use Env;                 # make env vars readily available
use File::Basename;      # for basename(), dirname()
use File::Copy "cp";     # for cp()
use Getopt::Long;        # load module with GetOptions function
use Time::Local;         # time functions
use FindBin;             # so we can find where this script resides
use List::MoreUtils qw(uniq);

# look for perl packages in the following locations
#--------------------------------------------------
use lib ( "$FindBin::Bin", "$FVROOT/bin", "$ESMADIR/$ARCH/bin" );


my $scriptname = basename($0);

# Command line options

  GetOptions ( "log=s",  => \$log,
               "jiter=i" => \$jiter,
               "ncpus=i" => \$ncpus,
	       "tag=s",  => \$tag,
	       "suffix=s",  => \$suffix,
	       "rc=s",   => \$thisrc,
	       "ods",
	       "debug",
	       "catonly",
	       "verbose",
	       "subdirs",
               "h");

  usage() if $opt_h;

# Parse command line, etc

  init();

# Rename diag files

  lndiag_files($willdo);

# Generate ods files if requested

  d2ods() if ( $doODS );

# All done

# print "gsidiag: resulting files \n";
# $rc_ignore = system('ls -lrt');
  if ($rc==0) {
     print "$0: sucessfully completed.\n\n";
     exit(0);
  } else {
     print "$0: failed to collect diag files\n\n";
     exit(1);
  }


#......................................................................

sub init {

   if ( $#ARGV  <  3 ) {
     print STDERR " Missing arguments; see usage:\n";
     usage();
   } else {              # required command line args
     $nymd = $ARGV[0];
     $nhms = sprintf("%6.6d",$ARGV[1]);
     $expid = $ARGV[2];
     $willdo = $ARGV[3];
     $yyyy = substr($nymd,0,4);
     $mm   = substr($nymd,4,2);
     $dd   = substr($nymd,6,2);
     $hh   = substr($nhms,0,2);
     $nymdhh  = "${nymd}${hh}";
   }

# process options

   $rc    = 0;

# FVROOT is where the binaries have been installed
# ------------------------------------------------
   $fvroot  = $ENV{FVROOT};
   $fvwork  = $ENV{FVWORK}; if ( "$fvwork" eq "" ) { $fvwork = "./" };
   if ( $opt_ods ) {
        $doODS = 1;
   } else {
        $doODS = $ENV{D2ODS};
   }
   
   $mylog = ""; if ( $log ) {$mylog = $log};
  
   $jiter = -1 unless ($jiter);

   $do4dvar = 0;
   $final   = 0;
   if ( $jiter >= 0 ) { $do4dvar = 1 };
   $tag = sprintf("%3.3d",$jiter) unless ( $tag );

   if ( "$willdo" ne "set" &&  "$willdo" ne "unset" ) { 
     print " Option not allowed; see usage:\n";
     usage();
   }

  $MAX = 1;
  if ( defined($ncpus) ) {
      $MAX = $ncpus;
      if ($MAX < 1) { $MAX = 1; }
      else {
          $NCPUS = $ENV{"NCPUS"};
          if ($NCPUS) {
            if($MAX > $NCPUS) {
               $MAX = $NCPUS;
               print "Redefine number of processes used to NCPUS: $MAX \n";
            }
          }
      }
  }
  
  $lsubdirs = 0;
  if ($opt_subdirs) {
    $lsubdirs = 1;
  }

  $binary_diag = 1;
  $binary_diag = $ENV{GSI_BINARY_DIAG} if(defined $ENV{'GSI_BINARY_DIAG'});
  $netcdf_diag = 0;
  $netcdf_diag = $ENV{GSI_NETCDF_DIAG} if(defined $ENV{'GSI_NETCDF_DIAG'});
  $diag2txt    = 0;
  $diag2txt    = $ENV{GSI_DIAG2TXT} if(defined $ENV{'GSI_DIAG2TXT'}); 

# Welcome message
# ---------------
   print <<"EOF" ;

   -------------------------------------------------------
   GSIDiags -  Concatenate and rename GSI-diagnostic files
   -------------------------------------------------------

      Experiment id  : $expid
      Work directory : $fvwork
      Date           : $nymd
      Time           : $nhms
      NCPUS          : $MAX


Starting...

EOF

}

#......................................................................

sub lndiag_files {

  my ( $todo ) = @_;

  my($echorc_x, $gsidiags_rc);
  my(@loops, @satlist, @ozlist, @convlist, @tgaslist);
  my($var,$var1,$var2,$miter,$totiter);

  print "Entering gsidiag todo = $todo \n";
# $rc_ignore = system('ls -lrt');

# Build a hash variable for the gsi loops
# ---------------------------------------
  %loops    = ();
  if ($do4dvar) {
      $myiter = sprintf("%2.2d",$jiter);
      ($loops{$myiter}) = ($tag);
  } else {
      $var = `grep miter gsiparm.anl`;
      ($var1, $var2) = split(/miter/,$var);
      ($var1, $var2) = split(/=/,$var2);
      ($miter,$var2) = split(/,/,$var2);
      $totiter = sprintf("%2.2d",$miter + 1);
      print "setup_internal: Total number of iterations: $totiter \n";
      ($loops{"01"},$loops{"$totiter"}) = ("ges","anl");
  }

  $echorc_x = "$FVROOT/bin/echorc.x";
  if ($thisrc) {
     $gsidiags_rc = chomp($thisrc);
  } else {
     $gsidiags_rc = "$FVROOT/etc/gsidiags.rc";
  }

  @satlist = (split /\s+/, `$echorc_x -rc $gsidiags_rc satlist`);
  shift @satlist until $satlist[0];
  pop   @satlist until $satlist[-1];

  @ozlist  = (split /\s+/, `$echorc_x -rc $gsidiags_rc ozlist`);
  shift @ozlist until $ozlist[0];
  pop   @ozlist until $ozlist[-1];

  @convlist = (split /\s+/, `$echorc_x -rc $gsidiags_rc convlist`);
  shift @convlist until $convlist[0];
  pop   @convlist until $convlist[-1];

  @tgaslist= (split /\s+/, `$echorc_x -rc $gsidiags_rc tgaslist`);
  shift @tgaslist until $tgaslist[0];
  pop   @tgaslist until $tgaslist[-1];

# The following assumes the list of file types does not change for initial and final loops
# This seems to be a good assumption, that might only break in extremely rare cases
  my @keys = keys %loops;
  my $loop = $keys[-1];
  my @instrs = instruments($loop);
  if ($opt_verbose ) {print "@instrs \n";}

  @these = actual_instruments($loop,\@instrs,\@satlist);
  @satlist = @these;
  $ntypes = scalar @satlist;

  @these = actual_instruments($loop,\@instrs,\@ozlist);
  @ozlist = @these;
  $ntypes = $ntypes + scalar @ozlist;

  @these = actual_instruments($loop,\@instrs,\@convlist);
  @convlist = @these;
  $ntypes = $ntypes + scalar @convlist;

  @these = actual_instruments($loop,\@instrs,\@tgaslist);
  @tgaslist = @these;
  $ntypes = $ntypes + scalar @tgaslist;


  if ($ntypes < $MAX) {
      $MAX = 2*$ntypes;  # 2 given that we have to 2 out loops (needs care)
      print "gsidiag: reset MAX PEs to number of obs types: $MAX \n";
  }

  if ( $todo eq "set" ) {
    print " lndiag_files: Collecting diag files ... \n";
    handle_obs(\%loops,$diagtag,\@satlist,\@convlist,\@ozlist,\@tgaslist);

  } else { # clean up diag files
      print " lndiag_files: Cleaning diag files ... \n";

       my(@newlist);
       @newlist = @satlist;
       push(@newlist,@convlist);
       push(@newlist,@ozlist);
       push(@newlist,@tgaslist);
       foreach $sat (@newlist) {
             $files_tmp = `sh -c "ls diag_${sat}.* 2>/dev/null"`;
             chomp($files_tmp);
             @files = split(/\n/,$files_tmp);
             $filecnt=@files;  # how many files are there
             if ($filecnt > 0 ) {
                if ( -e "diag_${sat}.${nymd}${hh}" ) { unlink("diag_${sat}.${nymd}${hh}") };
             }
       }

  }


}
#....................................................................................
sub handle_obs {
  my ($loops,$diagtag,$satlist,$convlist,$ozlist,$tgaslist) = @_;

  my ($files_tmp);
  my ($newpid, $pid, @pidARR);

  print "handle_obs(sat)  $_\n" for @$satlist;
  print "handle_obs(conv) $_\n" for @$convlist;
  print "handle_obs(oz)   $_\n" for @$ozlist;
  print "handle_obs(tgas) $_\n" for @$tgaslist;

  my @types = ();
  if($binary_diag) {push @types, 'bin'};
  if($netcdf_diag) {push @types, 'nc4'};

  $nsats = scalar @$satlist;

  print "Load Balance MAX Processors: $MAX \n";
  while ( ($loop,$diagtag) = each(%loops) ) {
    print "gsidiag: inside sat loop for case: $loop $diagtag \n";

     $ncount = 0;
     foreach $sat (@$satlist,@$convlist,@$ozlist,@$tgaslist) {
        $ncount = $ncount + 1;

        print "$ncount $nsats\n";
        $do2txt = 1;
        if ($ncount > $nsats) {$do2txt=0;}

        foreach $ext (@types) {

           @pidARR = load_balance($MAX, @pidARR); # do not fork more than $MAX jobs
           defined($newpid=fork) or die ">> ERROR << Cannot fork: $!";
           if ($newpid > 0) { print "ObsCat: Forking $sat at $newpid \n"; } ;
           unless ($newpid) {

              #---------------#
              # child process #
              #---------------#
              if ($ext eq "bin" ) { oneobs_binary($sat,$loop,$diagtag,$do2txt);};
              if ($ext eq "nc4" ) { oneobs_netcdf($sat,$loop,$diagtag,$do2txt);};
              exit;

           } # unless

           #---------------#
           # adult process #
           #---------------#
           push @pidARR, $newpid;

       } # <ext-type>

     } # <sat-type>

  } # while

  # wait for forked jobs to complete
  #---------------------------------
  while (@pidARR) {
     $pid = shift @pidARR;
     waitpid($pid,0);
  }

} # <handle_sat>

#......................................................................
sub oneobs_binary {
        my ($sat, $loop, $diagtag, $do2txt) = @_;

        my($files_tmp,$cmd,$rc_ignore);
        my($this_diag_file);
        my($files);

        if ($opt_debug) { 
          print "bin: $loop $sat $do2txt \n"; 
          return(0);
        }

        $rc1 = system( "zeit_ci.x -r .gsidiag_zeit ${sat}.bin");

        if ($lsubdirs) {
          $files_tmp = `ls dir.*/${sat}_$loop`;
        } else {
          $files_tmp = `ls pe*.${sat}_$loop`;
        }
        chomp($files_tmp);
        @files = split(/\n/,$files_tmp);
        $files=@files;  # how many files are there
        if ($files > 0 ) {
           if ( $suffix ) {
                $this_diag_file = "diag_${sat}_${diagtag}.${nymd}${hh}${suffix}";
           } else {
                $this_diag_file = "diag_${sat}_$loop.${nymd}${hh}";
           }
           $cmd = "cat @files > $this_diag_file";
#          if ($opt_verbose) { print " $cmd\n" };
           $rc_ignore = system($cmd);
           if ( -z "${this_diag_file}" ) {
                unlink("${this_diag_file}");
           } else {
                if ( $opt_catonly ) {
                } else {
                   rename("${this_diag_file}","$fvwork/$expid.diag_${sat}_${diagtag}.${nymd}_${hh}z.bin");
                   if( "$loop" eq "01" ) {
                      Assignfn( "$fvwork/$expid.diag_${sat}_${diagtag}.${nymd}_${hh}z.bin","diag_${sat}.${nymd}${hh}");
                   }
                   $cmd = "/bin/rm @files";
#                  if ($opt_verbose) { print " $cmd\n" };
                   if ($do2txt and $diag2txt and not $netcdf_diag) {
                      $cmd = "$diag2txt $fvwork/$expid.diag_${sat}_${diagtag}.${nymd}_${hh}z.bin";
#                     if ($opt_verbose) { print " $cmd\n" };
                      $rc_ignore = system($cmd);
                   } 
                }
           }
        } # <files>

        $rc1 = system( "zeit_co.x -r .gsidiag_zeit ${sat}.bin");

} # <oneobs_binary>

sub oneobs_netcdf {
        my ($sat, $loop, $diagtag, $do2txt) = @_;

        my($files_tmp,$cmd,$rc_ignore);
        my($this_diag_file);
        my($files);

        if ($opt_debug) {
           print "nc4 $loop $sat  $do2txt\n";
           return(0);
        }

        $rc1 = system( "zeit_ci.x -r .gsidiag_zeit ${sat}.nc4");
        if ($lsubdirs) {
          $files_tmp = `ls dir.*/${sat}_$loop.nc4`;
        } else {
          $files_tmp = `ls pe*.${sat}_$loop.nc4`;
        }
        chomp($files_tmp);
        @files = split(/\n/,$files_tmp);
        $files=@files;  # how many files are there
        if ($files > 0 ) {
           if ( $suffix ) {
                $this_diag_file = "diag_${sat}_${diagtag}.${nymd}${hh}${suffix}";
           } else {
                $this_diag_file = "diag_${sat}_$loop.${nymd}${hh}.nc4";
           }
	   if ($files > 1) {
	       $cmd = "$FVROOT/bin/nc_diag_cat.x -o $this_diag_file @files";
#              if ($opt_verbose) { print " $cmd\n" };
	       $rc_ignore = system($cmd);
	   } else {
	       $cmd = "/bin/cp @files $this_diag_file";
#              if ($opt_verbose) { print " $cmd\n" };
	       $rc_ignore = system($cmd);	       
	   }
           if ( -z "${this_diag_file}" ) {
                unlink("${this_diag_file}");
           } else {
                if ( $opt_catonly ) {
                } else {
                   rename("${this_diag_file}","$fvwork/$expid.diag_${sat}_${diagtag}.${nymd}_${hh}z.nc4");
                   if( "$loop" eq "01" ) {
                      Assignfn( "$fvwork/$expid.diag_${sat}_${diagtag}.${nymd}_${hh}z.bin","diag_${sat}.${nymd}${hh}");
                   }
                   $cmd = "/bin/rm @files";
#                  if ($opt_verbose) { print " $cmd\n" };
                   if ($do2txt and $diag2txt) {
                      $cmd = "$diag2txt -nc4 $fvwork/$expid.diag_${sat}_${diagtag}.${nymd}_${hh}z.nc4";
#                     if ($opt_verbose) { print " $cmd\n" };
                      $rc_ignore = system($cmd);
                   }   
                }
           }
        } # <files>
        $rc1 = system( "zeit_co.x -r .gsidiag_zeit ${sat}.nc4");

} # <oneobs_netcdf>

#......................................................................
sub d2ods {
  chdir("$fvwork");
  $optods = " ";
  if($opt_ods) {$optods = "-o $opt_ods"};
  $cmd = "$fvroot/bin/diag2ods $optods $nymd $nhms $expid";
  print " $cmd\n";
  $rc = System($cmd, "$mylog","diag2ods");
}
#......................................................................
#
# Tick - advance date/time by nsecs seconds
#
#

sub tick {
    my ( $nymd, $nhms, $nsecs ) = @_;

    if("$nsecs" == "0" ) {
        return ($nymd, $nhms);
    }

    $yyyy1  = substr($nymd,0,4);
    $mm1    = substr($nymd,4,2);
    $dd1    = substr($nymd,6,2);

    $hh1 = 0 unless ( $hh1 = substr($nhms,0,2));
    $mn1 = 0 unless ( $mn1 = substr($nhms,2,2));
    $ss1 = 0 unless ( $ss1 = substr($nhms,4,2));
    $time1 = timegm($ss1,$mn1,$hh1,$dd1,$mm1-1,$yyyy1) + $nsecs;
    ($ss1,$mn1,$hh1,$dd1,$mm1,$yyyy1,undef,undef,undef) = gmtime($time1);

    $nymd = (1900+$yyyy1)*10000 + ($mm1+1)*100 + $dd1;
    $nhms = sprintf("%6.6d",$hh1*10000 + $mn1*100 + $ss1);
    return ($nymd, $nhms);

}

#=======================================================================
# name: load_balance
# purpose: If the number of child processes is at MAX or above, then
#          wait here until enough child processes complete to get the
#          total number under the limit.
#=======================================================================
sub load_balance {

    my ($MAX, @pidARR);
    my ($check_counter, $pid, $status);

    # get input parameters
    #---------------------
    $MAX = shift @_;
    @pidARR = @_;

    while (scalar(@pidARR) >= $MAX) {

        # loop through child processes
        #-----------------------------
        $check_counter = 0;
        while (1) {

            # check child process
            #---------------------------------
            # status equals 0   if still alive
            # status equals pid if complete
            # status equals -1  if not found
            #---------------------------------
            $pid = shift @pidARR;
            $status = waitpid($pid, WNOHANG);
            last if $status;

            # child process not complete
            #---------------------------
            push @pidARR, $pid;
            $check_counter++;

            # take one second breather before looping through again
            #------------------------------------------------------
            if ($check_counter >= $MAX) {
                sleep 1;
                $check_counter = 0;
            }
        }
    }
    return @pidARR if @pidARR;
}

#....................................................................................
sub instruments {
  my ($loop) = @_; 

  if ($opt_verbose) {
     print "instruments: $loop\n";
  }  
  my @allinstr = qw();
  $ic = 0;

  if ($lsubdirs) {
     my @files = glob("dir.*/*_${loop} dir.*/*_${loop}.nc4"); # must include both since lists differ
     foreach $this (@files) {
         chomp($this);
         ($pfx,$sfx) = split(/\//,$this);
         $first = $sfx;
         ($pfx,$sfx) = split(/_${loop}/,$first);
         push @allinstr, $pfx;
         $ic = $ic + 1;
     }
  } else {
     my @files = glob("pe*_${loop} pe*_${loop}.nc4"); # must include both since lists differ
     foreach $this (@files) {
         chomp($this);
         ($pfx,$sfx) = split(/\./,$this);
         $first = $sfx;
         ($pfx,$sfx) = split(/_${loop}/,$first);
         push @allinstr, $pfx;
         $ic = $ic + 1;
     }
  }
  @instr = uniq @allinstr;
# if ($opt_verbose) {
#    print  "@instr \n";
# }
}

#....................................................................................
sub actual_instruments {
  my ($loop,$only,$instruments) = @_; 
 
# if ($opt_verbose) {
#   print "actual_instruments: $_\n" for @$only;
#   print "actual_instruments: $_\n" for @$instruments;
#   print "actual_instruments: $loop\n";
# }

  @keep = ();
  foreach $itype ( @$instruments ) { 
    chomp($itype);
    foreach $ob (@$only) { # there has to be a more intelligent way to do this
       if ($ob eq $itype ) {
          push @keep, $itype;
          break;
       }
    }
  }
# print "keep @keep \n";

  return(@keep);
}
#....................................................................................
sub System {

    my ( $cmd, $logfile, $xname ) = @_;
    my ( @zname );

    open SAVEOUT, ">&STDOUT";  # save stdout
    open SAVEERR, ">&STDERR";  # save stderr

    open STDOUT, ">>$logfile" or die "can't redirect stdout";
    open STDERR, ">>$logfile" or die "can't redirect stderr";

    select STDERR; $| = 1;     # make it unbuffered
    select STDOUT; $| = 1;     # make it unbuffered

    @zname = split(" ", $cmd);
    if ( "$zname[0]" eq "mpirun" || "$zname[0]" eq "prun" ) {
      $rc1 = system( "zeit_ci.x -r $fvwork/.zeit $xname");
    } else {
      $rc1 = system( "zeit_ci.x -r $fvwork/.zeit $zname[0]");
    }

    $rc = system ( $cmd );     # run the shell command

    if ( "$zname[0]" eq "mpirun" || "$zname[0]" eq "prun" ) {
      $rc2 = system( "zeit_co.x -r $fvwork/.zeit $xname");
    } else {
      $rc2 = system( "zeit_co.x -r $fvwork/.zeit $zname[0]");
    }

    # Bitwise shift returns actual UNIX return code
    $exit_code = $rc >> 8;

    close STDOUT;
    close STDERR;

    open STDOUT, ">&SAVEOUT" ;  # restore stdout
    open STDERR, ">&SAVEERR" ;  # restore stdout

    return $exit_code;

  }

#......................................................................

sub Assign {

  my ( $fname, $lu ) = @_;

  $f77name = "fort.$lu";
  unlink($f77name) if ( -e $f77name ) ;
  symlink("$fname","$f77name");

}

sub Assignfn {

# Assignfn - assigns fn to given file name fname.
# fname = old file
# fn = new file (links to old)
  my ( $fname, $fn ) = @_;
  unlink($fn) if ( -e $fn ) ;
  symlink("$fname","$fn");

}

sub fullpath {

    my ( $fname ) = @_;

    $dirn = dirname("$fname");
    chomp($dirn = `pwd`) if ( "$dirn" eq "." ) ;
    $name = basename("$fname");
    $full = "$dirn/$name";
    return ($full);

  }

#......................................................................

sub usage {

   print <<"EOF";

NAME
     gsidiags - Collect and rename all diagnostic files from GSI
          
SYNOPSIS

     gsidiags [...options...] nymd nhms expid willdo
          
DESCRIPTION


     The following parameters are required 

     nymd     Year-month-day, e.g., 2002010  for 10 Jan 2002
     nhms     Hour-month-sec, e.g., 000000   for 00Z
     expid    Experiment id
     willdo   choose either set or unset

OPTIONS

     -h            prints this usage notice
     -jiter #      iteration number diag files were produced at
     -tag          diag files identifier, such as, ges, anl, or ans
     -ods          converts diag files to ODS
     -suffix name  adds a suffix to end of diag file names (as before link)
     -catonly      only cat pe-files into diag files (no-link)
     -log logfile  name of log file for this procedure 
     -subdirs      set when gsi writes individual PE files into specific dirs
     -verbose      echo extra info

NECESSARY ENVIRONMENT

OPTIONAL ENVIRONMENT

  ARCH             machine architecture, such as, Linux, AIX, etc
  D2ODS            when set, converts GSI diag output files to ods
  FVROOT           location of build's bin
  FVHOME           location of binaries must be defined
  FVWORK           working env must be defined
  GSI_BINARY_DIAG  on/off binary output (default: 1; on) 
  GSI_NETCDF_DIAG  on/off netcdf output (default: 0; off)
  GSI_DIAG2TXT     Execution path/flags to gsidiag_bin2txt.x
                      (default: unset, do not run bin2txt)

AUTHOR

     Ricardo Todling (Ricardo.Todling\@nasa.gov), NASA/GSFC/GMAO
     Last modified: 24Oct2021      by: R. Todling


EOF

  exit(1)

}
