#!/usr/bin/env perl
#============================================================================
# name - esma_mpirun
# purpose - Wrapper for the mpirun command; filters out command-line flags
#           which are not applicable for the loaded version of mpi, and loads
#           other flags required for the operating environment.
#
# note -
# 1. Any flag which has not been identified as potentially needing 
#    to be filtered is passed directly through to the mpirun command.
# 2. Specifically, the -h (or -help) flag is passed through to the
#    mpirun command.
#
# revision history
# 02Oct2008  Stassi     Initial version of code.
# 23Jun2009  Stassi     Add $xtraflags for pleiades nodes
# 03May2010  Todling    Quick fix to work under mvapich2
# 13May2010  Stassi     Extended fix to work on NAS nodes as well as NCCS
# 27Jul2010  Stassi     Modified to work with either intel-mpi or mvapich
# 18May2011  Stassi     Capture status of cmd and die if not success
# 18May2016  Thompson   Add support for MPT
# 24Sep2018  Thompson   Add support for Open MPI and MPICH3, remove Intel MPI
#                       section at NAS, add detection for GMAO desktop, remove
#                       references to Scali MPI
# 25Sep2018  Thompson   SLURM doesn't always provide SLURM_NTASKS_PER_NODE so
#                       you can't always make Intel MPI depend on it. Add
#                       detection of needed environment variable so that
#                       perhost on Intel MPI + SLURM works as expected.
# 27Sep2018  Thompson   Rewrite the which_mpi_cmd code so that it is more
#                       specific based on MPI stack rather than on if the
#                       command exists on the system. Add a new sub to check
#                       if you are on slurm batch for NCCS MPT. Removed
#                       mpdboot bits as that is not used anymore.
# 03Oct2018  Thompson   Remove code for MPT + SLURM + Interactive thanks
#                       to Virginie Buchard who made me explore it a bit
#                       deeper. The correct fix is an environment variable
#                       which is not something to be set in esma_mpirun. But
#                       a fat comment is added for future reference.
# 03Oct2018  Thompson   Add support for HPC-X and treat as Open MPI
# 20Aug2020  Thompson   Assume Open MPI to start (for CI, etc.)
#============================================================================
use strict;
use warnings;

# global variables
#-----------------
my ($node, $mpi_type, $perhost);
my ($progx, @mpiARGs, @progARGs);
my ($mpicmd, $xtraflags);
my ($scriptname,$scriptbasename);

# main program
#-------------
{
    init();
    get_mpi_type();

    parse_command_line();
    which_mpi_cmd();
    get_xtraflags();

    run_mpicmd();
}

#============================================================================
# name - init
# purpose - get runtime inputs
#
#============================================================================
sub init {
    use FindBin;
    use Getopt::Long;
    use File::Basename;
    use Getopt::Long qw(:config no_auto_abbrev pass_through);

    # get runtime flags
    #------------------
    GetOptions("perhost=i" => \$perhost);

    $scriptname = $0;
    $scriptbasename = basename($0);
}

#============================================================================
# name - get_mpi_type
# purpose - try to determine the type of mpi library
#============================================================================
sub get_mpi_type {
    my ($MPIHOME);

    # Assume that the MPI stack will be openmpi (useful for CI, AWS, etc.)
    # This will be overridden below if a match occurs, but if not, this
    # will assume openmpi
    $mpi_type = "openmpi";

    # At NCCS, Open MPI modules can define M_MPI_ROOT, so we cannot
    # depend on that as always good. Instead, we'll use the specific
    # environment variables that seem valid. Note MPT_VERSION is defined
    # at NCCS, but not at NAS, so that should failover to MPI_ROOT

    # check MPI specific env variable
    #--------------------------------
    if    ( $ENV{"MVAPICH"}    )             { $mpi_type = "mvapich2" }
    elsif ( $ENV{"I_MPI_ROOT"} )             { $mpi_type = "intel"    }
    elsif ( $ENV{"OPENMPI"}    )             { $mpi_type = "openmpi"  }
    elsif ( $ENV{"MPICH"}      )             { $mpi_type = "mpich"    }
    elsif ( $ENV{"MPT_VERSION"})             { $mpi_type = "mpt"      }
    elsif ( $ENV{"HPCX_HOME"}  )             { $mpi_type = "openmpi"  }

    # check $MPIHOME env variable
    #----------------------------
    elsif ( $MPIHOME = $ENV{"MPIHOME"} ) {
        if    ( $MPIHOME =~ m[mvapich2] )    { $mpi_type = "mvapich2" }
        elsif ( $MPIHOME =~ m[/intel/mpi/] ) { $mpi_type = "intel"    }
        elsif ( $MPIHOME =~ m[openmpi] )     { $mpi_type = "openmpi"  }
        elsif ( $MPIHOME =~ m[mpich] )       { $mpi_type = "mpich"    }
    }

    # check $MPI_HOME env variable
    #-----------------------------
    elsif ( $MPIHOME = $ENV{"MPI_HOME"} ) {
        if    ( $MPIHOME =~ m[mvapich2] )    { $mpi_type = "mvapich2" }
        elsif ( $MPIHOME =~ m[/intel/mpi/] ) { $mpi_type = "intel"    }
        elsif ( $MPIHOME =~ m[openmpi] )     { $mpi_type = "openmpi"  }
        elsif ( $MPIHOME =~ m[mpich] )       { $mpi_type = "mpich"    }
    }

    # check $MPI_ROOT env variable
    #-----------------------------
    elsif ( $MPIHOME = $ENV{"MPI_ROOT"} ) {
        if    ( $MPIHOME =~ m[mvapich2] )    { $mpi_type = "mvapich2" }
        elsif ( $MPIHOME =~ m[/intel/mpi/] ) { $mpi_type = "intel"    }
        elsif ( $MPIHOME =~ m[/sgi/mpi/] )   { $mpi_type = "mpt"      }
        elsif ( $MPIHOME =~ m[/hpe/mpt/] )   { $mpi_type = "mpt"      }
        elsif ( $MPIHOME =~ m[openmpi] )     { $mpi_type = "openmpi"  }
        elsif ( $MPIHOME =~ m[mpich] )       { $mpi_type = "mpich"    }
    }
    print "$scriptname: mpi_type = $mpi_type\n";
    return;
}

#============================================================================
# name - parse_command_line
# purpose - extract the executable program from the argument list;
#           divide all other arguments between those which belong to the
#           mpi command and those which belong to the executable program
#
# note -
# 1. arguments which precede the prog are @mpiARGs
# 2. arguments which follow the prog are @progARGs
#============================================================================
sub parse_command_line {
    my ($found, $num, $arg);

    # extract executable program from argument list
    #----------------------------------------------
    $progx = undef;
    $found = 0;

    $num = scalar(@ARGV);
    foreach (1..$num) {
        $arg = shift @ARGV;
        unless ($found) {
            if (-x $arg) {
                $progx = $arg;
                $found = 1;
                next;
            }
            push @mpiARGs, $arg;
            next;
        }
        push @progARGs, $arg
    }
    die ">> Error << no executable program found in mpi command" unless $progx;
}

#============================================================================
# name - isSLURMbatch
# purpose - determine if SLURM allocation is batch or interactive
#============================================================================
sub isSLURMbatch {
    my ($BatchFlag);

    if ($ENV{SLURM_JOBID}) {
       $BatchFlag = `scontrol show job $ENV{SLURM_JOBID} | grep -o 'BatchFlag=[[:alnum:]]' | cut -d= -f2`;
    }

    return int($BatchFlag);
}

#============================================================================
# name - which_mpi_cmd
# purpose - determine which mpi command to use
#============================================================================
sub which_mpi_cmd {
    use FindBin qw($Bin);
    use lib ("$Bin");
    use GMAO_utils qw(get_siteID);
    my ($progname, $status, $siteID);
    $siteID = get_siteID();

    # check for existence of mpirun, mpiexec, or mpiexec_mpt command
    #---------------------------------------------------------------
    $mpicmd = "notfound";

    if ($mpi_type eq 'mpt') {

       # NOTE: If you are running mpiexec_mpt at NCCS under SLURM interactive
       #       with MPT, you will *NOT* get the same behavior as under
       #       a batch job. To get the same behaviour, you must run with:
       #          
       #          setenv SLURM_DISTRIBUTION=block
       #
       #       This is currently done with both g5das.j and gcm_run.j
       
       chomp($mpicmd = `which mpiexec_mpt`);
    } elsif ($mpi_type eq 'mpich') {
       chomp($mpicmd = `which mpiexec`);
    } elsif ($mpi_type eq 'mvapich2') {
       # For now use mpirun_rsh. At NCCS this should be
       # srun -n but as -np is *external* to esma_mpirun
       # this requires wholesale script changes
       chomp($mpicmd = `which mpiexec.mpirun_rsh`);
    } else {
       # Used by both Intel MPI and Open MPI
       chomp($mpicmd = `which mpirun`);
    }

    # This might be superfluous, but a good check
    die ">>> Error <<< Cannot find command to run mpi" unless -e $mpicmd;

    return;
}

#============================================================================
# name - get_xtraflags
# purpose - set other flags needed by the node where executing.
#
# notes
# 1. This is mainly handling -perhost. At some point it will
#    be good to add something that can handle prepend rank flags
#============================================================================
sub get_xtraflags {
    use FindBin qw($Bin);
    use lib ("$Bin");
    use GMAO_utils qw(get_siteID);
    my ($cpus_per_node, $numnodes, $siteID, $label, $ignoreslurm, $respectplacement);

    $xtraflags = "";
    $siteID = get_siteID();

    # flags needed for mvapich2 mpi
    #------------------------------
    $xtraflags .= "-export" if $mpi_type eq "mvapich2";

    # flags needed for intel mpi
    #---------------------------
    if ($mpi_type eq "intel") {
       if ($perhost) {

          # Detect if we are on slurm on SLES 11
          # ------------------------------------
          if ($ENV{SLURM_JOB_ID} && ! -e "/etc/os-release") {

             #############################################################
             # Intel MPI on SLURM handles -perhost in a possibly         #
             # unexpected manner. When in a SLURM allocation, Intel      #
             # MPI by default obeys SLURM. If you ask for 97 tasks on    #
             # Haswell nodes that have 28 cores each, you will end up    #
             # with SLURM_TASKS_PER_NODE=25,24(x3) and Intel MPI will    #
             # put 24 tasks per node on 3 nodes and 25 on the other. If  #
             # you pass in mpirun -perhost 5 -np 20, you will end up     #
             # running all 20 processes on the first node. In order      #
             # to have -perhost work "as expected" one needs to tell     #
             # Intel MPI to ignore SLURM job placement by setting:       #
             # I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=disable.              #
             #                                                           #
             # As this is somewhat unexpected behavior, esma_mpirun will #
             # die unless it detects process placement is being ignored. #
             #############################################################

             # Intel kindly provides four ways to disable process
             # placement respecting. We must test for all four
             # --------------------------------------------------
             $respectplacement = lc $ENV{I_MPI_JOB_RESPECT_PROCESS_PLACEMENT};

             if ( $respectplacement eq 0     || $respectplacement eq "no"    ||
                  $respectplacement eq "off" || $respectplacement eq "disable" ) {
                $ignoreslurm = 1;
             } else {
                $ignoreslurm = 0;
             }

             # Check if we are ignoring SLURM
             # ------------------------------
             if ( $ignoreslurm ) {
                $xtraflags .= " -perhost $perhost ";
             } else {

                # Write a good error message about Intel MPI + SLURM perhost behavior
                # -------------------------------------------------------------------
                my $message = qq {
                >> Error << Intel MPI on SLURM with -perhost works differently
                than one would expect. Intel MPI will ignore -perhost
                unless I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=disable in the
                environment. This was not detected but -perhost was passed
                in which could lead to unexpected behavior. Please either
                disable I_MPI_JOB_RESPECT_PROCESS_PLACEMENT or reconsider
                how you have called $scriptbasename
                };

                # Unindent the above
                # ------------------
                $message =~ s/^\s+//gm;

                die "$message\n";
             }
          } else {
             # We cannot assume all cases of Intel MPI are on SLURM, so
             # pass through perhost, but be aware of unknown behavior.

             $xtraflags .= " -perhost $perhost";
          }
       }
    }

    # MPT can respect perhost as well. Note that it will
    # die if the perhost number does not divide evenly
    # --------------------------------------------------
    if ($mpi_type eq "mpt") {
       if ($perhost) {
          $xtraflags .= " -perhost $perhost";
       }
    }

    if ($mpi_type eq "openmpi") {

       if ($siteID eq "gmao") {
          $xtraflags .= " -oversubscribe";
       } elsif ($perhost) {
          $xtraflags .= " -map-by ppr:$perhost:node -bind-to core";
       }

    }

}

#============================================================================
# name - run_mpicmd
# purpose - run the mpi command
#============================================================================
sub run_mpicmd {
    use File::Basename;
    my ($flags, $cmd, $status);
    my ($PBS_NODEFILE, $num);

    $flags = "";
    $flags .= " $xtraflags"     if $xtraflags;
    $flags .= " @mpiARGs"       if @mpiARGs;

    $cmd = "$mpicmd $flags $progx @progARGs";
    my_system($cmd);
}

#============================================================================
# name - my _system
# purpose - print system command, then execute it and check its status
#============================================================================
sub my_system {
    my ($cmd, $status);

    $cmd = shift @_;
    print "$cmd\n";
    $status = system $cmd;

    if ($status) {
        $status = $status>>8;
        die ">> Error << $cmd: status = $status;";
    }
}
