#!/bin/sh
###############################################################################
##                                                                           ##
##           l  p  j  s  u  b  m  i  t  _  s  l  u  r  m                     ##
##                                                                           ##
##    sh script to generate and submit parallel SLURM jobs using             ##
##    Intel MPI.                                                             ##
##                                                                           ##
##    Usage: lpjsubmit [-class c] [-group g] [-wtime time] [-blocking n]     ##
##                     [-o output] [-e error] [-q] [-nocheck] ntask          ##
##                     [LPJargs...]                                          ##
##                                                                           ##
##    written by Werner von Bloh, PIK Potsdam                                ##
##                                                                           ##
##    Last change: $Date:: 2018-05-16 15:15:17 +0200 (Wed, 16 May 2018)    $ ##
##    By         : $Author:: herzfeld                        $               ##
##                                                                           ##
###############################################################################

if [ $# -lt 1 ]
then
  echo >&2 Error: Number of tasks missing
  echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
  exit 1
fi

if [ "$LPJROOT" = "" ]
then
  echo >&2 Error: environment variable LPJROOT is not set
  echo >&2 "Set by export LPJROOT=<path to lpjml directory>"
  exit 1
fi

if [ $1 = "-class" ]
then
  if [ $# -lt 2 ]
  then
    echo >&2 Error: class missing
    echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
    exit 1
  fi
  shift 1
  class=$1
  shift 1
else
  class=short
fi
if [ $1 = "-group" ]
then
  if [ $# -lt 2 ]
  then
    echo >&2 Error: group missing
    echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
    exit 1
  fi
  shift 1
  group=$1
  shift 1
else
  group=""
fi

wtime=""
if [ $1 = "-wtime" ]
then
  if [ $# -lt 2 ]
  then
    echo >&2 Error: wall clock time missing
    echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
    exit 1
  fi
  shift 1
  wtime=$1
  shift 1
fi
blocking="unlimited"
if [ $1 = "-blocking" ]
then
  if [ $# -lt 2 ]
  then
    echo >&2 Error: blocking factor missing 
    echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
    exit 1
  fi
  shift 1
  blocking=$1
  shift 1
fi
output="lpjml.%j.out"
if [ $1 = "-o" ]
then
  if [ $# -lt 2 ]
  then
    echo >&2 Error: output filename missing 
    echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
    exit 1
  fi
  shift 1
  output=$1
  shift 1
fi
error="lpjml.%j.err"
if [ $1 = "-e" ]
then
  if [ $# -lt 2 ]
  then
    echo >&2 Error: error filename  missing
    echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
    exit 1
  fi
  shift 1
  error=$1
  shift 1
fi

if [ $1 = "-q" ]
then
 shift 1
 quiet="-Q"
fi
if [ $1 = "-nocheck" ]
then
 shift 1
 nocheck="-nocheck"
fi
if [ $# -lt 1 ]
then
  echo >&2 Error: Number of tasks missing
  echo >&2 Usage: $0 [-class c] [-group g] [-wtime time] [-blocking n] [-o output] [-e error] [-q] [-nocheck] ntasks [args ...]
  exit 1
fi

ntask=$1 # number of tasks
shift 1
args=$*  # runtime arguments for lpjml

# check, whether LPJ configuration is valid

if $LPJROOT/bin/lpjcheck $quiet $nocheck $args ;
then
# yes, create SLURM job control file
  cat <<EOF >slurm.jcf
#!/bin/bash 
###############################################################################
##                                                                           ##
##                     s  l  u  r  m  .  j  c  f                             ##
##                                                                           ##
##  SLURM JCF file for running an Intel MPI job on the HLRS2015 cluster      ##
##  at PIK                                                                   ##
##                                                                           ##
##  Automatically generated by lpjsubmit shell script                        ##
##                                                                           ##
##  Created: $(date +"%d.%m.%Y")                                                      ##
##                                                                           ##
###############################################################################
 
#SBATCH --ntasks=$ntask
#SBATCH --qos=$class
#SBATCH -J LPJmL50
EOF
echo "#SBATCH --comment=\"LPJmL Version" $(cat $LPJROOT/VERSION) "args:" $args"\"" >>slurm.jcf
cat <<EOF >>slurm.jcf
#SBATCH --mail-user=$(whoami)
#SBATCH --mail-type=end
EOF
if [ "$wtime" != "" ]
then
  echo "#SBATCH --time="$wtime >>slurm.jcf
fi
if [ "$group" != "" ]
then
  echo "#SBATCH --account="$group >>slurm.jcf
fi
if [ "$blocking" != "unlimited" ]
then
  echo "#SBATCH --tasks-per-node="$blocking >>slurm.jcf
fi
cat <<EOF >>slurm.jcf
#SBATCH -o  $output
#SBATCH -e  $error

module load mpi/intel/5.1.3

export LPJROOT=$LPJROOT

ulimit -c unlimited

export I_MPI_PMI_LIBRARY=/p/system/slurm/lib/libpmi.so

srun --propagate  \$LPJROOT/bin/lpjml $args

rc=\$?  # save return code of srun
exit \$rc # exit with return code
EOF
# submit job
  if sbatch $quiet slurm.jcf ;
  then
    if [ "$quiet" != "-Q" ]
    then
      squeue -u $(whoami)
    fi
  else
    exit 1
  fi
else
  echo >&2 "Error in LPJ configuration, job not submitted"
  exit 1
fi
