#!/bin/bash
#SBATCH --time=00:10:00
##SBATCH --time=18:00:00
#SBATCH --nodes=9
#SBATCH --tasks-per-node=100
#SBATCH --cpus-per-task=1
#SBATCH --partition=standard
#SBATCH --qos=short
##SBATCH --qos=standard
#SBATCH --chdir=../run

# record queueing time and start run timer
TIMER_Q_END="$(date +%s)"
TIMER_Q_ELAPSED="$(expr $TIMER_Q_END - $TIMER_Q_START)"
echo Queue-time seconds $TIMER_Q_ELAPSED >> times
TIMER_R_START="$(date +%s)"

# prevent any threaded system libraries from automatically using threading
export OMP_NUM_THREADS=1

# choose maximum clock speed for cores
export SLURM_CPU_FREQ_REQ=2250000

# set scratch directory
export TMPDIR=/work/n02/n02/`whoami`/SCRATCH

# choose amount of time needed for run restart at end (seconds)
TIME_TO_LEAVE_S=180

# find out how long until scheduler terminates this job and calculate how long to run job
# (handle cases mm:ss and hh:mm:ss)
TIME_LEFT_HMS=$(squeue -j $SLURM_JOB_ID -O TimeLeft | tail -1 | xargs)
if [[ ${#TIME_LEFT_HMS} < 6 ]]; then
  TIME_LEFT_S=$(echo $TIME_LEFT_HMS|awk -F: '{print ($1 * 60) + $2 }')
else
  TIME_LEFT_S=$(echo $TIME_LEFT_HMS|awk -F: '{print ($1 * 3600) + ($2 * 60) + $3 }')
fi
RUN_TIME_LEFT_S="$(($TIME_LEFT_S-$TIME_TO_LEAVE_S))"

# run the job for that long
timeout $RUN_TIME_LEFT_S  srun --distribution=block:block --hint=nomultithread ./mitgcmuv

# Get the exit code
EXIT_CODE=$?

# record run time
TIMER_R_END="$(date +%s)"
TIMER_R_ELAPSED="$(expr $TIMER_R_END - $TIMER_R_START)"
echo Run-time seconds $TIMER_R_ELAPSED >> times

# choose how to proceed
if [ $EXIT_CODE == 0 ]; then

  echo job chain finished

  # save a copy of final stdout/stderr from the master node
  cp STDERR.0000 stderr_9999999999
  cp STDOUT.0000 stdout_9999999999

  # transfer results back to BAS
  sbatch -J THWR_$JOBNO \
    --account $HECACC \
    --export=ALL \
    ../scripts/rsync_a2.sh

elif [ $EXIT_CODE == 124 ]; then

  echo job chain ran out of time, restarting from pickup.ckptA

  # read the timestep from the pickup meta file
  META_LINE=`sed -n '/timeStepNumber/p' pickup.ckptA.meta`
  NITER0=$(echo $META_LINE | sed 's/[^0-9]*//g')

  # update the line in data which sets niter0 and is uncommented
  NITER0_LINE="\ niter0 = $NITER0,"
  sed -i "/^ niter0/c $NITER0_LINE" data

  # update the line in data containing pickupSuff, whether or not it's commented (assumes only one!)
  PICKUP_LINE="\ pickupSuff = 'ckptA',"
  sed -i "/pickupSuff/c $PICKUP_LINE" data

  # save a copy of stdout/stderr from the master node
  NITER0FORMAT=`printf "%010i" $NITER0`
  cp STDERR.0000 stderr_$NITER0FORMAT
  cp STDOUT.0000 stdout_$NITER0FORMAT

  # record queueing start time
  TIMER_Q_START="$(date +%s)"

  # submit the next job in the chain
  cd ../scripts
  sbatch -J THW_$JOBNO \
         --account $HECACC \
         --export HECACC=$HECACC,JOBNO=$JOBNO,TIMER_Q_START=$TIMER_Q_START \
         run_repeat_a2.sh

else

  echo job chain failed, exit code $EXIT_CODE

  # save a copy of final stdout/stderr from the master node
  cp STDERR.0000 stderr_9999999999
  cp STDOUT.0000 stdout_9999999999

  # transfer results back to BAS
  sbatch -J THWR_$JOBNO \
    --account $HECACC \
    --export=ALL \
    ../scripts/rsync_a2.sh

fi



