#!/bin/bash

# check if ACCEL_PROF_HOME is set
if [ -z ${ACCEL_PROF_HOME} ]; then
    echo "ACCEL_PROF_HOME is not set."
    echo "Please set ACCEL_PROF_HOME to the root directory of this repo."
    exit 1
fi

function help_func()
{
    cat <<EOF
Description: A collection of CUDA application profilers.
Usage:
    -h, --help
        Print this help message.
    -t <tool_name>
        none: Do nothing.
        mem_trace: Trace memory access of CUDA kernels.
        app_metric: Collect metrics for CUDA Applications.
        code_check: Check CUDA code for potential issues.
        hot_analysis: Analyze hot memory regions.
        app_analysis: Analyze CUDA application performance.
        app_analysis_cpu: Analyze CUDA application performance on CPU.
        time_hotness_cpu: Analyze time hotness on CPU.
        uvm_advisor: Analyze UVM memory access patterns.
        event_trace: Trace AMD GPU events.
        event_trace_mgpu: Trace multi-GPU events.
    -d <device_name>
        Specify the device vendor name.
        Valid values:
            - nvc (NVIDIA Compute Sanitizer, default)
            - nvbit (NVIDIA nvbit)
            - rocm (AMD ROCm)
    -v
        Verbose mode.
EOF
    exit 0
}

function error_hint()
{
error_message=$1
cat <<EOF
${error_message}
Tip: Use "accelprof -h" for more usage description.
EOF
    exit 1
}

# for debugging
# set -x

OPTION_STR=""
while [ -n "$1" ]
do
    arg="$1" ; shift
    case "${arg}" in
        -t)
        export TOOL=$1
        shift
        OPTION_STR="${OPTION_STR} -t ${TOOL}"
        ;;
        -d)
        export DEVICE=$1
        shift
        OPTION_STR="${OPTION_STR} -d ${DEVICE}"
        ;;
        -v)
        export VERBOSE=1
        OPTION_STR="${OPTION_STR} -v"
        ;;
        -h)
        help_func
        break
        ;;
        * )
        set -- "${arg}" "$@"
        break
        ;;
    esac
done

EXECUTABLE=$1
ARGS="${*:2}"

if [ -z ${EXECUTABLE} ]
then
    error_hint "Specify the executable to run."
fi

if [ -z ${TOOL} ]
then
    error_hint "Specify tool via -t <tool_name>"
fi


if [ ! -z ${RESULT_DIR} ]
then
    echo "Result directory: ${RESULT_DIR}"
else
    export RESULT_DIR=$(pwd)
fi

if [ "${DEVICE}" = "nvc" ]; then
    export YOSEMITE_DEVICE=nvc
elif [ "${DEVICE}" = "nvbit" ]; then
    export YOSEMITE_DEVICE=nvbit
elif [ "${DEVICE}" = "rocm" ]; then
    export YOSEMITE_DEVICE=rocm
else
    export YOSEMITE_DEVICE=nvc
fi

if [ ${EXECUTABLE} == "python" ] || [ ${EXECUTABLE} == "python3" ];
then
    export YOSEMITE_APP_NAME=$(basename $(echo $ARGS | cut -d' ' -f1) .py)
    export TORCH_PROFILE_ENABLED=1
else
    export YOSEMITE_APP_NAME=${EXECUTABLE#./}
    export TORCH_PROFILE_ENABLED=0
fi
echo "${EXECUTABLE} ${ARGS}"

if [ ! -z ${VERBOSE} ]
then
    export OUTPUT_REDIRECT=${RESULT_DIR}/${YOSEMITE_APP_NAME}.accelprof.log
    echo "stdout redirect: ${OUTPUT_REDIRECT}"
else
    export OUTPUT_REDIRECT=/dev/null
fi

# Profile the application with the specified tool
if [ ${TOOL} == "code_check" ]
then
    export YOSEMITE_TOOL_NAME=code_check
elif [ ${TOOL} == "mem_trace" ]
then
    export YOSEMITE_TOOL_NAME=mem_trace
elif [ ${TOOL} == "app_metric" ]
then
    export YOSEMITE_TOOL_NAME=app_metric
elif [ ${TOOL} == "hot_analysis" ]
then
    export YOSEMITE_TOOL_NAME=hot_analysis
elif [ ${TOOL} == "uvm_advisor" ]
then
    export YOSEMITE_TOOL_NAME=uvm_advisor
elif [ ${TOOL} == "app_analysis" ]
then
    export YOSEMITE_TOOL_NAME=app_analysis
elif [ ${TOOL} == "app_analysis_cpu" ]
then
    export YOSEMITE_TOOL_NAME=app_analysis_cpu
elif [ ${TOOL} == "time_hotness_cpu" ]
then
    export YOSEMITE_TOOL_NAME=time_hotness_cpu
elif [ ${TOOL} == "event_trace" ]
then
    export YOSEMITE_TOOL_NAME=event_trace
elif [ ${TOOL} == "event_trace_mgpu" ]
then
    export YOSEMITE_TOOL_NAME=event_trace_mgpu
elif [ ${TOOL} == "none" ]
then
    export YOSEMITE_TOOL_NAME=none
else
    error_hint "Invalid tool name."
fi
echo "Running the ${YOSEMITE_TOOL_NAME} tool for ${YOSEMITE_APP_NAME}..."


# Check the version of accelprof
commit_hash=$(git -C "$ACCEL_PROF_HOME" rev-parse HEAD)
if [[ -n $(git -C "$ACCEL_PROF_HOME" status --porcelain) ]]; then
    modified=1
else
    modified=0
fi

# output metadata
printf "%-30s: %s\n" "[ACCELPROF INFO] VERSION" \
        "${commit_hash}, modified ${modified}" > ${OUTPUT_REDIRECT}
printf "%-30s: %s\n" "[ACCELPROF INFO] LD_PRELOAD" \
        "${ACCEL_PROF_HOME}/lib/libcompute_sanitizer.so" >> ${OUTPUT_REDIRECT}
printf "%-30s: %s\n" "[ACCELPROF INFO] OPTIONS" \
        "${OPTION_STR}" >> ${OUTPUT_REDIRECT}
printf "%-30s: %s %s\n" "[ACCELPROF INFO] COMMAND" \
        "${EXECUTABLE}" "${ARGS}" >> ${OUTPUT_REDIRECT}

# Record start time in seconds since epoch
start_time=$(date +%s)
printf "%-30s: %s\n\n" "[ACCELPROF INFO] START TIME" "$(date)" >> ${OUTPUT_REDIRECT}

# accelprof

if [ ${YOSEMITE_DEVICE} == "nvc" ]
then
    LD_PRELOAD_LIB=${ACCEL_PROF_HOME}/lib/libcompute_sanitizer.so
elif [ ${YOSEMITE_DEVICE} == "nvbit" ]
then
    LD_PRELOAD_LIB=${ACCEL_PROF_HOME}/lib/libnv-nvbit.so
elif [ ${YOSEMITE_DEVICE} == "rocm" ]
then
    LD_PRELOAD_LIB=${ACCEL_PROF_HOME}/lib/librocm_callback.so
else
    LD_PRELOAD_LIB=${ACCEL_PROF_HOME}/lib/libcompute_sanitizer.so
fi

# run the application without any tool
if [ ${YOSEMITE_TOOL_NAME} == "none" ]
then
    LD_PRELOAD_LIB=""
fi

# Multi-GPU profiling (e.g., distributed training with Megatron-LM)
if [ "${MGPU_PROFILING}" == "1" ]
then
    export TORCH_PROFILE_ENABLED=1
    CUDA_INJECTION64_PATH=${LD_PRELOAD_LIB} ${EXECUTABLE} ${ARGS} >> ${OUTPUT_REDIRECT} 2>&1
else
    LD_PRELOAD=${LD_PRELOAD_LIB} ${EXECUTABLE} ${ARGS} >> ${OUTPUT_REDIRECT} 2>&1
fi


# check if the above execution is successful
if [ $? -ne 0 ]; then
    echo "error: Fail to run the application with AccelProf."
    echo "Please check ${OUTPUT_REDIRECT} for more details."
    echo ""
    exit 1
fi

# Record end time and calculate elapsed time
end_time=$(date +%s)
elapsed=$((end_time - start_time))
printf "%-30s: %s\n" "[ACCELPROF INFO] END TIME" "$(date)" >> ${OUTPUT_REDIRECT}
printf "%-30s: %02d:%02d:%02d\n" "[ACCELPROF INFO] ELAPSED TIME" \
        $((elapsed/3600)) $((elapsed%3600/60)) $((elapsed%60)) >> ${OUTPUT_REDIRECT}
