#!/usr/bin/env bash

# setting colors to use
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
NC='\033[0m'
VERSION="v1.7.03"

printf "\n\n                                  GToTree ${VERSION}\n"
printf "                         (github.com/AstrobioMike/GToTree)\n\n"

#############################################################################
################################  HELP INFO  ################################
#############################################################################
## called by program name with no arguments or with "-h" as only positional argument ##
if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then


    printf "\n --------------------------------  HELP INFO  --------------------------------- \n\n"
    printf "  This program takes input genomes from various sources and searches for the\n"
    printf "  specified Pfams.\n\n"

    printf "\n    Required inputs include:\n\n"
    printf "      1) Input genomes in one or any combination of the following formats:\n"
    printf "        - [-a <file>] single-column file of NCBI assembly accessions\n"
    printf "        - [-g <file>] single-column file with the paths to each GenBank file\n"
    printf "        - [-f <file>] single-column file with the paths to each fasta file\n"
    printf "        - [-A <file>] single-column file with the paths to each amino acid file,\n"
    printf "                      each file should hold the coding sequences for just one genome\n\n"

    printf "      2)  [-p <file>] single-column file of the target Pfams, e.g.:\n"
    printf "                  PF04183\n"
    printf "                  PF05400\n\n"

    printf "    Optional arguments include:\n\n"

    printf "        - [-o <str>] default: pfam_search_output\n"
    printf "                  Specify the desired output directory.\n\n"

    printf "        - [-n <int> ] default: 2\n"
    printf "                  The number of cpus you'd like to use during the HMM search.\n\n"

    printf "        - [-d ] default: false\n"
    printf "                  Provide this flag with no arguments if you'd like to keep the\n"
    printf "                  temporary directory. (mostly useful for debugging)\n\n"

    printf "        - [-P ] default: false\n"
    printf "                  Provide this flag with no arguments if your system can't use ftp,\n"
    printf "                  and you'd like to try using http.\n\n"

    printf "    Example usage:\n\n\t GToTree-pfam-search -a ncbi_accessions.txt -g genbank_files.txt -f fasta_files.txt -p target_pfams.txt\n\n"

    exit
fi


#############################################################################
##############  CHECKING FIRST FOR ALL ESSENTIAL DEPENDENCIES  ##############
#############################################################################
if ! command -v hmmsearch > /dev/null; then
    printf "\n  ${RED}HMMER3 is an essential dependency but does not seem to be in your PATH :(${NC}\n"
    printf "\n  See github.com/AstrobioMike/GToTree/wiki/installation for help if needed.\n\n"
    printf "\nExiting for now.\n\n"
    exit
fi

#############################################################################
############################  PARSING ARGUMENTS  ############################
#############################################################################
## setting some defaults
output_dir="pfam_search_output"
debug_flag='false'
http_flag='false'

while getopts :a:g:f:A:p:o:dn:P args
do
    case "${args}"
    in
        a) NCBI_acc_file=${OPTARG};;
        g) genbank_list_file=${OPTARG};;
        f) fasta_files=${OPTARG};;
        A) amino_acid_files=${OPTARG};;
        p) pfam_file=${OPTARG};;
        o) output_dir=${OPTARG};;
        d) debug_flag='true';;
        n) num_cpus=${OPTARG};;
        P) http_flag='true';;
        \?) printf "\n  ${RED}Invalid argument: -${OPTARG}${NC}\n\n    Run 'GToTree-pfam-search' with no arguments or '-h' only to see help menu.\n\n" >&2 && exit
    esac
done

# checking no duplicates in NCBI accession file
if [ -f "$NCBI_acc_file" ]; then
    num_dupes=$(uniq -d "$NCBI_acc_file" | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}      $NCBI_acc_file has duplicate entries, check it out and provide unique accessions only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi

# checking numeric inputs and setting defaults if not provided
if [ -z $num_cpus ]; then
    num_cpus=2
else
    # checking is an integer
    if ! [[ $num_cpus =~ ^[0-9]+$ ]]; then
        printf "\n  ${RED}The value provided for cpus (\"-n\") needs to be an integer.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    if [ $num_cpus == 0 ]; then
        printf "\n  ${RED}The value provided for cpus (\"-n\") needs to be greater than 0.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi

#############################################################################
############  MAKING SURE MINIMUM REQUIRED INPUTS WERE PROVIDED  ############
#############################################################################
if [ ! -n "$NCBI_acc_file" ] && [ ! -n "$genbank_list_file" ] && [ ! -n "$fasta_files" ] && [ ! -n "$amino_acid_files" ]; then
    printf "\n  ${RED}You need to provide at least one input-genome source!${NC}\n"
    printf "\nExiting for now.\n\n"
    exit
fi

if [ ! -n "$pfam_file" ]; then
    printf "\n  ${RED}You need to provide the target Pfams to search for!${NC}\n"
    printf "\nExiting for now.\n\n"
    exit
fi

#############################################################################
##########  ATTEMPTING TO CREATE OUTPUT DIR AND EXITING IF FAILS  ###########
#############################################################################

mkdir $output_dir 2> /dev/null

if [ $? -ne 0 ] ; then
    printf "  ${RED}Output directory \"${output_dir}\" already exists, either remove or rename\n  that one, or specify a different output directory to the \"-o\" option.${NC}\n\n"
    printf "Exiting for now.\n\n"
    exit
else
    rm -rf $output_dir # removing here so if we abort in a later step, the output_dir isn't created yet
fi

#############################################################################
########################### STARTING LOG FILE ###############################
#############################################################################
gtt_pfam_search_log=gtotree-pfam-search-runlog.txt


#############################################################################
########## CHECKING INPUT GENOME SOURCES AND SPECIFIC DEPENDENCIES ##########
#############################################################################

printf "\n\n               GToTree ${VERSION} (github.com/AstrobioMike/GToTree)\n\n" > $gtt_pfam_search_log

printf "\n ---------------------------------  RUN INFO  --------------------------------- \n\n" | tee -a $gtt_pfam_search_log

# storing the command as entered for the log file
command_call="$(printf %q "$BASH_SOURCE")$(printf ' %q' "$@")"

printf "    Command entered:\n" >> $gtt_pfam_search_log
printf "    $command_call\n\n" >> $gtt_pfam_search_log

printf "\n    Input genome sources include:\n" | tee -a $gtt_pfam_search_log

### checking/reporting those provided as NCBI accessions
if [ "$NCBI_acc_file" != "" ]; then

    if [ -f "$NCBI_acc_file" ]; then
        NCBI_input_genomes_total=$(wc -l $NCBI_acc_file | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - NCBI accessions listed in $NCBI_acc_file ($NCBI_input_genomes_total genomes)\n" | tee -a $gtt_pfam_search_log
    else
        printf "\n${RED}      You specified $NCBI_acc_file as a source of NCBI genomes to use, but that file cannot be found :(${NC}\n" | tee -a $gtt_pfam_search_log
        printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        exit
    fi

else
    NCBI_input_genomes_total=0
fi

### checking/reporting those provided as genbank files
if [ "$genbank_list_file" != "" ];then
    if [ -s "$genbank_list_file" ]; then
        # checking all the files pointed to by this file can be found
        for file in $(cat $genbank_list_file)
        do
          if [ ! -s $file ]; then
            printf "\n${RED}      Some genbank files specified in $genbank_list_file cannot be found :(${NC}\n\n" | tee -a $gtt_pfam_search_log
            printf "  Double-check the provided locations and where they should be, here's one of the problems:\n" | tee -a $gtt_pfam_search_log
            printf "            $file\n" | tee -a $gtt_pfam_search_log
            printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
            exit
          fi
        done

        genbank_genomes_total=$(wc -l $genbank_list_file | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Genbank files listed in $genbank_list_file ($genbank_genomes_total genomes)\n" | tee -a $gtt_pfam_search_log
    else
        printf "\n${RED}      You specified $genbank_list_file as a source of GenBank files to use, but that file cannot be found or is empty :(${NC}\n" | tee -a $gtt_pfam_search_log
        printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        exit
    fi

else
    genbank_genomes_total=0

fi

### checking/reporting those provided as fasta files
if [ "$fasta_files" != "" ]; then
    if ! command -v prodigal > /dev/null; then
        printf "\n  ${RED}Prodigal is required when providing fasta files.${NC}\n" | tee -a $gtt_pfam_search_log
        printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        exit
    fi

    if [ -s "$fasta_files" ]; then
        
        # checking all files pointed to by this file can be found
        for file in $(cat $fasta_files)
        do
            if [ ! -s $file ]; then
                printf "\n${RED}      Some fasta files specified in $fasta_files cannot be found :(${NC}\n\n" | tee -a $gtt_pfam_search_log
                printf "  Double-check the provided locations and where they should be, here's one of the problems:\n" | tee -a $gtt_pfam_search_log
                printf "            $file\n" | tee -a $gtt_pfam_search_log
                printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
                exit
            fi
        done

        fasta_genomes_total=$(wc -l $fasta_files | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Fasta files listed in $fasta_files ($fasta_genomes_total genomes)\n" | tee -a $gtt_pfam_search_log
    else
        printf "\n${RED}      You specified $fasta_files as a source of fasta files to use, but that file cannot be found or is empty :(${NC}\n" | tee -a $gtt_pfam_search_log
        printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        exit
    fi

else
    fasta_genomes_total=0

fi

### checking/reporting those provided as amino acid files of coding sequences
if [ "$amino_acid_files" != "" ]; then

    if [ -s "$amino_acid_files" ]; then
        
        # checking all files pointed to by this file can be found
        for file in $(cat $amino_acid_files)
        do
            if [ ! -s $file ]; then
                printf "\n${RED}      Some amino acid files specified in $amino_acid_files cannot be found :(${NC}\n\n" | tee -a $gtt_pfam_search_log
                printf "  Double-check the provided locations and where they should be, here's one of the problems:\n" | tee -a $gtt_pfam_search_log
                printf "            $file\n" | tee -a $gtt_pfam_search_log
                printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
                exit
            fi
        done

        amino_acid_genomes_total=$(wc -l $amino_acid_files | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Amino acid files listed in $amino_acid_files ($amino_acid_genomes_total genomes)\n" | tee -a $gtt_pfam_search_log
    else
        printf "\n${RED}      You specified $amino_acid_files as a source of fasta files to use, but that file cannot be found or is empty :(${NC}\n" | tee -a $gtt_pfam_search_log
        printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        exit
    fi

else
    amino_acid_genomes_total=0

fi


### reporting total number of planned genomes
total_input_genomes=$(($NCBI_input_genomes_total + $genbank_genomes_total + $fasta_genomes_total + $amino_acid_genomes_total))
printf "\n                             ${GREEN}Total input genomes: $total_input_genomes${NC}\n" | tee -a $gtt_pfam_search_log


#############################################################################
###############  CHECKING AND REPORTING SPECIFIED PFAM SOURCE  ##############
#############################################################################
printf "\n    Pfam source to be used:\n" | tee -a $gtt_pfam_search_log

if [ -f "$pfam_file" ]; then
    uniq ${pfam_file} > uniq_pfam_targets.tmp
    pfam_target_genes_total=$(wc -l uniq_pfam_targets.tmp | sed "s/^ *//" | cut -d " " -f 1)
    printf "      - $pfam_file ($pfam_target_genes_total targets)\n" | tee -a $gtt_pfam_search_log
else
    printf "\n${RED}      You specified $pfam_file as your target Pfams to sesarch, but that file cannot be found :(${NC}\n" | tee -a $gtt_pfam_search_log
    printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
    exit
fi

#############################################################################
##############  EXPLICITLY STATING IF DEFAULT BEHAVIOR CHANGED  #############
#############################################################################
if [ $output_dir != "pfam_search_output" ] || [ $debug_flag == "true" ] || [ $num_cpus != 2 ]; then

    printf "\n\n    Options set:\n" | tee -a $gtt_pfam_search_log

    if [ $output_dir != "pfam_search_output" ]; then
        printf "      - The output directory has been set to \"$output_dir\".\n" | tee -a $gtt_pfam_search_log
    fi

    if [ $num_cpus != "2" ]; then
        printf "      - Number of cpus to use during hmm search (\"-n\") has been set to $num_cpus.\n" | tee -a $gtt_pfam_search_log
    fi

    if [ $debug_flag == "true" ]; then
        printf "      - Debug mode on. Temp directory won't be removed after run.\n" | tee -a $gtt_pfam_search_log
    fi

fi

printf "\n"

sleep 3.5

#############################################################################
###############  CREATE A TEMP DIRECTORY AND EXITING IF FAILS  ##############
#############################################################################
tmp_dir=$(date +%s).gtt-pfam-search.tmpdir

mkdir $tmp_dir 2> /dev/null

if [ $? -ne 0 ] ; then
    printf "\n${RED}  Tried to make temporary directory named ${tmp_dir} but failed, this shouldn't happen :(${NC}\n\n" | tee -a $gtt_pfam_search_log
    rm uniq_hmm_names.tmp
    printf "Exiting for now.\n\n" | tee -a $gtt_pfam_search_log
    exit
fi

# remaking output directory now that things are a go
mkdir $output_dir 2> /dev/null

# moving log file now that we're a go
mv $gtt_pfam_search_log ${output_dir}/gtotree-pfam-search-runlog.txt
# adjusting value of log file variable
gtt_pfam_search_log=${output_dir}/gtotree-pfam-search-runlog.txt

# moving uniq_hmm_names.tmp into temp directory now that all is well
mv uniq_pfam_targets.tmp ${tmp_dir}/uniq_pfam_targets.tmp


#############################################################################
########################  KEEPING TRACK OF RUN TIME  ########################
#############################################################################
start_time=$(date +"%I:%M %p")
SECONDS=0


#############################################################################
######################  GETTING AND BUILDING PFAM HMM  ######################
#############################################################################

     printf "\n ############################################################################## \n" | tee -a $gtt_pfam_search_log
     printf " ####                  Downloading and building Pfam HMMs                  ####\n" | tee -a $gtt_pfam_search_log
     printf " ############################################################################## \n\n" | tee -a $gtt_pfam_search_log

base_link="https://pfam.xfam.org/family/"

for target in $(cat ${tmp_dir}/uniq_pfam_targets.tmp)
do

    # --insecure flag added on 29-Nov-2020, due to pfam certificate being invalid (https://github.com/AstrobioMike/GToTree/issues/28)
    curl --insecure --silent --retry 10 -o ${tmp_dir}/${target}.hmm "${base_link}${target}/hmm"

    if [ -s ${tmp_dir}/${target}.hmm ]; then
        # getting accession pulled (to account for current version on Pfam as compared to what was searched)
        actual_target=$(grep -m1 "^ACC" ${tmp_dir}/${target}.hmm | tr -s " " "\t" | cut -f 2)
        printf "$actual_target\n" >> ${tmp_dir}/actual_pfam_targets.tmp

        if [ $target != $actual_target ]; then
            mv ${tmp_dir}/${target}.hmm ${tmp_dir}/${actual_target}.hmm
        fi

        cat ${tmp_dir}/${actual_target}.hmm >> ${tmp_dir}/all_targets.hmm

    else # aborting if any of the pfam targets couldn't be pulled successfully
        printf "\n  ${RED}One of the target Pfams could not be successfully downloaded :(${NC}\n"
        printf "\n  The problem child was ${target}.\n\n"
        printf "\nExiting for now.\n\n"

        rm ${output_dir}
        # removing temp directory unless debug mode on
        if [ $debug_flag == 'false' ]; then
             rm -rf $tmp_dir
        fi

        exit

    fi

done

# starting the main results table which will have the following as its header:
paste <(printf "assembly_id\tdownloaded_accession\tTotal_gene_count") <(printf %s "$(cat ${tmp_dir}/actual_pfam_targets.tmp | tr "\n" "\t")") > ${output_dir}/All_genomes_Pfam_target_counts.tsv

#############################################################################
#####################  NCBI-DERIVED GENOME PROCESSING  ######################
#############################################################################
if [ -n "$NCBI_acc_file" ]; then


    printf "\n ############################################################################## \n" | tee -a $gtt_pfam_search_log
    printf " ####          Working on the genomes provided as NCBI accessions          ####\n" | tee -a $gtt_pfam_search_log
    printf " ############################################################################## \n\n" | tee -a $gtt_pfam_search_log

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtt_pfam_search_log
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtt_pfam_search_log


    # storing sorted ncbi accession file
    sort $NCBI_acc_file > ${tmp_dir}/sorted_ncbi_accs.tmp

    ## checking if any have the GenBank AND RefSeq for the same genome (e.g.: GCA_ and GCF_ with same following accession numbers)
    sed 's/GC._//' ${tmp_dir}/sorted_ncbi_accs.tmp | sort > ${tmp_dir}/sorted_base_ncbi_accs.tmp
    uniq -d ${tmp_dir}/sorted_base_ncbi_accs.tmp > ${tmp_dir}/dupe_accs.tmp

    ## if there were, removing genbank one, keeping refseq one, and reporting:
    if [ -s ${tmp_dir}/dupe_accs.tmp ]; then
        num_dupe_genomes=$(wc -l ${tmp_dir}/dupe_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)
        for dupe_acc in $(cat ${tmp_dir}/dupe_accs.tmp)
        do
            grep "$dupe_acc" ${tmp_dir}/sorted_ncbi_accs.tmp
        done > ${output_dir}/Redundant_input_accessions.txt

        printf "     ${YELLOW}******************************* ${NC}NOTICE ${YELLOW}*******************************${NC}  \n" | tee -a $gtt_pfam_search_log
        printf "\t$num_dupe_genomes accession(s) redundant - meaning GenBank and RefSeq accessions\n" | tee -a $gtt_pfam_search_log
        printf "\twere provided for the same genome. Only RefSeq accession used.\n\n" | tee -a $gtt_pfam_search_log
        printf "\t  Reported in \"${output_dir}/Redundant_input_accessions.txt\".\n" | tee -a $gtt_pfam_search_log
        printf "     ${YELLOW}**********************************************************************${NC}  \n\n" | tee -a $gtt_pfam_search_log

        comm -23 ${tmp_dir}/sorted_ncbi_accs.tmp <(sort ${output_dir}/Redundant_input_accessions.txt) | sort > ${tmp_dir}/sorted_building_new_input_ncbi_accs.tmp

        cat <(grep "^GCF_" ${output_dir}/Redundant_input_accessions.txt) ${tmp_dir}/sorted_building_new_input_ncbi_accs.tmp | sort > ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp

        sleep 3

    else
        mv ${tmp_dir}/sorted_ncbi_accs.tmp ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp
    fi

    ## downloading genbank files and/or refseq assembly summary files as needed

    if grep -q "^GCA" ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp; then
        printf "\t\t  ${GREEN}Downloading GenBank assembly summaries...${NC}\n\n" | tee -a $gtt_pfam_search_log
        if [ "$http_flag" == 'false' ]; then
            curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt -o ${tmp_dir}/ncbi_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        else
            curl --connect-timeout 30 --retry 10 https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt -o ${tmp_dir}/ncbi_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        fi
    fi

    # making sure file downloaded with no errors
    if [ -s capture_any_dl_errors.tmp ]; then
        printf "\n\n  ${RED}Download of NCBI assembly summaries failed :(${NC}\n  Is the internet connection weak?\n\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        rm -rf ${tmp_dir} ${output_dir} capture_any_dl_errors.tmp
        exit
    else
        rm -rf capture_any_dl_errors.tmp
    fi

    if grep -q "^GCF" ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp; then
        printf "\n\t\t  ${GREEN}Downloading RefSeq assembly summaries...${NC}\n\n" | tee -a $gtt_pfam_search_log
        if [ "$http_flag" == 'false' ]; then
            curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt -o ${tmp_dir}/ncbi_RS_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        else
            curl --connect-timeout 30 --retry 10 https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt -o ${tmp_dir}/ncbi_RS_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        fi
        cat ${tmp_dir}/ncbi_RS_assembly_info.tmp >> ${tmp_dir}/ncbi_assembly_info.tmp
    fi

    # making sure file downloaded with no errors
    if [ -s capture_any_dl_errors.tmp ]; then
        printf "\n\n  ${RED}Download of NCBI assembly summaries failed :(${NC}\n  Is the internet connection weak?\n\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log
        rm -rf ${tmp_dir} ${output_dir} capture_any_dl_errors.tmp
        exit
    else
        rm -rf capture_any_dl_errors.tmp
    fi

    ## searching assembly info table for the input accessions (NOTE: will always return latest accession version as of now)
    # to keep track of what was searched and what was downloaded, both are included in the output summary table
    gtt-parse-assembly-summary-file -a ${tmp_dir}/ncbi_assembly_info.tmp -w ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp -o ${tmp_dir}/ncbi_accessions_info.tmp

    ## sorting and saving found input accs
    cut -f 1 ${tmp_dir}/ncbi_accessions_info.tmp | sort > ${tmp_dir}/sorted_got_ncbi_accs.tmp

    # checking if any accs weren't successfully found:
    comm -23 ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp ${tmp_dir}/sorted_got_ncbi_accs.tmp > ${tmp_dir}/not_found_accs.tmp

    # if any not found, removing from current list, and reporting:
    if [ -s ${tmp_dir}/not_found_accs.tmp ]; then

        num_accs_not_found=$(wc -l ${tmp_dir}/not_found_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)
        printf "\n\n     ${YELLOW}******************************* ${NC}NOTICE ${YELLOW}*******************************${NC}  \n" | tee -a $gtt_pfam_search_log
        printf "\t  $num_accs_not_found accession(s) not successfully found at NCBI.\n\n" | tee -a $gtt_pfam_search_log
        printf "\t     Reported in \"${output_dir}/NCBI_accessions_not_found.txt\".\n" | tee -a $gtt_pfam_search_log
        printf "     ${YELLOW}**********************************************************************${NC}  \n\n" | tee -a $gtt_pfam_search_log

        sleep 3

        cp ${tmp_dir}/not_found_accs.tmp ${output_dir}/NCBI_accessions_not_found.txt

        mv ${tmp_dir}/sorted_got_ncbi_accs.tmp ${tmp_dir}/updated_sorted_ncbi_accs.tmp

        NCBI_remaining_genomes_total=$(wc -l ${tmp_dir}/updated_sorted_ncbi_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)

    else
        mv ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp ${tmp_dir}/updated_sorted_ncbi_accs.tmp
        NCBI_remaining_genomes_total=$(wc -l ${tmp_dir}/updated_sorted_ncbi_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)
        printf "\n\n\t  ${GREEN}All $NCBI_remaining_genomes_total input accessions successfully found.${NC}\n\n" | tee -a $gtt_pfam_search_log

    fi

    ### searching for targets in each genome
    while IFS=$'\t' read -r -a curr_line

    do

        assembly="${curr_line[0]}"
        downloaded_accession="${curr_line[1]}"
        num=$((num+1))

        printf "   --------------------------------------------------------------------------   \n"
        printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $NCBI_remaining_genomes_total total.\n"
        printf "   --------------------------------------------------------------------------   \n\n"

        # storing and building links
        if [ "$http_flag" == 'false' ]; then
            base_link="${curr_line[8]}"
        else
            base_link=$(echo ${curr_line[8]} | sed 's/^ftp/https/')
        fi

        end_path=$(basename $base_link)

        # attempting to download genes for assembly
        curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genes.tmp.gz "${base_link}/${end_path}_protein.faa.gz"

        if [ -s ${tmp_dir}/${assembly}_genes.tmp.gz ]; then
            gunzip -f ${tmp_dir}/${assembly}_genes.tmp.gz

        else # trying to get assembly if there were no gene annotations available
            curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz"

            if [ -s ${tmp_dir}/${assembly}_genome.tmp.gz ]; then

                gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz

                printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n"
                printf "   $assembly doesn't appear to have gene annotations.\n\n"
                printf "   Downloaded the genome and identifying CDSs with prodigal.\n"
                printf "  ${YELLOW}****************************************************************************${NC}  \n\n"

                printf "      Getting coding seqs...\n\n"
                prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null
                tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes.tmp
            fi
        fi

        if [ -s ${tmp_dir}/${assembly}_genes.tmp ]; then

            # # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found)
            # ass_name="${curr_line[2]}"
            # if [ -z "$ass_name" ]; then ass_name="NA"; fi
            # org_name="${curr_line[4]}"
            # if [ -z "$org_name" ]; then org_name="NA"; fi
            # infraspecific_name="${curr_line[5]}"
            # if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi
            # taxid="${curr_line[3]}"
            # if [ -z "$taxid" ]; then taxid="NA"; fi
            # version_status="${curr_line[6]}"
            # if [ -z "$version_status" ]; then version_status="NA"; fi
            # asm_level="${curr_line[7]}"
            # if [ -z "$asm_level" ]; then asm_level="NA"; fi

            ### counting how many genes in this genome
            gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)

            ### running hmm search ###
            hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null

            ### getting counts of each target in this genome
            for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
            do
                grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
            done

            ### writing results to main output file
            paste <( printf "${assembly}\t${downloaded_accession}\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " )  >> ${output_dir}/All_genomes_Pfam_target_counts.tsv

            ### Pulling out hits for this genome ###
            esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
                
            for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
            do
                if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then

                    grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp

                    for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
                    do
                        echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
                    done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp

                    gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
                
                    # adding to fasta of that target holding all genomes
                    cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/All_${target}_hits.faa
                fi

            done

        else 
            printf "     ${YELLOW}******************************* ${NC}NOTICE ${YELLOW}*******************************${NC}  \n"
            printf "\t  $assembly's genes nor genome downloaded properly :(\n\n"
            printf "\t    Reported in \"${output_dir}/NCBI_accessions_not_downloaded.txt\"\n"
            printf "     ${YELLOW}************************************************************************ ${NC}\n"
            rm -rf ${tmp_dir}/${assembly}_report1.tmp ${tmp_dir}/${assembly}_genes.tmp.gz
            sleep 3
            echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt

        fi

    done < ${tmp_dir}/ncbi_accessions_info.tmp

fi

#############################################################################
####################  GENBANK-DERIVED GENOME PROCESSING  ####################
#############################################################################
if [ -n "$genbank_list_file" ]; then

    printf "\n ############################################################################## \n" | tee -a $gtt_pfam_search_log
    printf " ####           Working on the genomes provided as GenBank files           ####\n" | tee -a $gtt_pfam_search_log
    printf " ############################################################################## \n\n" | tee -a $gtt_pfam_search_log

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtt_pfam_search_log
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtt_pfam_search_log

    # setting a counter to track how far along we are
    num=0

    while IFS=$'\t' read -r -a file

    do

        ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
        if $(file $file | grep -q "gzip"); then
            was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
            file_location=${file%.*}
            gunzip -f -c $file > $file_location
            assembly="$(basename ${file_location%.*})"
        else
            file_location=$file
            assembly="$(basename ${file%.*})"
            was_gzipped=FALSE
        fi

        num=$((num+1)) # to track progress

        printf "   --------------------------------------------------------------------------   \n"
        printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $genbank_genomes_total total.\n"
        printf "   --------------------------------------------------------------------------   \n\n"

        # # storing more info about the assembly if it's present in the genbank file:
        # if grep -q "ORGANISM" $file_location; then 
        #     org_name=$(grep -m1 "ORGANISM" $file_location | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_")
        # else
        #     org_name="NA"
        # fi

        # if grep -q "strain=" $file_location; then 
        #     strain=$(grep -m1 "strain=" $file_location | tr -s " " | cut -f 2 -d '"')
        # else
        #     strain="NA"
        # fi

        # if grep -q "taxon" $file_location; then
        #     taxid=$(grep -m1 "taxon" $file_location | cut -f2 -d ":" | tr -d '"')
        # else
        #     taxid="NA"
        # fi

        # extracting AA coding sequences from genbank file
        gtt-genbank-to-AA-seqs -i $file_location -o ${tmp_dir}/${assembly}_genes.tmp 2> /dev/null

        # checking that the file had CDS annotations, if not running prodigal
        if [ ! -s ${tmp_dir}/${assembly}_genes.tmp ]; then

            printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n"
            printf "\t  This genbank file doesn't appear to have CDS annotations,\n"
            printf "\t  so we are identifying coding sequences with prodigal.\n\n"

            printf "\t    Reported in \"${output_dir}/Genbank_files_with_no_CDSs.txt\".\n"
            printf "  ${YELLOW}****************************************************************************${NC}  \n\n"

            echo "$file" >> ${output_dir}/Genbank_files_with_no_CDSs.txt
            rm -rf ${tmp_dir}/${assembly}_genes.tmp

            # pulling out full nucleotide fasta from genbank file
            gtt-genbank-to-fasta -i $file_location -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null

            # running prodigal
            prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr

            if [ -s ${file_location}_prodigal.stderr ]; then

                printf "\n\n ${RED}############################################################################## \n" | tee -a $gtt_pfam_search_log
                printf " ${RED}############################################################################## \n" | tee -a $gtt_pfam_search_log
                printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee -a $gtt_pfam_search_log
                printf " ##############################################################################${NC} \n" | tee -a $gtt_pfam_search_log
                printf " ${RED}############################################################################## \n\n" | tee -a $gtt_pfam_search_log

                printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee -a $gtt_pfam_search_log
                printf "    Something went wrong with prodigal trying to call genes on the fasta\n" | tee -a $gtt_pfam_search_log
                printf "    file generated from assembly ${GREEN}${assembly}${NC}.\n" | tee -a $gtt_pfam_search_log
                printf "    GToTree is not sure it should move forward when something odd is going\n" | tee -a $gtt_pfam_search_log
                printf "    on like this :(\n\n" | tee -a $gtt_pfam_search_log

                printf "    It is possible this is just because this assembly is shorter than\n" | tee -a $gtt_pfam_search_log
                printf "    100,000 bps. If you'd like to still work with it, you can run\n" | tee -a $gtt_pfam_search_log
                printf "    prodigal on it yourself and provide the amino acid sequences to\n" | tee -a $gtt_pfam_search_log
                printf "    GToTree.\n" | tee -a $gtt_pfam_search_log
                printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee -a $gtt_pfam_search_log

                printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log

                rm -rf ${output_dir} ${file_location}_prodigal.stderr

                # removing temp directory unless debug mode on
                if [ $debug_flag == 'false' ]; then
                     rm -rf $tmp_dir
                fi

                exit

            fi

            # getting rid of asterisks from prodigal output
            tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
            # renaming to be cleaner since the prodigal labels are that informative anyway (in the context of accessions like with ncbi inputs)
            gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.tmp

        fi

        ## removing gunzipped genome file if it was gunzipped
        if [ $was_gzipped == "TRUE" ]; then
            rm -rf $file_location
        fi          

        ### counting how many genes in this genome
        gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)

        ### running hmm search ###
        hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null


        ### getting counts of each target in this genome
        for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
        do
            grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
        done

        ### writing results to main output file
        paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " )  >> ${output_dir}/All_genomes_Pfam_target_counts.tsv


        ### Pulling out hits for this genome ###
        esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
            
        for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
        do
            if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then

                grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp

                for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
                do
                    echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
                done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp

                gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
            
                # adding to fasta of that target holding all genomes
                cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/All_${target}_hits.faa
            fi

        done

    done < $genbank_list_file

fi


#############################################################################
#####################  FASTA-DERIVED GENOME PROCESSING  #####################
#############################################################################
if [ -n "$fasta_files" ]; then

    printf "\n ############################################################################## \n" | tee -a $gtt_pfam_search_log
    printf " ####            Working on the genomes provided as fasta files            ####\n" | tee -a $gtt_pfam_search_log
    printf " ############################################################################## \n\n" | tee -a $gtt_pfam_search_log

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtt_pfam_search_log
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtt_pfam_search_log

    # setting a counter to track how far along we are
    num=0

    while IFS=$'\t' read -r -a file

    do

        ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
        if $(file $file | grep -q "gzip"); then
            was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
            file_location=${file%.*}
            gunzip -f -c $file > $file_location
            assembly="$(basename ${file_location%.*})"
        else
            file_location=$file
            assembly="$(basename ${file%.*})"
            was_gzipped=FALSE
        fi

        num=$((num+1)) # to track progress

        printf "   --------------------------------------------------------------------------   \n"
        printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $fasta_genomes_total total.\n"
        printf "   --------------------------------------------------------------------------   \n\n"

        # running prodigal
        prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr

        if [ -s ${file_location}_prodigal.stderr ]; then

            printf "\n\n ${RED}############################################################################## \n" | tee -a $gtt_pfam_search_log
            printf " ${RED}############################################################################## \n" | tee -a $gtt_pfam_search_log
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee -a $gtt_pfam_search_log
            printf " ##############################################################################${NC} \n" | tee -a $gtt_pfam_search_log
            printf " ${RED}############################################################################## \n\n" | tee -a $gtt_pfam_search_log

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee -a $gtt_pfam_search_log
            printf "    Something went wrong with prodigal trying to call genes on the fasta\n" | tee -a $gtt_pfam_search_log
            printf "    file generated from assembly ${GREEN}${assembly}${NC}.\n" | tee -a $gtt_pfam_search_log
            printf "    GToTree is not sure it should move forward when something odd is going\n" | tee -a $gtt_pfam_search_log
            printf "    on like this :(\n\n" | tee -a $gtt_pfam_search_log

            printf "    It is possible this is just because this assembly is shorter than\n" | tee -a $gtt_pfam_search_log
            printf "    100,000 bps. If you'd like to still work with it, you can run\n" | tee -a $gtt_pfam_search_log
            printf "    prodigal on it yourself and provide the amino acid sequences to\n" | tee -a $gtt_pfam_search_log
            printf "    GToTree.\n" | tee -a $gtt_pfam_search_log
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee -a $gtt_pfam_search_log

            printf "\nExiting for now.\n\n" | tee -a $gtt_pfam_search_log

            rm -rf ${output_dir} ${file_location}_prodigal.stderr

            # removing temp directory unless debug mode on
            if [ $debug_flag == 'false' ]; then
                 rm -rf $tmp_dir
            fi

            exit

        fi

        # getting rid of asterisks from prodigal output
        tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
        # renaming to be cleaner since the prodigal labels are that informative anyway (in the context of accessions like with ncbi inputs)
        gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.tmp

        ## removing gunzipped genome file if it was gunzipped
        if [ $was_gzipped == "TRUE" ]; then
            rm -rf $file_location
        fi
          
        ### counting how many genes in this genome
        gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
        
        ### running hmm search ###
        hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null

        ### getting counts of each target in this genome
        for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
        do
            grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
        done

        ### writing results to main output file
        paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " )  >> ${output_dir}/All_genomes_Pfam_target_counts.tsv

        ### Pulling out hits for this genome ###
        esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
            
        for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
        do
            if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then

                grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp

                for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
                do
                    echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
                done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp

                gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
            
                # adding to fasta of that target holding all genomes
                cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/All_${target}_hits.faa
            fi

        done

    done < $fasta_files

fi


#############################################################################
##################  AMINO-ACID-DERIVED GENOME PROCESSING  ###################
#############################################################################
if [ -n "$amino_acid_files" ]; then

    printf "\n ############################################################################## \n" | tee -a $gtt_pfam_search_log
    printf " ####         Working on the genomes provided as amino acid files          ####\n" | tee -a $gtt_pfam_search_log
    printf " ############################################################################## \n\n" | tee -a $gtt_pfam_search_log

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtt_pfam_search_log
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtt_pfam_search_log

    # setting a counter to track how far along we are
    num=0

    while IFS=$'\t' read -r -a file

    do

        ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
        if $(file $file | grep -q "gzip"); then
            was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
            file_location=${file%.*}
            gunzip -f -c $file > $file_location
            assembly="$(basename ${file_location%.*})"
        else
            file_location=$file
            assembly="$(basename ${file%.*})"
            was_gzipped=FALSE
        fi

        num=$((num+1)) # to track progress

        printf "   --------------------------------------------------------------------------   \n"
        printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $fasta_genomes_total total.\n"
        printf "   --------------------------------------------------------------------------   \n\n"

        cp ${file_location} ${tmp_dir}/${assembly}_genes.tmp

        ## removing gunzipped genome file if it was gunzipped
        if [ $was_gzipped == "TRUE" ]; then
            rm -rf $file_location
        fi

        ### counting how many genes in this genome
        gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
        
        ### running hmm search ###
        hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null

        ### getting counts of each target in this genome
        for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
        do
            grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
        done

        ### writing results to main output file
        paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " )  >> ${output_dir}/All_genomes_Pfam_target_counts.tsv

        ### Pulling out hits for this genome ###
        esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
            
        for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
        do
            if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then

                grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp

                for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
                do
                    echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
                done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp

                gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
            
                # adding to fasta of that target holding all genomes
                cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/All_${target}_hits.faa
            fi

        done

    done < $amino_acid_files

fi

## writing out files for each searched gene holding the genomes with hits to that gene, and making iToL files for each
num=4
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
do

    awk -F $'\t' -v col="$num" ' $col > 0 { print $1 } ' ${output_dir}/All_genomes_Pfam_target_counts.tsv | tail -n +2 > ${output_dir}/Genomes_with_hits_to_${target}.txt

    if [ -s ${output_dir}/Genomes_with_hits_to_${target}.txt ]; then
        printf "DATASET_STYLE\nSEPARATOR SPACE\nDATASET_LABEL $target\nCOLOR\t#0000ff\nDATA\n" > ${output_dir}/${target}-iToL.txt

        cat <(sed 's/$/ branch node #0000ff 3 normal/' ${output_dir}/Genomes_with_hits_to_${target}.txt) >> ${output_dir}/${target}-iToL.txt
    
    else
        rm ${output_dir}/Genomes_with_hits_to_${target}.txt

    fi

    num=$(($num + 1))

done

# removing temp directory unless debug mode on
if [ $debug_flag == 'false' ]; then
     rm -rf $tmp_dir
fi

#############################################################################
##########################  JOB-FINISHED REPORTING  #########################
#############################################################################

printf "\n\n#################################################################################\n" | tee -a $gtt_pfam_search_log
printf "####                                 ${GREEN}Done!!${NC}                                  ####\n" | tee -a $gtt_pfam_search_log
printf "#################################################################################\n\n" | tee -a $gtt_pfam_search_log


printf "    Main results table written to:\n" | tee -a $gtt_pfam_search_log
printf "        ${GREEN}${output_dir}/All_genomes_Pfam_target_counts.tsv${NC}\n\n" | tee -a $gtt_pfam_search_log

printf "    Fasta files of hits to each target are stored in the output directory:\n" | tee -a $gtt_pfam_search_log
printf "        ${GREEN}${output_dir}/${NC}\n\n" | tee -a $gtt_pfam_search_log

# reporting any problem files/accessions
if [ -f ${output_dir}/Redundant_input_accessions.txt ] || [ -f ${output_dir}/NCBI_accessions_not_found.txt ] || [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ] || [ -f ${output_dir}/Genbank_files_with_no_CDSs.txt ]; then
    printf "\n  Notes:\n\n" | tee -a $gtotree_log

    if [ -f ${output_dir}/Redundant_input_accessions.txt ]; then
        printf "    $num_dupe_report accession(s) redundant.\n" | tee -a $gtotree_log
        printf "        Reported in \"${output_dir}/Redundant_input_accessions.txt\".\n\n" | tee -a $gtotree_log
    fi

    if [ -f ${output_dir}/NCBI_accessions_not_found.txt ]; then
        num_accs_not_found=$(wc -l ${output_dir}/NCBI_accessions_not_found.txt | sed "s/^ *//" | cut -d " " -f 1)
        printf "    ${YELLOW}$num_accs_not_found accession(s) not successfully found at NCBI.${NC}\n" | tee -a $gtotree_log
        printf "        Reported in \"${output_dir}/NCBI_accessions_not_found.txt\".\n\n" | tee -a $gtotree_log
    fi

    if [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ]; then
        num_accs_not_downloaded=$(wc -l ${output_dir}/NCBI_accessions_not_downloaded.txt | sed "s/^ *//" | cut -d " " -f 1)
        printf "    ${YELLOW}$num_accs_not_downloaded did not download properly.${NC}\n" | tee -a $gtotree_log
        printf "        Reported in \"${output_dir}/NCBI_accessions_not_downloaded.txt\".\n\n" | tee -a $gtotree_log
    fi

    if [ -f ${output_dir}/Genbank_files_with_no_CDSs.txt ]; then
        num_genbanks_no_cds=$(wc -l ${output_dir}/Genbank_files_with_no_CDSs.txt | sed "s/^ *//" | cut -d " " -f 1)
        printf "    $num_genbanks_no_cds genbank file(s) had no identified coding sequences.\n" | tee -a $gtotree_log
        printf "        Reported in \"${output_dir}/Genbank_files_with_no_CDSs.txt\",\n" | tee -a $gtotree_log
        printf "        but processed as though submitted in fasta format.\n\n" | tee -a $gtotree_log
    fi

fi

printf "________________________________________________________________________________\n\n" | tee -a $gtt_pfam_search_log

# reporting log file output
printf "    Log file written to:\n" | tee -a $gtt_pfam_search_log
printf "        ${GREEN}${output_dir}/gtotree-pfam-search-runlog.txt${NC}\n\n" | tee -a $gtt_pfam_search_log

duration=$SECONDS

printf "                                         Total process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n" | tee -a $gtt_pfam_search_log

today=$(date +'%A')

printf "                                                      ${GREEN}Happy $today :)${NC}\n\n" | tee -a $gtt_pfam_search_log
