#!/usr/bin/env bash

# setting colors to use
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
NC='\033[0m'
VERSION="v1.7.03"

if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
    printf "GToTree ${VERSION}\n"
    exit
fi

printf "\n\n                                  GToTree ${VERSION}\n"
printf "                         (github.com/AstrobioMike/GToTree)\n\n"

#############################################################################
################################  HELP INFO  ################################
#############################################################################
## called by program name with no arguments or with "-h" as only positional argument ##
if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then

    printf "\n ----------------------------------  ${YELLOW}HELP INFO${NC}  ---------------------------------- \n\n"
    printf "  This program takes input genomes from various sources and ultimately produces\n"
    printf "  a phylogenomic tree. You can find detailed usage information at:\n"
    printf "                                  github.com/AstrobioMike/GToTree/wiki\n\n"


    printf "\n -------------------------------  ${YELLOW}REQUIRED INPUTS${NC}  ------------------------------- \n\n"
    
    printf "      1) Input genomes in one or any combination of the following formats:\n"
    printf "        - [-a <file>] single-column file of NCBI assembly accessions\n"
    printf "        - [-g <file>] single-column file with the paths to each GenBank file\n"
    printf "        - [-f <file>] single-column file with the paths to each fasta file\n"
    printf "        - [-A <file>] single-column file with the paths to each amino acid file,\n"
    printf "                      each file should hold the coding sequences for just one genome\n\n"

    printf "      2)  [-H <file>] location of the uncompressed HMM file being used, or just the\n"
    printf "                      HMM name if you've set the environment variable 'GToTree_HMM_dir'\n"
    printf "                      to the appropriate location or installed via conda (run 'gtt-hmms'\n"
    printf "                      by itself to view the available gene-sets)\n\n"


    printf "\n -------------------------------  ${YELLOW}OPTIONAL INPUTS${NC}  ------------------------------- \n\n"

    printf "\n      ${YELLOW}Output directory specification:${NC}\n\n"

    printf "        - [-o <str>] default: GToTree_output\n"
    printf "                  Specify the desired output directory.\n\n"

    printf "\n      ${YELLOW}User-specified modification of genome labels:${NC}\n\n"

    printf "        - [-m <file>] specify desired genome labels\n"
    printf "                  A two- or three-column tab-delimited file where column 1 holds either\n"
    printf "                  the file name or NCBI accession of the genome to name (depending\n"
    printf "                  on the input source), column 2 holds the desired new genome label,\n"
    printf "                  and column 3 holds something to be appended to either initial or\n"
    printf "                  modified labels (e.g. useful for \"tagging\" genomes in the tree based\n"
    printf "                  on some characteristic). Columns 2 or 3 can be empty, and the file does\n"
    printf "                  not need to include all input genomes.\n\n"

    printf "\n      ${YELLOW}Options for adding taxonomy information:${NC}\n\n"

    printf "        - [-t ] default: false\n"
    printf "                  Provide this flag with no arguments if you'd like to add NCBI taxonomy\n"
    printf "                  info to the sequence headers for any genomes with NCBI taxids. This will\n"
    printf "                  will largely be effective for input genomes provided as NCBI accessions\n"
    printf "                  (provided to the \`-a\` argument), but any input GenBank files will also\n"
    printf "                  be searched for an NCBI taxid. See \`-L\` argument for specifying desired\n"
    printf "                  ranks.\n\n"

    printf "        - [-D ] default: false\n"
    printf "                  Provide this flag with no arguments if you'd like to add taxonomy from the\n"
    printf "                  Genome Taxonomy Database (GTDB; gtdb.ecogenomic.org). This will only be\n"
    printf "                  effective for input genomes provided as NCBI accessions (provided to the\n"
    printf "                  \`-a\` argument). This can be used in combination with the \`-t\` flag, in\n"
    printf "                  which case any input accessions not represented in the GTDB will have NCBI\n"
    printf "                  taxonomic infomation added (with '_NCBI' appended). See \`-L\` argument for\n"
    printf "                  specifying desired ranks, and see helper script \`gtt-get-accessions-from-GTDB\`\n"
    printf "                  for help getting input accessions based on GTDB taxonomy searches.\n\n"

    printf "        - [-L <str>] default: Domain,Phylum,Class,Species,Strain\n"
    printf "                  A comma-separated list of the taxonomic ranks you'd like added to\n"
    printf "                  the labels if adding taxonomic information. E.g., all would be\n"
    printf "                  \"-L Domain,Phylum,Class,Order,Family,Genus,Species\". Note that\n"
    printf "                  strain-level information is available through NCBI, but not GTDB.\n\n"


    printf "\n      ${YELLOW}Filtering settings:${NC}\n\n"

    printf "        - [-c <float>] default: 0.2\n"
    printf "                  A float between 0-1 specifying the range about the median of\n"
    printf "                  sequences to be retained. For example, if the median length of a\n"
    printf "                  set of sequences is 100 AAs, those seqs longer than 120 or shorter\n"
    printf "                  than 80 will be filtered out before alignment of that gene set\n"
    printf "                  with the default 0.2 setting.\n\n"

    printf "        - [-G <float>] default: 0.5\n"
    printf "                  A float between 0-1 specifying the minimum fraction of hits a\n"
    printf "                  genome must have of the SCG-set. For example, if there are 100\n"
    printf "                  target genes in the HMM profile, and Genome X only has hits to 49\n"
    printf "                  of them, it will be removed from analysis with default value 0.5.\n\n"

    printf "        - [-B ] default: false\n"
    printf "                  Provide this flag with no arguments if you'd like to run GToTree\n"
    printf "                  in \"best-hit\" mode. By default, if a SCG has more than one hit\n"
    printf "                  in a given genome, GToTree won't include a sequence for that target\n"
    printf "                  from that genome in the final alignment. With this flag provided,\n"
    printf "                  GToTree will use the best hit. See here for more discussion:\n"
    printf "                  github.com/AstrobioMike/GToTree/wiki/things-to-consider\n\n"

    printf "\n      ${YELLOW}KO searching:${NC}\n\n"

    printf "        - [-K <file>] single-column file of KO targets to search each genome for\n"
    printf "                  Table of hit counts, fastas of hit sequences, and files compatible\n"
    printf "                  with the iToL web-based tree-viewer will be generated for each\n"
    printf "                  target. See visualization of gene presence/absence example at\n"
    printf "                  github.com/AstrobioMike/GToTree/wiki/example-usage for example.\n\n"


    printf "\n      ${YELLOW}Pfam searching:${NC}\n\n"

    printf "        - [-p <file>] single-column file of Pfam targets to search each genome for\n"
    printf "                  Table of hit counts, fastas of hit sequences, and files compatible\n"
    printf "                  with the iToL web-based tree-viewer will be generated for each\n"
    printf "                  target. See visualization of gene presence/absence example at\n"
    printf "                  github.com/AstrobioMike/GToTree/wiki/example-usage for example.\n\n"

    printf "\n      ${YELLOW}General run settings:${NC}\n\n"

    printf "        - [-N ] default: false\n"
    printf "                  No tree. Generate alignment only.\n\n"

    printf "        - [-k ] default: false\n"
    printf "                  Keep individual protein alignment files.\n\n"

    printf "        - [-T <str>] default: FastTreeMP if available, FastTree if not\n"
    printf "                  Which program to use for tree generation. Currently supported are\n"
    printf "                  \"FastTree\", \"FastTreeMP\", and \"IQ-TREE\". As of now, these run\n"
    printf "                  with default settings only (and IQ-TREE includes \"-mset WAG,LG\"). To\n" 
    printf "                  run either with more specific options (and there is a lot of room for\n"
    printf "                  variation here), you can use the output alignment file from GToTree (and\n"
    printf "                  partitions file if wanted for mixed-model specification) as input into\n"
    printf "                  a dedicated treeing program.\n"
    printf "                  Note on FastTreeMP (http://www.microbesonline.org/fasttree/#OpenMP). FastTreeMP\n"
    printf "                  parallelizes some steps of the treeing step. Currently, conda installs\n"
    printf "                  FastTreeMP with FastTree on linux systems, but not on Mac OSX systems.\n"
    printf "                  So if using the conda installation, you may not have FastTreeMP if on a Mac,\n"
    printf "                  in which case FastTree will be used instead – this will be reported when the\n"
    printf "                  program starts, and be in the log file.\n\n"

    printf "        - [-n <int> ] default: 2\n"
    printf "                  The number of cpus you'd like to use during the HMM search. (Given\n"
    printf "                  these are individual small searches on single genomes, 2 is probably\n"
    printf "                  always sufficient. Keep in mind this will be multiplied by the number of jobs\n"
    printf "                  running concurrently if also modifying the \`-j\` parameter.)\n\n"

    printf "        - [-M <int> ] default: 5\n"
    printf "                  The number of threads muscle will use during alignment. (Keep in mind\n"
    printf "                  this will be multiplied by the number of jobs running concurrently\n"
    printf "                  if also modifying the \`-j\` parameter.)\n\n"

    printf "        - [-j <int> ] default: 1\n"
    printf "                  The number of jobs you'd like to run in parallel during steps\n"
    printf "                  that are parallelizable. This includes things like downloading input\n"
    printf "                  accession genomes and running parallel alignments, and portions of the\n"
    printf "                  tree step if using FastTree on a Linux system (e.g. see FastTree docs\n"
    printf "                  here: http://www.microbesonline.org/fasttree/#OpenMP).\n\n"
    printf "                  Note that I've occassionally noticed NCBI not being happy with over ~50\n"
    printf "                  downloads being attempted concurrently. So if using a \`-j\` setting around\n"
    printf "                  there or higher, and GToTree is saying a lot of input accessions were not\n"
    printf "                  successfully downloaded, consider trying with fewer.\n\n"

    printf "        - [-X ] default: false\n"
    printf "                  If working with greater than 1,000 target genomes, GToTree will by default\n"
    printf "                  use the 'super5' muscle alignment algorithm to increase the speed of the alignments (see\n"
    printf "                  github.com/AstrobioMike/GToTree/wiki/things-to-consider#working-with-many-genomes\n"
    printf "                  for more details and the note just above there on using representative genomes).\n"
    printf "                  Anyway, provide this flag with no arguments if you don't want to speed up\n"
    printf "                  the alignments.\n\n"

    printf "        - [-P ] default: false\n"
    printf "                  Provide this flag with no arguments if your system can't use ftp,\n"
    printf "                  and you'd like to try using http.\n\n"

    printf "        - [-F ] default: false\n"
    printf "                  Provide this flag with no arguments if you'd like to force\n"
    printf "                  overwriting the output directory if it exists.\n\n"

    printf "        - [-d ] default: false\n"
    printf "                  Provide this flag with no arguments if you'd like to keep the\n"
    printf "                  temporary directory. (Mostly useful for debugging.)\n\n"


    printf "\n --------------------------------  ${YELLOW}EXAMPLE USAGE${NC}  -------------------------------- \n\n"

    printf "\tGToTree -a ncbi_accessions.txt -f fasta_files.txt -H Bacteria -D -j 4\n\n"

    exit
fi


#############################################################################
##############  CHECKING FIRST FOR ALL ESSENTIAL DEPENDENCIES  ##############
#############################################################################
if ! command -v muscle > /dev/null; then
    printf "\n  ${RED}Muscle is an essential dependency but does not seem to be in your PATH :(${NC}\n"
    printf "\n  See github.com/AstrobioMike/GToTree/wiki/installation for help if needed.\n\n"
    printf "\nExiting for now.\n\n"
    exit
fi

if ! command -v hmmsearch > /dev/null; then
    printf "\n  ${RED}HMMER3 is an essential dependency but does not seem to be in your PATH :(${NC}\n"
    printf "\n  See github.com/AstrobioMike/GToTree/wiki/installation for help if needed.\n\n"
    printf "\nExiting for now.\n\n"
    exit
fi

if ! command -v trimal > /dev/null; then
    printf "\n  ${RED}Trimal is an essential dependency but does not seem to be in your PATH :(${NC}\n"
    printf "\n  See github.com/AstrobioMike/GToTree/wiki/installation for help if needed.\n\n"
    printf "\nExiting for now.\n\n"
    exit
fi

if ! command -v FastTree > /dev/null; then
    printf "\n  ${RED}FastTree is an essential dependency but does not seem to be in your PATH :(${NC}\n"
    printf "\n  See github.com/AstrobioMike/GToTree/wiki/installation for help if needed.\n\n"
    printf "\nExiting for now.\n\n"
    exit
fi


#############################################################################
##########  SETTING VARIABLES TO REPORT WHAT SHOULD BE CITED AT END #########
#############################################################################
parallel_used="false"
prodigal_used="false"
hmmer_used="false"
muscle_used="false"
trimal_used="false"
taxonkit_used="false"
gtdb_used="false"
fasttree_used="false"
iqtree_used="false"
universal_SCGs_used="false"
pfam_db_used="false"
kofamscan_used="false"


#############################################################################
############################  PARSING ARGUMENTS  ############################
#############################################################################
## setting some defaults
output_dir="GToTree_output"
taxonkit_id_swap='false'
gtdb_id_swap='false'
debug_flag='false'
overwrite_output_dir='false'
lineage_spec="Domain,Phylum,Class,Species,Strain"
best_hit_mode='false'
align_only='false'
keep_individual_alignments='false'
tree_program='FastTreeMP'
additional_pfam_targets='false'
ko_targets='false'
override_faster_alignment='false'
http_flag='false'

while getopts :a:g:f:A:H:o:m:tDL:K:p:NkT:c:G:BFdn:j:M:XP args
do
    case "${args}"
    in
        a) NCBI_acc_file=${OPTARG};;
        g) genbank_list_file=${OPTARG};;
        f) fasta_files=${OPTARG};;
        A) amino_acid_files=${OPTARG};;
        H) hmm_file=${OPTARG};;
        o) output_dir=${OPTARG};;
        m) file_to_genome_id_map=${OPTARG};;
        t) taxonkit_id_swap='true';;
        D) gtdb_id_swap='true';;
        L) lineage_spec=${OPTARG};;
        K) target_KOs=${OPTARG};;
        p) target_pfams=${OPTARG};;
        N) align_only='true';;
        k) keep_individual_alignments='true';;
        T) tree_program=${OPTARG};;
        c) len_cutoff=${OPTARG};;
        G) gen_cutoff=${OPTARG};;
        B) best_hit_mode='true';;
        F) overwrite_output_dir='true';;
        d) debug_flag='true';;
        n) num_cpus=${OPTARG};;
        j) num_jobs=${OPTARG};;
        M) num_muscle_threads=${OPTARG};;
        X) override_faster_alignment='true';;
        P) http_flag='true';;
        \?) printf "\n  ${RED}Invalid argument: -${OPTARG}${NC}\n\n    Run 'GToTree' with no arguments or '-h' only to see help menu.\n\n" >&2 && exit
    esac
done

##################################
##### SOME PRE-FLIGHT CHECKS #####
##################################


# checking if `-L` argument was set that also `-t` or `-D` were provided, otherwise no tax info is added
if [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ]; then
    if [ $taxonkit_id_swap != "true" ] && [ $gtdb_id_swap != "true" ]; then
        printf "\n  ${RED}You specified to add lineage info to headers with the \`-L ...\` argument, but\n"
        printf "  neither the \`-t\` nor \`-D\` flags were provided to indicate which taxonomy to use :(${NC}\n\n"
        printf "  See \`GToTree -h\` for more info.\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi

# checking if taxonkit is available if it was specified
if [ $taxonkit_id_swap != "false" ]; then
    if ! command -v taxonkit > /dev/null; then
        printf "\n  ${RED}You specified to add lineages to headers, but 'taxonkit' not found in your PATH :(${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi


# checking specified tree program is one of the expected ones
if [ $tree_program != "FastTreeMP" ] && [ $tree_program != "FastTree" ] && [ $tree_program != "IQ-TREE" ]; then

    printf "\n  ${RED}You specified to use $tree_program as the treeing program, but that is not one of the options :(${NC}\n"
    printf "\n  Currently available options are 'FastTreeMP' (the default), 'FastTree', or 'IQ-TREE'.\n\n"

    printf "  You can also run GToTree in alignment-only mode by adding the \"-N\" flag,\n"
    printf "  and then take your concatenated alignment to another tree program :)\n"
    printf "\nExiting for now.\n\n"
    exit

fi

# checking FastTreeMP is available if specified
wanting_but_missing_FastTreeMP="null"
if [ $tree_program == "FastTreeMP" ]; then
    if ! command -v FastTreeMP > /dev/null; then
        # keeping track so can report in log that regular FastTree was used
        wanting_but_missing_FastTreeMP="true"
        tree_program="FastTree"
    fi
fi

# checking iqtree is available if it was specified
if [ $tree_program == "IQ-TREE" ]; then    
    if ! command -v iqtree > /dev/null; then
        printf "\n  ${RED}You specified to use IQ-TREE, but 'iqtree' not found in your PATH :(${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi


### if KO targets were provided, checking that KO data is present already, and downloading if it isn't
if [ "$target_KOs" != "" ]; then
    gtt-get-kofamscan-data
fi

### if ncbi taxonomy info wanted, checking that ncbi tax data is present already, and downloading if it isn't
if [ $taxonkit_id_swap != "false" ]; then
    gtt-get-ncbi-tax-data
fi


##########################################################################
##### CHECKING INPUT FILES, INCLUDING IF THEY HAVE CRLF LINE ENDINGS #####
##########################################################################

# checking out NCBI accessions file 
if [ -f "$NCBI_acc_file" ]; then

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${NCBI_acc_file} | cmp - ${NCBI_acc_file} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${NCBI_acc_file} ${NCBI_acc_file}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${NCBI_acc_file}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${NCBI_acc_file}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        NCBI_acc_file="${NCBI_acc_file}-unix"

    fi

    # checking no duplicates in there
    num_dupes=$(sort "$NCBI_acc_file" | uniq -d | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $NCBI_acc_file has duplicate entries, check it out and provide unique accessions only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no whitespace in there
    if grep -q " " ${NCBI_acc_file}; then
        printf "\n${RED}  $NCBI_acc_file has spaces in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no spaces.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no tabs in there
    if grep -q $'\t' ${NCBI_acc_file}; then
        printf "\n${RED}  $NCBI_acc_file has tabs in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no tabs.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi


# checking out input fasta files file 
if [ -f "$fasta_files" ]; then

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${fasta_files} | cmp - ${fasta_files} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${fasta_files} ${fasta_files}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${fasta_files}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${fasta_files}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        fasta_files="${fasta_files}-unix"

    fi

    # checking no duplicates in there
    num_dupes=$(sort "$fasta_files" | uniq -d | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $fasta_files has duplicate entries, check it out and provide unique entries only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no whitespace in there
    if grep -q " " ${fasta_files}; then
        printf "\n${RED}  $fasta_files has spaces in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no spaces.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no tabs in there
    if grep -q $'\t' ${fasta_files}; then
        printf "\n${RED}  $fasta_files has tabs in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no tabs.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi


# checking out input amino-acid files file 
if [ -f "$amino_acid_files" ]; then

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${amino_acid_files} | cmp - ${amino_acid_files} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${amino_acid_files} ${amino_acid_files}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${amino_acid_files}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${amino_acid_files}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        amino_acid_files="${amino_acid_files}-unix"

    fi

    # checking no duplicates in there
    num_dupes=$(sort "$amino_acid_files" | uniq -d | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $amino_acid_files has duplicate entries, check it out and provide unique entries only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no whitespace in there
    if grep -q " " ${amino_acid_files}; then
        printf "\n${RED}  $amino_acid_files has spaces in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no spaces.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no tabs in there
    if grep -q $'\t' ${amino_acid_files}; then
        printf "\n${RED}  $amino_acid_files has tabs in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no tabs.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi


# checking out input genbank files file 
if [ -f "$genbank_list_file" ]; then

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${genbank_list_file} | cmp - ${genbank_list_file} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${genbank_list_file} ${genbank_list_file}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${genbank_list_file}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${genbank_list_file}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        genbank_list_file="${genbank_list_file}-unix"

    fi

    # checking no duplicates in there
    num_dupes=$(sort "$genbank_list_file" | uniq -d | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $genbank_list_file has duplicate entries, check it out and provide unique entries only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no whitespace in there
    if grep -q " " ${genbank_list_file}; then
        printf "\n${RED}  $genbank_list_file has spaces in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no spaces.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no tabs in there
    if grep -q $'\t' ${genbank_list_file}; then
        printf "\n${RED}  $genbank_list_file has tabs in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no tabs.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi


### checking on mapping file
if [ -f "$file_to_genome_id_map" ]; then

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${file_to_genome_id_map} | cmp - ${file_to_genome_id_map} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${file_to_genome_id_map} ${file_to_genome_id_map}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${file_to_genome_id_map}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${file_to_genome_id_map}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        file_to_genome_id_map="${file_to_genome_id_map}-unix"

    fi

    # looking for duplicate desired labels in mapping file
    num_dupes=$(cut -f 2,3 "$file_to_genome_id_map" | tr "\t" "_" | sort | uniq -d | wc -l)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $file_to_genome_id_map appears to have duplicate labels in there.${NC}\n" 
        printf "${RED}Check it out and provide unique labels only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi


# checking numeric inputs and setting defaults if not provided
if [ -z $len_cutoff ]; then
    len_cutoff="0.2"
    mult_len_cut=$(echo "$len_cutoff * 100" | bc | cut -f 1 -d ".") # just storing for later use
else
    # checking is in 0-1 range (bash can't compare floats so multiplying first)
    mult_len_cut=$(echo "$len_cutoff * 100" | bc | cut -f 1 -d ".")

    if ! [ ${mult_len_cut} -ge 0 -a ${mult_len_cut} -le 100 ]; then
        printf "\n  ${RED}The gene-length cutoff proportion (\"-c\") needs to be between 0-1.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi

if [ -z $gen_cutoff ]; then
    gen_cutoff="0.5"
    mult_gen_cut=$(echo "$gen_cutoff * 100" | bc | cut -f 1 -d ".") # just storing for later use
else
    # checking is in 0-1 range (bash can't compare floats so multiplying first)
    mult_gen_cut=$(echo "$gen_cutoff * 100" | bc | cut -f 1 -d ".")

    if ! [ ${mult_gen_cut} -ge 0 -a ${mult_gen_cut} -le 100 ]; then
        printf "\n  ${RED}The minimum genome gene-copy proportion (\"-G\") needs to be between 0-1.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi

if [ -z $num_cpus ]; then
    num_cpus=2
else
    # checking is an integer
    if ! [[ $num_cpus =~ ^[0-9]+$ ]]; then
        printf "\n  ${RED}The value provided for cpus (\"-n\") needs to be an integer.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    if [ $num_cpus == 0 ]; then
        printf "\n  ${RED}The value provided for cpus (\"-n\") needs to be greater than 0.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi


if [ -z $num_muscle_threads ]; then
    num_muscle_threads=5
else
    # checking is an integer
    if ! [[ $num_muscle_threads =~ ^[0-9]+$ ]]; then
        printf "\n  ${RED}The value provided for muscle threads (\"-M\") needs to be an integer.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    if [ $num_cpus == 0 ]; then
        printf "\n  ${RED}The value provided for muscle threads (\"-M\") needs to be greater than 0.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi
fi


if [ -z $num_jobs ]; then
    num_jobs=1
else
    # checking is an integer
    if ! [[ $num_jobs =~ ^[0-9]+$ ]]; then
        printf "\n  ${RED}The value provided for number of jobs to run in parallel (\"-j\") needs\n"
        printf "    to be an integer.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    if [ $num_jobs == 0 ]; then
        printf "\n  ${RED}The value provided for number of jobs to run in parallel (\"-j\") needs\n"
        printf "    to be greater than 0.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi


#############################################################################
############  MAKING SURE MINIMUM REQUIRED INPUTS WERE PROVIDED  ############
#############################################################################
if [ ! -n "$NCBI_acc_file" ] && [ ! -n "$genbank_list_file" ] && [ ! -n "$fasta_files" ] && [ ! -n "$amino_acid_files" ]; then
    printf "\n  ${RED}You need to provide at least one input-genome source!${NC}\n"
    printf "\nExiting for now.\n\n"
    exit
fi

if [ ! -n "$hmm_file" ]; then
    printf "\n  ${RED}You need to provide the HMM file of the target genes you want to tree!${NC}\n"
    printf "  You can view the available gene-sets packaged with GToTree by running 'gtt-hmms' by\n"
    printf "  itself with no arguments.\n"

    printf "\nExiting for now.\n\n"
    exit
fi



#############################################################################
####  CHECKING SPECIFIED LINEAGE INFO TO ADD TO LABELS IS INTERPRETABLE  ####
#############################################################################
if [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ]; then
    echo $lineage_spec | tr "," "\n" > gtotree.specified-lineage-info.tmp
    gtt-check-wanted-lineage-info -w gtotree.specified-lineage-info.tmp

    if [ -s gtotree.uninterpretable_ranks.tmp ]; then
        printf "    ${RED}One or more of the specified lineage ranks to add (passed to \"-L\") were uninterpretable:${NC}\n"
        sed 's/^/        /' gtotree.uninterpretable_ranks.tmp
        printf "\n\n    Available options are:\n        Domain\n        Phylum\n        Class\n        Order\n        Family\n        Genus\n        Species\n        Strain\n"
        printf "\nExiting for now.\n\n"
        rm gtotree.specified-lineage-info.tmp gtotree.uninterpretable_ranks.tmp
        exit

    else
        rm gtotree.specified-lineage-info.tmp gtotree.uninterpretable_ranks.tmp
    fi
fi


#############################################################################
####################  ATTEMPTING TO CREATE OUTPUT DIR  ######################
#############################################################################

## will not overwrite unless `-F` flag was provided
if [ $overwrite_output_dir == "true" ]; then

    rm -rf $output_dir
    mkdir $output_dir

else

    mkdir $output_dir 2> /dev/null

    if [ $? -ne 0 ] ; then
        printf "  ${RED}Output directory \"${output_dir}\" already exists, either remove or rename\n  that one, specify a different output directory to the \"-o\" option,\n  or add the \"-F\" flag to force overwriting it.${NC}\n\n"
        printf "Exiting for now.\n\n"
        exit
    else
        rm -rf $output_dir # removing here so if we abort in a later step before getting down to business (like the gene-filtering warning), the output_dir isn't created yet
    fi

fi

#############################################################################
########################### STARTING LOG FILE ###############################
#############################################################################
gtotree_log=${output_dir}-gtotree-runlog.txt


#############################################################################
########## CHECKING INPUT GENOME SOURCES AND SPECIFIC DEPENDENCIES ##########
#############################################################################

printf "\n\n               GToTree ${VERSION} (github.com/AstrobioMike/GToTree)\n\n" > $gtotree_log

printf "\n ---------------------------------  RUN INFO  --------------------------------- \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# storing the command as entered for the log file
command_call="$(printf %q "$BASH_SOURCE")$(printf ' %q' "$@")"

command_call=$(printf "$command_call" | sed 's/\\,/,/g')

printf "    Command entered:\n" >> $gtotree_log
printf "    $command_call\n\n" >> $gtotree_log

printf "${YELLOW}\n    Input genome sources include:\n${NC}" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

### checking/reporting those provided as NCBI accessions
if [ "$NCBI_acc_file" != "" ]; then

    if [ -f "$NCBI_acc_file" ]; then
        NCBI_input_genomes_total=$(wc -l $NCBI_acc_file | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - NCBI accessions listed in $NCBI_acc_file ($NCBI_input_genomes_total genomes)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    else
        printf "\n${RED}      You specified $NCBI_acc_file as a source of NCBI genomes to use, but that file cannot be found :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

else
    NCBI_input_genomes_total=0
fi

### checking/reporting those provided as genbank files
if [ "$genbank_list_file" != "" ];then
    if [ -s "$genbank_list_file" ]; then
        # checking all the files pointed to by this file can be found
        for file in $(cat $genbank_list_file)
        do
          if [ ! -s $file ]; then
            printf "\n${RED}      Some genbank files specified in $genbank_list_file cannot be found :(${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  Double-check the provided locations and where they should be, here's one of the problems:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "            $file\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            exit
          fi
        done

        genbank_genomes_total=$(wc -l $genbank_list_file | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Genbank files listed in $genbank_list_file ($genbank_genomes_total genomes)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    else
        printf "\n${RED}      You specified $genbank_list_file as a source of GenBank files to use, but that file cannot be found or is empty :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

else
    genbank_genomes_total=0

fi

### checking/reporting those provided as fasta files
if [ "$fasta_files" != "" ]; then
    if ! command -v prodigal > /dev/null; then
        printf "\n  ${RED}Prodigal is required when providing fasta files.${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

    if [ -s "$fasta_files" ]; then
        
        # checking all files pointed to by this file can be found
        for file in $(cat $fasta_files)
        do
            if [ ! -s $file ]; then
                printf "\n${RED}      Some fasta files specified in $fasta_files cannot be found :(${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "  Double-check the provided locations and where they should be, here's one of the problems:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "            $file\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                exit
            fi
        done

        fasta_genomes_total=$(wc -l $fasta_files | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Fasta files listed in $fasta_files ($fasta_genomes_total genomes)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    else
        printf "\n${RED}      You specified $fasta_files as a source of fasta files to use, but that file cannot be found or is empty :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

else
    fasta_genomes_total=0

fi

### checking/reporting those provided as amino acid files of coding sequences
if [ "$amino_acid_files" != "" ]; then

    if [ -s "$amino_acid_files" ]; then
        
        # checking all files pointed to by this file can be found
        for file in $(cat $amino_acid_files)
        do
            if [ ! -s $file ]; then
                printf "\n${RED}      Some amino acid files specified in $amino_acid_files cannot be found :(${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "  Double-check the provided locations and where they should be, here's one of the problems:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "            $file\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                exit
            fi
        done

        amino_acid_genomes_total=$(wc -l $amino_acid_files | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Amino-acid files listed in $amino_acid_files ($amino_acid_genomes_total genomes)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    else
        printf "\n${RED}      You specified $amino_acid_files as a source of fasta files to use, but that file cannot be found or is empty :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

else
    amino_acid_genomes_total=0

fi


### reporting total number of planned genomes
total_input_genomes=$(($NCBI_input_genomes_total + $genbank_genomes_total + $fasta_genomes_total + $amino_acid_genomes_total))
printf "\n                             ${GREEN}Total input genomes: $total_input_genomes${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


#############################################################################
###########  ADDING NOTICE ABOUT SPEED AND THOUSANDS OF GENOMES  ############
#############################################################################
if [ $total_input_genomes -ge 1000 ] && [ $override_faster_alignment == 'false' ]; then
    printf "\n  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    We seem to be aiming to work with $total_input_genomes total genomes. This is quite a\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    bit, and the time the individual gene alignments take can quickly become\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    prohibitive with many thousands of genomes like this. By default, GToTree\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    is going use the 'super5' muscle algorithm to help speed up the alignments\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    for this run (since we have so many here). If you don't want this to\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    happen, you should cancel this run now with 'ctrl + c', and add the '-X' \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    flag to the GToTree call.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "    More info can be found here:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "      github.com/AstrobioMike/GToTree/wiki/things-to-consider#working-with-many-genomes\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "    And while we're chatting, you may also want to consider\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    using \"representative\" genomes, if you're not already. More info on that can be found here:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "      github.com/AstrobioMike/GToTree/wiki/things-to-consider#consider-using-representative-genomes\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "    We will wait 30 seconds before continuing with our regularly scheduled program :)\n"

    printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    sleep 30

fi


#############################################################################
###############  CHECKING AND REPORTING SPECIFIED HMM SOURCE  ###############
#############################################################################
printf "${YELLOW}\n    HMM source to be used:\n${NC}" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

if [ $hmm_file == "Universal" ]; then
    hmm_file="Universal_Hug_et_al"
    universal_SCGs_used="true"
fi

# adding ".hmm" suffix if not present
if [[ $hmm_file != *.hmm ]]; then
    hmm_file=${hmm_file}.hmm
fi

if [ -f "$hmm_file" ]; then
    # this is if the user provided a path to the HMM file directly
    grep "^NAME" $hmm_file | tr -s " " | cut -f2 -d " " > uniq_hmm_names.tmp
    hmm_target_genes_total=$(wc -l uniq_hmm_names.tmp | sed "s/^ *//" | cut -d " " -f 1)
    printf "      - $hmm_file ($hmm_target_genes_total targets)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# if not a direct path by user, we are expecting it to be in the GToTree stored HMMs directory
# here we are making sure the variable holds something 
elif [ -z ${GToTree_HMM_dir} ]; then
    # reporting it is not set
    printf "\n${RED}      The 'GToTree_HMM_dir' variable is not set :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    exit

# now making sure directory exists or that we can create it if not
elif [ ! -d ${GToTree_HMM_dir} ]; then
    
    # attempting to create
    mkdir -p ${GToTree_HMM_dir} > /dev/null
    if [ $? -ne 0 ]; then
        printf "\n${RED}      The 'GToTree_HMM_dir' location does not exist and can't be created :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

# making sure it is writable
elif [ ! -w ${GToTree_HMM_dir} ]; then
    printf "\n${RED}      The 'GToTree_HMM_dir' location is not writable for you :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    exit

# now moving on to setting or getting the hmm file
# if it is not currently present there, we will download it and store it there
elif [ -f "${GToTree_HMM_dir}${hmm_file}" ]; then

    hmm_file=${GToTree_HMM_dir}${hmm_file}

    grep "^NAME" $hmm_file | tr -s " " | cut -f2 -d " " > uniq_hmm_names.tmp
    hmm_target_genes_total=$(wc -l uniq_hmm_names.tmp | sed "s/^ *//" | cut -d " " -f 1)
    printf "      - $(basename ${hmm_file} | sed 's/\.hmm//') ($hmm_target_genes_total targets)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

elif [ ! -f "${GToTree_HMM_dir}/${hmm_file}" ]; then

    # checking info table is there, downloading if not
    if [ ! -f "${GToTree_HMM_dir}/hmm-sources-and-info.tsv" ]; then

        # downloading to there if not already present
        curl --silent --retry 10 -L -o ${GToTree_HMM_dir}/hmm-sources-and-info.tsv https://figshare.com/ndownloader/files/37703646

    fi

    # searching for what was requested
    if grep -q -w "^${hmm_file}" ${GToTree_HMM_dir}/hmm-sources-and-info.tsv ; then
        wanted_hmm_link=$(grep -w "^${hmm_file}" ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 5)
    fi

    # if it was found, moving forward with download
    if [ ! -z ${wanted_hmm_link} ]; then

        base_hmm_file_name=${hmm_file}
        hmm_file=${GToTree_HMM_dir}${hmm_file}

        # downloading
        curl --silent --retry 10 -L -o ${hmm_file} ${wanted_hmm_link}

        # making sure download was completed successfully by checking md5
        expected_md5=$(grep -w "^${base_hmm_file_name}" ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 8)

        downloaded_md5=$(md5sum ${hmm_file} | cut -f 1 -d " ")

        if [ "${expected_md5}" != "${downloaded_md5}" ]; then

            printf "\n${RED}      There seems to have been a problem downloading the ${base_hmm_file_name} file :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "      Please try again, and if problems continue, please report an issue here:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "              https://github.com/AstrobioMike/GToTree/issues\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            rm -rf ${hmm_file}
            exit

        fi

        # reporting info
        grep "^NAME" ${hmm_file} | tr -s " " | cut -f 2 -d " " > uniq_hmm_names.tmp
        hmm_target_genes_total=$(wc -l uniq_hmm_names.tmp | sed "s/^ *//" | cut -d " " -f 1)

        printf "      - $(basename ${hmm_file}) ($hmm_target_genes_total targets)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    else 

        printf "\n${RED}      You specified $hmm_file as the HMM file to use, but that file cannot be found :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "      You can see the available gene-sets packaged with GToTree by running \`gtt-hmms\`\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit

    fi

fi


#############################################################################
##############  EXPLICITLY STATING IF DEFAULT BEHAVIOR CHANGED  #############
#############################################################################
if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] || \
   [ $taxonkit_id_swap != "false" ] || [ $gtdb_id_swap != "false" ] || [ $len_cutoff != "0.2" ] || \
   [ $gen_cutoff != "0.5" ] || [ $debug_flag == "true" ] || [ $best_hit_mode == "true" ] || \
   [ $num_jobs != "1" ] || [ $num_cpus != 2 ] || [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ] || \
   [ $tree_program != "FastTreeMP" ] || [ $wanting_but_missing_FastTreeMP == "true" ] || \
   [ $align_only == 'true' ] || [ $keep_individual_alignments == 'true' ] || \
   [ ! -z $target_pfams ] || [ ! -z $target_KOs ]; then

    if [ "$file_to_genome_id_map" != "" ]; then
        if [ ! -s $file_to_genome_id_map ]; then
            printf "\n${RED}      You specified $file_to_genome_id_map, but that file cannot be found or is empty :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            exit
        fi
    fi

    printf "${YELLOW}\n    Options set:\n${NC}" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    if [ $output_dir != "GToTree_output" ]; then
        printf "      - The output directory has been set to \"$output_dir/\"\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ "$file_to_genome_id_map" != "" ]; then
        printf "      - The file \"$file_to_genome_id_map\" will be used to modify labels of the specified genomes\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $taxonkit_id_swap != "false" ] && [ $gtdb_id_swap == "false" ]; then
        printf "      - Taxonkit will be used to add NCBI taxonomy info to labels where possible\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        taxonkit_used="true"
    fi

    if [ $gtdb_id_swap != "false" ] && [ $taxonkit_id_swap == "false" ]; then
        printf "      - GTDB taxonomic info will be added to labels where possible\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        gtdb_used="true"
    fi

    if [ $taxonkit_id_swap != "false" ] && [ $gtdb_id_swap != "false" ]; then
        printf "      - GTDB taxonomic info will be added to labels where possible\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "      - NCBI taxonomic info will be added where possible when GTDB is not\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        taxonkit_used="true"
        gtdb_used="true"
    fi    

    if [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ]; then
        lineage_mod_report=$(echo $lineage_spec | sed 's/,/, /g')
        printf "      - Lineage information added to labels will be $lineage_mod_report\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $len_cutoff != "0.2" ]; then
        printf "      - Gene-length filtering cutoff threshold (\"-c\") has been set to $len_cutoff\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $align_only == 'true' ]; then
        printf "      - Only generating alignment, no tree, as \"-N\" option has been provided\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $align_only == 'false' ]; then

        if [ $wanting_but_missing_FastTreeMP == "true" ]; then
            printf "      - FastTreeMP not available on system, so using non-parallel FastTree instead\n"
            tree_program="FastTree"
        fi

        if [ $tree_program != "FastTreeMP" ] && [ $wanting_but_missing_FastTreeMP != "true" ]; then
            printf "      - Tree generation program (\"-T\") has been set to $tree_program\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        fi

    fi

    if [ $keep_individual_alignments == 'true' ]; then
        printf "      - Individual protein alignment files will be retained, due to the \"-k\" flag being provided\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $gen_cutoff != "0.5" ]; then
        printf "      - Genome minimum gene-copy threshold (\"-G\") has been set to $gen_cutoff\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $num_cpus != "2" ]; then
        printf "      - Number of cpus to use during hmm search (\"-n\") has been set to $num_cpus\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $num_jobs != "1" ]; then
        printf "      - Number of jobs to run during parallelizable steps has been set to $num_jobs\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        parallel_used="true"
    fi

    if [ $best_hit_mode == "true" ]; then
        printf "      - Running in \"best-hit\" mode\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $debug_flag == "true" ]; then
        printf "      - Debug mode on. Temp directory won't be removed after run\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ $http_flag == "true" ]; then
        printf "      - Attempting to use http instead of ftp\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

fi

#### checking and reporting if KOs targeted ####
if [ "$target_KOs" != "" ]; then

    if [ -s "$target_KOs" ]; then
        uniq ${target_KOs} > uniq_ko_targets.tmp
        ko_target_genes_total=$(wc -l uniq_ko_targets.tmp | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - KOs will be searched from: $target_KOs ($ko_target_genes_total targets)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        ko_targets="true"
        # setting variable to track citation
        kofamscan_used="true"

    else
        printf "\n${RED}      You specified $target_KOs as your target Pfams to sesarch, but that file cannot be found :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${target_KOs} | cmp - ${target_KOs} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${target_KOs} ${target_KOs}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${target_KOs}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${target_KOs}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        target_KOs="${target_KOs}-unix"

    fi

    # checking no duplicates in there
    num_dupes=$(sort "$target_KOs" | uniq -d | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $target_KOs has duplicate entries, check it out and provide unique accessions only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no whitespace in there
    if grep -q " " ${target_KOs}; then
        printf "\n${RED}  $target_KOs has spaces in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no spaces.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no tabs in there
    if grep -q $'\t' ${target_KOs}; then
        printf "\n${RED}  $target_KOs has tabs in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no tabs.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi


#### checking and reporting if pfams targeted ####
if [ "$target_pfams" != "" ]; then

    if [ -s "$target_pfams" ]; then
        uniq ${target_pfams} > uniq_pfam_targets.tmp
        pfam_target_genes_total=$(wc -l uniq_pfam_targets.tmp | sed "s/^ *//" | cut -d " " -f 1)
        printf "      - Pfams will be searched from: $target_pfams ($pfam_target_genes_total targets)\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        additional_pfam_targets="true"
        # setting variable to track citation
        pfam_db_used="true"
    else
        printf "\n${RED}      You specified $target_pfams as your target Pfams to sesarch, but that file cannot be found :(${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi

    # checking if file has windows CRLF line-terminators, and running dos2unix if so
    dos2unix < ${target_pfams} | cmp - ${target_pfams} > /dev/null

    if [ $? -ne 0 ] ; then
        dos2unix -n ${target_pfams} ${target_pfams}-unix 2> /dev/null

        printf "\n${YELLOW}    Input file \"${target_pfams}\" had some Windows-formatting that likely\n"
        printf "    would have caused problems. It has been converted it with \`dos2unix\` into file\n"
        printf "    \"${target_pfams}-unix\", and we will be using that.${NC}\n\n"

        sleep 2

        target_pfams="${target_pfams}-unix"

    fi

    # checking no duplicates in there
    num_dupes=$(sort "$target_pfams" | uniq -d | wc -l | sed "s/^ *//" | cut -d " " -f 1)
    if [ ! $num_dupes == 0 ]; then
        printf "\n${RED}  $target_pfams has duplicate entries, check it out and provide unique accessions only.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no whitespace in there
    if grep -q " " ${target_pfams}; then
        printf "\n${RED}  $target_pfams has spaces in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no spaces.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

    # checking no tabs in there
    if grep -q $'\t' ${target_pfams}; then
        printf "\n${RED}  $target_pfams has tabs in it, maybe they are at the end of all of the entries?${NC}\n" 
        printf "  They might cause problems. Check it out and provide one with no tabs.${NC}\n"
        printf "\nExiting for now.\n\n"
        exit
    fi

fi

printf "\n"

sleep 3.5


#############################################################################
######  ADDING NOTICE ABOUT FILTERING BY GENE-LENGTH WITH FEW GENOMES  ######
#############################################################################
if [ $total_input_genomes -le 20 ]; then
    printf "\n  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    Filtering by gene-length using the median length of a gene set (set with\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    the \`-c\` flag) becomes less reliable with fewer genomes. With $total_input_genomes total\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    input genomes, if a lot of sequences are dropped, consider increasing\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    the parameter and/or visually inspecting the alignments.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    More info can be found here:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "      github.com/AstrobioMike/GToTree/wiki/Things-to-consider\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "\t        Moving forward with \"-c\" set to $len_cutoff this run.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    sleep 3
fi

#############################################################################
########  ADDING NOTICE ABOUT COMPUTATIONAL LOAD WITH MANY GENOMES  #########
#############################################################################
if [ $total_input_genomes -ge 12500 ]; then
    printf "\n  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    The alignment and treeing steps, particularly the alignments, can become\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    prohibitively memory-intensive with many input genomes. With $total_input_genomes total\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    input genomes, this job may not be feasible :( \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    
    printf "    Often it is useful to slim down how many genomes of closely related\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    organisms we are including when looking across a broad level of\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    diversity, as having many closely organisms usually isn't helpful or\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    informative at broad levels.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "    Have you considered using \"representative\" genomes only (either from\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    NCBI or GTDB)? Those both provide helpful systems for reducing some\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    redundancy when working at a broad level with many organisms. See\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    github.com/AstrobioMike/GToTree/wiki/things-to-consider#consider-using-representative-genomes\n"
    
    printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "      ${YELLOW}Press any key to continue, or press \`ctrl + c\` to cancel this run...${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # temporarily changing IFS so can tell the difference between input spacebar vs return key
    orig_IFS=$IFS
    IFS=""

    read -r -s -n 1 key_var
    
    if [[ $key_var == "" ]]; then
        IFS=${orig_IFS}
        printf "\n                         You're the boss! Movin' on :)\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        sleep 3
    else
        IFS=${orig_IFS}
        printf "\n                Exiting for now, hope to see you again soon :)\n\n"
        sleep 3
        rm -rf $gtotree_log uniq_hmm_names.tmp
        exit
    fi
    
fi


#############################################################################
######    CHECKING FOR AND SETTING UP GTDB REFERENCE FILES IF NEEDED    #####
#############################################################################

if [ "$gtdb_id_swap" != 'false' ]; then
    gtt-check-or-setup-GTDB-files || exit
fi


#############################################################################
###############  CREATE A TEMP DIRECTORY AND EXITING IF FAILS  ##############
#############################################################################
tmp_dir=$(date +%s).gtotree.tmpdir

mkdir $tmp_dir 2> /dev/null

if [ $? -ne 0 ] ; then
    printf "\n${RED}  Tried to make temporary directory named ${tmp_dir} but failed, this shouldn't happen :(${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    rm -rf uniq_hmm_names.tmp uniq_pfam_targets.tmp
    printf "Exiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    exit
fi

### making sure each id provided in the mapping file (if given) is found in one
### of the genome input files
if [ "$file_to_genome_id_map" != "" ]; then
    if [ "$genbank_list_file" != "" ]; then
        for file in $(cat $genbank_list_file)
        do
            basename $file
        done > ${tmp_dir}/genbank_list_file_basenames.tmp
        genbank_list_file_basenames=${tmp_dir}/genbank_list_file_basenames.tmp
    fi

    if [ "$fasta_files" != "" ]; then
        for file in $(cat $fasta_files)
        do
            basename $file
        done > ${tmp_dir}/fasta_files_basenames.tmp
        fasta_files_basenames=${tmp_dir}/fasta_files_basenames.tmp
    fi

    if [ "$amino_acid_files" != "" ]; then
        for file in $(cat $amino_acid_files)
        do
            basename $file
        done > ${tmp_dir}/amino_acid_files_basenames.tmp
        amino_acid_files_basenames=${tmp_dir}/amino_acid_files_basenames.tmp
    fi

    cat $NCBI_acc_file $genbank_list_file_basenames $fasta_files_basenames $amino_acid_files_basenames | sort > ${tmp_dir}/sorted_all_input_genome_ids.tmp
    
    ## adding in properly handling mapping file if the user provides paths rather than just file names, re: https://github.com/AstrobioMike/GToTree/issues/14

    paste <( cut -f 1 $file_to_genome_id_map | awk -F '/' ' { print $NF } ' ) <( cut -f 2- $file_to_genome_id_map ) > ${tmp_dir}/new_id_map.tmp
    new_file_to_genome_id_map=${tmp_dir}/new_id_map.tmp

    comm -23 <(cut -f 1 ${tmp_dir}/new_id_map.tmp | sort) ${tmp_dir}/sorted_all_input_genome_ids.tmp > ${tmp_dir}/spurious_ids.tmp

    if [ -s ${tmp_dir}/spurious_ids.tmp ]; then
        spurious_ids_count=$(wc -l ${tmp_dir}/spurious_ids.tmp | sed "s/^ *//" | cut -d " " -f 1)
        
        cp ${tmp_dir}/spurious_ids.tmp Missing_IDs.txt
        mv uniq_hmm_names.tmp ${tmp_dir}/

        if [ -s "$target_pfams" ]; then
            mv uniq_pfam_targets.tmp ${tmp_dir}/
        fi

        if [ $debug_flag == 'false' ]; then
             rm -rf $tmp_dir
        fi
        
        printf "\n${RED}  $spurious_ids_count genome ID(s) listed in the mapping file (passed to \"-m\") not found in${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${RED}any of the input genomes :(${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  Reported in the file \"Missing_IDs.txt\" for you to investigate.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  If things look the same at first glance, double-check there are no extra\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  empty spaces or anything at the ends of the entries in that file.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        
        rm -rf $output_dir
        printf "Exiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        exit
    fi
fi

# remaking output directory now that things are a go
mkdir $output_dir 2> /dev/null
mkdir ${output_dir}/run_files/

# moving files into temp directory now that all is well
mv uniq_hmm_names.tmp ${tmp_dir}/uniq_hmm_names.tmp

if [ "$ko_targets" != "false" ]; then
    mv uniq_ko_targets.tmp ${tmp_dir}/uniq_ko_targets.tmp
fi

if [ "$additional_pfam_targets" != 'false' ]; then
    mv uniq_pfam_targets.tmp ${tmp_dir}/uniq_pfam_targets.tmp
fi

# starting an all genomes file to keep track of those from all sources
touch ${tmp_dir}/genomes_from_all_sources.tmp

# starting the SCG-counts per genome tab which will have the following as its header:
paste <(printf "assembly_id") <(printf %s "$(cat ${tmp_dir}/uniq_hmm_names.tmp | tr "\n" "\t" | sed 's/.$//')") > ${output_dir}/SCG_hit_counts.tsv

#############################################################################
########################  KEEPING TRACK OF RUN TIME  ########################
#############################################################################
start_time=$(date +"%I:%M %p")
SECONDS=0


#############################################################################
######  GETTING AND BUILDING NEEDED KO STUFF IF USER WANTS KO SCANNING  #####
#############################################################################
if [ "$ko_targets" != 'false' ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####                     Setting up HMMs for KO targets                   ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # making output dirs
    mkdir -p ${output_dir}/KO_search_results/target_KO_profiles/
    mkdir -p ${output_dir}/KO_search_results/KO_hit_seqs/
    mkdir -p ${output_dir}/KO_search_results/iToL_files/
    mkdir -p ${output_dir}/KO_search_results/individual_genome_results/

    gtt-parse-kofamscan-targets.sh ${target_KOs} ${output_dir}

fi


#############################################################################
##########  GETTING AND BUILDING ADDITIONAL TARGET PFAM HMM IF SET  #########
#############################################################################
if [ "$additional_pfam_targets" != 'false' ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####              Downloading HMMs for additional Pfam targets            ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # making output dirs
    mkdir -p ${output_dir}/Pfam_search_results/target_Pfam_profiles/
    mkdir -p ${output_dir}/Pfam_search_results/Pfam_hit_seqs/
    mkdir -p ${output_dir}/Pfam_search_results/iToL_files/
    mkdir -p ${output_dir}/Pfam_search_results/individual_genome_results/
    mkdir -p ${output_dir}/Pfam_search_results/info/

    gtt-get-additional-pfam-targets.sh ${tmp_dir} ${output_dir}

fi

#############################################################################
#####################  NCBI-DERIVED GENOME PROCESSING  ######################
#############################################################################
if [ -n "$NCBI_acc_file" ]; then


    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####          Working on the genomes provided as NCBI accessions          ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # storing sorted ncbi accession file
    sort $NCBI_acc_file > ${tmp_dir}/sorted_ncbi_accs.tmp

    ## checking if any have the GenBank AND RefSeq for the same genome (e.g.: GCA_ and GCF_ with same following accession numbers)
    sed 's/GC._//' ${tmp_dir}/sorted_ncbi_accs.tmp | sort > ${tmp_dir}/sorted_base_ncbi_accs.tmp
    uniq -d ${tmp_dir}/sorted_base_ncbi_accs.tmp > ${tmp_dir}/dupe_accs.tmp

    ## if there were, removing genbank one, keeping refseq one, and reporting:
    if [ -s ${tmp_dir}/dupe_accs.tmp ]; then
        num_dupe_genomes=$(wc -l ${tmp_dir}/dupe_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)
        for dupe_acc in $(cat ${tmp_dir}/dupe_accs.tmp)
        do
            grep "$dupe_acc" ${tmp_dir}/sorted_ncbi_accs.tmp
        done > ${output_dir}/Redundant_input_accessions.txt

        printf "     ${YELLOW}******************************* ${NC}NOTICE ${YELLOW}*******************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\t$num_dupe_genomes accession(s) redundant - meaning GenBank and RefSeq accessions\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\twere provided for the same genome. Only RefSeq accession used.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\t  Reported in \"${output_dir}/run_files/Redundant_input_accessions.txt\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "     ${YELLOW}**********************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        comm -23 ${tmp_dir}/sorted_ncbi_accs.tmp <(sort ${output_dir}/Redundant_input_accessions.txt) | sort > ${tmp_dir}/sorted_building_new_input_ncbi_accs.tmp

        cat <(grep "^GCF_" ${output_dir}/Redundant_input_accessions.txt) ${tmp_dir}/sorted_building_new_input_ncbi_accs.tmp | sort > ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp

        sleep 3

    else
        mv ${tmp_dir}/sorted_ncbi_accs.tmp ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp
    fi

    # downloading genbank files and/or refseq assembly summary files as needed

    if grep -q "^GCA" ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp; then
        printf "\t\t  ${GREEN}Downloading GenBank assembly summaries...${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        if [ "$http_flag" == 'false' ]; then
            curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt -o ${tmp_dir}/ncbi_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        else
            curl --connect-timeout 30 --retry 10 https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt -o ${tmp_dir}/ncbi_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        fi
    fi

    # making sure file downloaded with no errors
    if [ -s capture_any_dl_errors.tmp ]; then
        printf "\n\n  ${RED}Download of NCBI assembly summaries failed :(${NC}\n  Is the internet connection weak?\n\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        rm -rf ${tmp_dir} ${output_dir} capture_any_dl_errors.tmp
        exit
    else
        rm -rf capture_any_dl_errors.tmp
    fi

    if grep -q "^GCF" ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp; then
        printf "\n\t\t  ${GREEN}Downloading RefSeq assembly summaries...${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        if [ "$http_flag" == 'false' ]; then
            curl --connect-timeout 30 --retry 10 ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt -o ${tmp_dir}/ncbi_RS_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        else
            curl --connect-timeout 30 --retry 10 https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt -o ${tmp_dir}/ncbi_RS_assembly_info.tmp || echo "failed" > capture_any_dl_errors.tmp
        fi

        cat ${tmp_dir}/ncbi_RS_assembly_info.tmp >> ${tmp_dir}/ncbi_assembly_info.tmp
    fi

    # making sure file downloaded with no errors
    if [ -s capture_any_dl_errors.tmp ]; then
        printf "\n\n  ${RED}Download of NCBI assembly summaries failed :(${NC}\n  Is the internet connection weak?\n\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        rm -rf ${tmp_dir} ${output_dir} capture_any_dl_errors.tmp
        exit
    else
        rm -rf capture_any_dl_errors.tmp
    fi


    ## searching assembly info table for the input accessions (NOTE: will always return latest accession version as of now)
    # to keep track of what was searched and what was downloaded, both are included in the output summary table
    gtt-parse-assembly-summary-file -a ${tmp_dir}/ncbi_assembly_info.tmp -w ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp -o ${tmp_dir}/ncbi_accessions_info.tmp

    ## sorting and saving found input accs 
    cut -f 1 ${tmp_dir}/ncbi_accessions_info.tmp | sort > ${tmp_dir}/sorted_got_ncbi_accs.tmp

    # checking if any accs weren't successfully found (including removing any potential whitespace which users may have accidentally left in, eventually should put a check for this at the start):
    comm -23 <(sed 's/ *$//' ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp) <(sed 's/ *$//' ${tmp_dir}/sorted_got_ncbi_accs.tmp) > ${tmp_dir}/not_found_accs.tmp

    # if any not found, removing from current list, and reporting:
    if [ -s ${tmp_dir}/not_found_accs.tmp ]; then

        num_accs_not_found=$(wc -l ${tmp_dir}/not_found_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)
        printf "\n  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    $num_accs_not_found accession(s) not successfully found at NCBI.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    Reported in \"${output_dir}/run_files/NCBI_accessions_not_found.txt\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        sleep 3

        cp ${tmp_dir}/not_found_accs.tmp ${output_dir}/NCBI_accessions_not_found.txt

        mv ${tmp_dir}/sorted_got_ncbi_accs.tmp ${tmp_dir}/updated_sorted_ncbi_accs.tmp

        NCBI_remaining_genomes_total=$(wc -l ${tmp_dir}/updated_sorted_ncbi_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)

    else
        mv ${tmp_dir}/updated_sorted_input_ncbi_accs.tmp ${tmp_dir}/updated_sorted_ncbi_accs.tmp
        NCBI_remaining_genomes_total=$(wc -l ${tmp_dir}/updated_sorted_ncbi_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)
        printf "\n\n\t        ${GREEN}All $NCBI_remaining_genomes_total input accessions successfully found.${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    fi

    # creating the ncbi-derived-genome output table file which will have the following as its header:
    printf "input_accession\tdownloaded_accession\tassembly_name\ttaxid\torganism_name\tinfraspecific_name\tversion_status\tassembly_level\tnum_SCG_hits\tuniq_SCG_hits\tperc_comp\tperc_redund\n" > ${output_dir}/NCBI_genomes_summary_info.tsv

    ### running in parallel if set, otherwise running in serial ###
    if [ $num_jobs == "1" ]; then
        gtt-ncbi-serial.sh ${tmp_dir}/ncbi_accessions_info.tmp $tmp_dir $hmm_file $NCBI_remaining_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets $http_flag ${ko_targets} ${target_KOs}
    else
        cat ${tmp_dir}/ncbi_accessions_info.tmp | parallel -j $num_jobs gtt-ncbi-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets $http_flag ${ko_targets} ${target_KOs}
    fi

    ## checking if prodigal was used to add to citations list being reported at end
    if [ -s ${tmp_dir}/prodigal_used ]; then
        prodigal_used="true"
    fi

    printf "________________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


    ## reporting any accessions not successfully found or downloaded, and updating files
    printf "\n     ******************************* ${GREEN}UPDATE${NC} *******************************  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        Of the input genomes provided by NCBI accession:\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    if [ -f ${output_dir}/Redundant_input_accessions.txt ]; then
        num_dupe_genomes=$(wc -l ${output_dir}/Redundant_input_accessions.txt | sed "s/^ *//" | cut -d " " -f 1)
        num_dupe_report=$(($num_dupe_genomes / 2))
        printf "          ${YELLOW}$num_dupe_report accession(s) redundant.${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "          Reported in \"${output_dir}/run_files/Redundant_input_accessions.txt\".\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ -f ${output_dir}/NCBI_accessions_not_found.txt ]; then
        num_accs_not_found=$(wc -l ${output_dir}/NCBI_accessions_not_found.txt | sed "s/^ *//" | cut -d " " -f 1)
        printf "          ${YELLOW}$num_accs_not_found accession(s) not found at NCBI.${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "          Reported in \"${output_dir}/run_files/NCBI_accessions_not_found.txt\".\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    if [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ]; then
        num_accs_not_downloaded=$(wc -l ${output_dir}/NCBI_accessions_not_downloaded.txt | sed "s/^ *//" | cut -d " " -f 1)

        comm -23 ${tmp_dir}/updated_sorted_ncbi_accs.tmp <(sort ${output_dir}/NCBI_accessions_not_downloaded.txt) | sort > ${tmp_dir}/updated_sorted_ncbi_accs2.tmp
        mv ${tmp_dir}/updated_sorted_ncbi_accs2.tmp ${tmp_dir}/final_included_NCBI_accs.tmp

        printf "          ${YELLOW}$num_accs_not_downloaded NCBI accession(s) did not download successfully.${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "          Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\".\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    else
        mv ${tmp_dir}/updated_sorted_ncbi_accs.tmp ${tmp_dir}/final_included_NCBI_accs.tmp
    fi

    num_retained_ncbi_input_genomes=$(wc -l ${tmp_dir}/final_included_NCBI_accs.tmp | sed "s/^ *//" | cut -d " " -f 1)

    if [ -f ${output_dir}/NCBI_accessions_not_found.txt ] || [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ] || [ -f ${output_dir}/Redundant_input_accessions.txt ]; then

        printf "        $num_retained_ncbi_input_genomes of the total $NCBI_input_genomes_total input accessions had their genomes successfully\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "        downloaded and searched.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    else
        printf "                ${GREEN}All $num_retained_ncbi_input_genomes successfully downloaded and searched :)${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    fi

    printf "     ********************************************************************** \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # adding to genomes from all sources file
    cat ${tmp_dir}/final_included_NCBI_accs.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp

fi



#############################################################################
####################  GENBANK-DERIVED GENOME PROCESSING  ####################
#############################################################################
if [ -n "$genbank_list_file" ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####           Working on the genomes provided as GenBank files           ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # setting a counter to track how far along we are
    num=0

    # creating the genbank-genome-derived output table which will have the following as its header:
    printf "assembly_name\toriginal_file_name\ttaxid\torganism_name\tstrain\tnum_SCG_hits\tuniq_SCG_hits\tperc_comp\tperc_redund\n" > ${output_dir}/Genbank_genomes_summary_info.tsv

    ### running in parallel if set, otherwise running in serial ###
    if [ $num_jobs == "1" ]; then
        gtt-genbank-serial.sh $genbank_list_file $tmp_dir $hmm_file $genbank_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}

        ### kill backstop ###
        # if there was a problem with the serial genbank genome processing, killing main program here and reporting
        if [ -s ${tmp_dir}/kill_genbank_serial.prodigal ]; then

            problem_assembly=$(head -n 1 ${tmp_dir}/kill_genbank_serial.prodigal)

            printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Something went wrong with prodigal trying to call genes on the fasta\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    file generated from assembly ${GREEN}${problem_assembly}${NC}.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    GToTree is not sure it should move forward when something odd is going\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    on like this :(\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "    It is possible this is just because this assembly is shorter than\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    100,000 bps. That's unlikely to be sufficient for incorporation\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    into a phylogenomic tree, but if you'd like to still try with it,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    you can run prodigal on it yourself and provide the amino acid\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    sequences to GToTree. Though it is still likely to get filtered\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    out due to not having enough of the target genes (set with '-G').\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # removing tmp directory unless debug set
            if [ $debug_flag == 'false' ]; then
                rm -rf $tmp_dir
            fi

            mv $gtotree_log ${output_dir}/gtotree-runlog.txt
            exit
        fi

    else
        cat $genbank_list_file | parallel -j $num_jobs gtt-genbank-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}


        ### kill backstop ###
        # if there was a problem with the parallel genbank genome processing, killing main program here and reporting
        if [ -s ${tmp_dir}/kill_genbank_parallel.prodigal ]; then

            problem_assembly=$(head -n 1 ${tmp_dir}/kill_genbank_parallel.prodigal)

            printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Something went wrong with prodigal trying to call genes on the fasta\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    file generated from assembly ${GREEN}${problem_assembly}${NC}.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    GToTree is not sure it should move forward when something odd is going\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    on like this :(\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


            printf "    It is possible this is just because this assembly is shorter than\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    100,000 bps. That's unlikely to be sufficient for incorporation\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    into a phylogenomic tree, but if you'd like to still try with it,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    you can run prodigal on it yourself and provide the amino acid\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    sequences to GToTree. Though it is still likely to get filtered\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    out due to not having enough of the target genes (set with '-G').\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # removing tmp directory unless debug set
            if [ $debug_flag == 'false' ]; then
                rm -rf $tmp_dir
            fi

            mv $gtotree_log ${output_dir}/gtotree-runlog.txt
            exit
        fi

    fi

    mv ${tmp_dir}/genbank_genomes_list.tmp ${tmp_dir}/final_included_genbank_genomes.tmp

    # adding retained genomes to genomes from all sources file
    cat ${tmp_dir}/final_included_genbank_genomes.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp

    ## checking if prodigal was used to add to citations list being reported at end
    if [ -s ${tmp_dir}/prodigal_used ]; then
        prodigal_used="true"
    fi

    printf "________________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

fi


#############################################################################
#####################  FASTA-DERIVED GENOME PROCESSING  #####################
#############################################################################
if [ -n "$fasta_files" ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####            Working on the genomes provided as fasta files            ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # setting a counter to track how far along we are
    num=0

    # creating the amino-acid-genome-derived output table which will have the following as its header:
    printf "Assembly_name\tOriginal_file_name\ttaxid\tnum_SCG_hits\tuniq_SCG_hits\tperc_comp\tperc_redund\n" > ${output_dir}/Fasta_genomes_summary_info.tsv


    ### running in parallel if set, otherwise running in serial ###
    if [ $num_jobs == "1" ]; then
        gtt-fasta-serial.sh $fasta_files $tmp_dir $hmm_file $fasta_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}

        ### kill backstop ###
        # if there was a problem with the serial fasta genome processing, killing main program here and reporting
        if [ -s ${tmp_dir}/kill_fasta_serial.prodigal ]; then

            problem_assembly=$(head -n 1 ${tmp_dir}/kill_fasta_serial.prodigal)

            printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Something went wrong with prodigal trying to call genes on assembly\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    ${GREEN}${problem_assembly}${NC}. GToTree is not sure it should\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    move forward when something odd is going on like this :(\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "    It is possible this is just because this assembly is shorter than\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    100,000 bps. That's unlikely to be sufficient for incorporation\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    into a phylogenomic tree, but if you'd like to still try with it,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    you can run prodigal on it yourself and provide the amino acid\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    sequences to GToTree. Though it is still likely to get filtered\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    out due to not having enough of the target genes (set with '-G').\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # removing tmp directory unless debug set
            if [ $debug_flag == 'false' ]; then
                rm -rf $tmp_dir
            fi

            mv $gtotree_log ${output_dir}/gtotree-runlog.txt
            exit
        fi

    else
        cat $fasta_files | parallel -j $num_jobs gtt-fasta-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}

        ### kill backstop ###
        # if there was a problem with the parallel fasta genome processing, killing main program here and reporting
        if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then

            problem_assembly=$(head -n 1 ${tmp_dir}/kill_fasta_parallel.prodigal)

            printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Something went wrong with prodigal trying to call genes on assembly\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    ${GREEN}${problem_assembly}${NC}. GToTree is not sure it should\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    move forward when something odd is going on like this :(\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "    It is possible this is just because this assembly is shorter than\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    100,000 bps. That's unlikely to be sufficient for incorporation\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    into a phylogenomic tree, but if you'd like to still try with it,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    you can run prodigal on it yourself and provide the amino acid\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    sequences to GToTree. Though it is still likely to get filtered\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    out due to not having enough of the target genes (set with '-G').\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # removing tmp directory unless debug set
            if [ $debug_flag == 'false' ]; then
                rm -rf $tmp_dir
            fi

            mv $gtotree_log ${output_dir}/gtotree-runlog.txt
            exit
        fi

    fi

    # adding retained genomes to genomes from all sources file
    cat ${tmp_dir}/fasta_genomes_list.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp

    ## prodigal must have been used with input fastas, setting to add to citations list being reported at end
    prodigal_used="true"


    printf "_______________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

fi


#############################################################################
##################  AMINO-ACID-DERIVED GENOME PROCESSING  ###################
#############################################################################
if [ -n "$amino_acid_files" ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####         Working on the genomes provided as amino acid files          ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    curr_time=$(date +"%I:%M %p")
    duration=$SECONDS

    printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # setting a counter to track how far along we are
    num=0

    # creating the amino-acid-genome-derived output table which will have the following as its header:
    printf "Assembly_name\tOriginal_file_name\ttaxid\tnum_SCG_hits\tuniq_SCG_hits\tperc_comp\tperc_redund\n" > ${output_dir}/Amino_acid_genomes_summary_info.tsv


    ### running in parallel if set, otherwise running in serial ###
    if [ $num_jobs == "1" ]; then
        gtt-amino-acid-serial.sh $amino_acid_files $tmp_dir $hmm_file $amino_acid_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}

        ### kill backstop ###
        # if there was a problem with the serial fasta genome processing, killing main program here and reporting
        if [ -s ${tmp_dir}/kill_amino_acid_serial.problem ]; then

            problem_assembly=$(head -n 1 ${tmp_dir}/kill_amino_acid_serial.problem)

            printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Something went wrong with amino acid input file ${GREEN}${problem_assembly}${NC}.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Is it in standard fasta format? GToTree is not sure it should move forward\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    when something odd is going on like this :(\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # removing tmp directory unless debug set
            if [ $debug_flag == 'false' ]; then
                rm -rf $tmp_dir
            fi

            mv $gtotree_log ${output_dir}/gtotree-runlog.txt
            exit
        fi

    else
        cat $amino_acid_files | parallel -j $num_jobs gtt-amino-acid-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}

        ### kill backstop ###
        # if there was a problem with the parallel fasta genome processing, killing main program here and reporting
        if [ -s ${tmp_dir}/kill_amino_acid_parallel.problem ]; then

            problem_assembly=$(head -n 1 ${tmp_dir}/kill_amino_acid_parallel.problem)

            printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Something went wrong with amino acid input file ${GREEN}${problem_assembly}${NC}.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Is it in standard fasta format? GToTree is not sure it should move forward\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    when something odd is going on like this :(\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # removing tmp directory unless debug set
            if [ $debug_flag == 'false' ]; then
                rm -rf $tmp_dir
            fi

            mv $gtotree_log ${output_dir}/gtotree-runlog.txt
            exit
        fi

    fi

    # adding retained genomes to genomes from all sources file
    cat ${tmp_dir}/amino_acid_genomes_list.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp

    printf "_______________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

fi


#############################################################################
#############################  FILTERING GENES  #############################
#############################################################################
printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ####                     Filtering genes by length                        ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf "     Keeping those with lengths within ${mult_len_cut}%% of the median for the gene set.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

curr_time=$(date +"%I:%M %p")
duration=$SECONDS

printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

### first checking that all genes have some sequences, if not, removing from gene list ###
for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
do
    if [ ! -s ${tmp_dir}/${SCG}_hits.faa ]; then
        echo "$SCG"
    fi
done > ${tmp_dir}/SCG_targets_with_no_retained_seqs.tmp

# removing those genes, if any:
comm -23 <(sort ${tmp_dir}/uniq_hmm_names.tmp) <(sort ${tmp_dir}/SCG_targets_with_no_retained_seqs.tmp) > ${tmp_dir}/updated_genes_list_a.tmp

### checking if any have just 1 and removing them from gene list ###
for SCG in $(cat ${tmp_dir}/updated_genes_list_a.tmp)
do

    curr_count=$(grep -c ">" ${tmp_dir}/${SCG}_hits.faa)

    if [ ${curr_count} -lt 2 ]; then
        echo "$SCG"
    fi

done > ${tmp_dir}/SCG_targets_with_only_one_seq.tmp

# removing those, if any:
comm -23 <(sort ${tmp_dir}/updated_genes_list_a.tmp) <(sort ${tmp_dir}/SCG_targets_with_only_one_seq.tmp) > ${tmp_dir}/updated_genes_list.tmp

# combining removed list
cat ${tmp_dir}/SCG_targets_with_no_retained_seqs.tmp ${tmp_dir}/SCG_targets_with_only_one_seq.tmp > ${tmp_dir}/removed_SCG_targets.tmp
removed_genes=$(wc -l ${tmp_dir}/removed_SCG_targets.tmp | sed "s/^ *//" | cut -d " " -f 1)

# reporting
if [ $removed_genes != "0" ]; then

## reporting varies depending on if run in best-hit mode (-B) or not
    if [ $best_hit_mode != "true" ]; then

        printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    $removed_genes gene(s) either had no hits in any genome, a hit in only one genome, or\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    only multiple hits per genome... Just so ya know!!\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "        These included:\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        sed 's/^/                     /' ${tmp_dir}/removed_SCG_targets.tmp | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\n    Reported in \"${output_dir}/run_files/Target-genes-not-found-or-retained.txt\".\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "   If interested, you can figure out which of those scenarios was the cause by\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "   checking out \"${output_dir}/SCG_hit_counts.tsv\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${tmp_dir}/removed_SCG_targets.tmp ${output_dir}/run_files/Target-genes-not-found-or-retained.txt

        sleep 3

    else

        printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    $removed_genes gene(s) had no hits in any genome, or a hit in only one genome... Just so ya know!!\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "        These included:\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        sed 's/^/                     /' ${tmp_dir}/removed_SCG_targets.tmp | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\n    Reported in \"${output_dir}/run_files/Target-genes-not-found-or-retained.txt\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${tmp_dir}/removed_SCG_targets.tmp ${output_dir}/run_files/Target-genes-not-found-or-retained.txt

        sleep 3

    fi

fi


### making sure there are some genes left after removing those with 0 hits in any genome or only multiple hits per genome (this shouldn't be a problem except under odd circumstances like when testing with VERY few genes and/or very few genomes)
len_updated_genes_list=$(wc -l ${tmp_dir}/updated_genes_list.tmp | sed "s/^ *//" | cut -d " " -f 1)


if [ $len_updated_genes_list == "0" ]; then

## reporting varies depending on if run in best-hit mode (-B) or not
    if [ $best_hit_mode != "true" ]; then

        printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    After filtering out genes that had either 0 hits in any genome OR only\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    multiple hits, no genes remained. This typically shouldn't happen unless\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    maybe there were very few genes being targeted, or very few genomes. You\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    can consider running GToTree in \"best-hit\" mode (by providing the '-B'\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    flag with no arguments), which will retain genes from genomes with multiple\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    hits - but keep in mind that is less conservative. This is also a good time\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    to make sure nothing weird is going on.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        printf "    You can see the number of target-gene hits per genome by checking out \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    \"${output_dir}/SCG_hit_counts.tsv\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    else

        printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    After filtering out genes that had 0 hits in any genome, no genes remained.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    This typically shouldn't happen unless maybe there were very few genes being targeted,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    or very few genomes being run. If it is not clear what is causing the problem,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    please feel free to post an issue at the GToTree github:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    https://github.com/AstrobioMike/GToTree/issues\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    fi

    # removing tmp directory unless debug set
    if [ $debug_flag == 'false' ]; then
        rm -rf $tmp_dir
    fi

    mv $gtotree_log ${output_dir}/gtotree-runlog.txt

    exit

fi


### filtering out sequences that are too long or too short ###
# running in parallel if set, otherwise running in serial
if [ $num_jobs == "1" ]; then

    for SCG in $(cat ${tmp_dir}/updated_genes_list.tmp)
    do

        printf "   --------------------------------------------------------------------------   \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\n    Filtering ${GREEN}$SCG${NC} sequences by length...\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        gtt-count-bases-per-seq -i ${tmp_dir}/${SCG}_hits.faa -o ${tmp_dir}/${SCG}-num_bps.tmp
        cut -f 2 ${tmp_dir}/${SCG}-num_bps.tmp > ${tmp_dir}/${SCG}-lengths.tmp
        median=$(gtt-get-median.sh ${tmp_dir}/${SCG}-lengths.tmp)
        buff=$(echo "$median * $len_cutoff" | bc)
        min_len=$(echo "$median - $buff" | bc)
        min_len_rnd=$(printf "%.0f\n" $min_len)
        max_len=$(echo "$median + $buff" | bc)
        max_len_rnd=$(printf "%.0f\n" $max_len)

        gtt-filter-seqs-by-length -i ${tmp_dir}/${SCG}_hits.faa -m $min_len_rnd -M $max_len_rnd -o ${tmp_dir}/${SCG}_hits_filtered.tmp

    done

else
    cat ${tmp_dir}/updated_genes_list.tmp | parallel -j $num_jobs gtt-filter-parallel.sh {} $tmp_dir $len_cutoff
fi


### removing genes again if they had all filtered, which really shouldn't happen unless there are very few sequences (like when testing) ###
for SCG in $(cat ${tmp_dir}/updated_genes_list.tmp)
do
    if [ ! -s ${tmp_dir}/${SCG}_hits_filtered.tmp ]; then
        echo "$SCG"
    fi
done > ${tmp_dir}/genes_to_remove2.tmp

# removing those genes, if any:
comm -23 <(sort ${tmp_dir}/updated_genes_list.tmp) <(sort ${tmp_dir}/genes_to_remove2.tmp) > ${tmp_dir}/final_genes_list.tmp

### making sure there are some genes left after filtering by length (this shouldn't be a problem except under odd circumstances like when testing with VERY few genes, and/or very few genomes)
len_final_genes_list=$(wc -l ${tmp_dir}/final_genes_list.tmp | sed "s/^ *//" | cut -d " " -f 1)

if [ $len_final_genes_list == "0" ]; then

    printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    After filtering genes by length (set with the '-c' flag to $len_cutoff this\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    run), no genes remained. This typically shouldn't happen unless maybe there\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    were very few genes being targeted. You can consider increasing the '-c'\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    cutoff threshold, but this is also a good time to make sure nothing weird\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    is going on.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # removing tmp directory unless debug set
    if [ $debug_flag == 'false' ]; then
        rm -rf $tmp_dir
    fi

    mv $gtotree_log ${output_dir}/gtotree-runlog.txt

    exit
fi    


removed_genes2=$(wc -l ${tmp_dir}/genes_to_remove2.tmp | sed "s/^ *//" | cut -d " " -f 1)

# reporting
if [ $removed_genes2 != "0" ]; then
    printf "     ${YELLOW}******************************* ${NC}NOTICE ${YELLOW}*******************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "\t  $removed_genes2 gene(s) had no hits in any genomes after filtering by length! Just so ya know...\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "\t    Length-filtered out of existence:\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    sed 's/^/          /' ${tmp_dir}/genes_to_remove2.tmp | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "\n     Reported in \"${output_dir}/run_files/Genes_with_no_hits_after_length_filter.txt\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "     ${YELLOW}************************************************************************ ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    cp ${tmp_dir}/genes_to_remove2.tmp ${output_dir}/Genes_with_no_hits_after_length_filter.txt
fi

new_hmm_target_genes_total=$(wc -l ${tmp_dir}/final_genes_list.tmp | sed "s/^ *//" | cut -d " " -f 1)

printf "________________________________________________________________________________ \n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


#############################################################################
############################  FILTERING GENOMES  ############################
#############################################################################
### removing genomes with hits to fewer than 50% (by default) of the total SCGs targeted ###
printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ####                Filtering genomes with too few hits                   ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf "     Removing those with fewer than ${mult_gen_cut}%% of the total SCGs targeted.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

curr_time=$(date +"%I:%M %p")
duration=$SECONDS

printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


# setting cutoff threshold (need to have this many hits or genome is removed)
cutoff=$(echo "${gen_cutoff}*${new_hmm_target_genes_total}" | bc)
cutoff=$(printf "%0.f\n" $cutoff)

## if only one gene was found, then the cutoff would allow genomes with zero hits to be retained
## so setting it to 1 if it is 0
if [ ${cutoff} == 0 ]; then
    cutoff=1
fi

cat ${tmp_dir}/*_hits_filtered.tmp > ${tmp_dir}/all_seqs.tmp

for i in $(cat ${tmp_dir}/genomes_from_all_sources.tmp)
do
    grep -c -w "^>$i" ${tmp_dir}/all_seqs.tmp

done > ${tmp_dir}/counts.tmp

paste ${tmp_dir}/genomes_from_all_sources.tmp ${tmp_dir}/counts.tmp | sort -k 1,1 > ${tmp_dir}/counts_tab.tmp

awk -F "\t" -v cutoff="$cutoff" '$2 < cutoff { print $0 }' ${tmp_dir}/counts_tab.tmp | tee ${tmp_dir}/Genomes_removed_for_too_few_hits.tmp | cut -f1 | sort > ${tmp_dir}/sorted_genomes_to_remove.tmp

# adding header to genomes' removed list
cat <(printf "accession\tuniq_SCG_hits\n") ${tmp_dir}/Genomes_removed_for_too_few_hits.tmp > ${output_dir}/Genomes_removed_for_too_few_hits.tsv

# getting updated genome list with these removed from it:
comm -23 <(sort ${tmp_dir}/genomes_from_all_sources.tmp) ${tmp_dir}/sorted_genomes_to_remove.tmp > ${tmp_dir}/final_genomes_from_all_sources.tmp

# reporting if any genomes removed for too few hits
removed_genomes=$(wc -l ${tmp_dir}/sorted_genomes_to_remove.tmp | sed "s/^ *//" | cut -d " " -f 1)

if [ $removed_genomes != "0" ]; then

    printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    $removed_genomes genome(s) removed from analysis due to having too few hits.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    Reported in \"${output_dir}/run_files/Genomes_removed_for_too_few_hits.tsv\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    sleep 3
else
    printf "             ${GREEN}No genomes were removed for having too few hits :)${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    rm ${output_dir}/Genomes_removed_for_too_few_hits.tsv
    sleep 3
fi

printf "________________________________________________________________________________\n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


### check there is more than 1 genome remaining, and exiting if not ###
num_final_genomes_from_all_sources=$(wc -l ${tmp_dir}/final_genomes_from_all_sources.tmp | sed "s/^ *//" | cut -d " " -f 1)

if [ $num_final_genomes_from_all_sources -lt 3 ]; then

    printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####${NC}             GToTree is exiting without completing :(                 ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "  ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    After filtering out genomes with too few hits to the target genes (set\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    with the '-G' flag), fewer than 3 genomes remained. This typically\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    shouldn't happen unless maybe there were very few genomes at the start,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    and/or very few target genes being searched. If it is not clear what is\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    causing the problem, please feel free to post an issue at the GToTree github:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "    https://github.com/AstrobioMike/GToTree/issues\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "  ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    # removing tmp directory unless debug set
    if [ $debug_flag == 'false' ]; then
        rm -rf $tmp_dir
    fi

    mv $gtotree_log ${output_dir}/gtotree-runlog.txt

    exit

fi



#############################################################################
#############  ALIGNING, TRIMMING, AND INSERTING GAP-SEQUENCES  #############
#############################################################################
printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ####          Aligning, trimming, and inserting gap-sequences             ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

curr_time=$(date +"%I:%M %p")
duration=$SECONDS

printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


### running in parallel if set, otherwise running in serial ###
if [ $num_jobs == "1" ]; then

    # setting new counter
    num=0

    for SCG in $(cat ${tmp_dir}/final_genes_list.tmp)

    do

        num=$((num+1))

        printf "\n\n\n   --------------------------------------------------------------------------   \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "\t    Aligning and cleaning gene ${GREEN}$SCG${NC}; Number $num of $new_hmm_target_genes_total.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "   --------------------------------------------------------------------------   \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        # removing those genomes that need to be removed based on not having enough hits to the target genes
        gtt-parse-fasta-by-headers -i ${tmp_dir}/${SCG}_hits_filtered.tmp -w ${tmp_dir}/sorted_genomes_to_remove.tmp -o ${tmp_dir}/${SCG}_hits_filtered.faa --inverse

        # aligning
        if [ $total_input_genomes -ge 1000 ] && [ $override_faster_alignment == 'false' ]; then
            muscle -super5 ${tmp_dir}/${SCG}_hits_filtered.faa -output ${tmp_dir}/aligned.tmp -threads ${num_muscle_threads}
        else
            muscle -align ${tmp_dir}/${SCG}_hits_filtered.faa -output ${tmp_dir}/aligned.tmp -threads ${num_muscle_threads}
        fi

        # trimming
        trimal -in ${tmp_dir}/aligned.tmp -out ${tmp_dir}/trimmed.faa.tmp -automated1

        # removing linewraps:
        sed 's/ .*$//' ${tmp_dir}/trimmed.faa.tmp | awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' > ${tmp_dir}/formatted.faa.tmp

        ## adding gap-sequences for genomes missing the current gene ##
        # finding here which ones have it
        grep ">" ${tmp_dir}/formatted.faa.tmp | tr -d ">" | sort > ${tmp_dir}/genomes_with_gene.tmp

        # now getting which ones don't have it
        comm -23 ${tmp_dir}/final_genomes_from_all_sources.tmp ${tmp_dir}/genomes_with_gene.tmp | sort > ${tmp_dir}/needed_gappers.tmp

        # creating gap-sequences if needed
        if [ -s ${tmp_dir}/needed_gappers.tmp ]; then

            # making a headers file for when making fasta in a few steps:
            sed 's/^/>/' ${tmp_dir}/needed_gappers.tmp > ${tmp_dir}/needed_headers.tmp

            # getting length of the alignment for the current gene:
            aln_length_tmp=$(sed -n '2p' ${tmp_dir}/formatted.faa.tmp | wc -c | tr -s " " | cut -f2 -d " ")
            # subtracting 1 for newline characters
            aln_length_tmp=$(echo "$aln_length_tmp"-1 | bc)
            # making a string of gaps the length of the alignment for those missing it:
            gap_seq=$(printf "%0.s-" $(seq 1 1 $aln_length_tmp))
            # making as many gap sequences as there are genomes missing the current gene:
            num_genomes_to_add=$(wc -l ${tmp_dir}/needed_gappers.tmp | tr -s " " "\t" | cut -f2)
            for i in $(cat ${tmp_dir}/needed_gappers.tmp)
            do
                echo "$gap_seq"
            done > ${tmp_dir}/gaps.tmp

            # making fasta of those genomes missing the current gene:
            paste -d "\n" ${tmp_dir}/needed_headers.tmp ${tmp_dir}/gaps.tmp > ${tmp_dir}/missing_genomes.faa.tmp
            # catting the genomes missing the current gene together with those that have it
            cat ${tmp_dir}/formatted.faa.tmp ${tmp_dir}/missing_genomes.faa.tmp > ${tmp_dir}/${SCG}.faa.tmp
        else
            mv ${tmp_dir}/formatted.faa.tmp ${tmp_dir}/${SCG}.faa.tmp
        fi

        ## reordering the final fasta of this gene so that all gene sets can be pasted together at end ##
        gtt-reorder-fasta -i ${tmp_dir}/${SCG}.faa.tmp -w ${tmp_dir}/final_genomes_from_all_sources.tmp -o ${tmp_dir}/${SCG}_all_aligned.faa

    done

else

    # running in parallel
    if [ $total_input_genomes -ge 1000 ] && [ $override_faster_alignment == 'false' ]; then
        faster_alignment='true'    
    else
        faster_alignment='false'
    fi

    cat ${tmp_dir}/final_genes_list.tmp | parallel -j $num_jobs gtt-align-and-trim-parallel.sh {} $tmp_dir $faster_alignment $num_muscle_threads

fi

printf "\n\n\n________________________________________________________________________________\n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


#############################################################################
######################  MAKING SUMMARY TABLE WITH ALL  ######################
#############################################################################
if [ -f ${output_dir}/NCBI_genomes_summary_info.tsv ]; then
    cut -f 1,4,9,10,11,12 ${output_dir}/NCBI_genomes_summary_info.tsv | tail -n +2 >> ${tmp_dir}/building_genomes_tab.tmp
fi

if [ -f ${output_dir}/Genbank_genomes_summary_info.tsv ]; then
    cut -f 1,3,6,7,8,9 ${output_dir}/Genbank_genomes_summary_info.tsv | tail -n +2 >> ${tmp_dir}/building_genomes_tab.tmp
fi

if [ -f ${output_dir}/Fasta_genomes_summary_info.tsv ]; then
    cut -f 1,3,4,5,6,7 ${output_dir}/Fasta_genomes_summary_info.tsv | tail -n +2 >> ${tmp_dir}/building_genomes_tab.tmp
fi

if [ -f ${output_dir}/Amino_acid_genomes_summary_info.tsv ]; then
    cut -f 1,3,4,5,6,7 ${output_dir}/Amino_acid_genomes_summary_info.tsv | tail -n +2 >> ${tmp_dir}/building_genomes_tab.tmp
fi

## getting current ordered list of all genomes together:
cut -f 1 ${tmp_dir}/building_genomes_tab.tmp | sort > ${tmp_dir}/sorted_all_output_genome_ids.tmp

## adding columns stating whether genome made it into the final tree, how many SCG-hits they had after length filtering, and how many genes they contributed to final alignment if they weren't removed
# first getting which were removed from analysis due to too few hits, if any:
if [ $removed_genomes != "0" ]; then
    sort <(tail -n +2 ${output_dir}/Genomes_removed_for_too_few_hits.tsv | cut -f1) > ${tmp_dir}/sorted_genomes_removed_for_too_few_hits.tmp
    
    # getting a list of those that weren't removed to iterate over
    comm -23 ${tmp_dir}/sorted_all_output_genome_ids.tmp ${tmp_dir}/sorted_genomes_removed_for_too_few_hits.tmp | sort > ${tmp_dir}/sorted_genomes_not_removed.tmp

    ## getting table of those that were removed
    for genome in $(cat ${tmp_dir}/sorted_genomes_removed_for_too_few_hits.tmp)
    do
        grep -m1 -w "^$genome" ${tmp_dir}/building_genomes_tab.tmp >> ${tmp_dir}/building_genomes_tab_for_removed_genomes.tmp
        grep -m1 -w "^$genome" ${tmp_dir}/counts_tab.tmp | cut -f 2 >> ${tmp_dir}/reporting_SCG_counts_after_len_filt_for_genomes_removed.tmp
        echo "No" >> ${tmp_dir}/reporting_that_not_in_final_tree_column.tmp
    done 

    # adding columns on:
    paste ${tmp_dir}/building_genomes_tab_for_removed_genomes.tmp ${tmp_dir}/reporting_SCG_counts_after_len_filt_for_genomes_removed.tmp ${tmp_dir}/reporting_that_not_in_final_tree_column.tmp > ${tmp_dir}/genomes_tab_for_removed_genomes.tmp

    ## getting table of those that weren't removed
    for genome in $(cat ${tmp_dir}/sorted_genomes_not_removed.tmp)
    do
        grep -m1 -w "^$genome" ${tmp_dir}/building_genomes_tab.tmp >> ${tmp_dir}/building_genomes_tab_for_retained_genomes.tmp
        grep -m1 -w "^$genome" ${tmp_dir}/counts_tab.tmp | cut -f 2 >> ${tmp_dir}/reporting_SCG_counts_after_len_filt_for_genomes_retained.tmp
        echo "Yes" >> ${tmp_dir}/reporting_that_in_final_tree_column.tmp
    done

    # adding columns on:
    paste ${tmp_dir}/building_genomes_tab_for_retained_genomes.tmp ${tmp_dir}/reporting_SCG_counts_after_len_filt_for_genomes_retained.tmp ${tmp_dir}/reporting_that_in_final_tree_column.tmp > ${tmp_dir}/genomes_tab_for_retained_genomes.tmp


    ## combining both together
    cat ${tmp_dir}/genomes_tab_for_retained_genomes.tmp ${tmp_dir}/genomes_tab_for_removed_genomes.tmp > ${tmp_dir}/building_genomes_tab.tmp

else
    # if there are no genomes to remove, then generating two new columns for all
    for genome in $(cat ${tmp_dir}/sorted_all_output_genome_ids.tmp)
    do
        grep -m1 -w "^$genome" ${tmp_dir}/building_genomes_tab.tmp >> ${tmp_dir}/building_genomes_tab_for_retained_genomes.tmp
        grep -m1 -w "^$genome" ${tmp_dir}/counts_tab.tmp | cut -f 2 >> ${tmp_dir}/reporting_SCG_counts_after_len_filt_for_genomes_retained.tmp
        echo "Yes" >> ${tmp_dir}/reporting_that_in_final_tree_column.tmp
    done

    # adding columns on:
    paste ${tmp_dir}/building_genomes_tab_for_retained_genomes.tmp ${tmp_dir}/reporting_SCG_counts_after_len_filt_for_genomes_retained.tmp ${tmp_dir}/reporting_that_in_final_tree_column.tmp > ${tmp_dir}/building_genomes_tab.tmp

fi


# and adding header and writing to output directory
cat <(printf "assembly_id\ttaxid\tnum_SCG_hits\tuniq_SCG_hits\tperc_comp\tperc_redund\tnum_SCG_hits_after_len_filt\tin_final_tree\n") ${tmp_dir}/building_genomes_tab.tmp > ${output_dir}/Genomes_summary_info.tsv

#############################################################################
############  FORMATTING TABLE FOR THOSE WITH >= 10% REDUNDANCY  ############
#############################################################################

if [ -f ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp ]; then
    cat <(printf "assembly_id\tnum_SCG_hits\tuniq_SCG_hits\tperc_comp\tperc_redund\n") ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp > ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv
fi

#############################################################################
#####################  CATTING ALL ALIGNMENTS TOGETHER  #####################
#############################################################################
printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ####                  Catting all alignments together                     ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

curr_time=$(date +"%I:%M %p")
duration=$SECONDS

printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# this concatenates the alignments and generates the partitions file for mixed model alignments
gtt-cat-alignments -t $tmp_dir -o $output_dir

# storing genomes that made it through workflow to report at end
genomes_retained=$(wc -l ${tmp_dir}/final_genomes_from_all_sources.tmp | sed "s/^ *//" | cut -d " " -f 1)

printf "\n________________________________________________________________________________\n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


#############################################################################
##############  ADDING MORE INFORMATIVE HEADERS IF SPECIFIED  ###############
#############################################################################
## making copy of alignment file to preserve original
if [ -n "$new_file_to_genome_id_map" ] || [ $taxonkit_id_swap != "false" ] || [ $gtdb_id_swap != "false" ] ; then

    echo $lineage_spec | tr "," "\n" > ${tmp_dir}/wanted_ranks.tmp

    cp ${output_dir}/Aligned_SCGs.faa ${tmp_dir}/aligned_SCGs_mod_names.tmp

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####                   Adding more informative headers                    ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    ## building id swap file starting with user-provided Genome-to-ID mapping file if given
    if [ -n "$new_file_to_genome_id_map" ]; then

        sort -k 1,1 $new_file_to_genome_id_map | tee ${tmp_dir}/sorted_id_swap_map.tmp | cut -f1 > ${tmp_dir}/sorted_target_ids_to_swap.tmp

        ## separating out which ones are to be swapped that are ncbi accessions (because the others need to have their extensions removed)
        if [ -f ${output_dir}/NCBI_genomes_summary_info.tsv ]; then

        for acc in $(cut -f 1 ${output_dir}/NCBI_genomes_summary_info.tsv | tail -n +2)
        do
            grep -w -m1 "$acc" ${tmp_dir}/sorted_id_swap_map.tmp
        done | sort -k 1,1 | tee ${tmp_dir}/ncbi_id_swap_map.tmp | cut -f 1 > ${tmp_dir}/sorted_ncbi_ids_to_swap.tmp

        fi

        ## if there are ncbi accession to change, getting which are NOT ncbi accessions, and removing extensions
        ## otherwise moving all to non-ncbi-ids-to-swap and removing extensions
        if [ -s ${tmp_dir}/sorted_ncbi_ids_to_swap.tmp ]; then

            comm -23 ${tmp_dir}/sorted_target_ids_to_swap.tmp ${tmp_dir}/sorted_ncbi_ids_to_swap.tmp | sort > ${tmp_dir}/sorted_non_ncbi_ids_to_swap.tmp

            if [ -s ${tmp_dir}/sorted_non_ncbi_ids_to_swap.tmp ]; then

            # making sure files being appended to are starting empty
            rm -f ${tmp_dir}/non_ncbi_swap_map_p1.tmp ${tmp_dir}/non_ncbi_swap_map_p2.tmp

            for file in $(cat ${tmp_dir}/sorted_non_ncbi_ids_to_swap.tmp)
            do
                non_gz_file_name=$(echo $file | sed 's/.gz//')
                echo "${non_gz_file_name%.*}" >> ${tmp_dir}/non_ncbi_swap_map_p1.tmp
                grep -w -m1 "^${file}" ${tmp_dir}/sorted_id_swap_map.tmp | cut -f 2- >> ${tmp_dir}/non_ncbi_swap_map_p2.tmp
            done 

            paste ${tmp_dir}/non_ncbi_swap_map_p1.tmp ${tmp_dir}/non_ncbi_swap_map_p2.tmp | sort -k 1,1 > ${tmp_dir}/sorted_non_ncbi_swap_map.tmp
     
            fi

        else

            # making sure files being appended to are starting empty
            rm -f ${tmp_dir}/non_ncbi_swap_map_p1.tmp ${tmp_dir}/non_ncbi_swap_map_p2.tmp

            for file in $(cat ${tmp_dir}/sorted_target_ids_to_swap.tmp)
            do
                non_gz_file_name=$(echo "$file" | sed 's/.gz//')
                echo "${non_gz_file_name%.*}" >> ${tmp_dir}/non_ncbi_swap_map_p1.tmp
                grep -w -m1 "^${file}" ${tmp_dir}/sorted_id_swap_map.tmp | cut -f 2- >> ${tmp_dir}/non_ncbi_swap_map_p2.tmp
            done 

            paste ${tmp_dir}/non_ncbi_swap_map_p1.tmp ${tmp_dir}/non_ncbi_swap_map_p2.tmp | sort -k 1,1 > ${tmp_dir}/sorted_non_ncbi_swap_map.tmp
    
        fi

        ## updating id_swap_map
        if [ -f ${tmp_dir}/ncbi_id_swap_map.tmp ] && [ -f ${tmp_dir}/sorted_non_ncbi_swap_map.tmp ]; then
            cat ${tmp_dir}/ncbi_id_swap_map.tmp ${tmp_dir}/sorted_non_ncbi_swap_map.tmp > ${tmp_dir}/id_swap_map.tmp
        elif [ -f ${tmp_dir}/ncbi_id_swap_map.tmp ] && [ ! -f ${tmp_dir}/sorted_non_ncbi_swap_map.tmp ]; then
            cp ${tmp_dir}/ncbi_id_swap_map.tmp ${tmp_dir}/id_swap_map.tmp
        elif [ ! -f ${tmp_dir}/ncbi_id_swap_map.tmp ] && [ -f ${tmp_dir}/sorted_non_ncbi_swap_map.tmp ]; then
            cp ${tmp_dir}/sorted_non_ncbi_swap_map.tmp ${tmp_dir}/id_swap_map.tmp
        fi

        ## splitting mapping file into those that have specific labels to swap and those that need appending
        awk ' BEGIN { FS=OFS="\t" } $2 != "" { print $1,$2 } ' ${tmp_dir}/id_swap_map.tmp > ${tmp_dir}/base_label_swap.tmp

        ## now splitting the file down to those that have labels to append (getting to this part after taxonomy)
        awk ' BEGIN { FS=OFS="\t" } $3 != "" { print $1,$3 } ' ${tmp_dir}/id_swap_map.tmp > ${tmp_dir}/base_labels_need_append.tmp

        ## getting file of those that solely want appending
        awk ' BEGIN { FS=OFS="\t" } $3 != "" && $2 == "" { print $1,$3 } ' ${tmp_dir}/id_swap_map.tmp > ${tmp_dir}/base_labels_that_only_need_append.tmp

    fi

    ## now if taxonkit was specified, adding those to swap file too, but only
    ## renaming genomes with a taxid that are NOT given a new name in the user-provided mapping file (those solely appended-to still have lineages added first)
    if [ $taxonkit_id_swap != "false" ]; then

        ### checking there are actually some taxids to work with, if not, then printing message at end that no lineage info added
        taxid_counts=$(cut -f 2 ${output_dir}/Genomes_summary_info.tsv | sed '/^NA$/d' | wc -l | sed 's/^ *//')

        if [ $taxid_counts -gt 1 ]; then 

            ## running taxonkit and adding to summary table ##
            cut -f 2 ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | taxonkit lineage | taxonkit reformat -r NA | cut -f3 | tr ";" "\t" > ${tmp_dir}/lineages.tmp

            cat <(printf "NCBI_domain\tNCBI_phylum\tNCBI_class\tNCBI_order\tNCBI_family\tNCBI_genus\tNCBI_species\n") ${tmp_dir}/lineages.tmp > ${tmp_dir}/lineages2.tmp

            paste ${output_dir}/Genomes_summary_info.tsv ${tmp_dir}/lineages2.tmp > ${output_dir}/All_genomes_summary_info2.tsv
            mv ${output_dir}/All_genomes_summary_info2.tsv ${output_dir}/Genomes_summary_info.tsv

            ## getting genomes that have a taxid
            awk ' BEGIN { FS=OFS="\t" } $2 != "NA" ' ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | cut -f1 > ${tmp_dir}/potential_ids_for_swap.tmp
            ## removing those that the user already provided new labels for, if they did
            if [ -f ${tmp_dir}/base_label_swap.tmp ]; then
                comm -23 <(sort ${tmp_dir}/potential_ids_for_swap.tmp) <(cut -f1 ${tmp_dir}/base_label_swap.tmp | sort) > ${tmp_dir}/ids_for_taxonkit_swap.tmp
            else
                sort ${tmp_dir}/potential_ids_for_swap.tmp > ${tmp_dir}/ids_for_taxonkit_swap.tmp
            fi

            ## creating new labels for them based on lineage info
            for id in $(cat ${tmp_dir}/ids_for_taxonkit_swap.tmp)
            do
                grep -w -m1 "$id" ${output_dir}/Genomes_summary_info.tsv
            done | cut -f 1,9-15 > ${tmp_dir}/working_taxonkit_lineages.tmp

            ## builing new taxonkit-swapped labels based upon user-desired ranks (provided by -L flag)
            cut -f1 ${tmp_dir}/working_taxonkit_lineages.tmp > ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp

            if grep -q "Domain" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 2 ${tmp_dir}/working_taxonkit_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            if grep -q "Phylum" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 3 ${tmp_dir}/working_taxonkit_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            if grep -q "Class" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 4 ${tmp_dir}/working_taxonkit_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            if grep -q "Order" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 5 ${tmp_dir}/working_taxonkit_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            if grep -q "Family" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 6 ${tmp_dir}/working_taxonkit_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            if grep -q "Genus" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 7 ${tmp_dir}/working_taxonkit_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            if grep -q "Species" ${tmp_dir}/wanted_ranks.tmp; then
                paste -d "_" ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp <(cut -f 8 ${tmp_dir}/working_taxonkit_lineages.tmp | tr "." "_" | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp
                mv ${tmp_dir}/building_new_ids_for_taxonkit_swap2.tmp ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp
            fi

            mv ${tmp_dir}/building_new_ids_for_taxonkit_swap.tmp ${tmp_dir}/new_ids_for_taxonkit_swap.tmp

            ## making map for IDs changed with taxonkit
            paste ${tmp_dir}/ids_for_taxonkit_swap.tmp ${tmp_dir}/new_ids_for_taxonkit_swap.tmp > ${tmp_dir}/taxonkit_id_swap_map.tmp

            #### adding on infraspecific names if they exist and if Strain-level ranks desired (specified in -L flag)
            if grep -q "Strain" ${tmp_dir}/wanted_ranks.tmp; then

                # these only exist if NCBI accessions were given or if they were found in provided genbank files, so using the ${output_dir}/NCBI_genomes_summary_info.tsv and ${output_dir}/Genbank_genomes_summary_info.tsv files
                if [ -f ${output_dir}/NCBI_genomes_summary_info.tsv ] || [ -f ${output_dir}/Genbank_genomes_summary_info.tsv ]; then

                    # input-ncbi accessions to add strain info to:
                    if [ -f ${output_dir}/NCBI_genomes_summary_info.tsv ]; then

                        for id in $(cat ${tmp_dir}/ids_for_taxonkit_swap.tmp)
                        do
                            grep -m1 -w "$id" ${output_dir}/NCBI_genomes_summary_info.tsv
                        done > ${tmp_dir}/ncbi_genomes_summary_subset_for_id_change.tmp

                        # getting the ids and strains of those that have infraspecific names
                        grep "strain=" ${tmp_dir}/ncbi_genomes_summary_subset_for_id_change.tmp | cut -f 1,6 | sed 's/strain=//' | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]" > ${tmp_dir}/strain_info_tab.tmp
              
                    fi

                    # input-genbank labels to add strain info to:
                    if [ -f ${output_dir}/Genbank_genomes_summary_info.tsv ]; then

                        for id in $(cat ${tmp_dir}/ids_for_taxonkit_swap.tmp)
                        do
                            grep -m1 -w "$id" ${output_dir}/Genbank_genomes_summary_info.tsv
                        done > ${tmp_dir}/genbank_genomes_summary_subset_for_id_change.tmp
              
                        # getting the ids and strains of those that have strains IDs:
                        awk ' BEGIN { FS=OFS="\t" } $5 != "NA" { print $1,$5 } ' ${tmp_dir}/genbank_genomes_summary_subset_for_id_change.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]" >> ${tmp_dir}/strain_info_tab.tmp

                    fi

                    sort -k 1,1 ${tmp_dir}/strain_info_tab.tmp > ${tmp_dir}/sorted_strain_info_tab.tmp
                    cut -f 1 ${tmp_dir}/sorted_strain_info_tab.tmp > ${tmp_dir}/sorted_ids_with_strain_info.tmp
                    cut -f 2 ${tmp_dir}/sorted_strain_info_tab.tmp > ${tmp_dir}/sorted_strain_info.tmp

                    # finding which IDs don't get this change, so we can separate them
                    comm -23 ${tmp_dir}/ids_for_taxonkit_swap.tmp ${tmp_dir}/sorted_ids_with_strain_info.tmp > ${tmp_dir}/taxonkit_ids_to_not_change.tmp
                    # pulling out the ones we're not changing only, to use them to create a file of those we are changing
                    for i in $(cat ${tmp_dir}/taxonkit_ids_to_not_change.tmp)
                    do
                        grep -w -m1 "$i" ${tmp_dir}/taxonkit_id_swap_map.tmp
                    done > ${tmp_dir}/building_final_taxonkit_id_swap_map.tmp

                    # now pulling out only those we are adding to
                    for i in $(cat ${tmp_dir}/sorted_ids_with_strain_info.tmp)
                    do
                        grep -w -m1 "$i" ${tmp_dir}/taxonkit_id_swap_map.tmp
                    done > ${tmp_dir}/building_final_taxonkit_id_swap_map_to_paste_onto.tmp

                    # now adding strain labels
                    paste -d "_" ${tmp_dir}/building_final_taxonkit_id_swap_map_to_paste_onto.tmp ${tmp_dir}/sorted_strain_info.tmp > ${tmp_dir}/building_final_taxonkit_id_swap_map2.tmp

                    cat ${tmp_dir}/building_final_taxonkit_id_swap_map.tmp ${tmp_dir}/building_final_taxonkit_id_swap_map2.tmp > ${tmp_dir}/final_taxonkit_id_swap_map.tmp

                fi

            else
                cp ${tmp_dir}/taxonkit_id_swap_map.tmp ${tmp_dir}/final_taxonkit_id_swap_map.tmp
            fi

            ## pasting together and adding to the ${tmp_dir}/base_label_swap.tmp if other user-specified IDs were given (with -m flag)
            if [ -f ${tmp_dir}/base_label_swap.tmp ]; then
                cat ${tmp_dir}/base_label_swap.tmp ${tmp_dir}/final_taxonkit_id_swap_map.tmp > ${tmp_dir}/full_id_swap_map.tmp
            else
                cp ${tmp_dir}/final_taxonkit_id_swap_map.tmp ${tmp_dir}/full_id_swap_map.tmp
            fi

        else # this is if there are no taxids to use for taxonkit
            printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    You specified to use Taxonkit to add lineage info (set with the '-t' flag),\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    but there were no taxids available (which typically come from inputs provided\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    as NCBI accessions or GenBank files that might hold that information).\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    In this case no lineage information could be added.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${YELLOW}***************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            # if base label swap map was provided by the user (via the -m flag) then making that the final swap map since no taxonkit lineage info to add
            if [ -f ${tmp_dir}/base_label_swap.tmp ]; then
                cp ${tmp_dir}/base_label_swap.tmp ${tmp_dir}/full_id_swap_map.tmp
            fi
        fi
    
    else
        ## if no taxonkit swapping specified, moving user-provided map to final file if it exists
        if [ -f ${tmp_dir}/base_label_swap.tmp ]; then

            cp ${tmp_dir}/base_label_swap.tmp ${tmp_dir}/full_id_swap_map.tmp

        fi
    fi

    ## now if using gtdb lineage info was requested, adding those to swap file too (superceding if taxonkit was also used), and
    ## re-labeling genomes that were NOT given a new name in the user-provided mapping file (those solely appended-to will still have lineages added first)
    if [ $gtdb_id_swap != "false" ]; then

        # ref files were checked for and downloaded if needed above, before main program started

        # getting genomes that have a taxid (possibly found from a genbank file) and if its entry starts like an NCBI accession (which we may be able to find in GTDB)
        awk ' BEGIN { FS=OFS="\t" } $2 != "NA" ' ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | cut -f 1 | grep "^GC._" > ${tmp_dir}/potential_ids_for_gtdb_swap.tmp

        # creating subset gtdb table holding all target accessions (in case the user providing a mapping file that supersedes re-labeling any of these, but still wants the gtdb taxonomy and other information for the entry)
        gtt-parse-gtdb-assembly-summary-file -o ${tmp_dir}/target-gtdb.tsv -f ${tmp_dir}/gtdb_found_accs.tsv -n ${tmp_dir}/gtdb-not-found-accs.tsv -t ${tmp_dir}/target-gtdb-tax.tsv -a ${GTDB_dir}/GTDB-arc-and-bac-metadata.tsv -w ${tmp_dir}/potential_ids_for_gtdb_swap.tmp
        
        ## there are no strain level designations in GTDB, so those will be absent on any labels where GTDB taxonomy was added if requested by the user
            # notifying user here if this is the case
        if [ -n ${tmp_dir}/wanted_ranks.tmp ]; then
            if grep -q "Strain" ${tmp_dir}/wanted_ranks.tmp; then
                printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "    Strain-level labels were requested in addition to using GTDB taxonomy\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "    where possible. This is just a note that there will be no strain-level\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "    labels added for those that had GTDB lineage info added.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
                printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            fi
        fi

        ## reporting to user how many entries were tried and found at GTDB
        if [ -s ${tmp_dir}/gtdb-not-found-accs.tsv ]; then

            num_entries_found=$(tail -n +2 ${tmp_dir}/gtdb_found_accs.tsv | wc -l | sed "s/^ *//" | cut -d " " -f 1)
            num_entries_not_found=$(tail -n +2 ${tmp_dir}/gtdb-not-found-accs.tsv | wc -l | sed "s/^ *//" | cut -d " " -f 1)
            num_entries_searched=$(($num_entries_found + $num_entries_not_found))
            printf "\n  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}  \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    $num_entries_not_found accession(s) of the searched $num_entries_searched were not successfully found in GTDB.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "    Reported in \"${output_dir}/run_files/GTDB_accessions_not_found.tsv\".\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            printf "  ${YELLOW}****************************************************************************${NC}  \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

            sleep 3

            cp ${tmp_dir}/gtdb-not-found-accs.tsv ${output_dir}/run_files/GTDB_accessions_not_found.tsv

        fi

        # storing IDs for which a GTDB entry was found:
        cut -f 1 ${tmp_dir}/gtdb_found_accs.tsv | tail -n +2 > ${tmp_dir}/found_ids_for_gtdb_swap.tmp

            # copying keeper files to output dir
        cp ${tmp_dir}/target-gtdb.tsv ${output_dir}/run_files/GTDB_genomes_summary_info.tsv
        cp ${tmp_dir}/gtdb_found_accs.tsv ${output_dir}/run_files/GTDB_to_input_accession_map.tsv

        # now moving on to the swapping, removing those that the user already provided new labels for, if they did
        if [ -f ${tmp_dir}/base_label_swap.tmp ]; then
            comm -23 <(sort ${tmp_dir}/found_ids_for_gtdb_swap.tmp) <(cut -f 1 ${tmp_dir}/base_label_swap.tmp | sort) > ${tmp_dir}/ids_for_gtdb_swap.tmp
        else
            sort ${tmp_dir}/found_ids_for_gtdb_swap.tmp > ${tmp_dir}/ids_for_gtdb_swap.tmp
        fi

        ## adding GTDB taxonomy to final genome summary table for those that have it
            # first need to separate out those that have gtdb info from those that don't
        for id in $(cat ${tmp_dir}/ids_for_gtdb_swap.tmp)
        do
            grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv
        done > ${tmp_dir}/gtdb_rows_that_need_tax_info_added.tmp

        comm -23 <(cut -f 1 ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | sort) <(sort ${tmp_dir}/ids_for_gtdb_swap.tmp) > ${tmp_dir}/ids_that_need_NAs_added_for_gtdb_info.tmp
        for id in $(cat ${tmp_dir}/ids_that_need_NAs_added_for_gtdb_info.tmp)
        do
            grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv
        done > ${tmp_dir}/rows_that_need_NAs_added_for_gtdb_info.tmp

            # adding tax info to those that have gtdb info
        for id in $(cat ${tmp_dir}/ids_for_gtdb_swap.tmp)
        do
            grep -w -m 1 ^$id ${output_dir}/run_files/GTDB_to_input_accession_map.tsv
        done | cut -f 2 > ${tmp_dir}/base_ids_for_gtdb_swap.tmp

        for id in $(cat ${tmp_dir}/base_ids_for_gtdb_swap.tmp)
        do
            grep -w -m 1 ^$id ${tmp_dir}/target-gtdb-tax.tsv
        done | cut -f 2- > ${tmp_dir}/gtdb_tax_to_add_to_rows.tmp

        paste ${tmp_dir}/gtdb_rows_that_need_tax_info_added.tmp ${tmp_dir}/gtdb_tax_to_add_to_rows.tmp > ${tmp_dir}/rows_for_genome_summary_table_with_gtdb_tax_info.tmp

            # adding NAs for these columns for those that don't have gtdb info 
        for id in $(cat ${tmp_dir}/ids_that_need_NAs_added_for_gtdb_info.tmp)
        do
            printf "NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n"
        done > ${tmp_dir}/NAs_for_gtdb_tax_to_add_to_rows.tmp

        paste ${tmp_dir}/rows_that_need_NAs_added_for_gtdb_info.tmp ${tmp_dir}/NAs_for_gtdb_tax_to_add_to_rows.tmp > ${tmp_dir}/rows_for_genome_summary_table_without_gtdb_tax_info.tmp

            # sticking together with an expanded header to make the genome summary output table
        head -n 1 ${output_dir}/Genomes_summary_info.tsv > ${tmp_dir}/curr_genome_summary_header.tmp
        paste ${tmp_dir}/curr_genome_summary_header.tmp <(printf "GTDB_accession\tGTDB_domain\tGTDB_phylum\tGTDB_class\tGTDB_order\tGTDB_family\tGTDB_genus\tGTDB_species\n") > ${tmp_dir}/new_genome_summary_header.tmp

        cat ${tmp_dir}/new_genome_summary_header.tmp ${tmp_dir}/rows_for_genome_summary_table_with_gtdb_tax_info.tmp ${tmp_dir}/rows_for_genome_summary_table_without_gtdb_tax_info.tmp > ${output_dir}/Genomes_summary_info.tsv

        ## creating new labels based on GTDB lineage info
        for id in $(cat ${tmp_dir}/ids_for_gtdb_swap.tmp)
        do
            grep -w -m 1 "$id" ${tmp_dir}/gtdb_found_accs.tsv | cut -f 2
        done > ${tmp_dir}/gtdb_found_accs_to_pull_lineage_info_for.tmp

        for id in $(cat ${tmp_dir}/gtdb_found_accs_to_pull_lineage_info_for.tmp)
        do
            grep -w -m 1 "^$id" ${tmp_dir}/target-gtdb-tax.tsv
        done | cut -f 1,3-9 > ${tmp_dir}/working_gtdb_lineages.tmp

        ## buildig new gtdb-swapped labels based upon user-desired ranks (provided by -L flag)
        cut -f 1 ${tmp_dir}/working_gtdb_lineages.tmp > ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp

        if grep -q "Domain" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 2 ${tmp_dir}/working_gtdb_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        if grep -q "Phylum" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 3 ${tmp_dir}/working_gtdb_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        if grep -q "Class" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 4 ${tmp_dir}/working_gtdb_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        if grep -q "Order" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 5 ${tmp_dir}/working_gtdb_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        if grep -q "Family" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 6 ${tmp_dir}/working_gtdb_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        if grep -q "Genus" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 7 ${tmp_dir}/working_gtdb_lineages.tmp | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        if grep -q "Species" ${tmp_dir}/wanted_ranks.tmp; then
            paste -d "_" ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp <(cut -f 8 ${tmp_dir}/working_gtdb_lineages.tmp | tr "." "_" | tr " " "_" | tr "(" "_" | tr ")" "_" | tr "/" "_" | sed 's/__/_/g' | sed 's/_$//' | tr -d "[';\[\]]") > ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp
            mv ${tmp_dir}/building_new_ids_for_gtdb_swap2.tmp ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp
        fi

        mv ${tmp_dir}/building_new_ids_for_gtdb_swap.tmp ${tmp_dir}/new_ids_for_gtdb_swap.tmp

        ## making map for IDs changed with gtdb
        paste ${tmp_dir}/ids_for_gtdb_swap.tmp ${tmp_dir}/new_ids_for_gtdb_swap.tmp > ${tmp_dir}/final_gtdb_id_swap_map.tmp

        ## now, if there are taxonkit swapped labels, we are replacing those that we cover with GTDB in the full_id_swap_map file...
        if [ -f ${tmp_dir}/final_taxonkit_id_swap_map.tmp ]; then
            # getting those IDs with taxonkit info
            cut -f 1 ${tmp_dir}/final_taxonkit_id_swap_map.tmp | sort > ${tmp_dir}/current_taxonkit_ids_to_swap.tmp
            # getting those IDs with GTDB info
            cut -f 1 ${tmp_dir}/final_gtdb_id_swap_map.tmp | sort > ${tmp_dir}/current_gtdb_ids_to_swap.tmp

            # getting those with taxonkit (NCBI) lineage info, but no GTDB info (will happen in current release of GTDB doesn't include this particular accession)
            comm -23 ${tmp_dir}/current_taxonkit_ids_to_swap.tmp ${tmp_dir}/current_gtdb_ids_to_swap.tmp > ${tmp_dir}/taxonkit_ids_to_keep_because_no_GTDB_info.tmp
            
            # pulling those out, adding to base label swap file (with a marker that they are from NCBI since both will be mixed), then adding GTDB swap info
            for id in $(cat ${tmp_dir}/taxonkit_ids_to_keep_because_no_GTDB_info.tmp)
            do
                grep -w -m 1 "$id" ${tmp_dir}/full_id_swap_map.tmp | sed 's/$/_NCBI/'
            done > ${tmp_dir}/building_final_lineage_id_swap_map.tmp

            cat ${tmp_dir}/building_final_lineage_id_swap_map.tmp ${tmp_dir}/final_gtdb_id_swap_map.tmp > ${tmp_dir}/final_lineage_id_swap_map.tmp

        else # this is if there were no taxonkit swapping requested
            cp ${tmp_dir}/final_gtdb_id_swap_map.tmp ${tmp_dir}/final_lineage_id_swap_map.tmp

        fi

        ## adding to the ${tmp_dir}/base_label_swap.tmp if other user-specified IDs were given (with -m flag)
        if [ -f ${tmp_dir}/base_label_swap.tmp ]; then
            cat ${tmp_dir}/base_label_swap.tmp ${tmp_dir}/final_lineage_id_swap_map.tmp > ${tmp_dir}/full_id_swap_map.tmp
        else
            cp ${tmp_dir}/final_lineage_id_swap_map.tmp ${tmp_dir}/full_id_swap_map.tmp
        fi


    fi


    ### if some need appending, then appending here
    if [ -s ${tmp_dir}/base_labels_need_append.tmp ]; then
    
        # getting ids that need appending:
        sort -k 1 ${tmp_dir}/base_labels_need_append.tmp | tee ${tmp_dir}/sorted_base_labels_need_append.tmp | cut -f 1 > ${tmp_dir}/sorted_base_ids_need_append.tmp
    
        # getting those that don't need appending, so can separate:
        comm -23 <(cut -f 1 ${tmp_dir}/full_id_swap_map.tmp | sort) ${tmp_dir}/sorted_base_ids_need_append.tmp | sort > ${tmp_dir}/sorted_base_ids_no_append.tmp

        # making final swap table of those that don't need appending:
        for non_append_label in $(cat ${tmp_dir}/sorted_base_ids_no_append.tmp)
        do
            grep -m 1 -w "^$non_append_label" ${tmp_dir}/full_id_swap_map.tmp
        done > ${tmp_dir}/p1_final_id_swap_map.tmp

        # getting swap table for those that do need appending:
        for append_label in $(cat ${tmp_dir}/sorted_base_ids_need_append.tmp)
        do
            grep -m 1 -w "^$append_label" ${tmp_dir}/full_id_swap_map.tmp

        done > ${tmp_dir}/p2a_final_id_swap_map.tmp

        # appending labels
        paste -d "_" ${tmp_dir}/p2a_final_id_swap_map.tmp <(cut -f 2 ${tmp_dir}/sorted_base_labels_need_append.tmp) > ${tmp_dir}/p2_final_id_swap_map.tmp

        # sticking together for final label swap:
        cat ${tmp_dir}/p1_final_id_swap_map.tmp ${tmp_dir}/p2_final_id_swap_map.tmp ${tmp_dir}/base_labels_that_only_need_append.tmp > ${tmp_dir}/final_id_swap_map.tmp

    else
        # updating file if not changed
        rm -f ${tmp_dir}/base_labels_need_append.tmp

        if [ -s ${tmp_dir}/full_id_swap_map.tmp ]; then
            cp ${tmp_dir}/full_id_swap_map.tmp ${tmp_dir}/final_id_swap_map.tmp
        fi
    fi

    ## if final_id_swap_map.tmp exists here, moving forward with swap
    ## if it does not exist here (which may happen if Taxonkit or GTDB was specified, but no input sources had any taxids, for example), then not performing id swapping and reporting to user

    if [ -s ${tmp_dir}/final_id_swap_map.tmp ]; then
          ## one last safety screen for colons, commas, semi-colons, and spaces, and if multiple underscores in a row, collapsing to just 1
        paste <(cut -f 1 ${tmp_dir}/final_id_swap_map.tmp) <(cut -f 2,3 ${tmp_dir}/final_id_swap_map.tmp | tr "[:,; ]" "_" | tr -s "_") > ${tmp_dir}/final_id_swap_map_colonless.tmp

        ### now swapping ids
        gtt-swap-ids -i ${tmp_dir}/aligned_SCGs_mod_names.tmp -s ${tmp_dir}/final_id_swap_map_colonless.tmp -o ${output_dir}/Aligned_SCGs_mod_names.faa

        ### adding new ids to summary table as column "label"
        # first pulling out and adding to those that were changed
        for changed_id in $(cut -f1 ${tmp_dir}/final_id_swap_map_colonless.tmp)
        do
            grep -w -m1 "$changed_id" ${output_dir}/Genomes_summary_info.tsv
        done | cut -f 2- > ${tmp_dir}/building_new_genomes_tab1.tmp

        paste ${tmp_dir}/final_id_swap_map_colonless.tmp ${tmp_dir}/building_new_genomes_tab1.tmp > ${tmp_dir}/building_new_genomes_tab2.tmp

        # now getting those that weren't changed and adding assembly id to the "label" column
        comm -23 <(cut -f1 ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | sort) <(cut -f1 ${tmp_dir}/final_id_swap_map_colonless.tmp | sort) > ${tmp_dir}/unchanged_ids.tmp

        for unchanged_id in $(cat ${tmp_dir}/unchanged_ids.tmp)
        do
            grep -w -m1 "$unchanged_id" ${output_dir}/Genomes_summary_info.tsv
        done > ${tmp_dir}/building_new_genomes_tab3.tmp

        paste ${tmp_dir}/unchanged_ids.tmp ${tmp_dir}/building_new_genomes_tab3.tmp > ${tmp_dir}/building_new_genomes_tab4.tmp

        head -n 1 ${output_dir}/Genomes_summary_info.tsv > ${tmp_dir}/old_header.tmp

        # making new header
        paste <(cut -f 1 ${tmp_dir}/old_header.tmp) <(printf "label") <(cut -f 2- ${tmp_dir}/old_header.tmp) > ${tmp_dir}/new_header.tmp

        # now sticking all together into a new summary table that includes any adjusted labels
        cat ${tmp_dir}/new_header.tmp ${tmp_dir}/building_new_genomes_tab2.tmp ${tmp_dir}/building_new_genomes_tab4.tmp > ${output_dir}/Genomes_summary_info.tsv

        #### adding a "label_source" column to final output genome summary tab (NCBI, GTDB, user_map, user_map_append, input)
            # getting those, if any, that were adjusted by the user input map
        if [ -s ${tmp_dir}/base_label_swap.tmp ]; then
            cut -f 1 ${tmp_dir}/base_label_swap.tmp | sort > ${tmp_dir}/user_label_swapped_ids.tmp
        fi

        if [ -s ${tmp_dir}/base_labels_need_append.tmp ]; then
            cut -f 1 ${tmp_dir}/base_labels_need_append.tmp | sort > ${tmp_dir}/user_label_appended_ids.tmp
        fi

            # getting those, if any, that were adjusted by taxonkit/ncbi or GTDB (taxonkit/NCBI ones will need to have the gtdb ones subtracted if both exist, as giving GTDB precedence)
            # starting with taxonkit
        if [ -s ${tmp_dir}/final_taxonkit_id_swap_map.tmp ]; then
            cut -f 1 ${tmp_dir}/final_taxonkit_id_swap_map.tmp | sort > ${tmp_dir}/initial_taxonkit_label_swapped_ids.tmp

                # now getting those, if any, that were adjusted by GTDB that will then need to be removed from the taxonkit list
            if [ -s ${tmp_dir}/final_gtdb_id_swap_map.tmp ]; then
                cut -f 1 ${tmp_dir}/final_gtdb_id_swap_map.tmp | sort > ${tmp_dir}/gtdb_label_swapped_ids.tmp

                    # getting those which still need the taxonkit label (and aren't superseded by gtdb due to lack of data on that accession)
                comm -23 ${tmp_dir}/initial_taxonkit_label_swapped_ids.tmp ${tmp_dir}/gtdb_label_swapped_ids.tmp | sort > ${tmp_dir}/taxonkit_label_swapped_ids.tmp

            else
                # renaming file that doesn't need to be adjusted due to gtdb labels taking precedence
                cp ${tmp_dir}/initial_taxonkit_label_swapped_ids.tmp ${tmp_dir}/taxonkit_label_swapped_ids.tmp

            fi

        fi

            # getting those, if any, that were adjusted by GTDB
                # only doing if NOT already done in the taxonkit loop above
        if [ ! -s ${tmp_dir}/gtdb_label_swapped_ids.tmp ]; then

            if [ -s ${tmp_dir}/final_gtdb_id_swap_map.tmp ]; then
                cut -f 1 ${tmp_dir}/final_gtdb_id_swap_map.tmp | sort > ${tmp_dir}/gtdb_label_swapped_ids.tmp
            fi

        fi

        ### now making new column for each specifying the "label_source" (e.g. NCBI, GTDB, user_map, user_map_append_to_<X>, input) and re-making final output genome summary tab 
            # for user appended ones, will report "<X>_and_appended", with <X> being either "input", "NCBI", "user_map", or "GTDB"

            # wrapping all with appended since need to treat them differently if label was appended to (specified by the 3rd column in the input mapping file passed to '-m')
                # next will be the 'else' where they are each dealt with if no user-appended labels are involved
                    # for each, adding to a file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
        if [ -s ${tmp_dir}/user_label_appended_ids.tmp ]; then

            # user-map
            if [ -s ${tmp_dir}/user_label_swapped_ids.tmp ]; then

                    # adding to file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
                cat ${tmp_dir}/user_label_swapped_ids.tmp >> ${tmp_dir}/all_relabeled_ids.tmp

                for id in $(cat ${tmp_dir}/user_label_swapped_ids.tmp)
                do

                    # if label also in user-appended list
                    if grep -q $id ${tmp_dir}/user_label_appended_ids.tmp; then

                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "user_map_and_appended\n" >> ${tmp_dir}/building_label_source_column.tmp

                    # if label NOT in user-appended list
                    else
                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "user_map\n" >> ${tmp_dir}/building_label_source_column.tmp

                    fi

                done

            fi

            # taxonkit/NCBI
            if [ -s ${tmp_dir}/taxonkit_label_swapped_ids.tmp ]; then

                # adding to file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
                cat ${tmp_dir}/taxonkit_label_swapped_ids.tmp >> ${tmp_dir}/all_relabeled_ids.tmp
                
                for id in $(cat ${tmp_dir}/taxonkit_label_swapped_ids.tmp)
                do

                    # if label also in user-appended list
                    if grep -q $id ${tmp_dir}/user_label_appended_ids.tmp; then

                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "NCBI_and_appended\n" >> ${tmp_dir}/building_label_source_column.tmp

                    # if label NOT in user-appended list
                    else
                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "NCBI\n" >> ${tmp_dir}/building_label_source_column.tmp

                    fi

                done

            fi                    

            # GTDB
            if [ -s ${tmp_dir}/gtdb_label_swapped_ids.tmp ]; then

                # adding to file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
                cat ${tmp_dir}/gtdb_label_swapped_ids.tmp >> ${tmp_dir}/all_relabeled_ids.tmp

                for id in $(cat ${tmp_dir}/gtdb_label_swapped_ids.tmp)
                do

                    # if label also in user-appended list
                    if grep -q $id ${tmp_dir}/user_label_appended_ids.tmp; then

                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "GTDB_and_appended\n" >> ${tmp_dir}/building_label_source_column.tmp

                    # if label NOT in user-appended list
                    else
                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "GTDB\n" >> ${tmp_dir}/building_label_source_column.tmp

                    fi

                done

            fi

            # any getting those that were not re-labeled in any way (user-specified directly, or with any tax info - though may have been appended to)
            # getting file of those not in any of the above
            comm -23 <(cut -f 1 ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | sort) <(sort ${tmp_dir}/all_relabeled_ids.tmp) > ${tmp_dir}/all_NON_relabeled_ids.tmp

            if [ -s ${tmp_dir}/all_NON_relabeled_ids.tmp ]; then

                for id in $(cat ${tmp_dir}/all_NON_relabeled_ids.tmp)
                do

                    # if label also in user-appended list
                    if grep -q $id ${tmp_dir}/user_label_appended_ids.tmp; then

                        # making sure wasn't already found and added from gtdb_label_swapped_ids.tmp
                        if ! grep -q $id ${tmp_dir}/gtdb_label_swapped_ids.tmp; then

                            grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                            printf "input_and_appended\n" >> ${tmp_dir}/building_label_source_column.tmp
                        
                        fi

                    # if label NOT in user-appended list
                    else
                        grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                        printf "input\n" >> ${tmp_dir}/building_label_source_column.tmp

                    fi

                done

            fi

        ## now handling all if there is no appending going on (specified by column 3 in input mapping file passed to '-m')
        else

            # user-map
            if [ -s ${tmp_dir}/user_label_swapped_ids.tmp ]; then

                # adding to file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
                cat ${tmp_dir}/user_label_swapped_ids.tmp >> ${tmp_dir}/all_relabeled_ids.tmp

                for id in $(cat ${tmp_dir}/user_label_swapped_ids.tmp)
                do
                  
                    grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                    printf "user_map\n" >> ${tmp_dir}/building_label_source_column.tmp

                done

            fi

            # taxonkit/NCBI
            if [ -s ${tmp_dir}/taxonkit_label_swapped_ids.tmp ]; then

                # adding to file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
                cat ${tmp_dir}/taxonkit_label_swapped_ids.tmp >> ${tmp_dir}/all_relabeled_ids.tmp
                
                for id in $(cat ${tmp_dir}/taxonkit_label_swapped_ids.tmp)
                do

                    grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                    printf "NCBI\n" >> ${tmp_dir}/building_label_source_column.tmp

                done

            fi                    

            # GTDB
            if [ -s ${tmp_dir}/gtdb_label_swapped_ids.tmp ]; then

                    # adding to file holding all types so at end we can use that to find those that aren't re-labeled by any method (and just have their input labels)
                cat ${tmp_dir}/gtdb_label_swapped_ids.tmp >> ${tmp_dir}/all_relabeled_ids.tmp

                for id in $(cat ${tmp_dir}/gtdb_label_swapped_ids.tmp)
                do

                    grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                    printf "GTDB\n" >> ${tmp_dir}/building_label_source_column.tmp

                done

            fi

            # any getting those that were not re-labeled in any way (user-specified directly, or with any tax info)
            # getting file of those not in any of the above
            comm -23 <(cut -f 1 ${output_dir}/Genomes_summary_info.tsv | tail -n +2 | sort) <(sort ${tmp_dir}/all_relabeled_ids.tmp) > ${tmp_dir}/all_NON_relabeled_ids.tmp

            if [ -s ${tmp_dir}/all_NON_relabeled_ids.tmp ]; then

                for id in $(cat ${tmp_dir}/all_NON_relabeled_ids.tmp)
                do

                    grep -w -m 1 ^$id ${output_dir}/Genomes_summary_info.tsv >> ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp
                    printf "input\n" >> ${tmp_dir}/building_label_source_column.tmp

                done

            fi

        fi # closing if statement about there being appended labels or not

        ## finally adding additional column to output summary table
            # getting current header and making new one
        head -n 1 ${output_dir}/Genomes_summary_info.tsv > ${tmp_dir}/old_genome_summary_tab_header.tmp
        cut -f 1,2 ${tmp_dir}/old_genome_summary_tab_header.tmp > ${tmp_dir}/new_genome_summary_tab_header_begin.tmp
        cut -f 3- ${tmp_dir}/old_genome_summary_tab_header.tmp > ${tmp_dir}/new_genome_summary_tab_header_end.tmp

        paste ${tmp_dir}/new_genome_summary_tab_header_begin.tmp <(printf "label_source") ${tmp_dir}/new_genome_summary_tab_header_end.tmp > ${tmp_dir}/new_genome_summary_tab_header.tmp

            # splitting apart table and making new one with label source column added in between
        cut -f 1,2 ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp > ${tmp_dir}/building_new_genome_summary_tab_with_label_source_begin.tmp
        cut -f 3- ${tmp_dir}/building_new_genome_summary_tab_with_label_source.tmp > ${tmp_dir}/building_new_genome_summary_tab_with_label_source_end.tmp

        paste ${tmp_dir}/building_new_genome_summary_tab_with_label_source_begin.tmp ${tmp_dir}/building_label_source_column.tmp ${tmp_dir}/building_new_genome_summary_tab_with_label_source_end.tmp > ${tmp_dir}/new_genome_summary_tab.tmp

            # adding header
        cat ${tmp_dir}/new_genome_summary_tab_header.tmp ${tmp_dir}/new_genome_summary_tab.tmp > ${output_dir}/Genomes_summary_info.tsv

    else

        printf "  ${YELLOW}********************************** ${NC}NOTICE ${YELLOW}**********************************${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    You specified to swap labels either with a specific mapping file (passed\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    to the '-m' flag) and/or by specifying to add NCBI or GTDB taxonomy info,\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    but none of these worked out :(\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    Genomes from your mapping file may have been filtered out, and there may\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    have been no accessible taxonomy information to retrieve (which typically\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    would come from inputs provided as NCBI accessions or GenBank files that\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "    might hold that information).\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        printf "    In this case no labels could be swapped.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "  ${YELLOW}***************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        rm -rf ${tmp_dir}/aligned_SCGs_mod_names.tmp
    fi

    printf "\n________________________________________________________________________________\n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

fi


#############################################################################
################  COMBINING KO SEARCHING RESULTS IF NEEDED  #################
#############################################################################
if [ ${ko_targets} != 'false' ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ##########            Combining KO searching results               ###########\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    gtt-combine-kofamscan-results.sh ${tmp_dir}/genomes_from_all_sources.tmp ${tmp_dir} ${output_dir} ${target_KOs}

    gtt-gen-KO-iToL-files.sh ${tmp_dir} ${output_dir}

fi


#############################################################################
##########  WRITING OUT ADDITIONAL TARGET PFAM RESULTS IF NEEDED  ###########
#############################################################################

if [ $additional_pfam_targets != 'false' ]; then

    printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ####            Parsing results of additional Pfam searches               ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    gtt-gen-pfam-iToL-files.sh ${tmp_dir} ${output_dir}

fi


#############################################################################
#############  CLEARING OUT TMP FILES UNLESS -d FLAG PROVIDED  ##############
#############################################################################
# keeping individual alignment files if -k was provided
if [ $keep_individual_alignments == 'true' ]; then
    mkdir ${output_dir}/run_files/individual_alignments

    for file in ${tmp_dir}/*_all_aligned.faa
    do
        new_path="${output_dir}/run_files/individual_alignments/$(basename $(echo ${file%%_all_aligned.faa}))_aln.faa"
        mv ${file} ${new_path}
    done

fi

if [ $debug_flag == 'false' ]; then
    rm -rf $tmp_dir
fi


#############################################################################
##############################   MAKING TREE  ###############################
#############################################################################

if [ $align_only == 'false' ]; then

    if [ $tree_program == 'FastTreeMP' ]; then

        printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ####                         Running FastTreeMP                           ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        curr_time=$(date +"%I:%M %p")
        duration=$SECONDS

        printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        # setting env variable for FastTreeMP
        export OMP_NUM_THREADS=${num_jobs}

        # if headers were modified
        if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then

            FastTreeMP ${output_dir}/Aligned_SCGs_mod_names.faa > ${output_dir}/Aligned_SCGs_mod_names.tre 2> >(tee -a $gtotree_log >&2)

        else

            FastTreeMP ${output_dir}/Aligned_SCGs.faa > ${output_dir}/Aligned_SCGs.tre 2> >(tee -a $gtotree_log >&2)

        fi

        fasttree_used="true" # setting to report citations at end

    fi

    if [ $tree_program == 'FastTree' ]; then

        printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ####                          Running FastTree                            ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        curr_time=$(date +"%I:%M %p")
        duration=$SECONDS

        printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


        # if headers were modified
        if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then

            FastTree ${output_dir}/Aligned_SCGs_mod_names.faa > ${output_dir}/Aligned_SCGs_mod_names.tre 2> >(tee -a $gtotree_log >&2)

        # if headers were not modified
        else

            FastTree ${output_dir}/Aligned_SCGs.faa > ${output_dir}/Aligned_SCGs.tre 2> >(tee -a $gtotree_log >&2)

        fi

        fasttree_used="true" # setting to report citations at end

    fi


    if [ $tree_program == 'IQ-TREE' ]; then

        printf "\n ############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ####                          Running IQ-TREE                            ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf " ############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

        curr_time=$(date +"%I:%M %p")
        duration=$SECONDS

        printf "           It is currently $curr_time; the process started at $start_time.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "               Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


        if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
            iqtree -s ${output_dir}/Aligned_SCGs_mod_names.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        else
            iqtree -s ${output_dir}/Aligned_SCGs.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        fi

        mkdir ${output_dir}/iqtree_out/
        mv iqtree_out* ${output_dir}/iqtree_out/
        cp ${output_dir}/iqtree_out/iqtree_out.treefile ${output_dir}/Aligned_SCGs_mod_names.tre

        iqtree_used="true" # setting to report citations at end

    fi

fi


#############################################################################
##########################  ORGANIZING OUTPUT DIR  ##########################
#############################################################################
if [ -f ${output_dir}/NCBI_genomes_summary_info.tsv ]; then
    mv ${output_dir}/NCBI_genomes_summary_info.tsv ${output_dir}/run_files/NCBI_genomes_summary_info.tsv
fi

if [ -f ${output_dir}/Genbank_genomes_summary_info.tsv ]; then
    mv ${output_dir}/Genbank_genomes_summary_info.tsv ${output_dir}/run_files/Genbank_genomes_summary_info.tsv
fi

if [ -f ${output_dir}/Fasta_genomes_summary_info.tsv ]; then
    mv ${output_dir}/Fasta_genomes_summary_info.tsv ${output_dir}/run_files/Fasta_genomes_summary_info.tsv
fi

if [ -f ${output_dir}/Amino_acid_genomes_summary_info.tsv ]; then
    mv ${output_dir}/Amino_acid_genomes_summary_info.tsv ${output_dir}/run_files/Amino_acid_genomes_summary_info.tsv
fi


#############################################################################
##########################  JOB-FINISHED REPORTING  #########################
#############################################################################
printf "\n\n#################################################################################\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "####                                 ${GREEN}Done!!${NC}                                  ####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "#################################################################################\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# genome report
if [ $genomes_retained == $total_input_genomes ]; then
    printf "            ${GREEN}All $total_input_genomes input genomes were retained through the workflow!${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
else
    printf "  Overall, $genomes_retained genomes of the input $total_input_genomes were retained (see notes below).\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
fi

# reporting primary output files
output_file_prefix=$(basename ${output_dir})

if [ $align_only == 'false' ]; then

    if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then

        printf "    Tree written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "        ${GREEN}${output_dir}/${output_file_prefix}.tre${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        mv ${output_dir}/Aligned_SCGs_mod_names.tre ${output_dir}/${output_file_prefix}.tre
    else
        printf "    Tree written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        printf "        ${GREEN}${output_dir}/${output_file_prefix}.tre${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        mv ${output_dir}/Aligned_SCGs.tre ${output_dir}/${output_file_prefix}.tre
    fi

fi

if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
    printf "    Alignment written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/Aligned_SCGs_mod_names.faa${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    mv ${output_dir}/Aligned_SCGs.faa ${output_dir}/run_files/Aligned_SCGs.faa
else
    printf "    Alignment written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/Aligned_SCGs.faa${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
fi

if [ $keep_individual_alignments == 'true' ]; then
    printf "    Individual protein alignments stored in:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/run_files/individual_alignments/${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
fi

printf "    Main genomes summary table written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "        ${GREEN}${output_dir}/Genomes_summary_info.tsv${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf "    Summary table with hits per target gene per genome written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "        ${GREEN}${output_dir}/SCG_hit_counts.tsv${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )


if [ $ko_targets == 'true' ]; then
    printf "    Outputs from KO searching written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/KO_search_results/${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
fi

if [ $additional_pfam_targets == 'true' ]; then
    printf "    Outputs from Pfam searching written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/Pfam_search_results/${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
fi

printf "    Partitions file (for downstream use with mixed-model treeing) written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "        ${GREEN}${output_dir}/run_files/Partitions.txt${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
mv ${output_dir}/Partitions.txt ${output_dir}/run_files/Partitions.txt

# reporting any problem files/accessions and mentioning run_files/ directory
printf " _______________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

if [ -f ${output_dir}/Redundant_input_accessions.txt ] || [ -f ${output_dir}/NCBI_accessions_not_found.txt ] || [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ] || [ -f ${output_dir}/Genomes_removed_for_too_few_hits.tsv ] || [ -f ${output_dir}/Genes_with_no_hits_to_any_genomes.txt ] || [ -f ${output_dir}/Genes_with_no_hits_after_length_filter.txt ] || [ -f ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv ]; then

    printf "  Notes:\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

    if [ -f ${output_dir}/Redundant_input_accessions.txt ]; then
        printf "        $num_dupe_report accession(s) redundant.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${output_dir}/Redundant_input_accessions.txt ${output_dir}/run_files/Redundant_input_accessions.txt
    fi

    if [ -f ${output_dir}/NCBI_accessions_not_found.txt ]; then
        num_accs_not_found=$(wc -l ${output_dir}/NCBI_accessions_not_found.txt | sed "s/^ *//" | cut -d " " -f 1)
        printf "        $num_accs_not_found accession(s) not successfully found at NCBI.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${output_dir}/NCBI_accessions_not_found.txt ${output_dir}/run_files/NCBI_accessions_not_found.txt
    fi

    if [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ]; then
        num_accs_not_downloaded=$(wc -l ${output_dir}/NCBI_accessions_not_downloaded.txt | sed "s/^ *//" | cut -d " " -f 1)
        printf "        $num_accs_not_downloaded did not download properly.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${output_dir}/NCBI_accessions_not_downloaded.txt ${output_dir}/run_files/NCBI_accessions_not_downloaded.txt
    fi

    if [ -f ${output_dir}/Genomes_removed_for_too_few_hits.tsv ]; then
        printf "        $removed_genomes genome(s) removed due to having too few hits to the targeted SCGs.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${output_dir}/Genomes_removed_for_too_few_hits.tsv ${output_dir}/run_files/Genomes_removed_for_too_few_hits.tsv
    fi

    # reporting of unsucessful targets depends on if run in best-hit mode (-B) or not
    if [ $best_hit_mode != "true" ]; then

        if [ -f ${output_dir}/run_files/Target-genes-not-found-or-retained.txt ]; then
            printf "        $removed_genes gene(s) either had no hits or only multiple hits in each genome.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        fi

    else

        if [ -f ${output_dir}/Target_genes_not_found.txt ]; then
            printf "        $removed_genes gene(s) had no hits in any genome.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
            cp ${output_dir}/Target_genes_not_found.txt ${output_dir}/run_files/Target_genes_not_found.txt
        fi

    fi


    if [ -f ${output_dir}/Genes_with_no_hits_after_length_filter.txt ]; then
        printf "        $removed_genes2 gene(s) had no hits after filtering by length.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${output_dir}/Genes_with_no_hits_after_length_filter.txt ${output_dir}/run_files/Genes_with_no_hits_after_length_filter.txt
    fi

    if [ -f ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv ]; then
        num_genomes_high_redund=$(tail -n +2 ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv | wc -l | sed "s/^ *//" | cut -d " " -f 1)
        printf "        $num_genomes_high_redund genome(s) had an estimated redundancy of >= 10%%.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
        cp ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv ${output_dir}/run_files/Genomes_with_questionable_redundancy_estimates.tsv
    fi

    printf "\n    Reported along with additional informative run files in:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/run_files/${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

else

    printf "    Additional informative run files can be found in:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
    printf "        ${GREEN}${output_dir}/run_files/${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

fi

printf " _______________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

# reporting log file output
printf "    Log file written to:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "        ${GREEN}${output_dir}/gtotree-runlog.txt${NC}\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf " _______________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

### checking programs used and reporting a citations file ###
printf "    ${YELLOW}Programs used and their citations have been written to:\n${NC}" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
printf "        ${GREEN}${output_dir}/citations.txt${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf " _______________________________________________________________________________\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

printf "GToTree ${VERSION}\nLee MD. GToTree: a user-friendly workflow for phylogenomics. Bioinformatics. 2019; (March):1-3. doi.org/10.1093/bioinformatics/btz188\n\n" >> ${output_dir}/citations.txt

hmm_version=$(hmmsearch -h | head -n 2 | tail -n 1 | tr -s " " "\t" | cut -f 3)
printf "HMMER3 v${hmm_version}\nEddy SR. Accelerated profile HMM searches. PLoS Comput. Biol. 2011; (7)10. doi.org/10.1371/journal.pcbi.1002195\n\n" >> ${output_dir}/citations.txt

muscle_version=$(muscle -version | tr -s " " "\t" | cut -f 2 | head -n 1)
printf "Muscle ${muscle_version}\nEdgar RC. MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping. bioRxiv. 2021.06.20.449169. doi.org/10.1101/2021.06.20.449169\n\n" >> ${output_dir}/citations.txt

trimal_version=$(trimal --version | grep "trim" | tr -s " " "\t" | cut -f 2)
printf "TrimAl ${trimal_version}\nGutierrez SC. et al. TrimAl: a Tool for automatic alignment trimming. Bioinformatics. 2009; 25, 1972–1973. doi.org/10.1093/bioinformatics/btp348\n\n" >> ${output_dir}/citations.txt

if [ $prodigal_used == "true" ]; then
    prodigal_version=$(prodigal -v 2>&1 | grep Prodigal | tr -s " " "\t" | cut -f 2 | tr -d ":" | sed 's/V/v/')
    printf "Prodigal ${prodigal_version}\nHyatt, D. et al. Gene and translation initiation site prediction in metagenomic sequences. Bioinformatics. 2010; 28, 2223–2230. doi.org/10.1186/1471-2105-11-119\n\n" >> ${output_dir}/citations.txt
fi

if [ $taxonkit_used == "true" ]; then
    taxonkit_version=$(taxonkit -h | grep Version | tr -s " " "\t" | cut -f 2)
    printf "TaxonKit v${taxonkit_version}\nShen W and Ren H. TaxonKit: a practical and efficient NCBI Taxonomy toolkit. Journal of Genetics and Genomics. 2021. doi.org/10.1016/j.jgg.2021.03.006\n\n" >> ${output_dir}/citations.txt
fi

if [ $kofamscan_used == "true" ]; then
    kofamscan_version=$(exec_annotation -v | cut -f 2 -d " ")
    printf "KOfamScan ${kofamscan_version}\nAramaki, T et al. KofamKOALA: KEGG Ortholog assignment based on profile HMM and adaptive score threshold. Bioinformatics. 2020. doi.org/10.1093/bioinformatics/btz859\n\n" >> ${output_dir}/citations.txt
fi

if [ $pfam_db_used == "true" ]; then
    printf "Pfam database\nMistry J et al. Pfam: the protein families database in 2021. Nucleic Acids Research. 2021. doi.org/10.1093/nar/gkaa913\n\n" >> ${output_dir}/citations.txt
fi

if [ $gtdb_used == "true" ]; then
    printf "Genome Taxonomy Database (GTDB) ${gtdb_version}; ${gtdb_release_date}\nParks DH et al. A complete domain-to-species taxonomy for Bacteria and Archaea. Nat. Biotech. 2020. doi.org/10.1038/s41587-020-0501-8\n\n" >> ${output_dir}/citations.txt
fi

if [ $fasttree_used == "true" ]; then
    fasttree_version=$(fasttree -expert 2>&1 | head -n 1 | tr -s " " "\t" | cut -f 5)
    printf "FastTree 2 v${fasttree_version}\nPrice MN et al. FastTree 2 - approximately maximum-likelihood trees for large alignments. PLoS One. 2010; 5. doi.org/10.1371/journal.pone.0009490\n\n" >> ${output_dir}/citations.txt
fi

if [ $iqtree_used == "true" ]; then
    iqtree_version=$(iqtree -V | head -n 1 | tr -s " " "\t" | cut -f 4)
    printf "IQ-TREE v${iqtree_version}\nNguyen L-T et al. IQ-TREE: a fast and effective stochastic algorithm for estimating maximum likelihood phylogenies. Mol. Biol. Evol. 2015; 32, 268–274. doi.org/10.1093/molbev/msu300\n\n" >> ${output_dir}/citations.txt
fi

if [ $parallel_used == "true" ]; then
    parallel_version=$(parallel --version | head -n 1 | tr -s " " "\t" | cut -f 3)
    printf "GNU Parallel v${parallel_version}\nTange O. GNU Parallel 2018. doi.org/10.5281/zenodo.1146014\n\n" >> ${output_dir}/citations.txt
fi

if [ $universal_SCGs_used == "true" ]; then
    printf "Universal SCG-set\nHug LA et al. A new view of the tree of life. Nat. Microbiol. 2016; 1, 1–6. doi.org/10.1038/NMICROBIOL.2016.48\n\n" >> ${output_dir}/citations.txt
fi

duration=$SECONDS

printf "\n                                         Total process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

today=$(date +'%A')

printf "                                                      ${GREEN}Happy $today :)${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

mv $gtotree_log ${output_dir}/gtotree-runlog.txt
