###################################################################################################
## Snakefile for the "bit" assembly-based metagenomics processing workflow                       ##
## Version 1.0.3                                                                                 ##
## bit: https://github.com/AstrobioMike/bit                                                      ##
##                                                                                               ##
## If you use this workflow in a publication, please consider citing :)                          ##
##   Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122.  ##
##   https://doi.org/10.12688/f1000research.79530.1                                              ##
###################################################################################################


import os
configfile: "config.yaml"


########################################
############# General Info #############
########################################

"""
See the corresponding 'config.yaml' file for general use information.
Variables that may need to be adjusted should usually be changed there, not here.
"""

########################################
######## Some colors and helpers #######
########################################

tty_colors = {
    'green' : '\033[0;32m%s\033[0m',
    'yellow' : '\033[0;33m%s\033[0m',
    'red' : '\033[0;31m%s\033[0m'
}

def color_text(text, color='green'):
    if sys.stdout.isatty():
        return(tty_colors[color] % text)
    else:
        return(text)


########################################
#### Reading samples file into list ####
########################################

sample_IDs_file = config["sample_info_file"]
sample_ID_list = [line.strip() for line in open(sample_IDs_file)]

# making sure there are all unique names
if len(set(sample_ID_list)) != len(sample_ID_list):

    print(color_text("\n    Not all sample IDs in the " + str(config["sample_info_file"]) + " file are unique :(\n", "yellow"))
    print("    Exiting for now.\n")
    exit(1)

########################################
######## Setting up directories ########
########################################

fastqc_out_dir = config["outputs_dir"] + "fastqc-outputs/"
filtered_reads_dir = config["outputs_dir"] + "filtered-reads/"
assemblies_dir = config["outputs_dir"] + "assemblies/"
genes_dir = config["outputs_dir"] + "predicted-genes/"
annotations_and_tax_dir = config["outputs_dir"] + "annotations-and-taxonomy/"
mapping_dir = config["outputs_dir"] + "read-mapping/"
combined_outputs_dir = config["outputs_dir"] + "combined-outputs/"
bins_dir = config["outputs_dir"] + "bins/"
MAGs_dir = config["outputs_dir"] + "MAGs/"
logs_dir = "logs/"
benchmarks_dir = "benchmarks/"

if config["perform_binning_and_MAG_recovery"] == "TRUE":

    dirs_to_create = [config["outputs_dir"], fastqc_out_dir, filtered_reads_dir,
                        genes_dir, annotations_and_tax_dir, mapping_dir,
                        combined_outputs_dir, bins_dir, MAGs_dir,
                        logs_dir, config["REF_DB_ROOT_DIR"], benchmarks_dir]

else:

    dirs_to_create = [config["outputs_dir"], fastqc_out_dir, filtered_reads_dir,
                        genes_dir, annotations_and_tax_dir, mapping_dir,
                        combined_outputs_dir, logs_dir, config["REF_DB_ROOT_DIR"], 
                        benchmarks_dir]


for dir in dirs_to_create:
    try:
        os.mkdir(dir)
    except:
        pass

########################################
############# Rules start ##############
########################################

if config["perform_binning_and_MAG_recovery"] == "TRUE":

    rule all:
        input:
            combined_outputs_dir + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages.tsv",
            combined_outputs_dir + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages.tsv",
            combined_outputs_dir + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages.tsv",
            MAGs_dir + config["additional_filename_prefix"] + f"MAGs-overview.tsv",
            MAGs_dir + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out.tsv",
            bins_dir + config["additional_filename_prefix"] + f"bins-overview.tsv",
            assemblies_dir + config["additional_filename_prefix"] + f"assembly-summaries.tsv",
            config["outputs_dir"] + config["additional_filename_prefix"] + f"processing-overview.tsv",
            fastqc_out_dir + config["additional_filename_prefix"] + f"raw_multiqc_report.zip",
            fastqc_out_dir + config["additional_filename_prefix"] + f"filtered_multiqc_report.zip"
        shell:
            """
            bash scripts/combine-benchmarks.sh
            """

else:

    rule all:
        input:
            combined_outputs_dir + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages.tsv",
            combined_outputs_dir + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages.tsv",
            combined_outputs_dir + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages.tsv",
            assemblies_dir + config["additional_filename_prefix"] + f"assembly-summaries.tsv",
            fastqc_out_dir + config["additional_filename_prefix"] + f"raw_multiqc_report.zip",
            fastqc_out_dir + config["additional_filename_prefix"] + f"filtered_multiqc_report.zip"
        shell:
            """
            bash scripts/combine-benchmarks.sh
            """

rule summarize_MAG_KO_annots_with_KEGG_Decoder:
    conda:
        "envs/keggdecoder.yaml"
    input:
        MAGs_dir + config["additional_filename_prefix"] + f"MAG-level-KO-annotations.tsv"
    params:
        MAGs_dir = MAGs_dir,
        mod_input_annotations = MAGs_dir + "mod-MAG-level-KO-annotations.tmp",
        temp_output = MAGs_dir + "MAG-KEGG-Decoder-out.tmp",
        mapping_file = MAGs_dir + "MAG-ID-map.tmp",
        orig_html_output = MAGs_dir + "MAG-KEGG-Decoder-out.html",
        final_html_output = MAGs_dir + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out.html"
    output:
        MAGs_dir + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out.tsv"
    benchmark:
        benchmarks_dir + "summarize_MAG_KO_annots_with_KEGG_Decoder-benchmarks.tsv"
    shell:
        """
        # getting number of MAGs recovered
        num_mags_recovered=$(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//')
        # only running if any MAGs were recovered
        if [ ${{num_mags_recovered}} -gt 0 ]; then

            # KEGGDecoder splits on the first underscore to identify unique genome/MAG IDs
                # this can be problematic with how things are named, so we are swapping them all to not have
                # any "_" first, then afterwards we are changing the output table back to the original names so 
                # they match elsewhere (they will still be slightly different in the html output, but that is
                # only manually explored anyway)

            # making version of input for KEGGDecoder with no underscores
            tr "_" "-" < {input} > {params.mod_input_annotations}

            # making mapping file
            paste <( cut -f 1 {input} ) <( cut -f 1 {params.mod_input_annotations} ) > {params.mapping_file}


            # running KEGGDecoder
                # can only create html output if there are more than 1
            if [ ${{num_mags_recovered}} -gt 1 ]; then
                KEGG-decoder -v interactive -i {params.mod_input_annotations} -o {params.temp_output}
                ## adding additional prefix to html output if there is one
                if [ {params.orig_html_output} != {params.final_html_output} ]; then
                    mv {params.orig_html_output} {params.final_html_output}
                fi
            else
                KEGG-decoder -i {params.mod_input_annotations} -o {params.temp_output}
            fi


            # swapping MAG IDs back in output tsv from KEGGDecoder
            python scripts/swap-MAG-IDs.py -i {params.temp_output} -m {params.mapping_file} -o {output}

            # removing intermediate files
            rm {params.mod_input_annotations} {params.mapping_file} {params.temp_output}

        else

            printf "There were no MAGs recovered.\n" > {output}

        fi
        """


rule summarize_MAG_level_KO_annotations:
    input:
        MAG_overview = MAGs_dir + config["additional_filename_prefix"] + f"MAGs-overview.tsv",
        trigger = expand(annotations_and_tax_dir + "{ID}-gene-coverage-annotation-and-tax.tsv", ID = sample_ID_list)
    params:
        MAGs_dir = MAGs_dir,
        annot_and_tax_dir = annotations_and_tax_dir,
        tmp_contig_IDs = "curr-contig-ids.tmp"
    output:
        MAGs_dir + config["additional_filename_prefix"] + f"MAG-level-KO-annotations.tsv"
    benchmark:
        benchmarks_dir + "summarize_MAG_level_KO_annotations-benchmarks.tsv"
    shell:
        """
        # only running if any MAGs were recovered
        if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            for MAG in $(cut -f 1 {input.MAG_overview} | tail -n +2)
            do

                sample_ID=$(echo $MAG | sed 's/-MAG-[0-9]*$//')
                grep "^>" {params.MAGs_dir}${{MAG}}.fasta | tr -d ">" > {params.tmp_contig_IDs}

                python scripts/parse-MAG-annots.py -i {params.annot_and_tax_dir}${{sample_ID}}-gene-coverage-annotation-and-tax.tsv -w {params.tmp_contig_IDs} -M ${{MAG}} -o {output}
                rm {params.tmp_contig_IDs}

            done

        else

            printf "There were no MAGs recovered.\n" > {output}

        fi
        """


rule generate_assembly_processing_overview_table:
    input:
        trigger = MAGs_dir + config["additional_filename_prefix"] + f"MAGs-overview.tsv"
    params:
        sample_IDs_file = config["sample_info_file"],
        assemblies_dir = assemblies_dir,
        genes_dir = genes_dir,
        mapping_dir = mapping_dir,
        bins_dir = bins_dir,
        MAGs_dir = MAGs_dir,
        assembly_summaries = bins_dir + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv",
        checkm_results = bins_dir + config["additional_filename_prefix"] + "bins-checkm-out.tsv"
    output:
        config["outputs_dir"] + config["additional_filename_prefix"] + f"processing-overview.tsv"
    benchmark:
        benchmarks_dir + "generate_assembly_processing_overview_table-benchmarks.tsv"
    shell:
        """
        bash scripts/generate-assembly-based-overview-table.sh {params.sample_IDs_file} {params.assemblies_dir} {params.genes_dir} {params.mapping_dir} {params.bins_dir} {params.MAGs_dir} {output}
        # removing intermediate files from assembly-based process
        rm -rf {params.assembly_summaries} {params.checkm_results}
        """


rule generate_MAGs_overview_table:
    input:
        assembly_summaries = MAGs_dir + config["additional_filename_prefix"] + "MAG-assembly-summaries.tsv",
        checkm_results = MAGs_dir + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv",
        gtdb_done_trigger = MAGs_dir + "gtdbtk-out"
    params:
        gtdb_results = MAGs_dir + "gtdbtk-out/gtdbtk.*.summary.tsv",
        checkm_tmp = MAGs_dir + "checkm-estimates.tmp",
        gtdb_tmp = MAGs_dir + "gtdb-taxonomies.tmp",
        checkm_w_header_tmp = MAGs_dir + "checkm-estimates-with-headers.tmp",
        gtdb_w_header_tmp = MAGs_dir + "gtdb-taxonomies-with-headers.tmp",
        overview_tmp = MAGs_dir + "MAGs-overview.tmp",
        overview_header_tmp = MAGs_dir + "MAGs-overview-header.tmp",
        overview_sorted_tmp = MAGs_dir + "MAGs-overview-sorted.tmp",
        MAGs_dir = MAGs_dir
    output:
        MAGs_dir + config["additional_filename_prefix"] + f"MAGs-overview.tsv"
    benchmark:
        benchmarks_dir + "generate_MAGs_overview_table-benchmarks.tsv"
    shell:
        """
        # only running if any MAGs were recovered
        if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            # making sure none of the intermediate files exist already
            rm -rf {params.checkm_tmp} {params.gtdb_tmp} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} {params.overview_tmp} {params.overview_header_tmp} {params.overview_sorted_tmp}

            for MAG in $(cut -f 1 {input.assembly_summaries} | tail -n +2)
            do

                grep -w -m 1 "^${{MAG}}" {input.checkm_results} | cut -f 2,3 >> {params.checkm_tmp}
                grep -w "^${{MAG}}" {params.gtdb_results} | cut -f 2 | sed 's/^.__//' | sed $'s/;.__/\t/g' | awk -F $'\\t' ' BEGIN {{ OFS=FS }} {{ for (i=1; i<=NF; i++) if ( $i ~/^ *$/) $i = "NA" }}; 1 ' >> {params.gtdb_tmp}

            done

            # adding headers
            cat <(printf "est. completeness\\test. redundancy\\n") {params.checkm_tmp} > {params.checkm_w_header_tmp}
            cat <(printf "domain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n") {params.gtdb_tmp} > {params.gtdb_w_header_tmp}

            paste {input.assembly_summaries} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} > {params.overview_tmp}

            # ordering by taxonomy
            head -n 1 {params.overview_tmp} > {params.overview_header_tmp}
            tail -n +2 {params.overview_tmp} | sort -t $'\\t' -k 13,19 > {params.overview_sorted_tmp}

            cat {params.overview_header_tmp} {params.overview_sorted_tmp} > {output}

            rm -rf {params.checkm_tmp} {params.gtdb_tmp} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} {params.overview_tmp} {params.overview_header_tmp} {params.overview_sorted_tmp} {input}

        else

            rm -rf {params.MAGs_dir}*

            printf "There were no MAGs recovered.\n" > {output}

        fi
        """


rule summarize_MAG_assemblies:
   """ summarize MAG assemblies """

    conda:
        "envs/bit.yaml"
    input:
        trigger = MAGs_dir + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv"
    params:
        intermediate_file = MAGs_dir + "MAG-summaries.tmp",
        MAGs_dir = MAGs_dir
    output:
        MAGs_dir + config["additional_filename_prefix"] + "MAG-assembly-summaries.tsv"
    benchmark:
        benchmarks_dir + "summarize_MAG_assemblies-benchmarks.tsv"
    shell:
        """
        # only running if any MAGs were recovered
        if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            bit-summarize-assembly {params.MAGs_dir}*.fasta -o {params.intermediate_file} -t

            # slimming down the output
            cut -f 1,2,3,5,6,8,11,18,19,20 {params.intermediate_file} > {output}
            rm {params.intermediate_file}

        else

            printf "There were no MAGs recovered.\n" > {output}

        fi
        """


rule gtdbtk_on_MAGs:
    """ assign taxonomy to MAGs with gtdb-tk """

    conda:
        "envs/gtdb-tk.yaml"
    input:
        trigger = MAGs_dir + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv",
        gtdbtk_db_trigger = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"]
    params:
        MAGs_dir = MAGs_dir,
        gtdbtk_db_dir = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"],
        pplacer_cpus = config["gtdb_tk_pplacer_cpus"],
        gtdb_tk_scratch_location = config["gtdb_tk_scratch_location"],
        gtdb_mash_db = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_MASH_REF_PATH"]
    output:
        directory(MAGs_dir + "gtdbtk-out")
    resources:
        cpus = config["gtdb_tk_num_cpus"],
        mem_mb = config["gtdbtk_memory_resources"]
    log:
        logs_dir + "gtdbtk-run.log"
    benchmark:
        benchmarks_dir + "run_gtdbtk_on_MAGs-with-1-pplacer-cpu-benchmarks.tsv"
    shell:
        """
        # setting db dir variable if not set in the environment for some reason
        # snakemake uses bash strict mode, so need this so it won't fail if the variable isn't set
        set +u
        if [ -z "${{GTDBTK_DATA_PATH}}" ] || [ "${{GTDBTK_DATA_PATH}}" != "{params.gtdbtk_db_dir}" ]; then

            export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}

        fi
        set -u

        # only running if any MAGs were recovered
        if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            if [ "{params.gtdb_tk_scratch_location}" != "" ]; then

                gtdbtk classify_wf --scratch_dir {params.gtdb_tk_scratch_location} --genome_dir {params.MAGs_dir} -x fasta --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} --mash_db {params.gtdb_mash_db}  > {log} 2>&1

            else

                gtdbtk classify_wf --genome_dir {params.MAGs_dir} -x fasta --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} --mash_db {params.gtdb_mash_db} > {log} 2>&1

            fi

        else

            mkdir -p {output}
            printf "There were no MAGs recovered.\n" > {params.MAGs_dir}No-MAGs-recovered.txt
            printf "\n\nThere were no MAGs recovered, so GTDB-tk was not run.\n\n" > {log}

        fi
        """



rule filter_checkm_results_and_copy_MAGs:
    """ 
    Filters checkm results based on est. completion, redundancy, and strain heterogeneity set in 'config.yaml'
    Defaults are conservatively 90, 10, and 50
    """

    input:
        bins_dir + config["additional_filename_prefix"] + "bins-checkm-out.tsv"
    output:
        MAGs_dir + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv"
    params:
        bins_dir = bins_dir,
        MAGs_dir = MAGs_dir,
        tmp_file = MAGs_dir + "MAGs-checkm-out.tmp",
        min_est_comp = config["minimum_estimated_completion"],
        max_est_redund = config["maximum_estimated_redundancy"],
    benchmark:
        benchmarks_dir + "filtering_checkm_results_and_copying_MAGs-benchmarks.tsv"
    shell:
        """        
        # only running if there were bins recovered
        if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            cat <( printf "Name\tCompleteness\tContamination\tCompleteness_Model_Used\tTranslation_Table_Used\tCoding_Density\tContig_N50\tAverage_Gene_Length\tGenome_Size\tGC_Content\tTotal_Coding_Sequences\tAdditional_Notes\n" ) \
                <( awk -F $'\\t' ' $2 >= {params.min_est_comp} && $3 <= {params.max_est_redund} ' {input} ) > {params.tmp_file}

            sed 's/-bin\./-MAG-/' {params.tmp_file} > {output}

            for MAG in $(cut -f 1 {params.tmp_file} | tail -n +2)
            do
                new_ID=$(echo $MAG | sed 's/-bin\./-MAG-/')
                cp {params.bins_dir}${{MAG}}.fasta {params.MAGs_dir}${{new_ID}}.fasta
            done

            rm {params.tmp_file}

        else

            printf "There were no MAGs recovered.\n" > {output}

        fi
        """


rule generate_bins_overview_table:
    input:
        assembly_summaries = bins_dir + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv",
        checkm_results = bins_dir + config["additional_filename_prefix"] + "bins-checkm-out.tsv",
        timing_trigger = MAGs_dir + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv"
    params:
        checkm_tmp = bins_dir + "checkm-estimates.tmp",
        checkm_w_header_tmp = bins_dir + "checkm-estimates-with-headers.tmp",
        bins_dir = bins_dir
    output:
        bins_dir + config["additional_filename_prefix"] + f"bins-overview.tsv"
    benchmark:
        benchmarks_dir + "generate_bins_overview_table-benchmarks.tsv"
    shell:
        """
        # only running if there were bins recovered
        if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            # making sure none of the intermediate files exist already
            rm -rf {params.checkm_tmp} {params.checkm_w_header_tmp}

            for bin in $(cut -f 1 {input.assembly_summaries} | tail -n +2)
            do

                grep -w -m 1 "^${{bin}}" {input.checkm_results} | cut -f 2,3 >> {params.checkm_tmp}

            done

            # adding header
            cat <(printf "est. completeness\\test. redundancy\\n") {params.checkm_tmp} > {params.checkm_w_header_tmp}

            # combining
            paste {input.assembly_summaries} {params.checkm_w_header_tmp} > {output}

            rm -rf {params.checkm_tmp} {params.checkm_w_header_tmp}

        else

            rm -rf {params.bins_dir}*
            printf "There were no bins recovered.\n" > {output}

        fi
        """


rule run_checkm2_on_bins:
    conda:
        "envs/checkm2.yaml"
    input:
        trigger = expand(mapping_dir + "{ID}-metabat-assembly-depth.tsv", ID = sample_ID_list),
        checkm2_db_trigger = config["REF_DB_ROOT_DIR"] + config["CHECKM2_DATA_PATH"] + "/" + config["CHECKM2_TRIGGER_FILE"]
    params:
        bins_dir = bins_dir,
        output_dir = "checkm2-output-dir",
        checkm2_db_dir = config["REF_DB_ROOT_DIR"] + config["CHECKM2_DATA_PATH"],
        checkm2_db_filename = config["CHECKM2_DB_FILENAME"]
    resources:
        cpus = config["num_cpus"],
        mem_mb = config["checkm_memory_resources"]
    output:
        checkm2_results_tab = bins_dir + config["additional_filename_prefix"] + "bins-checkm-out.tsv"
    log:
        logs_dir + "checkm2.log"
    benchmark:
        benchmarks_dir + "run_checkm2_on_bins-benchmarks.tsv"
    shell:
        """
        # setting db dir variable if not set in the environment for some reason
            # snakemake uses bash strict mode, so need this so it won't fail if the variable isn't set
        set +u
        if [ -z "${{CHECKM2DB}}" ] || [ "${{CHECKM2DB}}" != "{params.checkm2_db_dir}/{params.checkm2_db_filename}" ]; then

            export CHECKM2DB={params.checkm2_db_dir}/{params.checkm2_db_filename}

        fi
        set -u

        checkm2 predict -i {params.bins_dir} -x fasta -t {resources.cpus} --output-directory {params.output_dir} --force > {log} 2>&1

        # making copy of primary output file
        cp {params.output_dir}/quality_report.tsv {output.checkm2_results_tab}

        # removing full checkm2-output-dir
        rm -rf {params.output_dir}
        """


rule summarize_bin_assemblies:
   """ summarize bin assemblies """

    conda:
        "envs/bit.yaml"
    input:
        trigger = expand(mapping_dir + "{ID}-metabat-assembly-depth.tsv", ID = sample_ID_list)
    params:
        intermediate_file = bins_dir + "bin-summaries.tmp",
        bins_dir = bins_dir
    output:
        bins_dir + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv"
    benchmark:
        benchmarks_dir + "summarize_bin_assemblies-benchmarks.tsv"
    shell:
        """
        # only running if any bins were recovered
        if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then

            bit-summarize-assembly {params.bins_dir}*.fasta -o {params.intermediate_file} -t

            # slimming down the output
            cut -f 1,2,3,5,6,8,11,18,19,20 {params.intermediate_file} > {output}
            rm {params.intermediate_file}

        else

            printf "There were no bins recovered.\n" > {output}

        fi
        """


rule metabat_binning:
    """
    This rule runs metabat2 for binning contigs.
    """

    conda:
        "envs/metabat.yaml"
    input:
        assembly = assemblies_dir + "{ID}-assembly.fasta",
        bam = mapping_dir + "{ID}.bam"
    params:
        bins_dir = bins_dir,
        prefix = bins_dir + "{ID}-bin",
        tmp_bins_file = "{ID}-bin-files.tmp",
        tmp_rename_script = "{ID}-rename.tmp"
    resources:
        cpus = config["num_threads"]
    output:
        depth_file = mapping_dir + "{ID}-metabat-assembly-depth.tsv"
    log:
        logs_dir + "{ID}-bam-summarize-and-metabat.log"
    benchmark:
        benchmarks_dir + "metabat_binning-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if the assembly produced anything
        if [ -s {input.assembly} ]; then

            jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} --percentIdentity 97 --minContigLength 1000 --minContigDepth 1.0  --referenceFasta {input.assembly} {input.bam} > {log} 2>&1

            # only running if there are contigs with coverage information in the coverage file we just generated
            if [ $(wc -l {output.depth_file} | sed 's/^ *//' | cut -f 1 -d " ") -gt 1 ]; then 
                metabat2  --inFile {input.assembly} --outFile {params.prefix} --abdFile {output.depth_file} -t {resources.cpus} >> {log} 2>&1
            else
                printf "\n\nThere was no coverage info generated in {output.depth_file}, so no binning with metabat was performed.\n\n" >> {log}
            fi

            # changing extensions from .fa to .fasta to match nt fasta extension elsewhere in GeneLab
            find {params.bins_dir} -name {wildcards.ID}*.fa > {params.tmp_bins_file}
            
            if [ -s {params.tmp_bins_file} ]; then
                paste -d " " <( sed 's/^/mv /' {params.tmp_bins_file} ) <( sed 's/.fa/.fasta/' {params.tmp_bins_file} ) > {params.tmp_rename_script}
                bash {params.tmp_rename_script}
            fi

            rm -rf {params.tmp_bins_file} {params.tmp_rename_script}

        else

            touch {output}
            printf "Binning not performed because the assembly didn't produce anything.\n" > {log}

        fi
        """


rule make_combined_contig_tax_tables:
    conda:
        "envs/bit.yaml"
    input:
        expand(annotations_and_tax_dir + "{ID}-contig-coverage-and-tax.tsv", ID = sample_ID_list)
    params:
        out_prefix = combined_outputs_dir + config["additional_filename_prefix"] + "Combined",
    output:
        combined_tax = combined_outputs_dir + config["additional_filename_prefix"] + "Combined-contig-level-taxonomy-coverages.tsv",
        norm_combined_tax = combined_outputs_dir + config["additional_filename_prefix"] + "Combined-contig-level-taxonomy-coverages-CPM.tsv"
    benchmark:
        benchmarks_dir + "make_combined_contig_tax_tables-benchmarks.tsv"
    shell:
        """
        python scripts/combine-contig-tax-tables.py {input} -o {params.out_prefix}
        """


rule make_combined_gene_level_tables:
    conda:
        "envs/bit.yaml"
    input:
        expand(annotations_and_tax_dir + "{ID}-gene-coverage-annotation-and-tax.tsv", ID = sample_ID_list)
    params:
        out_prefix = combined_outputs_dir + config["additional_filename_prefix"] + "Combined",
    output:
        combined_annots = combined_outputs_dir + config["additional_filename_prefix"] + "Combined-gene-level-KO-function-coverages.tsv",
        norm_combined_annots = combined_outputs_dir + config["additional_filename_prefix"] + "Combined-gene-level-KO-function-coverages-CPM.tsv",
        combined_tax = combined_outputs_dir + config["additional_filename_prefix"] + "Combined-gene-level-taxonomy-coverages.tsv",
        norm_combined_tax = combined_outputs_dir + config["additional_filename_prefix"] + "Combined-gene-level-taxonomy-coverages-CPM.tsv"
    benchmark:
        benchmarks_dir + "make_combined_gene_level_tables-benchmarks.tsv"
    shell:
        """
        python scripts/combine-KO-and-tax-tables.py {input} -o {params.out_prefix}
        """


rule combine_contig_tax_and_coverage:
    """
    This rule combines the contig-level taxonomic and coverage information for each individual sample. 
    """
    input:
        cov = mapping_dir + "{ID}-contig-coverages.tsv",
        tax = annotations_and_tax_dir + "{ID}-contig-tax.tsv"
    params:
        assembly = assemblies_dir + "{ID}-assembly.fasta",
        AAs = genes_dir + "{ID}-genes.faa",
        contig_tmp = annotations_and_tax_dir + "{ID}-contig.tmp",
        header_tmp = annotations_and_tax_dir + "{ID}-contig-header.tmp",
        contig_p1_tmp = annotations_and_tax_dir + "{ID}-contig-p1.tmp",
        tax_col_tmp = annotations_and_tax_dir + "{ID}-tax-col.tmp"
    output:
        annotations_and_tax_dir + "{ID}-contig-coverage-and-tax.tsv"
    benchmark:
        benchmarks_dir + "combine_contig_tax_and_coverage-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if the assembly produced anything
        if [ -s {params.assembly} ]; then

            # if there were no genes called, there is no contig-level taxonomy, so dealing with that here
            if [ -s {params.AAs} ]; then
                paste <( tail -n +2 {input.cov} | sort -V -k 1 ) <( tail -n +2 {input.tax} | sort -V -k 1 | cut -f 2- ) > {params.contig_tmp}
                paste <( head -n 1 {input.cov} ) <( head -n 1 {input.tax} | cut -f 2- ) > {params.header_tmp}
                cat {params.header_tmp} {params.contig_tmp} > {output}
                rm -rf {params.contig_tmp} {params.header_tmp}
                rm -rf {input}

            else

                paste <( tail -n +2 {input.cov} | sort -V -k 1 ) > {params.contig_p1_tmp}
                sed 's/.*/NA/g' {params.contig_p1_tmp} > {params.tax_col_tmp}
                paste {params.contig_p1_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} > {params.contig_tmp}
                cat <( printf "contig_ID\tcoverage\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" ) {params.contig_tmp} > {output}
                rm -rf {params.contig_p1_tmp} {params.tax_col_tmp} {params.contig_tmp}
                rm -rf {input}

            fi

        else

            printf "contig_ID\tcoverage\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output}
            rm -rf {input}

        fi
        """


rule combine_gene_annots_tax_and_coverage:
    """
    This rule combines the gene-level functional annotations, taxonomic classifications, and coverage information for each individual sample.
    """
    input:
        cov = mapping_dir + "{ID}-gene-coverages.tsv",
        annots = annotations_and_tax_dir + "{ID}-annotations.tsv",
        tax = annotations_and_tax_dir + "{ID}-gene-tax.tsv"
    params:
        assembly = assemblies_dir + "{ID}-assembly.fasta",
        AAs = genes_dir + "{ID}-genes.faa",
        gene_tmp = annotations_and_tax_dir + "{ID}-gene.tmp",
        header_tmp = annotations_and_tax_dir + "{ID}-gene-header.tmp"
    output:
        annotations_and_tax_dir + "{ID}-gene-coverage-annotation-and-tax.tsv"
    benchmark:
        benchmarks_dir + "combine_gene_annots_tax_and_coverage-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if the assembly produced anything and genes were identified (they are required for this)
        if [ -s {params.assembly} ] && [ -s {params.AAs} ]; then

            paste <( tail -n +2 {input.cov} | sort -V -k 1 ) <( tail -n +2 {input.annots} | sort -V -k 1 | cut -f 2- ) <( tail -n +2 {input.tax} | sort -V -k 1 | cut -f 2- ) > {params.gene_tmp}
            paste <( head -n 1 {input.cov} ) <( head -n 1 {input.annots} | cut -f 2- ) <( head -n 1 {input.tax} | cut -f 2- ) > {params.header_tmp}

            cat {params.header_tmp} {params.gene_tmp} > {output}

            rm -rf {params.gene_tmp} {params.header_tmp}
            rm -rf {input}

        else

            printf "gene_ID\tcoverage\tKO_ID\tKO_function\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output}
            rm -rf {input}

        fi
        """


rule get_cov_and_det:
    """
    This rule pulls out coverage and detection information for each sample, gene-level and contig-level,
    and filters the gene-level coverage information based on requiring at least 50% detection.
    """

    conda:
        "envs/mapping.yaml"
    input:
        bam = mapping_dir + "{ID}.bam",
        nt = genes_dir + "{ID}-genes.fasta"
    params:
        assembly = assemblies_dir + "{ID}-assembly.fasta",
        gene_cov_and_det_tmp = mapping_dir + "{ID}-gene-cov-and-det.tmp",
        contig_cov_and_det_tmp = mapping_dir + "{ID}-contig-cov-and-det.tmp",
        gene_cov_tmp = mapping_dir + "{ID}-gene-cov.tmp",
        contig_cov_tmp = mapping_dir + "{ID}-contig-cov.tmp",
        pileup_mem = config["pileup_mem"]
    output:
        gene_covs = mapping_dir + "{ID}-gene-coverages.tsv",
        contig_covs = mapping_dir + "{ID}-contig-coverages.tsv"
    resources:
        mem_mb = config["pileup_memory_resources"]
    log:
        logs_dir + "{ID}-pileup.log"
    benchmark:
        benchmarks_dir + "get_cov_and_det-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if the assembly produced anything
        if [ -s {params.assembly} ]; then

            # only running on genes also if genes were identified
            if [ -s {input.nt} ]; then

                pileup.sh -Xmx{params.pileup_mem} -in {input.bam} fastaorf={input.nt} outorf={params.gene_cov_and_det_tmp} out={params.contig_cov_and_det_tmp} > {log} 2>&1

                # filtering coverages based on detection
                    # genes
                grep -v "#" {params.gene_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $10 <= 0.5 ) $4 = 0 }} {{ print $1,$4 }} ' > {params.gene_cov_tmp}
                cat <( printf "gene_ID\tcoverage\n" ) {params.gene_cov_tmp} > {output.gene_covs}

                    # contigs
                grep -v "#" {params.contig_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $5 <= 50 ) $2 = 0 }} {{ print $1,$2 }} ' > {params.contig_cov_tmp}
                cat <( printf "contig_ID\tcoverage\n" ) {params.contig_cov_tmp} > {output.contig_covs}

                # removing intermediate files
                rm {params.gene_cov_and_det_tmp} {params.contig_cov_and_det_tmp} {params.gene_cov_tmp} {params.contig_cov_tmp}

            else

                pileup.sh -in {input.bam} out={params.contig_cov_and_det_tmp} > {log} 2>&1

                # filtering coverages based on detection
                    # contigs
                grep -v "#" {params.contig_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $5 <= 50 ) $2 = 0 }} {{ print $1,$2 }} ' > {params.contig_cov_tmp}
                cat <( printf "contig_ID\tcoverage\n" ) {params.contig_cov_tmp} > {output.contig_covs}

                # writing out empty genes coverage file
                printf "gene_ID\tcoverage\n" > {output.gene_covs}
                printf "\n\nGene-level coverage info not recovered because the assembly didn't have any genes identified.\n" >> {log}

                # removing intermediate files
                rm {params.contig_cov_and_det_tmp} {params.contig_cov_tmp}

            fi

        else

            printf "gene_ID\tcoverage\n" > {output.gene_covs}
            printf "contig_ID\tcoverage\n" > {output.contig_covs}
            printf "Coverage info not recovered because the assembly didn't produce anything.\n" > {log}

        fi
        """


if config["single_end_data"] != "TRUE":
    # mapping rule if paired-end data

    rule mapping_PE:
        """
        This rule builds the bowtie2 index and runs the mapping for each sample.
        """
        conda:
            "envs/mapping.yaml"
        input:
            assembly = assemblies_dir + "{ID}-assembly.fasta",
            R1 = filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"],
            R2 = filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"]
        params:
            index = mapping_dir + "{ID}-index",
            mapping_info = mapping_dir + "{ID}-mapping-info.txt",
            num_threads = config["num_threads"]
        resources:
            cpus = config["num_threads"],
            mem_mb = config["mapping_memory_resources"]
        output:
            mapping_dir + "{ID}.bam"
        log:
            logs_dir + "{ID}-bowtie2-build.log"
        benchmark:
            benchmarks_dir + "run_mapping-{ID}-benchmarks.tsv"
        shell:
            """
            # only running if the assembly produced anything
            if [ -s {input.assembly} ]; then

                bowtie2-build {input.assembly} {params.index} > {log} 2>&1
                bowtie2 --mm -q --threads {params.num_threads} -x {params.index} -1 {input.R1} -2 {input.R2} --no-unal 2> {params.mapping_info} | samtools view -b | samtools sort -@ {params.num_threads} > {output} 2> /dev/null
                rm {params.index}*

            else

                touch {output}
                printf "Mapping not performed because the assembly didn't produce anything.\n" > {log}

            fi
            """

else:
    # mapping rule if single-end data

    rule mapping_SE:
        """
        This rule builds the bowtie2 index and runs the mapping for each sample.
        """
        conda:
            "envs/mapping.yaml"
        input:
            assembly = assemblies_dir + "{ID}-assembly.fasta",
            R1 = filtered_reads_dir + "{ID}" + config["filtered_suffix"]
        params:
            index = mapping_dir + "{ID}-index",
            mapping_info = mapping_dir + "{ID}-mapping-info.txt",
            num_threads = config["num_threads"]
        resources:
            cpus = config["num_threads"],
            mem_mb = config["mapping_memory_resources"]
        output:
            mapping_dir + "{ID}.bam"
        log:
            logs_dir + "{ID}-bowtie2-build.log"
        benchmark:
            benchmarks_dir + "run_mapping-{ID}-benchmarks.tsv"
        shell:
            """
            # only running if the assembly produced anything
            if [ -s {input.assembly} ]; then

                bowtie2-build {input.assembly} {params.index} > {log} 2>&1
                bowtie2 --mm -q --threads {params.num_threads} -x {params.index} -r {input.R1} --no-unal 2> {params.mapping_info} | samtools view -b | samtools sort -@ {params.num_threads} > {output} 2> /dev/null
                rm {params.index}*

            else

                touch {output}
                printf "Mapping not performed because the assembly didn't produce anything.\n" > {log}

            fi
            """


rule tax_classification:
    """
    This rule runs the gene- and contig-level taxonomic classifications for each assembly.
    """

    conda:
        "envs/cat.yaml"
    input:
        assembly = assemblies_dir + "{ID}-assembly.fasta",
        AA = genes_dir + "{ID}-genes.faa",
        cat_db_trigger = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"]
    output:
        gene_tax_out = annotations_and_tax_dir + "{ID}-gene-tax.tsv",
        contig_tax_out = annotations_and_tax_dir + "{ID}-contig-tax.tsv"
    params:
        tmp_out_prefix = annotations_and_tax_dir + "{ID}-tax-out.tmp",
        tmp_genes = annotations_and_tax_dir + "{ID}-gene-tax.tmp",
        tmp_contigs = annotations_and_tax_dir + "{ID}-contig-tax.tmp",
        cat_db = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_DB"],
        cat_tax = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TAX"],
        block_size = config["block_size"]
    resources:
        cpus = config["num_cpus"],
        mem_mb = config["CAT_memory_resources"]
    log:
        logs_dir + "{ID}-CAT.log"
    benchmark:
        benchmarks_dir + "run_tax_classification-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if assembly produced any contigs and genes were identified (they are required for this)
        if [ -s {input.assembly} ] && [ -s {input.AA} ]; then

            CAT contigs -d {params.cat_db} -t {params.cat_tax} -n {resources.cpus} -r 3 --top 4 --I_know_what_Im_doing -c {input.assembly} -p {input.AA} -o {params.tmp_out_prefix} --no_stars --block_size {params.block_size} --index_chunks 2 --force > {log} 2>&1

            # adding names to gene classifications
            CAT add_names -i {params.tmp_out_prefix}.ORF2LCA.txt -o {params.tmp_genes} -t {params.cat_tax} --only_official --exclude_scores >> {log} 2>&1

            # formatting gene classifications
            bash scripts/format-gene-tax-classifications.sh {params.tmp_genes} {output.gene_tax_out}

            # adding names to contig classifications
            CAT add_names -i {params.tmp_out_prefix}.contig2classification.txt -o {params.tmp_contigs} -t {params.cat_tax} --only_official --exclude_scores >> {log} 2>&1

            # formatting contig classifications
            bash scripts/format-contig-tax-classifications.sh {params.tmp_contigs} {output.contig_tax_out}

            rm -rf {params.tmp_out_prefix}* {params.tmp_genes} {params.tmp_contigs}

        else

            touch {output}
            printf "Assembly-based taxonomic classification not performed because the assembly didn't produce anything and/or no genes were identified.\n" > {log}

        fi
        """


rule KO_annotation:
    """
    This rule runs the gene-level (KO) functional annotation for each sample.
    """
    conda:
        "envs/kofamscan.yaml"
    input:
        AAs = genes_dir + "{ID}-genes.faa",
        kofamscan_db_trigger = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/" + config["KOFAMSCAN_TRIGGER_FILE"]
    output:
        annotations_and_tax_dir + "{ID}-annotations.tsv"
    params:
        assembly = assemblies_dir + "{ID}-assembly.fasta",
        ko_db_dir = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"],
        tmp_out = annotations_and_tax_dir + "{ID}-KO-tab.tmp",
        tmp_dir = annotations_and_tax_dir + "{ID}-tmp-KO-dir"
    resources:
        cpus = config["num_cpus"],
        mem_mb = config["KOFamScan_memory_resources"]
    log:
        logs_dir + "{ID}-kofamscan.log"
    benchmark:
        benchmarks_dir + "run_KO_annotation-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if assembly produced any contigs and genes were identified (they are required for this)
        if [ -s {params.assembly} ] && [ -s {input.AAs} ]; then

            exec_annotation -p {params.ko_db_dir}/profiles/ -k {params.ko_db_dir}/ko_list --cpu {resources.cpus} -f detail-tsv -o {params.tmp_out} --tmp-dir {params.tmp_dir} --report-unannotated {input.AAs} > {log} 2>&1

            bit-filter-KOFamScan-results -i {params.tmp_out} -o {output}

            rm -rf {params.tmp_out} {params.tmp_dir}

        else

            touch {output}
            printf "Functional annotations not performed because the assembly didn't produce anything and/or no genes were identified.\n" > {log}

        fi
        """


rule call_genes:
    """
    This rule calls genes on each assembly file.
    """

    conda:
        "envs/prodigal.yaml"
    input:
        assembly = assemblies_dir + "{ID}-assembly.fasta"
    output:
        AA = genes_dir + "{ID}-genes.faa",
        nt = genes_dir + "{ID}-genes.fasta",
        gff = genes_dir + "{ID}-genes.gff"
    log:
        logs_dir + "{ID}-prodigal.log"
    benchmark:
        benchmarks_dir + "call_genes-{ID}-benchmarks.tsv"
    shell:
        """
        # only running if assembly produced any contigs
        if [ -s {input.assembly} ]; then

            prodigal -q -c -p meta -a {output.AA} -d {output.nt} -f gff -o {output.gff} -i {input.assembly} > {log} 2>&1

            # removing line-wraps
            bit-remove-wraps {output.AA} > {output.AA}.tmp 2> /dev/null && mv {output.AA}.tmp {output.AA}
            bit-remove-wraps {output.nt} > {output.nt}.tmp 2> /dev/null && mv {output.nt}.tmp {output.nt}

        else

            touch {output}
            printf "Gene-calling not performed because the assembly didn't produce anything.\n" > {log}

        fi
        """


rule summarize_assemblies:
    """
    This rule summarizes and reports general stats for all individual sample assemblies in one table.
    """
    conda:
        "envs/bit.yaml"
    input:
        expand(assemblies_dir + "{ID}-assembly.fasta", ID = sample_ID_list)        
    output:
        assemblies_dir + config["additional_filename_prefix"] + f"assembly-summaries.tsv"
    benchmark:
        benchmarks_dir + "summarize_assemblies-benchmarks.tsv"
    shell:
        """
        bit-summarize-assembly -o {output} {input}
        """


if config["single_end_data"] != "TRUE":
    # assembly rule if paired-end data
    rule assemble_PE:
        """
        This rule handles running the assembly for each individual sample.
        """
        conda:
            "envs/megahit.yaml"
        input:
            R1 = filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"],
            R2 = filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"]
        params:
            assemblies_dir = assemblies_dir,
            max_mem = config["max_mem_megahit"],
            failed_assemblies_file = assemblies_dir + config["additional_filename_prefix"] + f"Failed-assemblies.tsv"
        resources:
            cpus = config["num_threads"],
            mem_mb = config["megahit_memory_resources"]
        output:
            assemblies_dir + "{ID}-assembly.fasta"
        log:
            logs_dir + "{ID}-assembly.log"
        benchmark:
            benchmarks_dir + "assemble-{ID}-benchmarks.tsv"
        shell:
            """
            # removing output directory if exists already but rule still needs to be run (because there is no --force option to megahit i dont't think):
            rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/

            megahit -1 {input.R1} -2 {input.R2} -m {params.max_mem} -t {resources.cpus} --min-contig-len 500 -o {params.assemblies_dir}{wildcards.ID}-megahit-out > {log} 2>&1
            bit-rename-fasta-headers -i {params.assemblies_dir}{wildcards.ID}-megahit-out/final.contigs.fa -w c_{wildcards.ID} -o {output}

            rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/

            # checking the assembly produced anything (megahit can run, produce the output fasta, but it will be empty if no contigs were assembled)
            if [ ! -s {output} ]; then
                printf "{wildcards.ID}\tNo contigs assembled\n" >> {params.failed_assemblies_file}
            fi
            """

else:
    # assembly rule if single-end data
    rule assemble_SE:
        """
        This rule handles running the assembly for each individual sample.
        """
        conda:
            "envs/megahit.yaml"
        input:
            R1 = filtered_reads_dir + "{ID}" + config["filtered_suffix"]
        params:
            assemblies_dir = assemblies_dir,
            max_mem = config["max_mem_megahit"],
            failed_assemblies_file = assemblies_dir + config["additional_filename_prefix"] + f"Failed-assemblies.tsv"
        resources:
            cpus = config["num_threads"],
            mem_mb = config["megahit_memory_resources"]
        output:
            assemblies_dir + "{ID}-assembly.fasta"
        log:
            logs_dir + "{ID}-assembly.log"
        benchmark:
            benchmarks_dir + "assemble-{ID}-benchmarks.tsv"
        shell:
            """
            # removing output directory if exists already but rule still needs to be run (because there is no --force option to megahit i dont't think):
            rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/

            megahit -r {input.R1} -m {params.max_mem} -t {resources.cpus} --min-contig-len 500 -o {params.assemblies_dir}{wildcards.ID}-megahit-out > {log} 2>&1
            bit-rename-fasta-headers -i {params.assemblies_dir}{wildcards.ID}-megahit-out/final.contigs.fa -w c_{wildcards.ID} -o {output}

            rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/

            # checking the assembly produced anything (megahit can run, produce the output fasta, but it will be empty if no contigs were assembled)
            if [ ! -s {output} ]; then
                printf "{wildcards.ID}\tNo contigs assembled\n" >> {params.failed_assemblies_file}
            fi
            """


if config["single_end_data"] != "TRUE":
    # quality-trimming/filtering rule if this is paired-end data
    # quality-trimming/filtering rule run slightly different if data are generated with Swift 1S library prep
    if config["swift_1S"] == "TRUE":

        rule bbduk_PE:
            """
            This rule runs quality filtering/trimming on raw input fastq files for each individual sample.
            """

            conda:
                "envs/qc.yaml"
            input:
                in1 = config["input_reads_dir"] + "{ID}" + config["input_R1_suffix"],
                in2 = config["input_reads_dir"] + "{ID}" + config["input_R2_suffix"]
            output:
                out1 = filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"],
                out2 = filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"]
            log:
                logs_dir + "{ID}-bbduk.log"
            benchmark:
                benchmarks_dir + "bbduk-{ID}-benchmarks.tsv"
            shell:
                """
                bbduk.sh in={input.in1} in2={input.in2} out1={output.out1} out2={output.out2} \
                        ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \
                        trimq=10 mlf=0.5 maxns=0 swift=t > {log} 2>&1
                """

    else:

        rule bbduk_PE:
            """
            This rule runs quality filtering/trimming on raw input fastq files for each individual sample.
            """

            conda:
                "envs/qc.yaml"
            input:
                in1 = config["input_reads_dir"] + "{ID}" + config["input_R1_suffix"],
                in2 = config["input_reads_dir"] + "{ID}" + config["input_R2_suffix"]
            output:
                out1 = filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"],
                out2 = filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"]
            log:
                logs_dir + "{ID}-bbduk.log"
            benchmark:
                benchmarks_dir + "bbduk-{ID}-benchmarks.tsv"
            shell:
                """
                bbduk.sh in={input.in1} in2={input.in2} out1={output.out1} out2={output.out2} \
                        ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \
                        trimq=10 mlf=0.5 maxns=0 > {log} 2>&1
                """

else:
    # quality-trimming/filtering rule if this is single-end data
    # quality-trimming/filtering rule run slightly different if data are generated with Swift 1S library prep
    if config["swift_1S"] == "TRUE":

        rule bbduk_SE:
            """
            This rule runs quality filtering/trimming on input fastq files for each individual sample.
            """

            conda:
                "envs/qc.yaml"
            input:
                in1 = config["input_reads_dir"] + "{ID}" + config["input_suffix"]
            output:
                out1 = filtered_reads_dir + "{ID}" + config["filtered_suffix"]
            log:
                logs_dir + "{ID}-bbduk.log"
            benchmark:
                benchmarks_dir + "bbduk-{ID}-benchmarks.tsv"
            shell:
                """
                bbduk.sh in={input.in1} out1={output.out1} \
                        ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \
                        trimq=10 mlf=0.5 maxns=0 swift=t > {log} 2>&1
                """

    else:

        rule bbduk_SE:
            """
            This rule runs quality filtering/trimming on input fastq files for each individual sample.
            """

            conda:
                "envs/qc.yaml"
            input:
                in1 = config["input_reads_dir"] + "{ID}" + config["input_suffix"]
            output:
                out1 = filtered_reads_dir + "{ID}" + config["filtered_suffix"]
            log:
                logs_dir + "{ID}-bbduk.log"
            benchmark:
                benchmarks_dir + "bbduk-{ID}-benchmarks.tsv"
            shell:
                """
                bbduk.sh in={input.in1} out1={output.out1} \
                        ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \
                        trimq=10 mlf=0.5 maxns=0 > {log} 2>&1
                """


if config["single_end_data"] != "TRUE":

    # QC rules if this is paired-end data
    rule raw_multiqc_PE:
        """
        This rule collates all raw fastqc outputs.
        """

        conda:
            "envs/qc.yaml"
        input:
            expand(config["input_reads_dir"] + "{ID}" + config["input_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list),
            expand(config["input_reads_dir"] + "{ID}" + config["input_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list)
        params:
            reads_dir = config["input_reads_dir"],
            int_out_dir = config["additional_filename_prefix"] + "raw_multiqc_report",
            out_filename_prefix = config["additional_filename_prefix"] + "raw_multiqc",
            int_out_data_dir = config["additional_filename_prefix"] + "raw_multiqc_data",
            int_html_file = config["additional_filename_prefix"] + "raw_multiqc.html",
            int_zip = config["additional_filename_prefix"] + "raw_multiqc_report.zip",
            config_file = "config/multiqc.config"
        output:
            final_out_zip = fastqc_out_dir + config["additional_filename_prefix"] + f"raw_multiqc_report.zip"
        benchmark:
            benchmarks_dir + "raw_multiqc-benchmarks.tsv"
        shell:
            """
            multiqc -q -n {params.out_filename_prefix} --force --cl-config 'max_table_rows: 99999999' --interactive --config {params.config_file} {input} > /dev/null 2>&1
            
            # removing the individual fastqc files
            rm -rf {params.reads_dir}*fastqc*

            # making an output report directory and moving things into it
            mkdir -p {params.int_out_dir}
            mv {params.int_html_file} {params.int_out_data_dir} {params.int_out_dir}
            
            # zipping and removing unzipped dir
            zip -q -r {params.int_zip} {params.int_out_dir} && rm -rf {params.int_out_dir}

            # moving to final wanted location
            mv {params.int_zip} {output.final_out_zip}
            """


    rule raw_fastqc_PE:
        """
        This rule runs fastqc on all raw input fastq files.
        """

        conda:
            "envs/qc.yaml"
        input:
            config["input_reads_dir"] + "{ID}" + config["input_R1_suffix"],
            config["input_reads_dir"] + "{ID}" + config["input_R2_suffix"]
        output:
            config["input_reads_dir"] + "{ID}" + config["input_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip",
            config["input_reads_dir"] + "{ID}" + config["input_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip"
        benchmark:
            benchmarks_dir + "raw_fastqc-{ID}-benchmarks.tsv"
        shell:
            """
            fastqc {input} -t 2 -q
            """


    use rule raw_multiqc_PE as filtered_multiqc_PE with:
        input:
            expand(filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list),
            expand(filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list)
        params:
            reads_dir = filtered_reads_dir,
            int_out_dir = config["additional_filename_prefix"] + "filtered_multiqc_report",
            out_filename_prefix = config["additional_filename_prefix"] + "filtered_multiqc",
            int_out_data_dir = config["additional_filename_prefix"] + "filtered_multiqc_data",
            int_html_file = config["additional_filename_prefix"] + "filtered_multiqc.html",
            int_zip = config["additional_filename_prefix"] + "filtered_multiqc_report.zip",
            config_file = "config/multiqc.config"
        output:
            final_out_zip = fastqc_out_dir + config["additional_filename_prefix"] + f"filtered_multiqc_report.zip"
        benchmark:
            benchmarks_dir + "filtered_multiqc-benchmarks.tsv"


    use rule raw_fastqc_PE as filtered_fastqc_PE with:
        input:
            filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"],
            filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"]
        output:
            filtered_reads_dir + "{ID}" + config["filtered_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip",
            filtered_reads_dir + "{ID}" + config["filtered_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip"
        benchmark:
            benchmarks_dir + "filtered_fastqc-{ID}-benchmarks.tsv"



else:
    # QC rules if this is single-end data
    rule raw_multiqc_SE:
        """
        This rule collates all raw fastqc outputs.
        """

        conda:
            "envs/qc.yaml"
        input:
            expand(config["input_reads_dir"] + "{ID}" + config["input_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list)
        params:
            reads_dir = config["input_reads_dir"],
            int_out_dir = config["additional_filename_prefix"] + "raw_multiqc_report",
            out_filename_prefix = config["additional_filename_prefix"] + "raw_multiqc",
            int_out_data_dir = config["additional_filename_prefix"] + "raw_multiqc_data",
            int_html_file = config["additional_filename_prefix"] + "raw_multiqc.html",
            int_zip = config["additional_filename_prefix"] + "raw_multiqc_report.zip",
            config_file = "config/multiqc.config"
        output:
            final_out_zip = fastqc_out_dir + config["additional_filename_prefix"] + f"raw_multiqc_report.zip"
        benchmark:
            benchmarks_dir + "raw_multiqc-benchmarks.tsv"
        shell:
            """
            multiqc -q -n {params.out_filename_prefix} --force --cl-config 'max_table_rows: 99999999' --interactive --config {params.config_file} {input} > /dev/null 2>&1
            
            # removing the individual fastqc files
            rm -rf {params.reads_dir}*fastqc*

            # making an output report directory and moving things into it
            mkdir -p {params.int_out_dir}
            mv {params.int_html_file} {params.int_out_data_dir} {params.int_out_dir}
            
            # zipping and removing unzipped dir
            zip -q -r {params.int_zip} {params.int_out_dir} && rm -rf {params.int_out_dir}

            # moving to final wanted location
            mv {params.int_zip} {output.final_out_zip}
            """


    rule raw_fastqc_SE:
        """
        This rule runs fastqc on all raw input fastq files.
        """

        conda:
            "envs/qc.yaml"
        input:
            config["input_reads_dir"] + "{ID}" + config["input_suffix"]
        output:
            config["input_reads_dir"] + "{ID}" + config["input_suffix"].rsplit(".", 2)[0] + "_fastqc.zip"
        benchmark:
            benchmarks_dir + "raw_fastqc-{ID}-benchmarks.tsv"
        shell:
            """
            fastqc {input} -t 2 -q
            """


    use rule raw_multiqc_SE as filtered_multiqc_SE with:
        input:
            expand(filtered_reads_dir + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list)
        params:
            reads_dir = filtered_reads_dir,
            int_out_dir = config["additional_filename_prefix"] + "filtered_multiqc_report",
            out_filename_prefix = config["additional_filename_prefix"] + "filtered_multiqc",
            int_out_data_dir = config["additional_filename_prefix"] + "filtered_multiqc_data",
            int_html_file = config["additional_filename_prefix"] + "filtered_multiqc.html",
            int_zip = config["additional_filename_prefix"] + "filtered_multiqc_report.zip",
            config_file = "config/multiqc.config"
        output:
            final_out_zip = fastqc_out_dir + config["additional_filename_prefix"] + f"filtered_multiqc_report.zip"
        benchmark:
            benchmarks_dir + "filtered_multiqc-benchmarks.tsv"


    use rule raw_fastqc_SE as filtered_fastqc_SE with:
        input:
            filtered_reads_dir + "{ID}" + config["filtered_suffix"]
        output:
            filtered_reads_dir + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip"
        benchmark:
            benchmarks_dir + "filtered_fastqc-{ID}-benchmarks.tsv"


### database checking and setup rules ###
rule setup_checkm2_db:
    """
    This rule checks for the checkm2 db and downloads if needed.
    """

    conda:
        "envs/checkm2.yaml"

    output:
        checkm2_db_trigger = config["REF_DB_ROOT_DIR"] + config["CHECKM2_DATA_PATH"] + "/" + config["CHECKM2_TRIGGER_FILE"]
    params:
        checkm2_db_dir = config["REF_DB_ROOT_DIR"] + config["CHECKM2_DATA_PATH"],
        checkm2_db_filename = config["CHECKM2_DB_FILENAME"]
    log:
        logs_dir + "setup-checkm2-db.log"
    shell:
        """
        # adding wanted location to this conda env PATH (checkm2 looks in the CHECKM2DB variable),
            # so will be set when the conda environment is started from now on
        mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/
        echo 'export CHECKM2DB={params.checkm2_db_dir}/{params.checkm2_db_filename}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh
        
        # downloading ref db (will move to where we want it next)
        checkm2 database --download > {log} 2>&1

        # moving from default download location to wanted location
        mkdir -p {params.checkm2_db_dir}
        mv ~/databases/CheckM2_database/* {params.checkm2_db_dir}
        rm -rf ~/databases/CheckM2_database

        # and setting checkm db location
        checkm2 database --setdblocation {params.checkm2_db_dir}/{params.checkm2_db_filename}

        # placing file so that the workflow knows in the future this is done already
        touch {output.checkm2_db_trigger}
        """


rule setup_KOFamScan_db:
    """
    This rule checks for the KOFamScan db (minimally currently) and downloads if needed.
    """

    conda:
        "envs/kofamscan.yaml"
    output:
        kofamscan_db_trigger = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/" + config["KOFAMSCAN_TRIGGER_FILE"]
    params:
        ko_db_dir = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"],
        compressed_ko_list = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/ko_list.gz",
        compressed_profiles = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/profiles.tar.gz"
    log:
        logs_dir + "setup-kofamscan-db.log"
    benchmark:
        benchmarks_dir + "setup_KOFamScan_db-benchmarks.tsv"
    shell:
        """
        mkdir -p {params.ko_db_dir}

        printf "### Setting up KOFamScan reference database ###\n\n" > {log} 2>&1
        
        # using https instead of ftp for those whose systems that don't have access to the ftp servers

        printf "\n  Downloading ko_list file:\n\n" >> {log} 2>&1

        if ! curl -L -C - --connect-timeout 15 -o {params.compressed_ko_list} ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz >> {log} 2>&1
        then
            printf "\n\n  Downloading via http since ftp seemed to fail making the connection:\n\n"
            curl -L -C - -o {params.compressed_ko_list} https://www.genome.jp/ftp/db/kofam/ko_list.gz >> {log} 2>&1
        fi

        printf "\n\n  Downloading profiles.tar.gz file:\n\n" >> {log} 2>&1


        if ! curl -L -C - --connect-timeout 15 -o {params.compressed_profiles} ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz >> {log} 2>&1
        then
            printf "\n\n  Downloading via http since ftp seemed to fail making the connection:\n\n"
            curl -L -C - -o {params.compressed_profiles} https://www.genome.jp/ftp/db/kofam/profiles.tar.gz >> {log} 2>&1
        fi

        printf "\n\n  Decompressing profiles.tar.gz file:\n\n" >> {log} 2>&1
        tar -xzf {params.compressed_profiles} -C {params.ko_db_dir} >> {log} 2>&1
        rm {params.compressed_profiles}

        gunzip {params.compressed_ko_list}

        touch {output.kofamscan_db_trigger}
        """


rule setup_gtdbtk_db:
    """
    This rule checks for the gtdb-tk db (minimally currently) and downloads if needed.
    """

    conda:
        "envs/gtdb-tk.yaml"
    output:
        gtdbtk_db_trigger = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"]
    params:
        gtdbtk_db_dir = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"]
    log:
        logs_dir + "setup-gtdbtk-db.log"
    benchmark:
        benchmarks_dir + "setup_gtdbtk_db-benchmarks.tsv"
    shell:
        """
        mkdir -p {params.gtdbtk_db_dir}

        # adding wanted location to this conda env PATH (gtdb-tk looks in the GTDBTK_DATA_PATH variable),
            # so will be set when the conda environment is started from now on
        mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/
        echo 'export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh

        # but still needs to be set for this particular session that is downloading and setting up the db
        GTDBTK_DATA_PATH={params.gtdbtk_db_dir}

        # now downloading (see note in scripts/download-gtdbtk-refs.sh as to why using this instead of download-db.sh)
        bash scripts/download-gtdbtk-refs.sh {params.gtdbtk_db_dir} > {log} 2>&1

        touch {output.gtdbtk_db_trigger}
        """


rule setup_CAT_db:
    """
    This rule checks for the CAT reference database, and downloads if needed.
    """

    conda:
        "envs/cat.yaml"
    output:
        cat_db_trigger = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"]
    params:
        cat_db_dir = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"],
        compressed_cat = config["REF_DB_ROOT_DIR"] + config["CAT_DL_FILE"],
        compressed_nr_faa = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_DB"] + "/" + config["CAT_NR_GZ"],
        cat_dl_link = config["CAT_DL_LINK"],
        REF_DB_ROOT_DIR = config["REF_DB_ROOT_DIR"]
    log:
        logs_dir + "setup-CAT-db.log"
    benchmark:
        benchmarks_dir + "setup_CAT_db-benchmarks.tsv"
    shell:
        """
        mkdir -p {params.REF_DB_ROOT_DIR}

        printf "### Setting up CAT reference database ###\n\n" > {log} 2>&1

        printf "  Downloading reference db:\n\n" >> {log} 2>&1
        curl -L --insecure -C - -o {params.compressed_cat} {params.cat_dl_link} >> {log} 2>&1

        printf "\n\n  Extracting reference db:\n\n" >> {log} 2>&1
        tar -xvzf {params.compressed_cat} -C {params.REF_DB_ROOT_DIR} >> {log} 2>&1

        rm {params.compressed_cat} {params.compressed_nr_faa}

        touch {output.cat_db_trigger}
        """


rule clean_all:
    shell:
        """
        rm -rf {dirs_to_create}
        """
