###################################################################################################
## Snakefile for the "bit" genome summarize workflow.                                            ##
## Version 1.0.4                                                                                 ##
## bit: https://github.com/AstrobioMike/bit                                                      ##
##                                                                                               ##
## If you use this workflow in a publication, please consider citing :)                          ##
##   Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122.  ##
##   https://doi.org/10.12688/f1000research.79530.1                                              ##
###################################################################################################

configfile: "config.yaml"

genome_IDs, = glob_wildcards(config["genomes_dir"] + "/{id}" + config["assembly_extension"])

# checking if any input fastas were found
if len(genome_IDs) == 0:
    print("\n    No assembly fasta files were found, is the config.yaml file set appropriately?\n")
    exit(1)


# making directory for log files
try:
    os.mkdir(config["logs_dir"])
except:
    pass



rule all:
    input:
        str(config["output_prefix"]) + "-genome-summaries.tsv"
    params:
        checkm2_output = config["checkm2_output_dir"],
        gtdbtk_output = config["gtdbtk_output_dir"],
        keep_all_files = config["keep_all_files"]
    shell:
        """
        if [ {params.keep_all_files} != "YES" ]; then

            rm -rf {params.checkm2_output} {params.gtdbtk_output} checkm2-results.tsv gtdb-taxonomy.tsv summary-stats.tsv

        fi
        """


rule gen_summary_stats:
    conda:
        "envs/bit.yaml"
    input:
        expand(config["genomes_dir"] + "/{genome_ID}"  + config["assembly_extension"], genome_ID = genome_IDs)
    output:
        "summary-stats.tsv"
    shell:
        """
        bit-summarize-assembly {input} -o {output}
        """


## processing if eukarya
if config["is_euk"]:

    rule run_eukcc:
        conda:
            "envs/eukcc.yaml"
        input:
            genome = config["genomes_dir"] + "/{genome_ID}" + config["assembly_extension"],
            eukcc_db_trigger = config["DIR_HOLDING_eukcc_DIR"] + "/" + config["eukcc_db_dir"] + "/" + config["eukcc_TRIGGER_FILE"]
        params:
            output_dir = "{genome_ID}-eukcc-out",
            eukcc_db_dir = config["DIR_HOLDING_eukcc_DIR"] + "/" + config["eukcc_db_dir"]
        resources:
            cpus = config["threads"],
            mem_mb = config["eukcc_memory_resources"]
        log:
            config["logs_dir"] + "{genome_ID}-eukcc.log"
        output:
            est_tab = "{genome_ID}-eukcc-estimates.tsv",
            AA_seqs = "{genome_ID}-prots.faa"
        shell:
            """
            eukcc single --db {params.eukcc_db_dir} --threads {resources.cpus} --out {params.output_dir} --keep {input.genome} > {log} 2>&1

            # getting comp./redund. estimates
            paste <( echo "{wildcards.genome_ID}" ) <( tail -n +2 {params.output_dir}/eukcc.csv | head -n 1 | cut -f 2,3 ) > {output.est_tab}

            # getting protein seqs to pass to CAT formatted in a way it likes so they can be tracked back to the contigs
            sed 's/metaeuk_//' {params.output_dir}/workdir/metaeuk/*_metaeuk_cleaned.faa > {output.AA_seqs}

            rm -rf {params.output_dir}
            """


    rule combine_eukcc_estimates:
        input:
            expand("{genome_ID}-eukcc-estimates.tsv", genome_ID = genome_IDs)
        output:
            "combined-eukcc-estimates.tsv"
        shell:
            """
            printf "Assembly\\tEst. Comp.\\tEst. Redund.\\n" > {output}
            cat {input} >> {output}
            rm {input}
            """


    rule run_CAT:
        conda:
            "envs/cat.yaml"
        input:
            genome = config["genomes_dir"] + "/{genome_ID}" + config["assembly_extension"],
            AA_seqs = "{genome_ID}-prots.faa",
            cat_db_trigger = config["DIR_HOLDING_CAT_DIR"] + "/" + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"]
        params:
            tmp_out_prefix = "{genome_ID}-tax-dir.tmp",
            tmp_tax = "{genome_ID}-tax.tmp",
            cat_db = config["DIR_HOLDING_CAT_DIR"] + "/" + config["CAT_DIR"] + "/" + config["CAT_DB"],
            cat_tax = config["DIR_HOLDING_CAT_DIR"] + "/" + config["CAT_DIR"] + "/" + config["CAT_TAX"],
            num_threads = config["threads"],
            assembly_extension = config["assembly_extension"]
        resources:
            cpus = config["threads"],
            mem_mb = config["CAT_memory_resources"]        
        log:
            config["logs_dir"] + "{genome_ID}-CAT.log"
        output:
            "{genome_ID}-tax.tsv"
        shell:
            """
            CAT bin -b {input.genome} -p {input.AA_seqs} -d {params.cat_db} -t {params.cat_tax} -n {params.num_threads} -o {params.tmp_out_prefix} -f 0.5 -r 3 --top 4 --I_know_what_Im_doing > {log} 2>&1

            # adding names
            CAT add_names -i {params.tmp_out_prefix}.bin2classification.txt -o {params.tmp_tax} -t {params.cat_tax} --only_official > {log} 2>&1

            # formatting classification
            grep -v "^#" {params.tmp_tax} | awk -F $'\\t' ' BEGIN {{ OFS=FS }} {{ if ( $2 == "taxid assigned" ) {{ print $1,$6,$7,$8,$9,$10,$11,$12 }} \
            else {{ print $1,"NA","NA","NA","NA","NA","NA","NA" }} }} ' | head -n 1 | \
            sed 's/: [0-9\.]*//g' | sed 's/not classified/NA/g' | sed 's/no support/NA/g' | sed 's/{params.assembly_extension}//' > {output}

            rm -rf {wildcards.genome_ID}*tmp* {input.AA_seqs}
            """


    rule combine_euk_tax_outputs:
        input:
            expand("{genome_ID}-tax.tsv", genome_ID = genome_IDs)
        output:
            "CAT-taxonomies.tsv"
        shell:
            """
            printf "Assembly\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output}
            cat {input} >> {output}
            rm {input}
            """


    rule combine_euk_outputs:
        input:
            assembly_stats_tab = "summary-stats.tsv",
            eukcc_tab = "combined-eukcc-estimates.tsv",
            tax_tab = "CAT-taxonomies.tsv"
        output:
            str(config["output_prefix"]) + "-genome-summaries.tsv"
        shell:
            """
            python scripts/combine-euk-outputs.py -s {input.assembly_stats_tab} -c {input.eukcc_tab} -t {input.tax_tab} -o {output}
            rm {input}
            """

# processing if not eukarya
if not config["is_euk"]:

    rule run_checkm2:
        conda:
            "envs/checkm2.yaml"
        input:
            checkm2_db_trigger = config["CHECKM2_DATA_PATH"] + "/" + config["CHECKM2_TRIGGER_FILE"]
        params:
            pplacer_cpus = config["gtdb_tk_pplacer_cpus"],
            genomes_dir = config["genomes_dir"],
            extension = config["assembly_extension"],
            output_dir = config["checkm2_output_dir"],
            checkm2_db_dir = config["CHECKM2_DATA_PATH"],
            checkm2_db_filename = config["CHECKM2_DB_FILENAME"]
        resources:
            cpus = config["threads"],
            mem_mb = config["checkm2_memory_resources"]
        output:
            out_dir = directory(config["checkm2_output_dir"]),
            checkm2_results_tab = "checkm2-results.tsv"
        log:
            config["logs_dir"] + "checkm2.log"
        shell:
            """
            # setting db dir variable if not set in the environment for some reason
                # snakemake uses bash strict mode, so need this so it won't fail if the variable isn't set
            set +u
            if [ -z "${{CHECKM2DB}}" ] || [ "${{CHECKM2DB}}" != "{params.checkm2_db_dir}/{params.checkm2_db_filename}" ]; then

                export CHECKM2DB={params.checkm2_db_dir}/{params.checkm2_db_filename}

            fi
            set -u

            checkm2 predict -i {params.genomes_dir} -x {params.extension} -t {resources.cpus} --output-directory {output.out_dir} > {log} 2>&1

            # making copy of primary output file
            cp {output.out_dir}/quality_report.tsv {output.checkm2_results_tab}
            """


    rule gtdbtk_classify:
        conda:
            "envs/gtdb-tk.yaml"
        input:
            gtdbtk_db_trigger = config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"]
        params:
            pplacer_cpus = config["gtdb_tk_pplacer_cpus"],
            genomes_dir = config["genomes_dir"],
            extension = config["assembly_extension"],
            gtdbtk_db_dir = config["GTDB_DATA_PATH"]
        resources:
            cpus = config["threads"],
            mem_mb = config["gtdbtk_memory_resources"]
        output:
            directory(config["gtdbtk_output_dir"])
        log:
            config["logs_dir"] + "gtdb.log"
        shell:
            """
            # setting db dir variable if not set in the environment for some reason
                # snakemake uses bash strict mode, so need this so it won't fail if the variable isn't set
            set +u
            if [ -z "${{GTDBTK_DATA_PATH}}" ] || [ "${{GTDBTK_DATA_PATH}}" != "{params.gtdbtk_db_dir}" ]; then

                export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}

            fi
            set -u

            gtdbtk classify_wf -x {params.extension} --genome_dir {params.genomes_dir} --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} --skip_ani_screen > {log} 2>&1
            """


    rule combine_gtdb_classifications:
        input:
            config["gtdbtk_output_dir"]
        output:
            gtdb_tax_tab = "gtdb-taxonomy.tsv"
        shell:
            "cut -f 1,2 {input}/classify/*summary.tsv > {output}"


    rule combine_outputs:
        input:
            assembly_stats_tab = "summary-stats.tsv",
            checkm2_results_tab = "checkm2-results.tsv",
            gtdb_tax_tab = "gtdb-taxonomy.tsv"
        output:
            str(config["output_prefix"]) + "-genome-summaries.tsv"
        shell:
            "python scripts/combine-outputs.py -s {input.assembly_stats_tab} -c {input.checkm2_results_tab} -t {input.gtdb_tax_tab} -o {output}"


rule clean_all:
    shell:
        """
        rm -rf checkm2-output/ logs/ checkm2-results.tsv gtdb-taxonomy.tsv gtdb-tk-output/ summary-stats.tsv *-genome-summaries.tsv
        """


### database checking and setup rules if needed ###

rule setup_checkm2_db:
    """
    This rule checks for the checkm2 db and downloads if needed.
    """

    conda:
        "envs/checkm2.yaml"

    output:
        checkm2_db_trigger = config["CHECKM2_DATA_PATH"] + "/" + config["CHECKM2_TRIGGER_FILE"]
    params:
        checkm2_db_dir = config["CHECKM2_DATA_PATH"],
        checkm2_db_filename = config["CHECKM2_DB_FILENAME"]
    log:
        config["logs_dir"] + "setup-checkm2-db.log"
    shell:
        """
        # adding wanted location to this conda env PATH (checkm2 looks in the CHECKM2DB variable),
            # so will be set when the conda environment is started from now on
        mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/
        echo 'export CHECKM2DB={params.checkm2_db_dir}/{params.checkm2_db_filename}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh
        
        # downloading ref db (will move to where we want it next)
        checkm2 database --download > {log} 2>&1

        # moving from default download location to wanted location
        mkdir -p {params.checkm2_db_dir}
        mv ~/databases/CheckM2_database/* {params.checkm2_db_dir}

        # placing file so that the workflow knows in the future this is done already
        touch {output.checkm2_db_trigger}
        """


rule setup_gtdbtk_db:
    """
    This rule checks for the gtdb-tk db (minimally currently) and downloads if needed.
    """

    conda:
        "envs/gtdb-tk.yaml"
    output:
        gtdbtk_db_trigger = config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"]
    params:
        gtdbtk_db_dir = config["GTDB_DATA_PATH"]
    log:
        config["logs_dir"] + "setup-gtdbtk-db.log"
    shell:
        """
        mkdir -p {params.gtdbtk_db_dir}

        # adding wanted location to this conda env PATH (gtdb-tk looks in the GTDBTK_DATA_PATH variable),
            # so will be set when the conda environment is started from now on
        mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/
        echo 'export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh
        # but still needs to be set for this particular session that is downloading and setting up the db
        export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}

        # now downloading
        download-db.sh {params.gtdbtk_db_dir} > {log} 2>&1
        # placing file so that the workflow knows in the future this is done
        touch {output.gtdbtk_db_trigger}
        """


rule setup_CAT_db:
    """
    This rule checks for the CAT reference database, and downloads if needed.
    """

    conda:
        "envs/cat.yaml"
    output:
        cat_db_trigger = config["DIR_HOLDING_CAT_DIR"] + "/" + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"]
    params:
        cat_db_dir = config["DIR_HOLDING_CAT_DIR"] + "/" + config["CAT_DIR"],
        compressed_cat = config["DIR_HOLDING_CAT_DIR"] + "/" + config["CAT_DL_FILE"],
        compressed_nr_faa = config["CAT_DIR"] + "/" + config["CAT_DB"] + "/" + config["CAT_COMPRESSED_NR_FAA"],
        cat_dl_link = config["CAT_DL_LINK"],
        DIR_HOLDING_CAT_DIR = config["DIR_HOLDING_CAT_DIR"]
    log:
        config["logs_dir"] + "setup-CAT-db.log"
    shell:
        """
        mkdir -p {params.DIR_HOLDING_CAT_DIR}
        printf "### Setting up CAT reference database ###\n\n" > {log} 2>&1
        printf "  Downloading reference db:\n\n" >> {log} 2>&1
        curl -L -C - -o {params.compressed_cat} {params.cat_dl_link} >> {log} 2>&1
        printf "\n\n  Extracting reference db:\n\n" >> {log} 2>&1
        tar -xvzf {params.compressed_cat} -C {params.DIR_HOLDING_CAT_DIR} >> {log} 2>&1
        rm {params.compressed_cat} {params.compressed_nr_faa}
        touch {output.cat_db_trigger}
        """


rule setup_eukcc_db:
    """
    This rule checks for the eukcc database, and downloads if needed.
    """

    conda:
        "envs/eukcc.yaml"
    output:
        eukcc_db_trigger = config["DIR_HOLDING_eukcc_DIR"] + "/" + config["eukcc_db_dir"] + "/" + config["eukcc_TRIGGER_FILE"]
    params:
        compressed_eukcc = config["DIR_HOLDING_eukcc_DIR"] + "/" + config["eukcc_DL_FILE"],
        eukcc_dl_link = config["eukcc_DL_LINK"],
        DIR_HOLDING_eukcc_DIR = config["DIR_HOLDING_eukcc_DIR"]
    log:
        config["logs_dir"] + "setup-eukcc-db.log"
    shell:
        """
        mkdir -p {params.DIR_HOLDING_eukcc_DIR}
        printf "### Setting up eukcc reference database ###\n\n" > {log} 2>&1
        printf "  Downloading reference db:\n\n" >> {log} 2>&1
        curl -L -C - -o {params.compressed_eukcc} {params.eukcc_dl_link} >> {log} 2>&1
        printf "\n\n  Extracting reference db:\n\n" >> {log} 2>&1
        tar -xvzf {params.compressed_eukcc} -C {params.DIR_HOLDING_eukcc_DIR} >> {log} 2>&1
        rm {params.compressed_eukcc}
        touch {output.eukcc_db_trigger}
        """
