print(r'''
________             _______________              
___  __ )_____ _________  ____/__  /___  _____  __
__  __  |  __ `/  ___/_  /_   __  /_  / / /_  |/_/
_  /_/ // /_/ // /__ _  __/   _  / / /_/ /__>  <  
/_____/ \__,_/ \___/ /_/      /_/  \__,_/ /_/|_|  
                                                  
BacFlux v1.2.2 — Genomic Analysis of Bacterial Illumina Reads


Livio Antonielli, April 2026
''')

# Modules:
import os
import sys
from urllib.parse import urlparse

# Configuration file:
configfile: 'config/config.yaml'

# Path to DBs and input/output directories in config file:
workdir: config['directories']['output_dir']
INPUTDIR = config['directories']['input_dir']
BAKTADB = config['directories']['bakta_db']
BLASTDB = config['directories']['blast_db']
DMNDDB = config['directories']['eggnog_db']
GTDBTKDB = config['directories']['gtdbtk_db']
PLATONDB = config['directories']['platon_db']

# AMR DBs (ABRicate):
DATABASES = ["argannot", "card", "ecoh", "ecoli_vf", "megares", "ncbi", "resfinder", "vfdb"]

# Hardware resources in config file:
CPUS = config['resources']['threads']
RAM = config['resources']['ram_gb']

# Define not allowed characters
not_allowed_chars = set("_*#@%^/! ?&:;|<>")

def bakta_locus_tag(sample):
    tag = "".join(char if char.isalnum() or char in "_.-" else "_" for char in str(sample))
    tag = tag[:24]
    if not tag:
        raise ValueError(f"Unable to derive a valid Bakta locus tag from sample name '{sample}'.")
    return tag

# Sanity check of genus parameter:
if config['parameters'] is not None and "genus" in config['parameters']:
    genus = config['parameters']["genus"]
    if genus is not None and len(genus) > 0:
        print(f"The 'genus' parameter is specified in the config file with value: '{genus}'.")
    else:
        print("The 'genus' parameter value is not specified in the config file and will be inferred automatically.")
else:
    print("The 'genus' parameter is not present in the config file and contigs won't be filtered accordingly.")

# Sanity check of checkv db link parameter:
global id
id = "checkv-db-v1.5"

links = config.get("links") or {}

if "checkv_link" not in links or links["checkv_link"] is None or str(links["checkv_link"]).strip() == "":
    # Optional parameter: fall back to automatic download
    print(
        "The link to the CheckV database is not specified (or empty). "
        "CheckV will download the database automatically."
    )
else:
    link = str(links["checkv_link"]).strip()
    path = urlparse(link).path
    db = os.path.basename(path)

    # Hard failure only for truly invalid input
    if not db.endswith(".tar.gz"):
        raise ValueError(
            f"Invalid checkv_link: expected a .tar.gz archive, got '{db}'."
        )

    id = os.path.splitext(os.path.splitext(db)[0])[0]
    print(f"Using CheckV database from link: '{link}' (db_id='{id}').")

# Import FASTQ files from input dir:
SAMPLES, EXTENSIONS = glob_wildcards(os.path.join(INPUTDIR, '{sample}_R1.{extn}'))

if len(SAMPLES) == 0:
    sys.stderr.write(f"No files in {INPUTDIR}. Please, check directory.\n")
    sys.exit(0)
else:
    for sample in sorted(SAMPLES):
        if any(char in sample for char in not_allowed_chars):
            sys.stderr.write(f"Sample name '{sample}' contains not allowed characters.\n")
            sys.exit(0)
        print(f"Sample {sample} in process...")

# Check if files have nonunique extension:
for extension in EXTENSIONS:
  if extension.endswith(("fastq", "fq", "fastq.gz", "fq.gz")):
    if len(set(EXTENSIONS)) != 1:
      sys.stderr.write("More than one type of file extension detected\n\t")
      sys.stderr.write("\n\t".join(set(EXTENSIONS)))
      sys.exit(0)
  else:
    sys.stderr.write("\nFile format not recognized.\n")
    sys.exit(0)

# Create sample objects:
EXTN = EXTENSIONS[0]
R1 = '{sample}_R1.' + EXTN
R2 = '{sample}_R2.' + EXTN

rule all:
  input:
    fastp_json = expand("01.pre-processing/{sample}_fastp.json", sample = SAMPLES),
    qualimap_dir = expand("03.post-processing/mapping_evaluation/{sample}", sample = SAMPLES),
    quast_dir = expand("03.post-processing/assembly_evaluation/{sample}", sample = SAMPLES),
    checkm_dir = expand("03.post-processing/completeness_evaluation/{sample}", sample = SAMPLES),
    checkm_stats = expand("03.post-processing/completeness_evaluation/{sample}/checkm_stats.tsv", sample = SAMPLES),
    checkm_lineage = expand("03.post-processing/completeness_evaluation/{sample}/lineage.ms", sample = SAMPLES),
    gtdbtk_dir = expand("04.taxonomy/{sample}", sample = SAMPLES),
    bakta_dir = expand("05.annotation/bakta/{sample}", sample = SAMPLES),
    eggnog_dir = expand("05.annotation/eggnog/{sample}", sample = SAMPLES),
    antismash_dir = expand("05.annotation/antismash/{sample}", sample = SAMPLES),
    dbcan_db = "05.annotation/dbcan/dbcan_db_v5.1.2",
    dbcan_dir = expand("05.annotation/dbcan/{sample}", sample = SAMPLES),
    amr_tab = expand("06.AMR/ABRicate/{sample}/{db}.tsv", sample = SAMPLES, db = DATABASES),
    amr_summary = expand("06.AMR/ABRicate/{sample}/AMR_summary.txt", sample = SAMPLES),
    covstats = expand("06.AMR/AMR_mapping/{sample}/{sample}_covstats.tsv", sample = SAMPLES),
    amr_legend = expand("06.AMR/AMR_mapping/{sample}/{sample}_AMR_legend.tsv", sample = SAMPLES),
    plasmid_dir = expand("07.plasmids/{sample}", sample = SAMPLES),
    plasmids = expand("07.plasmids/{sample}/verified_plasmids.txt", sample = SAMPLES),
    vs2_dir = expand("08.phages/virsorter/{sample}", sample = SAMPLES),
    checkv_dir = expand("08.phages/checkv/{sample}", sample = SAMPLES),
    multiqc_dir = "09.report"

rule download_phix:
  output:
    phix = temp("01.pre-processing/phix.fna.gz")
  params:
    link = config['links']['phix_link']
  message:
    "--- Download PhiX genome from NCBI. ---"
  log:
    "logs/download_phix.log"  
  shell:
    """
    wget {params.link} -O {output.phix} > {log} 2>&1
    """

rule build_phix:
  input:
    phix = "01.pre-processing/phix.fna.gz"
  output:
    idx1 = temp("01.pre-processing/phix.1.bt2"),
    idx2 = temp("01.pre-processing/phix.2.bt2"),
    idx3 = temp("01.pre-processing/phix.3.bt2"),
    idx4 = temp("01.pre-processing/phix.4.bt2"),
    ridx1 = temp("01.pre-processing/phix.rev.1.bt2"),
    ridx2 = temp("01.pre-processing/phix.rev.2.bt2")
  params:
    basename = "01.pre-processing/phix"
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Build PhiX genome db. ---"
  log:
    "logs/build_phix.log"
  shell:
    """
    bowtie2-build \
    {input.phix} \
    {params.basename} \
    > {log} 2>&1
    """

rule map_phix:
  input:
    idx1 = "01.pre-processing/phix.1.bt2",
    idx2 = "01.pre-processing/phix.2.bt2",
    idx3 = "01.pre-processing/phix.3.bt2",
    idx4 = "01.pre-processing/phix.4.bt2",
    ridx1 = "01.pre-processing/phix.rev.1.bt2",
    ridx2 = "01.pre-processing/phix.rev.2.bt2",
    r1 = os.path.join(INPUTDIR, R1),
    r2 = os.path.join(INPUTDIR, R2)
  output:
    sam = temp("01.pre-processing/{sample}_contam.sam"),
    r1 = temp("01.pre-processing/{sample}.1.fastq"),
    r2 = temp("01.pre-processing/{sample}.2.fastq")
  params:
    db = temp("01.pre-processing/phix"),
    basename = "01.pre-processing/{sample}.fastq"
  resources:
    cpus = CPUS
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Map reads against PhiX genome db. ---"
  log:
    "logs/map_phix_{sample}.log"
  shell:
    """
    bowtie2 \
      -x {params.db} \
      -1 {input.r1} -2 {input.r2} \
      --threads {resources.cpus} \
      --un-conc {params.basename} \
      -S {output.sam} \
      --local \
      > {log} 2>&1
    """

rule trim_adapters:
  input:
    r1 = "01.pre-processing/{sample}.1.fastq",
    r2 = "01.pre-processing/{sample}.2.fastq"
  output:
    r1 = temp("01.pre-processing/{sample}_trim_R1.fastq"),
    r2 = temp("01.pre-processing/{sample}_trim_R2.fastq"),
    html = "01.pre-processing/{sample}_fastp.html",
    json = "01.pre-processing/{sample}_fastp.json"
  resources:
    cpus = 16
  conda:
    "envs/fastp.yaml"
  message:
    "--- Fastp: Remove adapters and quality filter. ---"
  log:
    "logs/trim_adapters_{sample}.log"
  shell:
    """
    fastp \
      --detect_adapter_for_pe \
      --length_required 100 \
      --cut_front \
      --cut_right \
      --thread {resources.cpus} \
      --verbose \
      -i {input.r1} -I {input.r2} \
      -o {output.r1} -O {output.r2} \
      -j {output.json} \
      -h {output.html} > {log} 2>&1
    """

rule genome_assembly:
  input:
    r1 = "01.pre-processing/{sample}_trim_R1.fastq",
    r2 = "01.pre-processing/{sample}_trim_R2.fastq"
  output:
    dir = directory("02.assembly/{sample}"),
    contigs = "02.assembly/{sample}/contigs.fasta"
  resources:
    cpus = CPUS,
    ram = RAM
  conda:
    "envs/spades.yaml"
  message:
    "--- SPAdes: genome assembly. ---"
  log:
    "logs/genome_assembly_{sample}.log"
  priority: 10
  shell:
    """
    OMP_NUM_THREADS={resources.cpus} \
    spades.py -k 21,33,55,77,99,127 --isolate \
      --pe1-1 {input.r1} \
      --pe1-2 {input.r2} \
      -o {output.dir} \
      -t {resources.cpus} \
      -m {resources.ram} > {log} 2>&1
    """

FASTA_LIN_CMD = r"""{if(NR==1) {printf "%s\n", $0} else {if(/^>/) {printf "\n%s\n", $0} else {printf $0}}}"""
FASTA_SEL_CMD = r"""{if(/^>/ && $6>=2.0 && $4>=500) {printf "%s\n", $0; getline; print}}"""

rule filter_contigs:
  input:
    contigs = "02.assembly/{sample}/contigs.fasta"    
  output:
    contigs = "02.assembly/{sample}/contigs_filt.fasta"
  message:
    "Remove short contigs <500 bp and low coverage <2x."
  priority: 9
  shell:
    """
    cat {input.contigs} | \
    awk {FASTA_LIN_CMD:q} | \
    awk -F"_" {FASTA_SEL_CMD:q} > {output.contigs}
    """

rule index_contigs:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta"
  output:
    idx1 = temp("03.post-processing/{sample}_contigs.1.bt2"),
    idx2 = temp("03.post-processing/{sample}_contigs.2.bt2"),
    idx3 = temp("03.post-processing/{sample}_contigs.3.bt2"),
    idx4 = temp("03.post-processing/{sample}_contigs.4.bt2"),
    ridx1 = temp("03.post-processing/{sample}_contigs.rev.1.bt2"),
    ridx2 = temp("03.post-processing/{sample}_contigs.rev.2.bt2")
  params:
    basename = "03.post-processing/{sample}_contigs"
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Build contig db. ---"
  log:
    "logs/index_contigs_{sample}.log"
  priority: 8
  shell:
    """
    bowtie2-build \
    -f {input.contigs} \
    {params.basename} > {log} 2>&1
    """

rule map_contigs:
  input:
    idx1 = "03.post-processing/{sample}_contigs.1.bt2",
    idx2 = "03.post-processing/{sample}_contigs.2.bt2",
    idx3 = "03.post-processing/{sample}_contigs.3.bt2",
    idx4 = "03.post-processing/{sample}_contigs.4.bt2",
    ridx1 = "03.post-processing/{sample}_contigs.rev.1.bt2",
    ridx2 = "03.post-processing/{sample}_contigs.rev.2.bt2",
    r1 = "01.pre-processing/{sample}_trim_R1.fastq",
    r2 = "01.pre-processing/{sample}_trim_R2.fastq"
  output:
    bam = temp("03.post-processing/{sample}_map.bam"),
    bai = temp("03.post-processing/{sample}_map.bam.bai"),
    csi = temp("03.post-processing/{sample}_map.bam.csi")
  params:
    db = temp("03.post-processing/{sample}_contigs")
  resources:
    cpus = CPUS
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Map reads against contigs. ---"
  log:
    "logs/map_contigs_{sample}.log"
  priority: 8
  shell:
    """
    bowtie2 \
      -x {params.db} \
      -1 {input.r1} \
      -2 {input.r2} \
      -p {resources.cpus} \
      -t 2> {log} | \
    samtools view \
      -@ {resources.cpus} \
      -hbS - | \
    samtools sort \
      -@ {resources.cpus} \
      --write-index \
      -o {output.bam} - >> {log} 2>&1

    samtools index \
      -@ {resources.cpus} \
      -b {output.bam} >> {log} 2>&1
    """

rule map_evaluation:
  input:
    bam = "03.post-processing/{sample}_map.bam"
  output:
    qualimap_dir = directory("03.post-processing/mapping_evaluation/{sample}")
  conda:
    "envs/qualimap.yaml"
  message:
    "--- Qualimap: Mapping evaluation. ---"
  log:
    "logs/map_evaluation_{sample}.log"
  shell:
    """
    qualimap bamqc \
      -bam {input.bam} \
      -outdir {output.qualimap_dir} \
      -outformat html > {log} 2>&1
    """

rule blast_contigs:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta"
  output:
    blast = "03.post-processing/contaminants/{sample}/blastout"
  params:
    dir = BLASTDB,
    db = os.path.join(BLASTDB, config['parameters']['nt_version'])
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/blast.yaml"
  message:
    "--- BLAST: Contigs against NCBI nt db. ---"
  log:
    "logs/blast_contigs_{sample}.log"
  priority: 7
  shell:
    """
    BLASTDB={params.dir} \
    blastn \
      -task megablast \
      -query {input.contigs} \
      -db {params.db} \
      -outfmt '6 qseqid staxids bitscore pident evalue length qlen slen qcovs qcovhsp sskingdoms scomnames sscinames sblastnames stitle' \
      -num_threads {resources.cpus} \
      -evalue 1e-5 -max_target_seqs 50 -max_hsps 5 \
      -out {output.blast} > {log} 2>&1
    """

rule blob_json:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta",
    bam = "03.post-processing/{sample}_map.bam",
    bai = "03.post-processing/{sample}_map.bam.bai",
    blast = "03.post-processing/contaminants/{sample}/blastout",
    nodes = os.path.join(BLASTDB, "nodes.dmp"),
    names = os.path.join(BLASTDB, "names.dmp")
  output:
    json = temp("03.post-processing/contaminants/{sample}/blob.blobDB.json"),
    cov = temp("03.post-processing/contaminants/{sample}/blob.{sample}_map.bam.cov")
  params:
    basename = "03.post-processing/contaminants/{sample}/blob"
  conda:
    "envs/blobtools.yaml"
  message:
    "--- BlobTools: Screen BLAST hits for contaminants. ---"
  log:
    "logs/blob_table_{sample}.log"
  priority: 7
  shell:
    """
    blobtools create \
      -i {input.contigs} \
      -b {input.bam} \
      -t {input.blast} \
      --nodes {input.nodes} \
      --names {input.names} \
      -o {params.basename} > {log} 2>&1
    """

rule blob_table:
  input:
    json = "03.post-processing/contaminants/{sample}/blob.blobDB.json"
  output:
    bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt"
  params:
    basename = "03.post-processing/contaminants/{sample}/bestscore"
  conda:
    "envs/blobtools.yaml"
  message:
    "--- BlobTools: Collapse taxonomic assignment of BLAST hits according to sum of best scores. ---"
  log:
    "logs/blob_table_{sample}.log"
  priority: 7
  shell:
    """
    blobtools view \
      --input {input.json} \
      --out {params.basename} \
      --taxrule bestsum \
      --rank all \
      --hits >> {log} 2>&1
    """

# Execute either one rule or another according to presence/absence of 'genus' parameter
if config.get('parameters') is not None and "genus" in config['parameters'] and config['parameters']['genus'] is not None and len(config['parameters']['genus']) > 0:
  rule:
    input:
      bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt",
      contigs = "02.assembly/{sample}/contigs_filt.fasta"
    output:
      abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "03.post-processing/contaminants/{sample}/contigs.list",
      contigs = "02.assembly/{sample}/contigs_sel.fasta"
    params:
      genus = config['parameters']['genus']    
    priority: 6
    shell:
      """
      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
        cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}
      
      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      
      awk -v var="{params.genus}" 'tolower($22) ~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore} > {output.list}
      
      grep -A1 -f {output.list} {input.contigs} | sed '/--/d' > {output.contigs}
      """
elif config.get('parameters') is not None and "genus" in config['parameters'] and (config['parameters']['genus'] is None or len(config['parameters']['genus']) == 0):
  rule:
    input:
      bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt",
      contigs = "02.assembly/{sample}/contigs_filt.fasta"
    output:
      abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "03.post-processing/contaminants/{sample}/contigs.list",
      contigs = "02.assembly/{sample}/contigs_sel.fasta"
    priority: 6
    shell:
      """
      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
        cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}
      
      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      for i in $(cat {output.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p' | sed -e 's/Para//;s/Pseudo//;s/Paen//;s/Paeni//;s/Brady//;s/Meso//;s/Neo//;s/Sino//;s/Aeri//;s/Caldi//;s/Geo//' | tr '[:upper:]' '[:lower:]'); do \
        awk -v var="$i" 'tolower($22) ~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore}; \
      done > {output.list}
      
      grep -A1 -f {output.list} {input.contigs} | sed '/--/d' > {output.contigs}
      """
else:
  rule:
    input:
      bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt",
      contigs = "02.assembly/{sample}/contigs_filt.fasta"
    output:
      abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "03.post-processing/contaminants/{sample}/contigs.list",
      contigs = "02.assembly/{sample}/contigs_sel.fasta"
    params:
      nohit = "no-hit"   
    priority: 6
    shell:
      """
      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
        cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}
      
      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      
      awk -v var="{params.nohit}" 'tolower($22) !~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore} > {output.list}
      
      grep -A1 -f {output.list} {input.contigs} | sed '/--/d' > {output.contigs}
      """

rule genome_assembly_evaluation:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta"
  output:
    quast_dir = directory("03.post-processing/assembly_evaluation/{sample}")
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/quast.yaml"
  message:
    "--- QUAST: Genome assembly evaluation. ---"
  log:
    "logs/assembly_evaluation_{sample}.log"
  shell:
    """
    quast \
      {input.contigs} \
      -o {output.quast_dir} \
      --no-icarus \
      -t {resources.cpus} > {log} 2>&1 
    """

rule completeness_and_contamination:
  input:
    contigs_sel = "02.assembly/{sample}/contigs_sel.fasta"
  output:
    checkm_dir = directory("03.post-processing/completeness_evaluation/{sample}"),
    checkm_stats = "03.post-processing/completeness_evaluation/{sample}/checkm_stats.tsv",
    checkm_lineage = "03.post-processing/completeness_evaluation/{sample}/lineage.ms"
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/checkm.yaml"
  message:
    "--- CheckM: Assessment of genome completeness and contamination. ---"
  log:
    "logs/completeness_and_contamination_{sample}.log"
  priority: 5
  shell:
    """
    cp {input.contigs_sel} {output.checkm_dir}/{wildcards.sample}.fasta
    
    checkm lineage_wf \
      -t {resources.cpus} \
      -x fasta {output.checkm_dir} \
      {output.checkm_dir} > {log} 2>&1
    
    checkm qa \
      -o 2 \
      -t {resources.cpus} \
      --tab_table \
      -f {output.checkm_stats} {output.checkm_lineage} {output.checkm_dir} >> {log} 2>&1
    """

rule taxonomic_assignment:
  input:
    checkm_dir = "03.post-processing/completeness_evaluation/{sample}",
  output:
    gtdbtk_dir = directory("04.taxonomy/{sample}")
  params:
    gtdbtk_db = GTDBTKDB,
    skani_sketch_dir = f"{GTDBTKDB}/skani_sketches_r226_skani0.3.1"
  resources:
    cpus = min(CPUS, 24),
    cpus_p = min(CPUS, 64)
  conda:
    "envs/gtdbtk.yaml"
  message:
    "--- GTDB-Tk: Taxonomic assignment. ---"
  log:
    "logs/taxonomic_assignment_{sample}.log"
  shell:
    """
    mkdir -p {output.gtdbtk_dir} {params.skani_sketch_dir}

    GTDBTK_DATA_PATH={params.gtdbtk_db:q} \
    gtdbtk classify_wf \
      -x fasta \
      --genome_dir {input.checkm_dir} \
      --out_dir {output.gtdbtk_dir} \
      --skani_sketch_dir {params.skani_sketch_dir:q} \
      --cpus {resources.cpus} \
      --pplacer_cpus {resources.cpus_p} > {log} 2>&1

    rm -rf {input.checkm_dir}/*.fasta
    """

rule annotation:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta",
    abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt"
  output:
    bakta_dir = directory("05.annotation/bakta/{sample}")
  params:
    bakta_db = BAKTADB,
    locus_tag = lambda wc: bakta_locus_tag(wc.sample)
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/bakta.yaml"
  message:
    "--- Bakta: Genome annotation. ---"
  log:
    "logs/annotation_{sample}.log"
  priority: 5
  shell:
    """
    for i in $(cat {input.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p'); do \
    bakta \
      --db {params.bakta_db} \
      --verbose \
      --genus $i \
      --species sp. \
      --strain {wildcards.sample} \
      --translation-table 11 \
      --min-contig-length 500 \
      --locus-tag {params.locus_tag} \
      --prefix {wildcards.sample} \
      --keep-contig-headers \
      --output {output.bakta_dir} \
      --threads {resources.cpus} \
      --force {input.contigs}; \
    done > {log} 2>&1
    """    

rule functional_annotation:
  input:
    bakta_dir = "05.annotation/bakta/{sample}"
  output:
    temp_dir = temp(directory("05.annotation/eggnog/{sample}/eggnog_tmp")),
    eggnog_dir = directory("05.annotation/eggnog/{sample}")
  params:
    dmnd_db = DMNDDB
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/eggnog-mapper.yaml"
  message:
    "--- EggNOG: Functional annotation. ---"
  log:
    "logs/functional_annotation_{sample}.log"
  priority: 4
  shell:
    """
    mkdir -p {output.temp_dir} {output.eggnog_dir}

    emapper.py \
      -i {input.bakta_dir}/{wildcards.sample}.faa \
      --output_dir {output.eggnog_dir} \
      --cpu {resources.cpus} \
      -m diamond \
      --data_dir {params.dmnd_db} \
      --output {wildcards.sample} \
      --temp_dir {output.temp_dir} \
      --override > {log} 2>&1
    """

rule secondary_metabolites_db:
  output:
    antismash_db = directory("05.annotation/antismash/databases")
  conda:
    "envs/antismash.yaml"
  message:
    "--- antiSMASH: database download. ---"
  log:
    "logs/secondary_metabolites_database.log"
  priority: 4
  shell:
    """
    download-antismash-databases \
      --database-dir {output.antismash_db} > {log} 2>&1
    """    

rule secondary_metabolites_analysis:
  input:
    antismash_db = "05.annotation/antismash/databases",
    bakta_dir = "05.annotation/bakta/{sample}"
  output:
    antismash_dir = directory("05.annotation/antismash/{sample}")
  params:
    taxon = 'bacteria',
    genefinding_tool = 'none'
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/antismash.yaml"
  message:
    "--- antiSMASH: secondary metabolite annotation. ---"
  log:
    "logs/secondary_metabolites_{sample}.log"
  priority: 4
  shell:
    """
    antismash \
      --output-dir {output.antismash_dir} \
      --output-basename {wildcards.sample} \
      --databases {input.antismash_db} \
      --taxon {params.taxon} \
      --genefinding-tool {params.genefinding_tool} \
      --cpus {resources.cpus} \
      {input.bakta_dir}/{wildcards.sample}.gbff > {log} 2>&1
    """    

# Specify directory where to place the dbCAN database
DBCAN_DB_DIR = "05.annotation/dbcan/dbcan_db_v5.1.2"

# Sentinel file for triggering the cazyme annotation rule afterwards
DBCAN_SENTINEL = DBCAN_DB_DIR + "/.verified.sha256"

rule cazyme_db_download:
  output:
    dbcan_db = directory(DBCAN_DB_DIR),
    dbcan_verified = DBCAN_SENTINEL
  params:
    dbcan_db_url = config["links"]["dbcan_link"],
    sha_url = lambda wc: config["links"]["dbcan_link"].replace(".tar.gz", ".sha256")
  log:
    "logs/cazyme_db_download.log"
  priority: 4
  shell:
    """
    mkdir -p "{output.dbcan_db}"

    TAR="{output.dbcan_db}/$(basename "{params.dbcan_db_url}")"
    SHA="{output.dbcan_db}/$(basename "{params.sha_url}")"

    wget -O "$TAR" "{params.dbcan_db_url}" > "{log}" 2>&1
    wget -O "$SHA" "{params.sha_url}" >> "{log}" 2>&1

    # verify checksum
    expected="$(awk 'NR==1{{print $1}}' "$SHA")"
    actual="$(sha256sum "$TAR" | awk '{{print $1}}')"
    test "$expected" = "$actual"

    # extract (flatten top-level directory)
    tar -xzf "$TAR" -C "{output.dbcan_db}" --strip-components=1 >> "{log}" 2>&1

    # create sentinel containing verified checksum
    echo "$actual" > "{output.dbcan_verified}"
    """

rule cazyme_gene_cluster:
  input:
    bakta_dir = "05.annotation/bakta/{sample}",
    dbcan_verified = DBCAN_SENTINEL
  output:
    dbcan_dir = directory("05.annotation/dbcan/{sample}")
  params:
    dbcan_db = DBCAN_DB_DIR
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/dbcan.yaml"
  message:
    "--- dbCAN: Finding CAZyme gene clusters. ---"
  log:
    "logs/cazyme_{sample}.log"
  priority: 4
  shell:
    """
    # CAZyme annotation of protein sequences
    run_dbcan CAZyme_annotation \
      --input_raw_data {input.bakta_dir}/{wildcards.sample}.faa \
      --output_dir {output.dbcan_dir} \
      --db_dir {params.dbcan_db} \
      --mode protein \
      --threads {resources.cpus} \
      --methods hmm \
      --methods diamond \
      --methods dbCANsub > {log} 2>&1

    # CAZyme Gene Cluster (CGC) Annotation
    run_dbcan gff_process \
      --input_gff {input.bakta_dir}/{wildcards.sample}.gff3 \
      --output_dir {output.dbcan_dir} \
      --db_dir {params.dbcan_db} \
      --gff_type prodigal \
      --threads {resources.cpus} >> {log} 2>&1

    # CAZyme Gene Cluster (CGC) Identification
    run_dbcan cgc_finder \
      --output_dir {output.dbcan_dir} >> {log} 2>&1 

    # CGC Substrate Prediction
    run_dbcan substrate_prediction \
      --output_dir {output.dbcan_dir} \
      --db_dir {params.dbcan_db} >> {log} 2>&1
    """    

for sample in SAMPLES:
  for db in DATABASES:
    rule:
      input:
        contigs = expand("02.assembly/{sample}/contigs_sel.fasta", sample = sample)
      output:
        amr_tab = expand("06.AMR/ABRicate/{sample}/{db}.tsv", sample = sample, db = db)
      params:
        db = db,
        minid = 80,
        mincov = 70
      conda:
        "envs/abricate.yaml"
      message:
        "--- ABRicate: AMR detection. ---"
      log:
        expand("logs/amr_{db}_in_{sample}_contigs.log", sample = sample, db = db)
      priority: 4
      shell:
        """
        abricate \
          --db {params.db} \
          {input.contigs} \
          --minid {params.minid} \
          --mincov {params.mincov} \
          --nopath \
          --quiet > {output.amr_tab} 2> {log}
        """

rule AMR_summary:
  input:
    argannot = "06.AMR/ABRicate/{sample}/argannot.tsv",
    card = "06.AMR/ABRicate/{sample}/card.tsv",
    ecoh = "06.AMR/ABRicate/{sample}/ecoh.tsv",
    ecoli_vf = "06.AMR/ABRicate/{sample}/ecoli_vf.tsv",
    megares = "06.AMR/ABRicate/{sample}/megares.tsv",
    ncbi = "06.AMR/ABRicate/{sample}/ncbi.tsv",
    resfinder = "06.AMR/ABRicate/{sample}/resfinder.tsv",
    vfdb = "06.AMR/ABRicate/{sample}/vfdb.tsv"
  output:
    amr_summary = "06.AMR/ABRicate/{sample}/AMR_summary.txt"
  conda:
    "envs/abricate.yaml"
  priority: 3
  shell:
    """
    abricate --summary \
      {input.argannot} \
      {input.card} \
      {input.ecoh} \
      {input.ecoli_vf} \
      {input.megares} \
      {input.ncbi} \
      {input.resfinder} \
      {input.vfdb} > {output.amr_summary}
    """

rule download_amr_db:
  output:
    card_db = temp("06.AMR/AMR_db/card.tar.bz2"),
    card_dir = temp(directory("06.AMR/AMR_db"))
  params:
    link = config['links']['card_link'],
  message:
    "--- Download AMR features from CARD repository. ---"
  log:
    "logs/download_amr.log"
  priority: 10
  shell:
    """
    wget {params.link} -O {output.card_db} > {log} 2>&1
    tar -xjvf {output.card_db} -C {output.card_dir} >> {log} 2>&1
    """

rule map_amr_db:
  input:
    card_dir = "06.AMR/AMR_db",
    r1 = "01.pre-processing/{sample}_trim_R1.fastq",
    r2 = "01.pre-processing/{sample}_trim_R2.fastq"
  output:
    bbmap_temp = temp(directory("06.AMR/AMR_mapping/{sample}/ref")),  
    covstats_temp = temp("06.AMR/AMR_mapping/{sample}/{sample}_covstats_temp.tsv"),
    covstats = "06.AMR/AMR_mapping/{sample}/{sample}_covstats.tsv",
    amr_legend = "06.AMR/AMR_mapping/{sample}/{sample}_AMR_legend.tsv"
  params:
    card_target = "nucleotide_fasta_protein_homolog_model.fasta",
    min_id = 0.99
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/bbmap.yaml"
  log:
    "logs/map_amr_{sample}.log"  
  priority: 9
  message:
    "--- Map trimmed reads against CARD db. ---"
  shell:
    """
    bbmap.sh \
      -in={input.r1} \
      -in2={input.r2} \
      ref={input.card_dir}/{params.card_target} \
      path={output.bbmap_temp} \
      idfilter={params.min_id} \
      idtag \
      -Xmx32g \
      threads={resources.cpus} \
      ambiguous=best \
      secondary=f \
      covstats={output.covstats_temp} > {log} 2>&1

    (head -n 1 {output.covstats_temp} > {output.covstats}) && \
    tail -n +2 {output.covstats_temp} | awk -F'\t' '{{print $5 "\t" $0}}' | sort -t$'\t' -k1,1nr | cut -f2- >> {output.covstats}

    echo "#AMR features with a covered length of at least 70%" > {output.amr_legend}
    (head -n 1 {input.card_dir}/aro_index.tsv >> {output.amr_legend}) && \
    for i in $(tail -n +2 {output.covstats} | awk -F'\t' '$5 >=70' | cut -f1 | awk -F'|' '{{print $5}}'); do \
        grep $i {input.card_dir}/aro_index.tsv
    done >> {output.amr_legend}
    """

rule plasmid_search:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta",
    blast = "03.post-processing/contaminants/{sample}/blastout"
  output:
    plasmid_dir = directory("07.plasmids/{sample}"),
    plasmids = "07.plasmids/{sample}/verified_plasmids.txt"
  params:
    platon_db = PLATONDB
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/platon.yaml"
  message:
    "--- Platon: Plasmid identification. ---"
  log:
    "logs/plasmid_search_{sample}.log"
  priority: 4
  shell:
    """
   platon \
    --db {params.platon_db} \
    --output {output.plasmid_dir} \
    --verbose \
    --threads {resources.cpus} \
    {input.contigs} > {log} 2>&1

    if [[ -s {output.plasmid_dir}/contigs_sel.plasmid.fasta ]] && grep -q ">" {output.plasmid_dir}/contigs_sel.plasmid.fasta; then
        while IFS= read -r i; do
            if grep -m 1 "$i" {input.blast} | grep -q "plasmid"; then
                echo "{wildcards.sample}: $i is a plasmid." >> {output.plasmids}
            else
               echo "{wildcards.sample}: $i was not verified by BLAST search." >> {output.plasmids}
            fi
        done < <(grep ">" {output.plasmid_dir}/contigs_sel.plasmid.fasta | sed 's/^>//g')
    else
        echo "Platon found no plasmid in sample {wildcards.sample}." > {output.plasmids}
    fi
    """

rule viral_db:
  output:
    vs2_db = directory("08.phages/vs2_db"),
    checkv_db = directory("08.phages/checkv_db")
  resources:
    cpus = 4
  conda:
    "envs/virsorter.yaml"
  params:
    checkv_link = config['links']['checkv_link'] if 'checkv_link' in config['links'] else None,
    tries = 5,
    db_id = id
  message:
    """
    --- Download VirSort2 database ---
    --- Download CheckV database ---
    """
  log:
    "logs/viral_databases.log"
  priority: 9
  shell:
    """
    virsorter setup \
      -d {output.vs2_db} \
      -j {resources.cpus} > {log} 2>&1
    
    if [ -z "{params.checkv_link}" ]; then
        checkv download_database \
          {output.checkv_db} >> {log} 2>&1
    else
        wget --tries={params.tries} -c {params.checkv_link} -P {output.checkv_db} >> {log} 2>&1
        tar -xzvf {output.checkv_db}/{params.db_id}.tar.gz -C {output.checkv_db} >> {log} 2>&1
        diamond makedb \
          --in {output.checkv_db}/{params.db_id}/genome_db/checkv_reps.faa \
          --db {output.checkv_db}/{params.db_id}/genome_db/checkv_reps >> {log} 2>&1
    fi
    """

rule viral_identification:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta",
    vs2_db = "08.phages/vs2_db",
    checkv_db = "08.phages/checkv_db"
  output:
    vs2_dir = directory("08.phages/virsorter/{sample}"),
    checkv_dir = directory("08.phages/checkv/{sample}")
  params:
    viral_groups = "dsDNAphage,NCLDV,RNA,ssDNA,lavidaviridae",
    min_score = 0.5
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/virsorter.yaml"
  message:
    """
    --- VirSorter2: Identification of phages and prophages. ---
    --- CheckV: Quality assessment of viral genomes. ---
    """
  log:
    "logs/viral_identification_{sample}.log"
  priority: 8
  shell:
    """
    virsorter run  \
      -i {input.contigs} \
      -w {output.vs2_dir} \
      -d {input.vs2_db} \
      --keep-original-seq \
      --include-groups {params.viral_groups} \
      --min-score {params.min_score} \
      -j {resources.cpus} \
      all > {log} 2>&1

    checkv_rep_files=$(find {input.checkv_db} -type f -path "*/genome_db/checkv_reps.faa" | sort)
    checkv_rep_file=$(printf "%s\n" "$checkv_rep_files" | head -n 1)
    checkv_rep_count=$(printf "%s\n" "$checkv_rep_files" | sed '/^$/d' | wc -l)
    if [ "$checkv_rep_count" -ne 1 ]; then
        echo "Expected exactly one CheckV database in {input.checkv_db}, found $checkv_rep_count candidate(s)." >> {log}
        exit 1
    fi
    checkv_db_dir=$(dirname "$(dirname "$checkv_rep_file")")

    checkv end_to_end \
      {output.vs2_dir}/final-viral-combined.fa \
      {output.checkv_dir} \
      -t {resources.cpus} \
      -d "$checkv_db_dir" >> {log} 2>&1
    """

rule multiqc:
  input:
    fastp_json = expand("01.pre-processing/{sample}_fastp.json", sample = SAMPLES),
    qualimap_dir = expand("03.post-processing/mapping_evaluation/{sample}", sample = SAMPLES),
    quast_dir = expand("03.post-processing/assembly_evaluation/{sample}", sample = SAMPLES),
    checkm_dir = expand("03.post-processing/completeness_evaluation/{sample}", sample = SAMPLES),
    gtdbtk_dir = expand("04.taxonomy/{sample}", sample = SAMPLES),
    bakta_dir = expand("05.annotation/bakta/{sample}", sample = SAMPLES)
  output:
    report_html = "09.report/multiqc_report.html",
    multiqc_dir = directory("09.report"),
    multiqc_yaml = "09.report/multiqc_config.yaml"
  conda:
    "envs/multiqc.yaml"
  message:
    "--- MultiQC: Aggregate results. ---"
  log:
    "logs/multiqc.log"
  shell:
    r"""
    set -euo pipefail

    mkdir -p {output.multiqc_dir}

    printf "%s\n" "show_analysis_paths: False" "show_analysis_time: False" > {output.multiqc_yaml}

  
    cat >> "{output.multiqc_yaml}" << 'EOF'
sample_names_replace_regex: true
sample_names_replace_exact: true

sample_names_replace:
  # read QC (fastp)
  '^01\.pre-processing \| ([^|]+)$': 'read QC | \1'

  # mapping QC (QualiMap)
  '^03\.post-processing \| mapping_evaluation \| ([^|]+) \| \1_map$': 'map QC pt.1 | \1'
  '^03\.post-processing \| mapping_evaluation \| ([^|]+) \| raw_data$': 'map QC pt.2 | \1'

  # assembly QC (Quast)
  '^03\.post-processing \| assembly_evaluation \| ([^|]+) \| contigs_sel$':  'assembly QC | \1'

  # annotation (Bakta)
  '^05\.annotation \| bakta \| ([^|]+) \| \1$': 'annotation | \1'
EOF

    multiqc \
      --config "{output.multiqc_yaml}" \
      -d \
      {input.fastp_json} \
      {input.qualimap_dir} \
      {input.quast_dir} \
      {input.checkm_dir} \
      {input.gtdbtk_dir} \
      {input.bakta_dir} \
      --outdir "{output.multiqc_dir}" \
      --filename "multiqc_report.html" > "{log}" 2>&1
    """
