print(r'''
__________             _____       _______________              
___  ____/_____ _________  /______ ___  ____/__  /___  _____  __
__  /_   _  __ `/_  ___/  __/  __ `/_  /_   __  /_  / / /_  |/_/
_  __/   / /_/ /_(__  )/ /_ / /_/ /_  __/   _  / / /_/ /__>  <  
/_/      \__,_/ /____/ \__/ \__,_/ /_/      /_/  \__,_/ /_/|_|  
                                                                                                            
FastaFlux v1.2.2 – Downstream Analysis of Bacterial Whole-Genome Assemblies


Livio Antonielli, April 2026
''')

# Modules:
import os
import sys
from urllib.parse import urlparse

# Configuration file:
configfile: 'config/config.yaml'

# Path to DBs and input/output directories in config file:
workdir: config['directories']['output_dir']
INPUTDIR = config['directories']['input_dir']
BAKTADB = config['directories']['bakta_db']
BLASTDB = config['directories']['blast_db']
DMNDDB = config['directories']['eggnog_db']
GTDBTKDB = config['directories']['gtdbtk_db']
PLATONDB = config['directories']['platon_db']

# AMR DBs (ABRicate):
DATABASES = ["argannot", "card", "ecoh", "ecoli_vf", "megares", "ncbi", "resfinder", "vfdb"]

# Hardware resources in config file:
CPUS = config['resources']['threads']

# Define not allowed characters
not_allowed_chars = set("_*#@%^/! ?&:;|<>")

def bakta_locus_tag(sample):
    tag = "".join(char if char.isalnum() or char in "_.-" else "_" for char in str(sample))
    tag = tag[:24]
    if not tag:
        raise ValueError(f"Unable to derive a valid Bakta locus tag from sample name '{sample}'.")
    return tag

# Sanity check of genus parameter:
if config['parameters'] is not None and "genus" in config['parameters']:
    genus = config['parameters']["genus"]
    if genus is not None and len(genus) > 0:
        print(f"The 'genus' parameter is specified in the config file with value: '{genus}'.")
    else:
        print("The 'genus' parameter value is not specified in the config file and will be inferred automatically.")
else:
    print("The 'genus' parameter is not present in the config file and contigs won't be filtered accordingly.")

# Sanity check of checkv and dbCAN link parameters:
links = config.get("links") or {}
CHECKV_LINK = str(links.get("checkv_link") or "").strip()
DBCAN_LINK = str(links.get("dbcan_link") or "").strip()
CHECKV_DB_ID = None

if not CHECKV_LINK:
    # Optional parameter: fall back to automatic download
    print(
        "The link to the CheckV database is not specified (or empty). "
        "The latest version will be downloaded automatically."
    )
else:
    path = urlparse(CHECKV_LINK).path
    db = os.path.basename(path)

    # Hard failure only for truly invalid input
    if not db.endswith(".tar.gz"):
        raise ValueError(
            f"Invalid checkv_link: expected a .tar.gz archive, got '{db}'."
        )

    CHECKV_DB_ID = os.path.splitext(os.path.splitext(db)[0])[0]
    print(f"Using CheckV database from link: '{CHECKV_LINK}' (db_id='{CHECKV_DB_ID}').")

if not DBCAN_LINK:
    raise ValueError("Missing required 'links.dbcan_link' value in the config file.")
if not DBCAN_LINK.endswith(".tar.gz"):
    raise ValueError(
        f"Invalid dbcan_link: expected a .tar.gz archive, got '{os.path.basename(DBCAN_LINK)}'."
    )

DBCAN_SHA_URL = DBCAN_LINK.replace(".tar.gz", ".sha256")

# Import FASTQ files from input dir:
SAMPLES, EXTENSIONS = glob_wildcards(os.path.join(INPUTDIR, '{sample}.{extn}'))

if len(SAMPLES) == 0:
    sys.stderr.write(f"No files in {INPUTDIR}. Please, check directory.\n")
    sys.exit(0)
else:
    for sample in sorted(SAMPLES):
        if any(char in sample for char in not_allowed_chars):
            sys.stderr.write(f"Sample name '{sample}' contains not allowed characters.\n")
            sys.exit(0)
        print(f"Sample {sample} in process...")

# Check if files have nonunique extension:
for extension in EXTENSIONS:
  if extension.endswith(("fasta", "fa", "fna")):
    if len(set(EXTENSIONS)) != 1:
      sys.stderr.write("More than one type of file extension detected\n\t")
      sys.stderr.write("\n\t".join(set(EXTENSIONS)))
      sys.exit(0)
  else:
    sys.stderr.write("\nFile format not recognized.\n")
    sys.exit(0)

# Create sample objects:
EXTN = EXTENSIONS[0]
CONTIGS = '{sample}.' + EXTN

rule all:
  input:
    contigs_filt = expand("01.pre-processing/{sample}/contigs_filt.fasta", sample = SAMPLES),
    contigs_sel = expand("02.post-processing/contaminants/{sample}/contigs_sel.fasta", sample = SAMPLES),
    quast_dir = expand("02.post-processing/assembly_evaluation/{sample}", sample = SAMPLES),
    checkm_dir = expand("02.post-processing/completeness_evaluation/{sample}", sample = SAMPLES),
    checkm_stats = expand("02.post-processing/completeness_evaluation/{sample}/checkm_stats.tsv", sample = SAMPLES),
    checkm_lineage = expand("02.post-processing/completeness_evaluation/{sample}/lineage.ms", sample = SAMPLES),
    gtdbtk_dir = expand("03.taxonomy/{sample}", sample = SAMPLES),
    bakta_dir = expand("04.annotation/bakta/{sample}", sample = SAMPLES),
    eggnog_dir = expand("04.annotation/eggnog/{sample}", sample = SAMPLES),
    antismash_dir = expand("04.annotation/antismash/{sample}", sample = SAMPLES),
    dbcan_db = "04.annotation/dbcan/dbcan_db_v5.1.2",
    dbcan_dir = expand("04.annotation/dbcan/{sample}", sample = SAMPLES),
    amr_tab = expand("05.AMR/ABRicate/{sample}/{db}.tsv", sample = SAMPLES, db = DATABASES),
    amr_summary = expand("05.AMR/ABRicate/{sample}/AMR_summary.txt", sample = SAMPLES),
    plasmid_dir = expand("06.plasmids/{sample}", sample = SAMPLES),
    plasmids = expand("06.plasmids/{sample}/verified_plasmids.txt", sample = SAMPLES),
    vs2_dir = expand("07.phages/virsorter/{sample}", sample = SAMPLES),
    checkv_dir = expand("07.phages/checkv/{sample}", sample = SAMPLES),
    multiqc_dir = "08.report"

rule filter_contigs:
  input:
    contigs = os.path.join(INPUTDIR, CONTIGS)
  output:
    contigs_filt = "01.pre-processing/{sample}/contigs_filt.fasta"
  message:
    "Contig filtering."
  priority: 9
  shell:
    """
    # Check the format of the FASTA header
    header_format=$(head -n 1 {input.contigs} | grep -E 'length_[0-9]+|cov_[0-9.]+' || true)

    if [ -n "$header_format" ]; then
        echo "SPAdes format detected: removing short contigs <500 bp and low coverage <2x."
        # SPAdes format: Headers contain 'length_' and 'cov_'
        awk '{{
          if(NR==1) {{
            printf "%s\\n", $0
          }} else {{
            if(/^>/) {{
              printf "\\n%s\\n", $0
            }} else {{
              printf $0
            }}
          }}
        }}' {input.contigs} | \
        awk -F"_" '{{
          if(/^>/ && $6 >= 2.0 && $4 >= 500) {{
            printf "%s\\n", $0
            getline
            print
          }}
        }}' > {output.contigs_filt}
    else
        echo "Fixing FASTA headers."
        # NCBI format: Simplify headers and no filtering based on length/coverage
        awk '/^>/ {{print $1; next}} {{print}}' {input.contigs} > {output.contigs_filt}
    fi
    """

rule map_contigs:
  input:
    contigs_filt = "01.pre-processing/{sample}/contigs_filt.fasta"
  output:
    bam = temp("02.post-processing/contaminants/{sample}_map.bam"),
    bai = temp("02.post-processing/contaminants/{sample}_map.bam.bai")
  resources:
    cpus = CPUS
  conda:
    "envs/minimap.yaml"
  message:
    "--- Minimap2: Map reads against contigs. ---"
  log:
    "logs/map_contigs_{sample}.log"
  priority: 0
  shell:
    """
    minimap2 \
      -a {input.contigs_filt} \
      {input.contigs_filt} 2> {log} | \
    samtools view \
      -S \
      -b \
      -u \
      -@ {resources.cpus} | \
    samtools sort \
      -o {output.bam} \
      -@ {resources.cpus} 2>> {log}
    
    samtools index \
      {output.bam} \
      -@ {resources.cpus} 2>> {log}
    """

rule blast_contigs:
  input:
    contigs_filt = "01.pre-processing/{sample}/contigs_filt.fasta"
  output:
    blast = "02.post-processing/contaminants/{sample}/blastout"
  params:
    dir = BLASTDB,
    db = os.path.join(BLASTDB, config['parameters']['nt_version'])
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/blast.yaml"
  message:
    "--- BLAST: Align contigs against NCBI nt db. ---"
  log:
    "logs/blast_contigs_{sample}.log"
  priority: 0
  shell:
    """
    BLASTDB={params.dir} \
    blastn \
      -task megablast \
      -query {input.contigs_filt} \
      -db {params.db} \
      -outfmt '6 qseqid staxids bitscore pident evalue length qlen slen qcovs qcovhsp sskingdoms scomnames sscinames sblastnames stitle' \
      -num_threads {resources.cpus} \
      -evalue 1e-5 \
      -max_target_seqs 50 \
      -max_hsps 5 \
      -out {output.blast} > {log} 2>&1
    """

rule blob_json:
  input:
    contigs_filt = "01.pre-processing/{sample}/contigs_filt.fasta",
    bam = "02.post-processing/contaminants/{sample}_map.bam",
    bai = "02.post-processing/contaminants/{sample}_map.bam.bai",
    blast = "02.post-processing/contaminants/{sample}/blastout",
    nodes = os.path.join(BLASTDB, "nodes.dmp"),
    names = os.path.join(BLASTDB, "names.dmp")
  output:
    json = temp("02.post-processing/contaminants/{sample}/blob.blobDB.json"),
    cov = temp("02.post-processing/contaminants/{sample}/blob.{sample}_map.bam.cov")
  params:
    basename = "02.post-processing/contaminants/{sample}/blob"
  conda:
    "envs/blobtools.yaml"
  message:
    "--- BlobTools: Screen BLAST hits for contaminants. ---"
  log:
    "logs/blob_json_{sample}.log"
  priority: 0
  shell:
    """
    blobtools create \
      -i {input.contigs_filt} \
      -b {input.bam} \
      -t {input.blast} \
      --nodes {input.nodes} \
      --names {input.names} \
      -o {params.basename} > {log} 2>&1
    """

rule blob_table:
  input:
    json = "02.post-processing/contaminants/{sample}/blob.blobDB.json"
  output:
    bestscore = "02.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt"
  params:
    basename = "02.post-processing/contaminants/{sample}/bestscore"
  conda:
    "envs/blobtools.yaml"
  message:
    "--- BlobTools: Collapse taxonomic assignment of BLAST hits according to sum of best scores. ---"
  log:
    "logs/blob_table_{sample}.log"
  priority: 0
  shell:
    """
    blobtools view \
      --input {input.json} \
      --out {params.basename} \
      --taxrule bestsum \
      --rank all \
      --hits > {log} 2>&1
    """

FASTA_LIN_CMD = r"""{if(NR==1) {printf "%s\n", $0} else {if(/^>/) {printf "\n%s\n", $0} else {printf $0}}}"""

# Execute either one rule or another according to presence/absence of 'genus' parameter
if config.get('parameters') is not None and "genus" in config['parameters'] and config['parameters']['genus'] is not None and len(config['parameters']['genus']) > 0:
  rule:
    input:
      contigs_filt = "01.pre-processing/{sample}/contigs_filt.fasta",
      bestscore = "02.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt"
    output:
      contigs_filt_lin = "01.pre-processing/{sample}/contigs_filt_lin.fasta",
      abund = "02.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "02.post-processing/contaminants/{sample}/contigs.list",
      contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta"
    params:
      genus = config['parameters']['genus']    
    priority: 0
    shell:
      """
      cat {input.contigs_filt} | awk {FASTA_LIN_CMD:q} > {output.contigs_filt_lin}

      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
          cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}

      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      
      awk -v var="{params.genus}" 'tolower($22) ~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore} > {output.list}
      
      grep -A1 -f {output.list} {output.contigs_filt_lin} | sed '/--/d' > {output.contigs_sel}
      """
else:
  rule:
    input:
      contigs_filt = "01.pre-processing/{sample}/contigs_filt.fasta",
      bestscore = "02.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt"
    output:
      contigs_filt_lin = "01.pre-processing/{sample}/contigs_filt_lin.fasta",
      abund = "02.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "02.post-processing/contaminants/{sample}/contigs.list",
      contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta"
    priority: 0
    shell:
      """
      cat {input.contigs_filt} | awk {FASTA_LIN_CMD:q} > {output.contigs_filt_lin}

      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
          cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}

      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}

      for i in $(cat {output.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p' | sed -e 's/Para//;s/Pseudo//;s/Paen//' | tr '[:upper:]' '[:lower:]'); do \
          awk -v var="$i" 'tolower($22) ~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore}; \
      done > {output.list}

      grep -A1 -f {output.list} {output.contigs_filt_lin} | sed '/--/d' > {output.contigs_sel}
      """

rule genome_assembly_evaluation:
  input:
    contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta"
  output:
    quast_dir = directory("02.post-processing/assembly_evaluation/{sample}")
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/quast.yaml"
  message:
    "--- QUAST: Genome assembly evaluation. ---"
  log:
    "logs/assembly_evaluation_{sample}.log"
  shell:
    """
    quast \
      {input.contigs_sel} \
      -o {output.quast_dir} \
      --no-icarus \
      -t {resources.cpus} > {log} 2>&1 
    """

rule completeness_and_contamination:
  input:
    contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta"
  output:
    checkm_dir = directory("02.post-processing/completeness_evaluation/{sample}"),
    checkm_stats = "02.post-processing/completeness_evaluation/{sample}/checkm_stats.tsv",
    checkm_lineage = "02.post-processing/completeness_evaluation/{sample}/lineage.ms"
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/checkm.yaml"
  message:
    "--- CheckM: Assessment of genome completenness and contamination. ---"
  log:
    "logs/completenness_and_contamination_{sample}.log"
  priority: 5
  shell:
    """
    cp {input.contigs_sel} {output.checkm_dir}/{wildcards.sample}.fasta
    
    checkm lineage_wf \
      -t {resources.cpus} \
      -x fasta {output.checkm_dir} \
      {output.checkm_dir} > {log} 2>&1
    
    checkm qa \
      -o 2 \
      -t {resources.cpus} \
      --tab_table \
      -f {output.checkm_stats} \
      {output.checkm_lineage} \
      {output.checkm_dir} >> {log} 2>&1
    """

rule taxonomic_assignment:
  input:
    checkm_dir = "02.post-processing/completeness_evaluation/{sample}",
  output:
    gtdbtk_dir = directory("03.taxonomy/{sample}")
  params:
    gtdbtk_db = GTDBTKDB,
    skani_sketch_dir = f"{GTDBTKDB}/skani_sketches_r226_skani0.3.1"
  resources:
    cpus = min(CPUS, 24),
    cpus_p = min(CPUS, 64)
  conda:
    "envs/gtdbtk.yaml"
  message:
    "--- GTDB-Tk: Taxonomic assignment. ---"
  log:
    "logs/taxonomic_assignment_{sample}.log"
  priority: 4
  shell:
    """
    mkdir -p {output.gtdbtk_dir} {params.skani_sketch_dir}

    GTDBTK_DATA_PATH={params.gtdbtk_db:q} \
    gtdbtk classify_wf \
      -x fasta \
      --genome_dir {input.checkm_dir} \
      --out_dir {output.gtdbtk_dir} \
      --skani_sketch_dir {params.skani_sketch_dir:q} \
      --cpus {resources.cpus} \
      --pplacer_cpus {resources.cpus_p} > {log} 2>&1

    rm -rf {input.checkm_dir}/*.fasta
    """

rule accurate_annotation:
  input:
    contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta",
    abund = "02.post-processing/contaminants/{sample}/{sample}_composition.txt"
  output:
    bakta_dir = directory("04.annotation/bakta/{sample}")
  params:
    bakta_db = BAKTADB,
    locus_tag = lambda wc: bakta_locus_tag(wc.sample)
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/bakta.yaml"
  message:
    "--- Bakta: Genome annotation. ---"
  log:
    "logs/accurate_annotation_{sample}.log"
  priority: 5
  shell:
    """
    for i in $(cat {input.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p'); do \
      bakta \
        --db {params.bakta_db} \
        --verbose \
        --genus $i \
        --species sp. \
        --strain {wildcards.sample} \
        --translation-table 11 \
        --min-contig-length 500 \
        --locus-tag {params.locus_tag} \
        --prefix {wildcards.sample} \
        --keep-contig-headers \
        --output {output.bakta_dir} \
        --threads {resources.cpus} \
        --force {input.contigs_sel}; \
    done > {log} 2>&1
    """    

rule functional_annotation:
  input:
    bakta_dir = "04.annotation/bakta/{sample}"
  output:
    temp_dir = temp(directory("04.annotation/eggnog/{sample}/eggnog_tmp")),
    eggnog_dir = directory("04.annotation/eggnog/{sample}")
  params:
    dmnd_db = DMNDDB
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/eggnog-mapper.yaml"
  message:
    "--- EggNOG: Functional annotation. ---"
  log:
    "logs/functional_annotation_{sample}.log"
  priority: 4
  shell:
    """
    mkdir -p {output.temp_dir} {output.eggnog_dir}

    emapper.py \
      -i {input.bakta_dir}/{wildcards.sample}.faa \
      --output_dir {output.eggnog_dir} \
      --cpu {resources.cpus} \
      -m diamond \
      --data_dir {params.dmnd_db} \
      --output {wildcards.sample} \
      --temp_dir {output.temp_dir} \
      --override > {log} 2>&1
    """

rule secondary_metabolites_db:
  output:
    antismash_db = directory("04.annotation/antismash/databases")
  conda:
    "envs/antismash.yaml"
  message:
    "--- antiSMASH: database download. ---"
  log:
    "logs/secondary_metabolites_database.log"
  priority: 4
  shell:
    """
    download-antismash-databases \
      --database-dir {output.antismash_db} > {log} 2>&1
    """    

rule secondary_metabolites_analysis:
  input:
    antismash_db = "04.annotation/antismash/databases",
    bakta_dir = "04.annotation/bakta/{sample}"
  output:
    antismash_dir = directory("04.annotation/antismash/{sample}")
  params:
    taxon = 'bacteria',
    genefinding_tool = 'none'
  resources:
    cpus = min(CPUS, 24)  
  conda:
    "envs/antismash.yaml"
  message:
    "--- antiSMASH: secondary metabolite annotation. ---"
  log:
    "logs/secondary_metabolites_{sample}.log"
  priority: 4
  shell:
    """
    antismash \
      --output-dir {output.antismash_dir} \
      --output-basename {wildcards.sample} \
      --databases {input.antismash_db} \
      --taxon {params.taxon} \
      --genefinding-tool {params.genefinding_tool} \
      --cpus {resources.cpus} \
      {input.bakta_dir}/{wildcards.sample}.gbff > {log} 2>&1
    """

# Specify directory where to place the dbCAN database
DBCAN_DB_DIR = "04.annotation/dbcan/dbcan_db_v5.1.2"

# Sentinel file for triggering the cazyme annotation rule afterwards
DBCAN_SENTINEL = DBCAN_DB_DIR + "/.verified.sha256"

rule cazyme_db_download:
  output:
    dbcan_db = directory(DBCAN_DB_DIR),
    dbcan_verified = DBCAN_SENTINEL
  params:
    dbcan_db_url = DBCAN_LINK,
    sha_url = DBCAN_SHA_URL
  log:
    "logs/cazyme_db_download.log"
  priority: 4
  shell:
    """
    mkdir -p "{output.dbcan_db}"

    TAR="{output.dbcan_db}/$(basename "{params.dbcan_db_url}")"
    SHA="{output.dbcan_db}/$(basename "{params.sha_url}")"

    wget -O "$TAR" "{params.dbcan_db_url}" > "{log}" 2>&1
    wget -O "$SHA" "{params.sha_url}" >> "{log}" 2>&1

    # verify checksum
    expected="$(awk 'NR==1{{print $1}}' "$SHA")"
    actual="$(sha256sum "$TAR" | awk '{{print $1}}')"
    test "$expected" = "$actual"

    # extract (flatten top-level directory)
    tar -xzf "$TAR" -C "{output.dbcan_db}" --strip-components=1 >> "{log}" 2>&1

    # create sentinel containing verified checksum
    echo "$actual" > "{output.dbcan_verified}"
    """

rule cazyme_gene_cluster:
  input:
    bakta_dir = "04.annotation/bakta/{sample}",
    dbcan_verified = DBCAN_SENTINEL
  output:
    dbcan_dir = directory("04.annotation/dbcan/{sample}")
  params:
    dbcan_db = DBCAN_DB_DIR
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/dbcan.yaml"
  message:
    "--- dbCAN: Finding CAZyme gene clusters. ---"
  log:
    "logs/cazyme_{sample}.log"
  priority: 4
  shell:
    """
    # CAZyme annotation of protein sequences
    run_dbcan CAZyme_annotation \
      --input_raw_data {input.bakta_dir}/{wildcards.sample}.faa \
      --output_dir {output.dbcan_dir} \
      --db_dir {params.dbcan_db} \
      --mode protein \
      --threads {resources.cpus} \
      --methods hmm \
      --methods diamond \
      --methods dbCANsub > {log} 2>&1

    # CAZyme Gene Cluster (CGC) Annotation
    run_dbcan gff_process \
      --input_gff {input.bakta_dir}/{wildcards.sample}.gff3 \
      --output_dir {output.dbcan_dir} \
      --db_dir {params.dbcan_db} \
      --gff_type prodigal \
      --threads {resources.cpus} >> {log} 2>&1

    # CAZyme Gene Cluster (CGC) Identification
    run_dbcan cgc_finder \
      --output_dir {output.dbcan_dir} >> {log} 2>&1

    # CGC Substrate Prediction
    run_dbcan substrate_prediction \
      --output_dir {output.dbcan_dir} \
      --db_dir {params.dbcan_db} >> {log} 2>&1
    """    

for sample in SAMPLES:
  for db in DATABASES:
    rule:
      input:
        contigs_sel = expand("02.post-processing/contaminants/{sample}/contigs_sel.fasta", sample = sample)
      output:
        amr_tab = expand("05.AMR/ABRicate/{sample}/{db}.tsv", sample = sample, db = db)
      params:
        db = db,
        minid = 80,
        mincov = 70  
      conda:
        "envs/abricate.yaml"
      message:
        "--- ABRicate: AMR detection. ---"
      log:
        expand("logs/amr_{db}_in_{sample}_contigs.log", sample = sample, db = db)
      priority: 4
      shell:
        """
        abricate \
          --db {params.db} \
          {input.contigs_sel} \
          --minid {params.minid} \
          --mincov {params.mincov} \
          --nopath \
          --quiet > {output.amr_tab} 2> {log}
        """

rule AMR_summary:
  input:
    argannot = "05.AMR/ABRicate/{sample}/argannot.tsv",
    card = "05.AMR/ABRicate/{sample}/card.tsv",
    ecoh = "05.AMR/ABRicate/{sample}/ecoh.tsv",
    ecoli_vf = "05.AMR/ABRicate/{sample}/ecoli_vf.tsv",
    megares = "05.AMR/ABRicate/{sample}/megares.tsv",
    ncbi = "05.AMR/ABRicate/{sample}/ncbi.tsv",
    resfinder = "05.AMR/ABRicate/{sample}/resfinder.tsv",
    vfdb = "05.AMR/ABRicate/{sample}/vfdb.tsv"
  output:
    amr_summary = "05.AMR/ABRicate/{sample}/AMR_summary.txt"
  conda:
    "envs/abricate.yaml"
  priority: 3
  shell:
    """
     abricate --summary \
      {input.argannot} \
      {input.card} \
      {input.ecoh} \
      {input.ecoli_vf} \
      {input.megares} \
      {input.ncbi} \
      {input.resfinder} \
      {input.vfdb} > {output.amr_summary}
    """

rule plasmid_search:
  input:
    contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta",
    blast = "02.post-processing/contaminants/{sample}/blastout"
  output:
    plasmid_dir = directory("06.plasmids/{sample}"),
    plasmids = "06.plasmids/{sample}/verified_plasmids.txt"
  params:
    platon_db = PLATONDB
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/platon.yaml"
  message:
    "--- Platon: Plasmid identification. ---"
  log:
    "logs/plasmid_search_{sample}.log"
  priority: 4
  shell:
    """
    platon \
      --db {params.platon_db} \
      --output {output.plasmid_dir} \
      --verbose \
      --threads {resources.cpus} \
      {input.contigs_sel} > {log} 2>&1
    
    if [[ -s {output.plasmid_dir}/contigs_sel.plasmid.fasta ]] && grep -q ">" {output.plasmid_dir}/contigs_sel.plasmid.fasta; then
        while IFS= read -r i; do
            if grep -m 1 "$i" {input.blast} | grep -q "plasmid"; then
                echo "{wildcards.sample}: $i is a plasmid." >> {output.plasmids}
            else
               echo "{wildcards.sample}: $i was not verified by BLAST search." >> {output.plasmids}
            fi
        done < <(grep ">" {output.plasmid_dir}/contigs_sel.plasmid.fasta | sed 's/^>//g')
    else
        echo "Platon found no plasmid in sample {wildcards.sample}." > {output.plasmids}
    fi
    """

rule viral_db:
  output:
    vs2_db = directory("07.phages/vs2_db"),
    checkv_db = directory("07.phages/checkv_db")
  resources:
    cpus = 4
  conda:
    "envs/virsorter.yaml"
  params:
    checkv_link = CHECKV_LINK,
    tries = 5,
    db_id = CHECKV_DB_ID
  message:
    """
    --- Download VirSort2 database ---
    --- Download CheckV database ---
    """
  log:
    "logs/viral_databases.log"
  priority: 9
  shell:
    """
    virsorter setup \
      -d {output.vs2_db} \
      -j {resources.cpus} > {log} 2>&1

    if [ -z "{params.checkv_link}" ]; then
        checkv download_database \
        {output.checkv_db} >> {log} 2>&1
    else
        wget --tries={params.tries} -c {params.checkv_link} -P {output.checkv_db} >> {log} 2>&1
        tar -xzvf {output.checkv_db}/{params.db_id}.tar.gz -C {output.checkv_db} >> {log} 2>&1
        diamond makedb \
          --in {output.checkv_db}/{params.db_id}/genome_db/checkv_reps.faa \
          --db {output.checkv_db}/{params.db_id}/genome_db/checkv_reps >> {log} 2>&1
    fi
    """

rule viral_identification:
  input:
    contigs_sel = "02.post-processing/contaminants/{sample}/contigs_sel.fasta",
    vs2_db = "07.phages/vs2_db",
    checkv_db = "07.phages/checkv_db"
  output:
    vs2_dir = directory("07.phages/virsorter/{sample}"),
    checkv_dir = directory("07.phages/checkv/{sample}")
  params:
    viral_groups = "dsDNAphage,NCLDV,RNA,ssDNA,lavidaviridae",
    min_score = 0.5
  resources:
    cpus = min(CPUS, 24)
  conda:
    "envs/virsorter.yaml"
  message:
    """
    --- VirSorter2: Identification of phages and prophages. ---
    --- CheckV: Quality assessment of viral genomes. ---
    """
  log:
    "logs/viral_identification_{sample}.log"
  priority: 8
  shell:
    """
    virsorter run  \
      -i {input.contigs_sel} \
      -w {output.vs2_dir} \
      -d {input.vs2_db} \
      --keep-original-seq \
      --include-groups {params.viral_groups} \
      --min-score {params.min_score} \
      -j {resources.cpus} \
      all > {log} 2>&1

    checkv_rep_files=$(find {input.checkv_db} -type f -path "*/genome_db/checkv_reps.faa" | sort)
    checkv_rep_file=$(printf "%s\n" "$checkv_rep_files" | head -n 1)
    checkv_rep_count=$(printf "%s\n" "$checkv_rep_files" | sed '/^$/d' | wc -l)
    if [ "$checkv_rep_count" -ne 1 ]; then
        echo "Expected exactly one CheckV database in {input.checkv_db}, found $checkv_rep_count candidate(s)." >> {log}
        exit 1
    fi
    checkv_db_dir=$(dirname "$(dirname "$checkv_rep_file")")

    checkv end_to_end \
      {output.vs2_dir}/final-viral-combined.fa \
      {output.checkv_dir} \
      -t {resources.cpus} \
      -d "$checkv_db_dir" >> {log} 2>&1
    """

rule multiqc:
  input:
    quast_dir = expand("02.post-processing/assembly_evaluation/{sample}", sample = SAMPLES),
    checkm_dir = expand("02.post-processing/completeness_evaluation/{sample}", sample = SAMPLES),
    gtdbtk_dir = expand("03.taxonomy/{sample}", sample = SAMPLES),
    bakta_dir = expand("04.annotation/bakta/{sample}", sample = SAMPLES)
  output:
    report_html = "08.report/multiqc_report.html",
    multiqc_dir = directory("08.report"),
    multiqc_yaml = "08.report/multiqc_config.yaml"
  conda:
    "envs/multiqc.yaml"
  message:
    "--- MultiQC: Aggregate results. ---"
  log:
    "logs/multiqc.log"
  shell:
    r"""
    set -euo pipefail

    mkdir -p {output.multiqc_dir}

    printf "%s\n" "show_analysis_paths: False" "show_analysis_time: False" > {output.multiqc_yaml}

  
    cat >> "{output.multiqc_yaml}" << 'EOF'
sample_names_replace_regex: true
sample_names_replace_exact: true

sample_names_replace:
  # assembly QC (Quast)
  '^02\.post-processing \| assembly_evaluation \| ([^|]+) \| contigs_sel$':  'assembly QC | \1'

  # annotation (Bakta)
  '^04\.annotation \| bakta \| ([^|]+) \| \1$': 'annotation | \1'
EOF

    multiqc \
      --config "{output.multiqc_yaml}" \
      -d \
      {input.quast_dir} \
      {input.checkm_dir} \
      {input.gtdbtk_dir} \
      {input.bakta_dir} \
      --outdir "{output.multiqc_dir}" \
      --filename "multiqc_report.html" > "{log}" 2>&1
    """
