print(r'''
________             _______________              
___  __ )_____ _________  ____/__  /___  _____  __
__  __  |  __ `/  ___/_  /_   __  /_  / / /_  |/_/
_  /_/ // /_/ // /__ _  __/   _  / / /_/ /__>  <  
/_____/ \__,_/ \___/ /_/      /_/  \__,_/ /_/|_|  
                                                  
BacFlux v1.1


Livio Antonielli, April 2024 
''')

# Modules:
import os
import sys
from urllib.parse import urlparse

# Configuration file (same directory as snakefile script):
configfile: 'config/config.yaml'

# Path to DBs and input/output directories in config file:
workdir: config['directories']['out_dir']
FASTQDIR = config['directories']['fastq_dir']
BLASTDB = config['directories']['blast_db']
DMNDDB = config['directories']['eggnog_db']
GTDBTKDB = config['directories']['gtdbtk_db']
BAKTADB = config['directories']['bakta_db']
PLATONDB = config['directories']['platon_db']

# AMR DBs (ABRicate):
DATABASES = ["argannot", "card", "ecoh", "ecoli_vf", "megares", "ncbi", "resfinder", "vfdb"]

# Hardware resources in config file:
CPUS = config['resources']['threads']
RAM = config['resources']['ram_gb']

# Sanity check of genus parameter:
if config['parameters'] is not None and "genus" in config['parameters']:
    genus = config['parameters']["genus"]
    if genus is not None and len(genus) > 0:
        print(f"The 'genus' parameter is specified in the config file with value: '{genus}'.")
    else:
        print("The 'genus' parameter value is not specified in the config file and will be inferred automatically.")
else:
    print("The 'genus' parameter is not present in the config file and contigs won't be filtered accordingly.")

# Sanity check of checkv db link parameter:
global id
id = None

if config['links'] is not None and "checkv_link" in config['links']:
    link = config['links']["checkv_link"]
    if link is not None and len(link) > 0:
        path = urlparse(link).path
        db = os.path.basename(path)
        id = os.path.splitext(os.path.splitext(db)[0])[0]
    else:
        print("The link to CheckV database is not specified and the latest version will be automatically downloaded.")
else:
    sys.stderr.write(f"The checkv_link parameter is not present. Please, check the config file.\n")
    sys.exit(0)

# Import FASTQ files from input dir:
SAMPLES, EXTENSIONS = glob_wildcards(os.path.join(FASTQDIR, '{sample}_R1.{extn}'))

if len(SAMPLES) == 0:
    sys.stderr.write(f"No files in {FASTQDIR}. Please, check directory.\n")
    sys.exit(0)
else:
    for sample in sorted(SAMPLES):
        print(f"Sample {sample} in process...")

# Check if files have nonunique extension:
for extension in EXTENSIONS:
  if extension.endswith(("fastq", "fq", "fastq.gz", "fq.gz")):
    if len(set(EXTENSIONS)) != 1:
      sys.stderr.write("More than one type of file extension detected\n\t")
      sys.stderr.write("\n\t".join(set(EXTENSIONS)))
      sys.exit(0)
  else:
    sys.stderr.write("\nFile format not recognized.\n")
    sys.exit(0)

# Create sample objects:
EXTN = EXTENSIONS[0]
R1 = '{sample}_R1.' + EXTN
R2 = '{sample}_R2.' + EXTN

rule all:
  input:
    fastp_json = expand("01.pre-processing/{sample}_fastp.json", sample = SAMPLES),
    qualimap_dir = expand("03.post-processing/mapping_evaluation/{sample}", sample = SAMPLES),
    quast_dir = expand("03.post-processing/assembly_evaluation/{sample}", sample = SAMPLES),
    checkm_dir = expand("03.post-processing/completeness_evaluation/{sample}", sample = SAMPLES),
    checkm_stats = expand("03.post-processing/completeness_evaluation/{sample}/checkm_stats.tsv", sample = SAMPLES),
    checkm_lineage = expand("03.post-processing/completeness_evaluation/{sample}/lineage.ms", sample = SAMPLES),
    gtdbtk_dir = expand("04.taxonomy/{sample}", sample = SAMPLES),
    prokka_dir = expand("05.annotation/prokka/{sample}", sample = SAMPLES),
    bakta_dir = expand("05.annotation/bakta/{sample}", sample = SAMPLES),
    eggnog_dir = expand("05.annotation/eggnog/{sample}", sample = SAMPLES),
    antismash_dir = expand("05.annotation/antismash/{sample}", sample = SAMPLES),
    amr_tab = expand("06.AMR/ABRicate/{sample}/{db}.tsv", sample = SAMPLES, db = DATABASES),
    amr_summary = expand("06.AMR/ABRicate/{sample}/AMR_summary.txt", sample = SAMPLES),
    covstats = expand("06.AMR/AMR_mapping/{sample}/{sample}_covstats.tsv", sample = SAMPLES),
    amr_legend = expand("06.AMR/AMR_mapping/{sample}/{sample}_AMR_legend.tsv", sample = SAMPLES),
    plasmid_dir = expand("07.plasmids/{sample}", sample = SAMPLES),
    plasmids = expand("07.plasmids/{sample}/verified_plasmids.txt", sample = SAMPLES),
    vs2_dir = expand("08.phages/virsorter/{sample}", sample = SAMPLES),
    checkv_dir = expand("08.phages/checkv/{sample}", sample = SAMPLES),
    multiqc_dir = "09.report"

rule download_phix:
  output:
    phix = temp("01.pre-processing/phix.fna.gz")
  params:
    link = config['links']['phix_link']
  message:
    "--- Download PhiX genome from NCBI. ---"
  log:
    "logs/download_phix.log"  
  shell:
    """
    wget {params.link} -O {output.phix} > {log} 2>&1
    """

rule build_phix:
  input:
    phix = "01.pre-processing/phix.fna.gz"
  output:
    idx1 = temp("01.pre-processing/phix.1.bt2"),
    idx2 = temp("01.pre-processing/phix.2.bt2"),
    idx3 = temp("01.pre-processing/phix.3.bt2"),
    idx4 = temp("01.pre-processing/phix.4.bt2"),
    ridx1 = temp("01.pre-processing/phix.rev.1.bt2"),
    ridx2 = temp("01.pre-processing/phix.rev.2.bt2")
  params:
    basename = "01.pre-processing/phix"
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Build PhiX genome db. ---"
  log:
    "logs/build_phix.log"
  shell:
    """
    bowtie2-build {input.phix} {params.basename} > {log} 2>&1
    """

rule map_phix:
  input:
    idx1 = "01.pre-processing/phix.1.bt2",
    idx2 = "01.pre-processing/phix.2.bt2",
    idx3 = "01.pre-processing/phix.3.bt2",
    idx4 = "01.pre-processing/phix.4.bt2",
    ridx1 = "01.pre-processing/phix.rev.1.bt2",
    ridx2 = "01.pre-processing/phix.rev.2.bt2",
    r1 = os.path.join(FASTQDIR, R1),
    r2 = os.path.join(FASTQDIR, R2)
  output:
    sam = temp("01.pre-processing/{sample}_contam.sam"),
    r1 = temp("01.pre-processing/{sample}.1.fastq"),
    r2 = temp("01.pre-processing/{sample}.2.fastq")
  params:
    db = temp("01.pre-processing/phix"),
    basename = "01.pre-processing/{sample}.fastq"
  resources:
    cpus = CPUS
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Map reads against PhiX genome db. ---"
  log:
    "logs/map_phix_{sample}.log"
  shell:
    """
    bowtie2 -x {params.db} -1 {input.r1} -2 {input.r2} \
    --threads {resources.cpus} --un-conc {params.basename} -S {output.sam} --local > {log} 2>&1
    """

rule trim_adapters:
  input:
    r1 = "01.pre-processing/{sample}.1.fastq",
    r2 = "01.pre-processing/{sample}.2.fastq"
  output:
    r1 = temp("01.pre-processing/{sample}_trim_R1.fastq"),
    r2 = temp("01.pre-processing/{sample}_trim_R2.fastq"),
    html = "01.pre-processing/{sample}_fastp.html",
    json = "01.pre-processing/{sample}_fastp.json"
  resources:
    cpus = 16
  conda:
    "envs/fastp.yaml"
  message:
    "--- Fastp: Remove adapters and quality filter. ---"
  log:
    "logs/trim_adapters_{sample}.log"
  shell:
    """
    fastp --detect_adapter_for_pe --length_required 100 --cut_front --cut_right --thread {resources.cpus} --verbose \
    -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -j {output.json} -h {output.html} > {log} 2>&1
    """

rule genome_assembly:
  input:
    r1 = "01.pre-processing/{sample}_trim_R1.fastq",
    r2 = "01.pre-processing/{sample}_trim_R2.fastq"
  output:
    dir = directory("02.assembly/{sample}"),
    contigs = "02.assembly/{sample}/contigs.fasta"
  resources:
    cpus = CPUS,
    ram = RAM
  conda:
    "envs/spades.yaml"
  message:
    "--- SPAdes: genome assembly. ---"
  log:
    "logs/genome_assembly_{sample}.log"
  priority: 10
  shell:
    """
    OMP_NUM_THREADS={resources.cpus} spades.py -k 21,33,55,77,99,127 --isolate \
    --pe1-1 {input.r1} --pe1-2 {input.r2} -o {output.dir} -t {resources.cpus} -m {resources.ram} > {log} 2>&1
    """

FASTA_LIN_CMD = r"""{if(NR==1) {printf "%s\n", $0} else {if(/^>/) {printf "\n%s\n", $0} else {printf $0}}}"""
FASTA_SEL_CMD = r"""{if(/^>/ && $6>=2.0 && $4>=500) {printf "%s\n", $0; getline; print}}"""

rule filter_contigs:
  input:
    contigs = "02.assembly/{sample}/contigs.fasta"    
  output:
    contigs = "02.assembly/{sample}/contigs_filt.fasta"
  message:
    "Remove short contigs <500 bp and low coverage <2x."
  priority: 9
  shell:
    """
    cat {input.contigs} | \
    awk {FASTA_LIN_CMD:q} | \
    awk -F"_" {FASTA_SEL_CMD:q} > {output.contigs}
    """

rule index_contigs:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta"
  output:
    idx1 = temp("03.post-processing/{sample}_contigs.1.bt2"),
    idx2 = temp("03.post-processing/{sample}_contigs.2.bt2"),
    idx3 = temp("03.post-processing/{sample}_contigs.3.bt2"),
    idx4 = temp("03.post-processing/{sample}_contigs.4.bt2"),
    ridx1 = temp("03.post-processing/{sample}_contigs.rev.1.bt2"),
    ridx2 = temp("03.post-processing/{sample}_contigs.rev.2.bt2")
  params:
    basename = "03.post-processing/{sample}_contigs"
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Build contig db. ---"
  log:
    "logs/index_contigs_{sample}.log"
  priority: 8
  shell:
    """
    bowtie2-build -f {input.contigs} {params.basename} > {log} 2>&1
    """

rule map_contigs:
  input:
    idx1 = "03.post-processing/{sample}_contigs.1.bt2",
    idx2 = "03.post-processing/{sample}_contigs.2.bt2",
    idx3 = "03.post-processing/{sample}_contigs.3.bt2",
    idx4 = "03.post-processing/{sample}_contigs.4.bt2",
    ridx1 = "03.post-processing/{sample}_contigs.rev.1.bt2",
    ridx2 = "03.post-processing/{sample}_contigs.rev.2.bt2",
    r1 = "01.pre-processing/{sample}_trim_R1.fastq",
    r2 = "01.pre-processing/{sample}_trim_R2.fastq"
  output:
    bam = temp("03.post-processing/{sample}_map.bam"),
    bai = temp("03.post-processing/{sample}_map.bam.bai"),
    csi = temp("03.post-processing/{sample}_map.bam.csi")
  params:
    db = temp("03.post-processing/{sample}_contigs")
  resources:
    cpus = CPUS
  conda:
    "envs/bowtie.yaml"
  message:
    "--- Bowtie2: Map reads against contigs. ---"
  log:
    "logs/map_contigs_{sample}.log"
  priority: 8
  shell:
    """
    bowtie2 -x {params.db} -1 {input.r1} -2 {input.r2} -p {resources.cpus} -t 2> {log} | \
    samtools view -@ {resources.cpus} -hbS - | \
    samtools sort -@ {resources.cpus} --write-index -o {output.bam} - >> {log} 2>&1
    samtools index -b {output.bam} -@ {resources.cpus} >> {log} 2>&1
    """

rule map_evaluation:
  input:
    bam = "03.post-processing/{sample}_map.bam"
  output:
    qualimap_dir = directory("03.post-processing/mapping_evaluation/{sample}")
  conda:
    "envs/qualimap.yaml"
  message:
    "--- Qualimap: Mapping evaluation. ---"
  log:
    "logs/map_evaluation_{sample}.log"
  shell:
    """
    qualimap bamqc -bam {input.bam} -outdir {output.qualimap_dir} -outformat html > {log} 2>&1
    """

rule blast_contigs:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta"
  output:
    blast = "03.post-processing/contaminants/{sample}/blastout"
  params:
    dir = BLASTDB,
    db = os.path.join(BLASTDB, "nt")
  resources:
    cpus = CPUS
  conda:
    "envs/blast.yaml"
  message:
    "--- BLAST: Contigs against NCBI nt db. ---"
  log:
    "logs/blast_contigs_{sample}.log"
  priority: 7
  shell:
    """
    BLASTDB={params.dir} blastn -task megablast -query {input.contigs} -db {params.db} -outfmt \
    '6 qseqid staxids bitscore pident evalue length qlen slen qcovs qcovhsp sskingdoms scomnames sscinames sblastnames stitle' \
    -num_threads {resources.cpus} -evalue 1e-5 -max_target_seqs 100 -max_hsps 10 \
    -out {output.blast} > {log} 2>&1
    """

rule blob_json:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta",
    bam = "03.post-processing/{sample}_map.bam",
    bai = "03.post-processing/{sample}_map.bam.bai",
    blast = "03.post-processing/contaminants/{sample}/blastout",
    nodes = os.path.join(BLASTDB, "nodes.dmp"),
    names = os.path.join(BLASTDB, "names.dmp")
  output:
    json = temp("03.post-processing/contaminants/{sample}/blob.blobDB.json"),
    cov = temp("03.post-processing/contaminants/{sample}/blob.{sample}_map.bam.cov")
  params:
    basename = "03.post-processing/contaminants/{sample}/blob"
  conda:
    "envs/blobtools.yaml"
  message:
    "--- BlobTools: Screen BLAST hits for contaminants. ---"
  log:
    "logs/blob_table_{sample}.log"
  priority: 7
  shell:
    """
    blobtools create -i {input.contigs} -b {input.bam} -t {input.blast} --nodes {input.nodes} --names {input.names} \
    -o {params.basename} > {log} 2>&1
    """

rule blob_table:
  input:
    json = "03.post-processing/contaminants/{sample}/blob.blobDB.json"
  output:
    bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt"
  params:
    basename = "03.post-processing/contaminants/{sample}/bestscore"
  conda:
    "envs/blobtools.yaml"
  message:
    "--- BlobTools: Collapse taxonomic assignment of BLAST hits according to sum of best scores. ---"
  log:
    "logs/blob_table_{sample}.log"
  priority: 7
  shell:
    """
    blobtools view --input {input.json} --out {params.basename} --taxrule bestsum --rank all --hits >> {log} 2>&1
    """

# Execute either one rule or another according to presence/absence of 'genus' parameter
if config.get('parameters') is not None and "genus" in config['parameters'] and config['parameters']['genus'] is not None and len(config['parameters']['genus']) > 0:
  rule:
    input:
      bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt",
      contigs = "02.assembly/{sample}/contigs_filt.fasta"
    output:
      abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "03.post-processing/contaminants/{sample}/contigs.list",
      contigs = "02.assembly/{sample}/contigs_sel.fasta"
    params:
      genus = config['parameters']['genus']    
    priority: 6
    shell:
      """
      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
      cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}
      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      awk -v var="{params.genus}" 'tolower($22) ~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore} > {output.list}
      grep -A1 -f {output.list} {input.contigs} | sed '/--/d' > {output.contigs}
      """
elif config.get('parameters') is not None and "genus" in config['parameters'] and (config['parameters']['genus'] is None or len(config['parameters']['genus']) == 0):
  rule:
    input:
      bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt",
      contigs = "02.assembly/{sample}/contigs_filt.fasta"
    output:
      abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "03.post-processing/contaminants/{sample}/contigs.list",
      contigs = "02.assembly/{sample}/contigs_sel.fasta"
    priority: 6
    shell:
      """
      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
      cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}
      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      for i in $(cat {output.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p' | sed -e 's/Para//;s/Pseudo//;s/Paen//;s/Paeni//;s/Brady//;s/Meso//;s/Neo//;s/Sino//;s/Aeri//;s/Caldi//;s/Geo//' | tr '[:upper:]' '[:lower:]'); do \
      awk -v var="$i" 'tolower($22) ~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore}; \
      done > {output.list}
      grep -A1 -f {output.list} {input.contigs} | sed '/--/d' > {output.contigs}
      """
else:
  rule:
    input:
      bestscore = "03.post-processing/contaminants/{sample}/bestscore.blob.blobDB.table.txt",
      contigs = "02.assembly/{sample}/contigs_filt.fasta"
    output:
      abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt",
      list = "03.post-processing/contaminants/{sample}/contigs.list",
      contigs = "02.assembly/{sample}/contigs_sel.fasta"
    params:
      nohit = "no-hit"   
    priority: 6
    shell:
      """
      for i in $(cat {input.bestscore} | sed '1,11d' | cut -f 22 | sort -u); do \
      cat {input.bestscore} | sed '1,11d' | awk -v var=$i 'BEGIN {{printf "%s%s", var, ": "}} $22 == var {{count++}} END {{printf "%.2f\\n", count/NR}}'; \
      done > {output.abund}
      echo "Sample {wildcards.sample} composition:"
      cat {output.abund}
      awk -v var="{params.nohit}" 'tolower($22) !~ tolower("[:alpha:]*"var) {{print $1}}' {input.bestscore} > {output.list}
      grep -A1 -f {output.list} {input.contigs} | sed '/--/d' > {output.contigs}
      """

rule genome_assembly_evaluation:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta"
  output:
    quast_dir = directory("03.post-processing/assembly_evaluation/{sample}")
  resources:
    cpus = CPUS
  conda:
    "envs/quast.yaml"
  message:
    "--- QUAST: Genome assembly evaluation. ---"
  log:
    "logs/assembly_evaluation_{sample}.log"
  shell:
    """
    quast {input.contigs} -o {output.quast_dir} --no-icarus -t {resources.cpus} > {log} 2>&1 
    """

rule completeness_and_contamination:
  input:
    contigs = "02.assembly/{sample}/contigs.fasta",
    contigs_filt = "02.assembly/{sample}/contigs_filt.fasta",
    contigs_sel = "02.assembly/{sample}/contigs_sel.fasta"
  output:
    checkm_dir = directory("03.post-processing/completeness_evaluation/{sample}"),
    checkm_stats = "03.post-processing/completeness_evaluation/{sample}/checkm_stats.tsv",
    checkm_lineage = "03.post-processing/completeness_evaluation/{sample}/lineage.ms"
  resources:
    cpus = CPUS
  conda:
    "envs/checkm.yaml"
  message:
    "--- CheckM: Assessment of genome completenness and contamination. ---"
  log:
    "logs/completenness_and_contamination_{sample}.log"
  priority: 5
  shell:
    """
    cp {input.contigs} {input.contigs_filt} {input.contigs_sel} {output.checkm_dir}
    checkm lineage_wf -t {resources.cpus} -x fasta {output.checkm_dir} {output.checkm_dir} > {log} 2>&1
    checkm qa -o 2 -t {resources.cpus} --tab_table -f {output.checkm_stats} {output.checkm_lineage} {output.checkm_dir} >> {log} 2>&1
    """

rule taxonomic_assignment:
  input:
    checkm_dir = "03.post-processing/completeness_evaluation/{sample}",
  output:
    gtdbtk_dir = directory("04.taxonomy/{sample}")
  params:
    gtdbtk_db = GTDBTKDB
  resources:
    cpus = CPUS,
    cpus_p = min(CPUS, 64)
  conda:
    "envs/gtdbtk.yaml"
  message:
    "--- GTDB-Tk: Taxonomic assignment. ---"
  log:
    "logs/taxonomic_assignment_{sample}.log"
  priority: 4
  shell:
    """
    GTDBTK_DATA_PATH={params.gtdbtk_db:q} \
    gtdbtk classify_wf -x fasta --genome_dir {input.checkm_dir} --cpus {resources.cpus} --pplacer_cpus {resources.cpus_p} --mash_db {params.gtdbtk_db:q}/mash/gtdb-tk_r214.msh \
    --out_dir {output.gtdbtk_dir} > {log} 2>&1
    rm -rf {input.checkm_dir}/contigs*.fasta
    """

rule legacy_annotation:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta",
    abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt"
  output:
    prokka_dir = directory("05.annotation/prokka/{sample}")
  resources:
    cpus = CPUS
  conda:
    "envs/prokka.yaml"
  message:
    "--- PROKKA: Genome annotation. ---"
  log:
    "logs/legacy_annotation_{sample}.log"
  priority: 5
  shell:
    """
    for i in $(cat {input.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p'); do \
    prokka --kingdom Bacteria --genus $i --species sp. --strain {wildcards.sample} \
    --usegenus --gcode 11 --rfam --compliant --addgenes --mincontiglen 500 \
    --centre AIT --locustag {wildcards.sample} --prefix {wildcards.sample} \
    --outdir {output.prokka_dir} --cpus {resources.cpus} --force {input.contigs}; \
    done > {log} 2>&1
    """

rule accurate_annotation:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta",
    abund = "03.post-processing/contaminants/{sample}/{sample}_composition.txt"
  output:
    bakta_dir = directory("05.annotation/bakta/{sample}")
  params:
    bakta_db = BAKTADB
  resources:
    cpus = CPUS
  conda:
    "envs/bakta.yaml"
  message:
    "--- Bakta: Genome annotation. ---"
  log:
    "logs/accurate_annotation_{sample}.log"
  priority: 5
  shell:
    """
    for i in $(cat {input.abund} | sort -t':' -k2 -nr | cut -d':' -f1 | sed -n '1p'); do \
    bakta --db {params.bakta_db} --verbose --genus $i --species sp. --strain {wildcards.sample} \
    --translation-table 11 --min-contig-length 500 \
    --locus-tag {wildcards.sample} --prefix {wildcards.sample} \
    --output {output.bakta_dir} --threads {resources.cpus} --force {input.contigs}; \
    done > {log} 2>&1
    """    

rule functional_annotation:
  input:
    prokka_dir = "05.annotation/prokka/{sample}"
  output:
    temp_dir = temp(directory("05.annotation/eggnog/{sample}/eggnog_tmp")),
    eggnog_dir = directory("05.annotation/eggnog/{sample}")
  params:
    dmnd_db = DMNDDB
  resources:
    cpus = CPUS
  conda:
    "envs/eggnog-mapper.yaml"
  message:
    "--- EggNOG: Functional annotation. ---"
  log:
    "logs/functional_annotation_{sample}.log"
  priority: 4
  shell:
    """
    mkdir -p {output.temp_dir} {output.eggnog_dir}
    emapper.py -i {input.prokka_dir}/{wildcards.sample}.faa --output_dir {output.eggnog_dir} \
    --cpu {resources.cpus} -m diamond --data_dir {params.dmnd_db} \
    --output {wildcards.sample} --temp_dir {output.temp_dir} --override > {log} 2>&1
    """

rule secondary_metabolites:
  input:
    bakta_dir = "05.annotation/bakta/{sample}"
  output:
    antismash_dir = directory("05.annotation/antismash/{sample}")
  params:
    taxon = 'bacteria',
    genefinding_tool = 'none'
  conda:
    "envs/antismash.yaml"
  message:
    "--- antiSMASH: secondary metabolite annotation. ---"
  log:
    "logs/secondary_metabolites_{sample}.log"
  priority: 4
  shell:
    """
    download-antismash-databases > {log} 2>&1
    antismash --taxon {params.taxon} --output-dir {output.antismash_dir} --output-basename {wildcards.sample} \
    --genefinding-tool {params.genefinding_tool} --genefinding-gff3 {input.bakta_dir}/{wildcards.sample}.gff3 {input.bakta_dir}/{wildcards.sample}.gbff >> {log} 2>&1
    """    

for sample in SAMPLES:
  for db in DATABASES:
    rule:
      input:
        contigs = expand("02.assembly/{sample}/contigs_sel.fasta", sample = sample)
      output:
        amr_tab = expand("06.AMR/ABRicate/{sample}/{db}.tsv", sample = sample, db = db)
      params:
        db = db
      conda:
        "envs/abricate.yaml"
      message:
        "--- ABRicate: AMR detection. ---"
      log:
        expand("logs/amr_{db}_in_{sample}_contigs.log", sample = sample, db = db)
      priority: 4
      shell:
        """
        abricate --db {params.db} {input.contigs} --nopath --quiet > {output.amr_tab} 2> {log}
        """

rule AMR_summary:
  input:
    argannot = "06.AMR/ABRicate/{sample}/argannot.tsv",
    card = "06.AMR/ABRicate/{sample}/card.tsv",
    ecoh = "06.AMR/ABRicate/{sample}/ecoh.tsv",
    ecoli_vf = "06.AMR/ABRicate/{sample}/ecoli_vf.tsv",
    megares = "06.AMR/ABRicate/{sample}/megares.tsv",
    ncbi = "06.AMR/ABRicate/{sample}/ncbi.tsv",
    resfinder = "06.AMR/ABRicate/{sample}/resfinder.tsv",
    vfdb = "06.AMR/ABRicate/{sample}/vfdb.tsv"
  output:
    amr_summary = "06.AMR/ABRicate/{sample}/AMR_summary.txt"
  conda:
    "envs/abricate.yaml"
  priority: 3
  shell:
    """
    abricate --summary \
    {input.argannot} {input.card} {input.ecoh} \
    {input.ecoli_vf} {input.megares} {input.ncbi} \
    {input.resfinder} {input.vfdb} > {output.amr_summary}
    """

rule download_amr_db:
  output:
    card_db = temp("06.AMR/AMR_db/card.tar.bz2"),
    card_dir = temp(directory("06.AMR/AMR_db"))
  params:
    link = config['links']['card_link'],
  message:
    "--- Download AMR features from CARD repository. ---"
  log:
    "logs/download_amr.log"
  priority: 10
  shell:
    """
    wget {params.link} -O {output.card_db} > {log} 2>&1
    tar -xjvf {output.card_db} -C {output.card_dir} >> {log} 2>&1
    """

rule map_amr_db:
  input:
    card_dir = "06.AMR/AMR_db",
    r1 = "01.pre-processing/{sample}_trim_R1.fastq",
    r2 = "01.pre-processing/{sample}_trim_R2.fastq"
  output:
    bbmap_temp = temp(directory("06.AMR/AMR_mapping/{sample}/ref")),  
    covstats_temp = temp("06.AMR/AMR_mapping/{sample}/{sample}_covstats_temp.tsv"),
    covstats = "06.AMR/AMR_mapping/{sample}/{sample}_covstats.tsv",
    amr_legend = "06.AMR/AMR_mapping/{sample}/{sample}_AMR_legend.tsv"
  params:
    card_target = "nucleotide_fasta_protein_homolog_model.fasta",
    min_id = 0.99
  resources:
    cpus = CPUS
  conda:
    "envs/bbmap.yaml"
  log:
    "logs/map_amr_{sample}.log"  
  priority: 9
  message:
    "--- Map trimmed reads against CARD db. ---"
  shell:
    """
    bbmap.sh -in={input.r1} -in2={input.r2} ref={input.card_dir}/{params.card_target} path={output.bbmap_temp} \
    idfilter={params.min_id} idtag -Xmx64g threads={resources.cpus} ambiguous=best secondary=f \
    covstats={output.covstats_temp} > {log} 2>&1

    (head -n 1 {output.covstats_temp} > {output.covstats}) && \
    tail -n +2 {output.covstats_temp} | awk -F'\t' '{{print $5 "\t" $0}}' | sort -t$'\t' -k1,1nr | cut -f2- >> \
    {output.covstats}

    echo "#AMR features with a covered length of at least 70%" > {output.amr_legend}
    (head -n 1 {input.card_dir}/aro_index.tsv >> {output.amr_legend}) && \
    for i in $(tail -n +2 {output.covstats} | awk -F'\t' '$5 >=70' | cut -f1 | awk -F'|' '{{print $5}}'); do \
        grep $i {input.card_dir}/aro_index.tsv
    done >> {output.amr_legend}
    """

rule plasmid_search:
  input:
    contigs = "02.assembly/{sample}/contigs_sel.fasta",
    blast = "03.post-processing/contaminants/{sample}/blastout"
  output:
    plasmid_dir = directory("07.plasmids/{sample}"),
    plasmids = "07.plasmids/{sample}/verified_plasmids.txt"
  params:
    platon_db = PLATONDB
  resources:
    cpus = CPUS
  conda:
    "envs/platon.yaml"
  message:
    "--- Platon: Plasmid identification. ---"
  log:
    "logs/plasmid_search_{sample}.log"
  priority: 4
  shell:
    """
   platon --db {params.platon_db} --output {output.plasmid_dir} --verbose --threads {resources.cpus} {input.contigs} > {log} 2>&1
    
    if [[ -s {output.plasmid_dir}/contigs_sel.plasmid.fasta ]] && grep -q ">" {output.plasmid_dir}/contigs_sel.plasmid.fasta; then
        while IFS= read -r i; do
            if grep -m 1 "$i" {input.blast} | grep -q "plasmid"; then
                echo "$i" >> {output.plasmids}
            else
                echo "Platon's hints for {wildcards.sample} sample were not verified by BLAST search." >> {output.plasmids}
            fi
        done < <(grep ">" {output.plasmid_dir}/contigs_sel.plasmid.fasta | sed 's/^>//g')
    else
        echo "Platon found no plasmid in {wildcards.sample} sample." > {output.plasmids}
    fi
    """

rule viral_db:
  output:
    vs2_db = temp(directory("08.phages/vs2_db")),
    checkv_db = temp(directory("08.phages/checkv_db"))
  resources:
    cpus = 4
  conda:
    "envs/virsorter.yaml"
  params:
    checkv_link = config['links']['checkv_link'] if 'checkv_link' in config['links'] else None,
    tries = 5,
    db_id = id
  message:
    """
    --- Download VirSort2 database ---
    --- Download CheckV database ---
    """
  log:
    "logs/viral_databases.log"
  priority: 9
  shell:
    """
    virsorter setup -d {output.vs2_db} -j {resources.cpus} > {log} 2>&1
    if [ -z "{params.checkv_link}" ]; then
        checkv download_database {output.checkv_db} >> {log} 2>&1
    else
        wget --tries={params.tries} -c {params.checkv_link} -P {output.checkv_db} >> {log} 2>&1
        tar -xzvf {output.checkv_db}/{params.db_id}.tar.gz -C {output.checkv_db} >> {log} 2>&1
        diamond makedb --in {output.checkv_db}/{params.db_id}/genome_db/checkv_reps.faa \
        --db {output.checkv_db}/{params.db_id}/genome_db/checkv_reps >> {log} 2>&1
    fi
    """

rule viral_identification:
  input:
    contigs = "02.assembly/{sample}/contigs_filt.fasta",
    vs2_db = "08.phages/vs2_db",
    checkv_db = "08.phages/checkv_db"
  output:
    vs2_dir = directory("08.phages/virsorter/{sample}"),
    checkv_dir = directory("08.phages/checkv/{sample}")
  params:
    viral_groups = "dsDNAphage,NCLDV,RNA,ssDNA,lavidaviridae",
    min_score = 0.5,
    checkv_link = config['links']['checkv_link'] if 'checkv_link' in config['links'] else None,
    db_id = id
  resources:
    cpus = CPUS
  conda:
    "envs/virsorter.yaml"
  message:
    """
    --- VirSorter2: Identification of phages and prophages. ---
    --- CheckV: Quality assessment of viral genomes. ---
    """
  log:
    "logs/viral_identification_{sample}.log"
  priority: 8
  shell:
    """
    virsorter run  -i {input.contigs} -w {output.vs2_dir} -d {input.vs2_db} \
    --keep-original-seq --include-groups {params.viral_groups} --min-score {params.min_score} -j {resources.cpus} all > {log} 2>&1

    if [ -z "{params.checkv_link}" ]; then
        checkv end_to_end {output.vs2_dir}/final-viral-combined.fa {output.checkv_dir} \
      -t {resources.cpus} -d {input.checkv_db}/{params.db_id} >> {log} 2>&1 
    else
        checkv end_to_end {output.vs2_dir}/final-viral-combined.fa {output.checkv_dir} \
      -t {resources.cpus} -d {input.checkv_db}/checkv-db-v1.5 >> {log} 2>&1    
    fi
    """

rule multiqc:
  input:
    fastp_json = expand("01.pre-processing/{sample}_fastp.json", sample = SAMPLES),
    qualimap_dir = expand("03.post-processing/mapping_evaluation/{sample}", sample = SAMPLES),
    quast_dir = expand("03.post-processing/assembly_evaluation/{sample}", sample = SAMPLES),
    prokka_dir = expand("05.annotation/prokka/{sample}", sample = SAMPLES),
    bakta_dir = expand("05.annotation/bakta/{sample}", sample = SAMPLES)
  output:
    multiqc_dir = directory("09.report"),
    multiqc_yaml = temp("09.report/multiqc_config.yaml")
  conda:
    "envs/multiqc.yaml"
  message:
    "--- MultiQC: Aggregate results. ---"
  log:
    "logs/multiqc.log"
  shell:
    """
    printf "%s\n" "show_analysis_paths: False" "show_analysis_time: False" > {output.multiqc_yaml}
    multiqc --config {output.multiqc_yaml} -d -dd 1 {input.fastp_json} {input.qualimap_dir} {input.quast_dir} {input.prokka_dir} {input.bakta_dir} \
    --outdir {output.multiqc_dir} > {log} 2>&1
    """
