configfile: "config_metabarcoding"

# Rule to collect final results
rule all:
	input:
		expand("{data}_monoreplicat.fasta", data=config["data"]),
		expand("{data}_assign.csv", data=config["data"])

rule Paired_reads_assembly:
	input:
		forward="{data}_R1.fastq",
		reverse="{data}_R2.fastq"
	output:
		"{data}_paired_all.fastq"
	params:
		config["pairing_quality"]
	shell:
		"illuminapairedend {input.forward} -r {input.reverse} --sanger --fastq-output --uppercase --score-min={params} > {output}"

rule Take_out_well_assembled_sequences:
	input:
		"{data}_paired_all.fastq"
	output:
		"{data}_paired_trie.fastq"
	shell:
		"obigrep -p 'mode!=\"joined\"' --sanger --nuc --uppercase {input} > {output}"

rule Taking_out_primers:
	input:
		donnees="{data}_paired_trie.fastq",
		tags="{data}_tags_liste.txt"
	output:
		main="{data}_primerless.fastq",
		primerNotFound="{data}_unidentified.fastq"
	params:
		config["mismatch_primer"]
	shell:
		"ngsfilter -e {params} --sanger --nuc --fastq-output --uppercase -t {input.tags} -u {output.primerNotFound} {input.donnees} > {output.main}"

rule Dereplication:
	input:
		"{data}_primerless.fastq"
	output:
		"{data}_dereplic.fasta"
	shell:
		"obiuniq -m sample --sanger --nuc {input} > {output}"

rule Length_verifying:
	input:
		"{data}_dereplic.fasta"
	output:
		"{data}_length.fasta"
	params:
		mini=config["length_min"],
		maxi=config["length_max"]
	shell:
		"obigrep -l {params.mini} -L {params.maxi} --fasta --nuc --uppercase {input} > {output}"

rule Taking_out_monoreplicats:
	input:
		"{data}_length.fasta"
	output:
		"{data}_replicatCheck.fasta"
	shell:
		"obigrep -p 'len(sequence[\"merged_sample\"])>1' --fasta --nuc --uppercase {input} > {output}"

rule File_with_monoreplicats:
	input:
		"{data}_length.fasta"
	output:
		"{data}_monoreplicat.fasta"
	shell:
		"obigrep -p 'len(sequence[\"merged_sample\"])==1' --fasta --nuc --uppercase {input} > {output}"

rule Sequencing_errors_detection:
	input:
		"{data}_replicatCheck.fasta"
	output:
		"{data}_obiclean_annote.fasta"
	params:
		D=config["mismatch_obiclean"],
		R=config["pourcentage_obiclean"]
	shell:
		"obiclean -d{params.D} -r{params.R} --fasta --nuc --uppercase {input} > {output}"

rule Taking_out_sequencing_errors:
	input:
		"{data}_obiclean_annote.fasta"
	output:
		"{data}_clean.fasta"
	shell:
		"obigrep -a 'obiclean_head:True' --fasta --nuc --uppercase {input} > {output}"

rule Sample_info_file_creation:
	input:
		"{data}_clean.fasta"
	output:
		"{data}_sample_info.csv"
	shell:
		"Tableau_info_seq.py {input} {output}"

rule Taxonomical_assignment:
	input:
		"{data}_clean.fasta"
	output:
		"{data}_blast_result.txt"
	params:
		DB=config["database"]
	shell:
		"blastn -query {input} -db {params.DB} -out {output} -outfmt \"6 qseqid qlen sseqid slen length nident mismatch gaps\" -word_size 4"

rule Blast_results_parsing_and_analyzing:
	input:
		results="{data}_blast_result.txt",
		info="{data}_sample_info.csv"
	output:
		"{data}_assign.csv"
	params:
		taxo=config["taxo"],
	shell:
		"Rscript Assign_Metab.R {input.results} {params.taxo} {input.info} {output}"

# Fin

