#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import glob

##### Load config and sample sheets #####
onsuccess:
	print("splitFASTQ completed successfully!")

configfile: "config/config.yaml"

## Read in samplesheet
samples = pd.read_table(config["samplesheet"])

## Convert all columns to strings
samples = samples.astype(str)

## Concatenate the sequencing directory to Read1 and Read2 for full paths
samples['Read1'] = samples[['Sequencing_Directory', 'Read1']].apply(lambda row: os.path.join(*row), axis=1)
samples['Read2'] = samples[['Sequencing_Directory', 'Read2']].apply(lambda row: os.path.join(*row), axis=1)

## Concatenate columns to identify which groups to run (i.e. Seq_Rep will be run together)
samples['id'] = samples[config['groupBy']].agg('_'.join, axis=1)

## Group by id and extract Read1 & Read2
read1 = samples.groupby('id')['Read1'].apply(list).to_dict()
read2 = samples.groupby('id')['Read2'].apply(list).to_dict()

##### Define rules #####
rule all:
    input:
        [expand("output/{group}/{group}_split{R}_done.txt", group=key, R=['R1', 'R2']) for key in read1]

rule splitR1:
    input:
        lambda wildcards: read1.get(wildcards.group)
    output:
        'output/{group}/{group}_splitR1_done.txt'
    params:
	    splitsize = config['splitsize']
    threads: 8
    benchmark:
        'output/{group}/benchmarks/{group}_splitR1.tsv'
    log:
        err = 'output/{group}/logs/{group}_splitR1.err'
    shell:
	    """
        module load pigz;
        mkdir -p output/{wildcards.group}/splitsR1;
        gunzip -c {input} | split -a 3 -l {params.splitsize} -d --additional-suffix=_R1.fastq --filter='pigz -p {threads} > output/{wildcards.group}/splitsR1/$FILE.gz' 2> {log.err};
        ls -1 output/{wildcards.group}/splitsR1/ > {output} 2>> {log.err};
	    """

rule splitR2:
    input:
        lambda wildcards: read2.get(wildcards.group)
    output:
        'output/{group}/{group}_splitR2_done.txt'
    params:
	    splitsize = config['splitsize']
    threads: 8
    benchmark:
        'output/{group}/benchmarks/{group}_splitR2.tsv'
    log:
        err = 'output/{group}/logs/{group}_splitR2.err'
    shell:
	    """
        module load pigz;
        mkdir -p output/{wildcards.group}/splitsR2;
        gunzip -c {input} | split -a 3 -l {params.splitsize} -d --additional-suffix=_R2.fastq --filter='pigz -p {threads} > output/{wildcards.group}/splitsR2/$FILE.gz' 2> {log.err};
        ls -1 output/{wildcards.group}/splitsR2/ > {output} 2>> {log.err};
	    """