"""
Convert the FactorNet repository to Kipoi models

author: Ziga Avsec
"""
# Script to go from: https://github.com/uci-cbcl/FactorNet/ to kipoi models
# Use snakemake

# rule git_clone - 0. git clone the permalink to the repo

# 1. copy over the files
# 2. bigwig.txt - check the bigwigs
# - DGF (DNase) - allways present
# - Unique35 (mappability track)
# 3. chip.txt (tasks)
# - [x] save to column names
# 4. feature.txt (input features)
# - softlink differnet dataloaders for it
# 5. [x] meta.txt assert it's allways the same

# [x] Assert DGF is always one of the bigwigs

# All models
MODELS = ['CEBPB/meta_Unique35_DGF',
          'CEBPB/onePeak_1_DGF',
          'CEBPB/onePeak_2_Unique35_DGF',
          'CTCF/metaGENCODE_RNAseq_Unique35_DGF',
          'CTCF/meta_RNAseq_Unique35_DGF',
          'E2F1/GENCODE_Unique35_DGF',
          'E2F1/onePeak_Unique35_DGF',
          'EGR1/meta_RNAseq_Unique35_DGF',
          'EGR1/onePeak_DGF',
          'FOXA1/multiTask_DGF',
          'FOXA1/onePeak_DGF',
          'FOXA2/multiTask_DGF',
          'FOXA2/onePeak_Unique35_DGF',
          'GABPA/metaGENCODE_RNAseq_Unique35_DGF',
          'GABPA/meta_RNAseq_Unique35_DGF',
          'HNF4A/multiTask_DGF',
          'HNF4A/onePeak_Unique35_DGF',
          'JUND/meta_Unique35_DGF_2',
          'JUND/meta_Unique35_DGF',
          'MAFK/meta_1_Unique35_DGF',
          'MAFK/onePeak_1_DGF',
          'MAFK/onePeak_2_Unique35_DGF',
          'MAX/onePeak_Unique35_DGF_2',
          'MAX/onePeak_Unique35_DGF',
          'NANOG/GENCODE_Unique35_DGF',
          'NANOG/onePeak_Unique35_DGF',
          'REST/GENCODE_Unique35_DGF_2',
          'REST/GENCODE_Unique35_DGF',
          'TAF1/GENCODE_Unique35_DGF',
          'TAF1/onePeak_Unique35_DGF']

# All cell-lines
CELL_LINES = ['A549',
              'GM12878',
              'H1-hESC',
              'HCT116',
              'HeLa-S3',
              'HepG2',
              'IMR-90',
              'induced_pluripotent_stem_cell',
              'K562',
              'liver',
              'MCF-7',
              'Panc1',
              'PC-3',
              ]


GENCODE_FEATURES = [
    'cpgisland.bed.gz',
    'wgEncodeGencodeBasicV19.cds.merged.bed.gz',
    'wgEncodeGencodeBasicV19.intron.merged.bed.gz',
    'wgEncodeGencodeBasicV19.promoter.merged.bed.gz',
    'wgEncodeGencodeBasicV19.txStart.bed.gz',
    'wgEncodeGencodeBasicV19.utr3.merged.bed.gz',
    'wgEncodeGencodeBasicV19.utr5.merged.bed.gz'
]

EXAMPLE_FILES = ["hg38_chr22.fa", "intervals.bed", "dnase_synth.chr22.bw"]

import template
from template import GENERATE_FILES

rule all:
    input:
        # model_files
        expand("{model}/model_files/model.json", model=MODELS),
        expand("{model}/model_files/best_model.hdf5", model=MODELS),
        # dataloader_files
        expand("{model}/dataloader_files/RNAseq_features", model=MODELS),
        expand("template/dataloader_files/gencode_features/{gcfeat}",
               gcfeat=GENCODE_FEATURES),
        expand("{model}/dataloader_files/gencode_features",
               model=MODELS),
        # expand("{model}/dataloader_files/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
        #        model=MODELS),
        # example files
        expand("{model}/example_files/{xf}",
               model=MODELS, xf=EXAMPLE_FILES),
        # model/dataloader files
        expand("{model}/{gen_file}",
               model=MODELS, gen_file=GENERATE_FILES),
        expand("{model}/tasks.txt", model=MODELS)

rule dataloader_py:
    input:
        expand("{model}/dataloader.py", model=MODELS),

rule dataloader_yaml:
    input:
        expand("{model}/dataloader.yaml", model=MODELS),

rule model_yaml:
    input:
        expand("{model}/model.yaml", model=MODELS),

rule git_clone:
    """Clone the FactorNet repo
    """
    # TODO - setup the git clone with a permalink
    output:
        "FactorNet/README.md",
    shell:
        'git clone --depth=1 https://github.com/uci-cbcl/FactorNet.git'

rule model_files:
    """Copy over the model files etc for each directory
    """
    input:
        arch = "FactorNet/models/{model}/model.json",
        w = "FactorNet/models/{model}/best_model.hdf5",
    output:
        arch = "{model}/model_files/model.json",
        w = "{model}/model_files/best_model.hdf5",
    run:
        from keras.models import model_from_json
        from keras.utils.layer_utils import convert_all_kernels_in_model
        import keras
        # keras.backend.set_image_dim_ordering("tf")
        model = model_from_json(open(input.arch, "r").read())
        model.load_weights(input.w)
        convert_all_kernels_in_model(model)
        model.save_weights(output.w)
        from shutil import copyfile
        # copyfile(input.arch, output.arch)
        with open(output.arch, "w") as f:
            f.write(model.to_json())

rule RNAseq_features:
    """Setup an RNAseq_features directory
    """
    input:
        m = "FactorNet/data/{cell_line}/meta.txt"
    output:
        m = "template/dataloader_files/RNAseq_features/{cell_line}/meta.txt",
    shell:
        "cp {input.m} {output.m}"

rule softlink_RNAseq_features:
    """Setup an RNAseq_features directory
    """
    input:
        expand("template/dataloader_files/RNAseq_features/{cell_line}/meta.txt",
               cell_line=CELL_LINES)
    output:
        sl = "{model}/dataloader_files/RNAseq_features"
    shell:
        "ln -srf template/dataloader_files/RNAseq_features {wildcards.model}/dataloader_files/RNAseq_features"


rule gencode_features:
    """Setup gencode_features
    """
    input:
        f = "FactorNet/resources/{gencode_feature}"
    output:
        f = "template/dataloader_files/gencode_features/{gencode_feature}"
    shell:
        "cp {input.f} {output.f}"

rule softlinke_gencode_featurs:
    input:
        expand("template/dataloader_files/gencode_features/{gcf}",
               gcf=GENCODE_FEATURES)
    output:
        sl = "{model}/dataloader_files/gencode_features"
    shell:
        "ln -srf template/dataloader_files/gencode_features {wildcards.model}/dataloader_files/gencode_features"

rule download_mappability:
    """Downloads the encode mappability track
    """
    output:
        track = "template/dataloader_files/wgEncodeDukeMapabilityUniqueness35bp.bigWig"
    shell:
        """wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig
        mv wgEncodeDukeMapabilityUniqueness35bp.bigWig template/dataloader_files
        """

rule softlink_mappability:
    input:
        track = "template/dataloader_files/wgEncodeDukeMapabilityUniqueness35bp.bigWig"
    output:
        track = "{model}/dataloader_files/wgEncodeDukeMapabilityUniqueness35bp.bigWig"
    shell:
        "ln -srf {input.track} {output.track}"

rule softlink_example_files:
    """Softlinks all the example files
    """
    # TODO - create a small bigwig file
    input:
        fa = "template/example_files/hg38_chr22.fa",
        intervals = "template/example_files/intervals.bed",
        bw = "template/example_files/dnase_synth.chr22.bw"
    output:
        fa = "{model}/example_files/hg38_chr22.fa",
        intervals = "{model}/example_files/intervals.bed",
        bw = "{model}/example_files/dnase_synth.chr22.bw"
    shell:
        """
        ln -srf {input.fa} {output.fa}
        ln -srf {input.intervals} {output.intervals}
        ln -srf {input.bw} {output.bw}
        """

rule copy_tasks:
    input:
        tasks = "FactorNet/models/{model}/chip.txt"
    output:
        tasks = "{model}/tasks.txt"
    shell:
        "cp {input.tasks} {output.tasks}"

rule render_templates:
    """
    """
    input:
        expand("template/template_{gen_file}",
               gen_file=GENERATE_FILES),
        "FactorNet/README.md",
        # feat = "FactorNet/models/{tf}/{subtype}/feature.txt",
        # bw = "FactorNet/models/{tf}/{subtype}/bigwig.txt"
    output:
        ["{tf}/{subtype}/" + gen_file for gen_file in GENERATE_FILES]
    run:
        from template import write_templates
        import pandas as pd
        df = pd.read_table("models.tsv")

        m = df[df.model == os.path.join(wildcards.tf, wildcards.subtype)].iloc[0]
        write_templates(tf=wildcards.tf,
                        model_subtype=wildcards.subtype,
                        **dict(m))


def write_models_tsv():
    import kipoi
    from collections import OrderedDict
    import pandas as pd
    l = []
    for m in MODELS:
        md = kipoi.get_model_descr("FactorNet/" + m)
        l.append(OrderedDict([
            ("model", m),
            ("arch_url", md.args['arch'].url),
            ("arch_md5", md.args['arch'].md5),
            ("weights_url", md.args['weights'].url),
            ("weights_md5", md.args['weights'].md5)
        ]))
    pd.DataFrame(l).to_csv("models.tsv", sep='\t', index=False)
