#!/usr/bin/env python3

### Program: prepTG
### Author: Rauf Salamzade
### Kalan Lab
### UW Madison, Department of Medical Microbiology and Immunology

# BSD 3-Clause License
#
# Copyright (c) 2021, Kalan-Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import shutil
import sys
import argparse
from Bio import SeqIO
from zol import util
import subprocess
import multiprocessing
def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""
	Program: prepTG
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

	Prepares additional genomes for being searched for GCFs using fai.
	
	Target genomes can either be in GenBank (with CDS features!!!) or FASTA format.
	
	Considerations
	--------------------------------------------------------------------------------------
	If FASTA format is provided, assumption is that genomes are prokaryotic and 
	pyrodigal/prodigal will be used to perform gene-calling. Eukaryotic genomes can 
	be provided as FASTA format but the --reference_proteome file should be used in 
	such case to map proteins from a reference proteome (from the same species ideally) 
	on to the target genomes. This will prevent detection of new genes in gene-clusters 
	detected by fai but synchronize gene-calling and allow for better similarity 
	assessment between orthologous genes.
	""", formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('-i', '--input_dir', help='Directory with target genomes (either featuring GenBanks or FASTAs).', required=True)
	parser.add_argument('-o', '--output_dir', help='Output directory to stored input for fai.', required=True)
	parser.add_argument('-l', '--locus_tag_length', type=int, help='Length of locus tags to set. Default is 3, allows for <~18k genomes.', required=False, default=3)
	parser.add_argument('-r', '--rename_locus_tags', action='store_true', help='Whether to rename locus tags if provided for CDS features in GenBanks.', required=False, default=False)
	parser.add_argument('-p', '--use_prodigal', action='store_true', help='Flag to use prodigal instead of pyrodigal.', required=False, default=False)
	parser.add_argument('-m', '--meta_mode', action='store_true', help='Flag to use meta mode instead of single for pyrodigal/prodigal.', default=False, required=False)
	parser.add_argument('-rp', '--reference_proteome', help='Provide path to a reference proteome to use for protein/gene-calling\nin target genomes - which should be in FASTA format.', default=None, required=False)
	parser.add_argument('-c', '--cpus', type=int, help="Total number of cpus/threads to use for running OrthoFinder2/prodigal.\n[Default is 1].", required=False, default=1)
	parser.add_argument('-mm', '--max_memory', type=int, help='Uses resource module to set soft memory limit. Provide in Giga-bytes.\nGenerally memory shouldn\'t be a major concern unless working\nwith hundreds of large metagenomes. [currently\nexperimental; default is None].', default=None, required=False)
	parser.add_argument('-v', '--version', action='store_true', help="Get version and exit.", required=False, default=False)

	args = parser.parse_args()
	return args

def prepTG():
	"""
	Void function which runs primary workflow for program.
	"""

	# get version
	version = util.getVersion()

	if len(sys.argv)>1 and ('-v' in set(sys.argv) or '--version' in set(sys.argv)):
		sys.stdout.write(version + '\n')
		sys.exit(0)

	"""
	PARSE ARGUMENTS
	"""
	myargs = create_parser()

	outdir = os.path.abspath(myargs.output_dir) + '/'
	target_genomes_dir = myargs.input_dir
	locus_tag_length = myargs.locus_tag_length
	rename_locus_tags = myargs.rename_locus_tags
	cpus = myargs.cpus
	max_memory = myargs.max_memory
	use_prodigal = myargs.use_prodigal
	meta_mode = myargs.meta_mode
	reference_proteome = myargs.reference_proteome

	if os.path.isdir(outdir):
		sys.stderr.write("Output directory exists! Exiting\n ")
		sys.exit(1)
	else:
		os.system('mkdir %s' % outdir)

	try:
		assert(os.path.isdir(target_genomes_dir))
		target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
	except:
		raise RuntimeError('Issue with reading genome listing file for samples with additional genomic assemblies.')

	"""
	START WORKFLOW
	"""

	sys.stdout.write('Input command:\n%s\n' % ' '.join(sys.argv))

	# create logging object
	log_file = outdir + 'Progress.log'
	logObject = util.createLoggerObject(log_file)
	logObject.info("Saving parameters for future records.")
	parameters_file = outdir + 'Parameter_Inputs.txt'
	parameter_values = [target_genomes_dir, outdir, use_prodigal, meta_mode, cpus, max_memory]
	parameter_names = ["Target Genomes Directory", "Output Directory", "Use Prodigal?", "Use Meta Mode?",
					   "Number of Cpus", "Maximum Memory in GB"]
	util.logParametersToFile(parameters_file, parameter_names, parameter_values)
	logObject.info("Done saving parameters!")

	logObject.info('Running prepTG version %s' % version)
	sys.stdout.write('Running prepTG version %s\n' % version)

	# set max memory limit
	if max_memory != None:
		logObject.info("Setting maximum memory usage to: %dGB" % max_memory)
		try:
			util.memory_limit(max_memory)
		except Exception as e:
			logObject.info("Error setting memory limit")
			sys.stdout.write("Error setting memory limit\n")

	# Step 1: List genomes in directory
	sys.stdout.write('--------------------\nStep 1\n--------------------\nProcessing genomes provided in input directory.\n')
	target_listing_file = outdir + 'Target_Genomes.txt'
	uncompress_dir = outdir + 'Uncompressed_Genomes/'
	list_cmd = ['listAllGenomesInDirectory.py', '-i', target_genomes_dir, '--uncompress_dir', uncompress_dir, '>',
				target_listing_file]
	try:
		subprocess.call(' '.join(list_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
						executable='/bin/bash')
		logObject.info('Successfully ran: %s' % ' '.join(list_cmd))
	except Exception as e:
		logObject.error('Had an issue running: %s' % ' '.join(list_cmd))
		sys.stderr.write('Had an issue running: %s' % ' '.join(list_cmd))
		logObject.error(e)
		sys.exit(1)

	# Step 2: Setup files needed for fai
	sys.stdout.write('--------------------\nStep 2\n--------------------\nRe-formatting genomes for usage in fai downstream.\n')
	format_assess_dir = outdir + 'Format_Assessment/'
	util.setupReadyDirectory([format_assess_dir])
	format_predictions_file = outdir + 'All_Format_Inferences.txt'
	target_genome_annotation_listing_file = outdir + 'Target_Genome_Annotation_Files.txt'
	additional_sample_genomes, additional_format_prediction = util.parseSampleGenomes(target_listing_file, format_assess_dir, format_predictions_file, logObject, cpus=cpus)
	if additional_format_prediction == 'mixed':
		logObject.error(
			'Format of additional genomes provided is not consistently FASTA or Genbank, please check input.')
		raise RuntimeError(
			'Format of additional genomes provided is not consistently FASTA or Genbank, please check input.')

	additional_proteomes_directory = outdir + 'Predicted_Proteomes_Additional/'
	additional_genbanks_directory = outdir + 'Genomic_Genbanks_Additional/'
	additional_genbanks_directory_local = 'Genomic_Genbanks_Additional/'
	util.setupReadyDirectory([additional_proteomes_directory, additional_genbanks_directory])
	if additional_format_prediction == 'fasta':
		if reference_proteome == None:
			additional_prodigal_outdir = outdir + 'Prodigal_Gene_Calling_Additional/'
			util.setupReadyDirectory([additional_prodigal_outdir])
			util.processGenomesUsingProdigal(additional_sample_genomes, additional_prodigal_outdir,
											 additional_proteomes_directory, additional_genbanks_directory, logObject,
											 cpus=cpus, locus_tag_length=locus_tag_length, use_prodigal=use_prodigal,
											 meta_mode=meta_mode)
			shutil.rmtree(additional_prodigal_outdir)
		else:
			additional_miniprot_outdir = outdir + 'Miniprot_Gene_Calling_Additional/'
			util.setupReadyDirectory([additional_miniprot_outdir])
			util.processGenomesUsingMiniprot(reference_proteome, additional_sample_genomes, additional_miniprot_outdir,
											 additional_proteomes_directory, additional_genbanks_directory, logObject,
											 cpus=cpus, locus_tag_length=locus_tag_length)
			shutil.rmtree(additional_miniprot_outdir)
	else:
		# genomes are provided as Genbanks with CDS features
		gene_name_mapping_outdir = outdir + 'Mapping_of_New_Gene_Names_to_Original/'
		util.setupReadyDirectory([gene_name_mapping_outdir])
		util.processGenomesAsGenbanks(additional_sample_genomes, additional_proteomes_directory,
									  additional_genbanks_directory, gene_name_mapping_outdir, logObject,
									  cpus=cpus, locus_tag_length=locus_tag_length, rename_locus_tags=rename_locus_tags)

	target_genome_annotation_listing_handle = open(target_genome_annotation_listing_file, 'w')
	target_genomes_concat_db_faa = outdir + 'Target_Genomes_DB.faa'
	target_genomes_concat_db_handle = open(target_genomes_concat_db_faa, 'w')
	for f in os.listdir(additional_proteomes_directory):
		sample = f.split('.faa')[0]
		target_genome_annotation_listing_handle.write(sample + '\t' + additional_genbanks_directory_local + sample + '.gbk\n')
		try:
			with open(additional_proteomes_directory + sample + '.faa') as ospf:
				for rec in SeqIO.parse(ospf, 'fasta'):
					target_genomes_concat_db_handle.write('>' + sample + '|' + rec.id + '\n' + str(rec.seq) + '\n')
		except:
			pass
	target_genome_annotation_listing_handle.close()
	target_genomes_concat_db_handle.close()

	# remove individual proteome files
	sys.stdout.write('--------------------\nStep 4\n--------------------\nRemoving intermediate files to save space.\n')
	shutil.rmtree(additional_proteomes_directory)

	target_genomes_concat_db_dmnd = outdir + 'Target_Genomes_DB.dmnd'
	diamond_makedb_cmd = ['diamond', 'makedb', '--ignore-warnings', '--in', target_genomes_concat_db_faa, '-d',
						  target_genomes_concat_db_dmnd]
	try:
		subprocess.call(' '.join(diamond_makedb_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
						executable='/bin/bash')
		logObject.info('Successfully ran: %s' % ' '.join(diamond_makedb_cmd))
	except Exception as e:
		logObject.error('Had an issue running: %s' % ' '.join(diamond_makedb_cmd))
		sys.stderr.write('Had an issue running: %s\n' % ' '.join(diamond_makedb_cmd))
		logObject.error(e)
		sys.exit(1)

	target_genome_annotation_data = util.readInAnnotationFilesForExpandedSampleSet(target_genome_annotation_listing_file, additional_genbanks_directory)

	# TODO: switch to sqllite inplace of pickle for dynamic loading of each samples data to avoid memory spikes

	pickle_output_dir = outdir + 'Target_Genomes_Info_Pickled/'
	util.setupReadyDirectory([pickle_output_dir])
	try:
		sample_genbank_info = []
		for sample in target_genome_annotation_data:
			sample_genbank = target_genome_annotation_data[sample]['genbank']
			result_pkl_file = pickle_output_dir + sample + '.pkl'
			sample_genbank_info.append([sample, sample_genbank, result_pkl_file])

		p = multiprocessing.Pool(cpus)
		p.map(util.parseGenbankAndFindBoundaryGenes, sample_genbank_info)
		p.close()
	except Exception as e:
		sys.stderr.write('Issues creating pickle file needed for fai.')
		sys.stderr.write(e + '\n')
		logObject.error(e)
		sys.exit(1)

	# Close logging object and exit
	util.closeLoggerObject(logObject)
	sys.exit(0)

if __name__ == '__main__':
	prepTG()
