#!/usr/bin/env python3

### Program: fai
### Author: Rauf Salamzade
### Kalan Lab
### UW Madison, Department of Medical Microbiology and Immunology

# BSD 3-Clause License
#
# Copyright (c) 2023, Kalan-Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import sys
import argparse
from time import sleep
from zol import util, fai
import subprocess
import pickle
import shutil
import multiprocessing
import traceback

def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""
	Program: zol
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

		 .o88o.            o8o  
		 888 `"            `"'  
		o888oo   .oooo.   oooo  
		 888    `P  )88b  `888  
		 888     .oP"888   888  
		 888    d8(  888   888  
		o888o   `Y888""8o o888o 
                                               
    MODES OF INPUT:
    *********************************************************************************

    Type 1: Directory of Homologous Gene-Cluster GenBanks 
    (GenBanks must have CDS features with locus_tag or protein_id names)
	-----------------------------------------------------	    
    $ fai -i Known_GeneCluster.gbk -tg prepTG_Database/ -o fai_Results/
    
    Type 2: Reference Genome with Gene-Cluster/Locus Coordinates 
    (proteins are collapsed for high-similarity using cd-hit)
    -----------------------------------------------------
    $ fai -r Reference.fasta -rc scaffold01 -rs 40201 -re 45043 \\
          -tg prepTG_Database/ -o fai_Results/

    Type 3: Set of Query Proteins (not compatible with the syntenic similarity cutoff) 
    (proteins are collapsed for high-similarity using cd-hit)
    Similar to input for cblaster
    -----------------------------------------------------
    $ fai -pq Gene-Cluster_Query_Proteins.faa -tg prepTG_Database/ -o fai_Results/	    	    
	""", formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument('-i', '--query_inputs', nargs='+', help='Paths to locus-specific GenBank(s) [could be multiple] to use as\nqueries for searching for homologous/orthologous instances in target genomes.\nFiles must end with ".gbk", ".gbff", or ".genbank".', required=False, default=None)
	parser.add_argument('-r', '--reference_genome', help='Path to reference genome in FASTA or GenBank format.', required=False, default=None)
	parser.add_argument('-rc', '--reference_contig', help='Scaffold name (up to first space) which features region\nof interest.', required=False, default=None)
	parser.add_argument('-rs', '--reference_start', type=int, help='Start position of gene-cluster on scaffold.', required=False, default=None)
	parser.add_argument('-re', '--reference_end', type=int, help='End position of gene-cluster on scaffold.', required=False, default=None)
	parser.add_argument('-pq', '--protein_queries', help="Path to protein multi-FASTA file containing to use as queries.", required=False, default=None)
	parser.add_argument('-tg', '--target_genomes_db', help='Result directory from running prepTG for target genomes of interest.', required=True)
	parser.add_argument('-o', '--output_dir', help='Parent output/workspace directory.', required=True)
	parser.add_argument('-dm', '--draft_mode', action='store_true', help='Run in draft-mode to also report segments on scaffold edges which in\naggregate (with other such segments) they meet criteria\nfor reporting.', required=False, default=False)
	parser.add_argument('-fp', '--filter_paralogs', action='store_true', help='Filter paralogous instances of gene-cluster identified in a\nsingle target genome.', required=False, default=False)
	parser.add_argument('-e', '--evalue_cutoff', type=float, help="E-value cutoff for DIAMOND blastp to consider a gene in a\ntarget genome a hit to a query protein. [Default\nis 1e-10].", required=False, default=1e-10)
	parser.add_argument('-m', '--min_prop', type=float, help="The minimum proportion of distinct proteins/ortholog groups\nneeded to report discrete segments of the gene-cluster.\nNote, that a minimum of 3 distinct query proteins/homolog\ngroups are needed per segment reported.", required=False, default=0.5)
	parser.add_argument('-kpq', '--key_protein_queries', help="Path to protein multi-FASTA file containing key query sequences\nwhich, if present within a potential segment identified by the HMM,\nwill automatically report the segment without further consideration.\nCompatible with all three input modes.", required=False, default=None)
	parser.add_argument('-kpe', '--key_protein_evalue_cutoff', type=float, help='E-value cutoff for finding key query sequences in putative\ngene-cluster homolog segment. [Default\nis 1e-20]. Disregarded if less strict than the\ngeneral --evalue cutoff.', required=False, default=1e-20)
	parser.add_argument('-kpm', '--key_protein_min_prop', type=float, help='The minimum proportion of distinct ortholog groups matching\nkey proteins needed to report a homologous gene-cluster in\na genome. [Default is 0.0].', required=False, default=0.0)
	parser.add_argument('-sct', '--syntenic_correlation_threshold', type=float, help="The minimum syntenic correlation needed to at least one known\nGCF instance to report segment. [Default is 0.6]", required=False, default=0.6)
	parser.add_argument('-gdm', '--gc_delineation_mode', help='Method/mode for delineation of gene-cluster boundaries. Options are\n"Gene-Clumper" or "HMM". Default is Gene-Clumper.', required=False, default="Gene-Clumper")
	parser.add_argument('-f', '--flanking_context', type=int, help='Number of bases to append to the beginning/end of the gene-cluster\nsegments identified. [Default is 1000].', required=False, default=1000)
	parser.add_argument('-mgd', '--max_genes_disconnect', type=int, help="Maximum number of genes between gene-cluster segments detected by HMM to\nmerge segments together. Alternatively the number of genes separating\nhits if Gene-Clumper mode is used. Allows for more inclusivity of novel\nauxiliary genes. [Default is 5].", required=False, default=5)
	parser.add_argument('-gt', '--gc_transition', type=float, help="Probability for gene-cluster to gene-cluster transition in HMM. Should be between\n0.0 and 1.0. [Default is 0.9].", required=False, default=0.9)
	parser.add_argument('-bt', '--bg_transition', type=float, help="Probability for background to background transition in HMM. Should be between\n0.0 and 1.0. [Default is 0.9].", required=False, default=0.9)
	parser.add_argument('-ge', '--gc_emission', type=float, help="Emission probability of gene being in gene-cluster state assuming a orthologis\nfound at the e-value cutoff. [Default is 0.95].", required=False, default=0.95)
	parser.add_argument('-be', '--bg_emission', type=float, help="Emission probability of gene being in gene-cluster state assuming no homolog is\nfound at the e-value cutoff. [Default is 0.2].", required=False, default=0.2)
	parser.add_argument('-gp', '--generate_plots', action='store_true', help='Generate plots for assessing gene-cluster segments identified.', required=False, default=False)
	parser.add_argument('-ds', '--diamond_sensitivity', help='DIAMOND alignment sensitivity. Options include: fast, mid-sensitive,\nsensitive, more-sensitive, very-sensitive, and ultra-sensitive.\n[Default is very-sensitive].', required=False, default="very-sensitive")
	parser.add_argument('-c', '--cpus', type=int, help="The number of cpus to use. [Default is 1].", required=False, default=1)
	parser.add_argument('-mm', '--max_memory', type=int, help='Uses resource module to set soft memory limit. Provide\nin Giga-bytes. Generally memory shouldn\'t be a major concern unless\nworking with hundreds of large metagenomes. [currently\nexperimental; default is None].', default=None, required=False)
	parser.add_argument('-v', '--version', action='store_true', help="Get version and exit.", required=False, default=False)

	args = parser.parse_args()
	return args

def faiMain():
	"""
	Void function which runs primary workflow for program.
	"""

	# get version
	version = util.getVersion()

	if len(sys.argv)>1 and ('-v' in set(sys.argv) or '--version' in set(sys.argv)):
		sys.stdout.write(version + '\n')
		sys.exit(0)

	"""
	PARSE INPUTS
	"""
	myargs = create_parser()

	query_inputs = myargs.query_inputs
	reference_genome = myargs.reference_genome
	reference_contig = myargs.reference_contig
	reference_start = myargs.reference_start
	reference_end = myargs.reference_end
	protein_queries_fasta = myargs.protein_queries
	target_genomes = myargs.target_genomes_db
	outdir = os.path.abspath(myargs.output_dir) + '/'
	evalue_cutoff = myargs.evalue_cutoff
	min_prop = myargs.min_prop
	syntenic_correlation_threshold = myargs.syntenic_correlation_threshold
	max_genes_disconnect = myargs.max_genes_disconnect
	flanking_context = myargs.flanking_context
	gc_transition = myargs.gc_transition
	bg_transition = myargs.bg_transition
	ge_emission = myargs.gc_emission
	be_emission = myargs.bg_emission
	key_protein_queries_fasta = myargs.key_protein_queries
	key_protein_evalue_cutoff = myargs.key_protein_evalue_cutoff
	key_protein_min_prop = myargs.key_protein_min_prop
	draft_mode = myargs.draft_mode
	filter_paralogs = myargs.filter_paralogs
	plotting_flag = myargs.generate_plots
	diamond_sensitivity = myargs.diamond_sensitivity.lower()
	delineation_mode = myargs.gc_delineation_mode.upper()
	cpus = myargs.cpus
	max_memory = myargs.max_memory

	try:
		assert(delineation_mode in set(['GENE-CLUMPER', 'HMM']))
	except:
		sys.stderr.write("Delineation mode is not valid. Must be either 'Gene-Clumper' or 'HMM'.\n")
		sys.exit(1)

	# create output directory if needed, or warn of over-writing
	if os.path.isdir(outdir):
		sys.stderr.write("Output directory exists. Overwriting in 5 seconds ...\n ")
		sleep(5)
	else:
		os.mkdir(outdir)

	check_dir = outdir + 'Checkpoint_Files/'
	if not os.path.isdir(check_dir):
		util.setupReadyDirectory([check_dir])

	try:
		assert(diamond_sensitivity in set(['fast', 'very-sensitive', 'ultra-sensitive', 'mid-sensitive', 'more-sensitive']))
	except:
		sys.stderr.write('Invalid value provided for --diamond_sensitivity option.\n')

	# create logging object
	log_file = outdir + 'Progress.log'
	logObject = util.createLoggerObject(log_file)
	version_string = util.getVersion()
	logObject.info('Running fai version %s' % version_string)
	sys.stdout.write('Running fai version %s\n' % version_string)

	logObject.info("Saving parameters for future records.")
	parameters_file = outdir + 'Parameter_Inputs.txt'
	parameter_values = [query_inputs, reference_genome, reference_contig, reference_start, reference_end,
						protein_queries_fasta, target_genomes, outdir, draft_mode, filter_paralogs, evalue_cutoff,
						min_prop, key_protein_queries_fasta, key_protein_evalue_cutoff, key_protein_min_prop,
						syntenic_correlation_threshold, max_genes_disconnect, flanking_context, ge_emission,
						be_emission, gc_transition, bg_transition, plotting_flag, diamond_sensitivity,
						delineation_mode, cpus, max_memory]
	parameter_names = ["Query gene-cluster GenBanks", "Reference genome for gene cluster",
					   "Reference scaffold for gene cluster", "Reference start coordinate of gene cluster",
					   "Reference end coordinate of gene cluster", "protein queries",
					   "Target genomes prepared directory by prepTG", "Output directory",
					   "Run in draft-assembly mode?", "Filter for paralogous/gene-content overlapping segments",
					   "General E-value cutoff for detection of protein homologs in genome",
					   "Minimum proportion of hits for gene presence in genome", "FASTA file with key proteins to consider",
					   "E-values for key proteins to be considered as high-supporting evidence for gene cluster presence",
					   "Minimum proportion of key protein hits required for gene cluster presence in genome",
					   "Syntenic correlation to known instance threshold required for gene cluster presence",
					   "Maximum distance in between candidate gene-cluster segments to perform merging",
					   "Base pair for flanking context to extract",
					   "Emission probability of gene being in gene-cluster state with homologous hit to gene-cluster",
					   "Emission probability of gene being in background state with homologous hit to gene-cluster",
					   "Probability for gene-cluster to gene-cluster transition in HMM",
					   "Probability for background to background transition in HMM",
					   "Perform plotting?", "DIAMOND sensitivity", "Delineation mode",
					   "Number of CPUs requested", "Maximum memory in GB"]
	util.logParametersToFile(parameters_file, parameter_names, parameter_values)
	logObject.info("Done saving parameters!")

	# set max memory limit
	if max_memory != None:
		logObject.info("Setting maximum memory usage to: %dGB" % max_memory)
		sys.stdout.write("Setting maximum memory usage to: %dGB\n" % max_memory)
		try:
			util.memory_limit(max_memory)
		except Exception as e:
			logObject.info("Error setting memory limit")
			sys.stdout.write("Error setting memory limit\n")

	input_type_1_observed = False
	input_type_2_observed = False
	input_type_3_observed = False
	if query_inputs != None and [os.path.isfile(qif) for qif in query_inputs]:
		input_type_1_observed = True
	if reference_genome != None and os.path.isfile(reference_genome) and reference_contig != None and reference_start != None and reference_end != None:
		input_type_2_observed = True
	if protein_queries_fasta != None and os.path.isfile(protein_queries_fasta):
		input_type_3_observed = True

	if input_type_1_observed and not (input_type_2_observed or input_type_3_observed):
		sys.stdout.write('Beginning fai searches using the following GenBank(s) as queries: %s\n' % ' '.join(query_inputs))
		logObject.info('Beginning fai searches using the following GenBank(s) as queries: %s' % ' '.join(query_inputs))
	elif input_type_2_observed and not (input_type_1_observed or input_type_3_observed):
		sys.stdout.write('Beginning fai searches using coordinates %s:%d-%d in reference genome %s\n' % (reference_contig,
																									   reference_start,
																									   reference_end,
	     																							   reference_genome))
		logObject.info('Beginning fai searches using coordinates %s:%d-%d in reference genome %s' % (reference_contig,
																							reference_start,
																							reference_end,
																							reference_genome))

	elif input_type_3_observed and not (input_type_1_observed or input_type_2_observed):
		sys.stdout.write('Beginning fai searches using protein queries FASTA file at: %s\n' % protein_queries_fasta)
		logObject.info('Beginning fai searches using protein queries FASTA file at: %s' % protein_queries_fasta)
	elif input_type_1_observed or input_type_2_observed or input_type_3_observed:
		sys.stderr.write('Only one type of input mode can currently be accepted!\n')
		logObject.error('Only one type of input mode can currently be accepted!')
	elif not (input_type_1_observed or input_type_2_observed or input_type_3_observed):
		sys.stderr.write('Required input(s) not provided!\n')
		logObject.error('Required input(s) not provided!')

	logObject.info('--------------------\nStep 1\n--------------------\nGetting protein sequences of known gene-cluster instance(s).')
	sys.stdout.write('--------------------\nStep 1\n--------------------\nGetting protein sequences of known gene-cluster instance(s).\n')

	query_information = None
	work_dir = outdir + 'Processing_Queries/'
	query_info_pickle_file = outdir + 'Query_Information.pkl'
	step1_check_file = check_dir + 'step1.txt'
	if not os.path.isfile(step1_check_file):
		util.setupReadyDirectory([work_dir])
		query_fasta = None
		comp_gene_info = None
		protein_to_hg = None
		if input_type_1_observed:
			# construct consensus sequences of ortholog groups to use for downstream searching
			# get paths	to individual genbank files
			genbanks = set([])
			try:
				for filename in query_inputs:
					filename = os.path.abspath(filename)
					if os.path.isfile(filename) and (filename.endswith(".gbff") or filename.endswith(".gbk") or filename.endswith(".genbank")):
						if util.checkValidGenBank(filename, use_either_lt_or_pi=True):
							genbanks.add(filename)
						else:
							sys.stderr.write('Ignoring file %s because it doesn\'t have CDS features or a comma found in a CDS locus tag.' % filename)
							logObject.warning('Ignoring file %s because it doesn\'t have CDS features or a comma found in a CDS locus tag.' % filename)
			except Exception as e:
				sys.stderr.write('Issues with parsing query GenBanks!\n')
				logObject.error('Issues with parsing query GenBanks!')
				sys.stderr.write(traceback.format_exc())
				sys.exit(1)

			num_gbk = len(genbanks)
			if num_gbk == 0:
				sys.stderr.write('Issues with parsing query GenBanks! No query GenBanks found ...\n')
				logObject.error('Issues with parsing query GenBanks! No query GenBanks found ...')
				sys.exit(1)
			else:
				sys.stdout.write('Found %d GenBanks to use as a joint query.\n' % num_gbk)
				logObject.info('Found %d GenBanks to use as a query.' % num_gbk)

			if num_gbk == 1:
				query_fasta = work_dir + 'NR_Reference_Query.faa'
				protein_queries_fasta = work_dir + '.'.join(list(genbanks)[0].split('/')[-1].split('.')[:-1]) + '.faa'
				util.convertGenbankToCDSProtsFasta(list(genbanks)[0], protein_queries_fasta, logObject, use_either_lt_or_pi=True)
				protein_to_hg = fai.collapseProteinsUsingCDHit(protein_queries_fasta, query_fasta, logObject)
			else:
				ortho_matrix_file, query_fasta = fai.genConsensusSequences(genbanks, work_dir, logObject, cpus=cpus)
				protein_to_hg = fai.parseHomologGroupMatrix(ortho_matrix_file, logObject)
			comp_gene_info = fai.parseCoordsFromGenbank(genbanks, logObject)

		elif input_type_2_observed:
			reference_genome = os.path.abspath(reference_genome)

			reference_genome_gbk = reference_genome
			if not util.is_genbank(reference_genome) and util.is_fasta(reference_genome):
				reference_genome_gbk = work_dir + 'reference.gbk'
				gene_call_cmd = ['runProdigalAndMakeProperGenbank.py', '-i', reference_genome, '-s', 'reference', '-l',
 								 'OG', '-o', work_dir]
				try:
					subprocess.call(' '.join(gene_call_cmd), shell=True, stdout=subprocess.DEVNULL,
									stderr=subprocess.DEVNULL,
									executable='/bin/bash')
					assert(os.path.isfile(reference_genome_gbk))
					logObject.info('Successfully ran: %s' % ' '.join(gene_call_cmd))
				except Exception as e:
					logObject.error('Had an issue running runProdigalAndMakeProperGenbank.py: %s' % ' '.join(gene_call_cmd))
					sys.stderr.write('Had an issue running runProdigalAndMakePRoperGenbank.py: %s\n' % ' '.join(gene_call_cmd))
					logObject.error(e)
					sys.exit(1)
			locus_genbank = work_dir + 'Reference_Query.gbk'
			locus_proteome = work_dir + 'Reference_Query.faa'
			fai.subsetGenBankForQueryLocus(reference_genome_gbk, locus_genbank, locus_proteome, reference_contig,
										   reference_start, reference_end, logObject)
			comp_gene_info = fai.parseCoordsFromGenbank([locus_genbank], logObject)
			query_fasta = work_dir + 'NR_Reference_Query.faa'
			protein_to_hg = fai.collapseProteinsUsingCDHit(locus_proteome, query_fasta, logObject)
		elif input_type_3_observed:
			# simplest case - just take the protein queries provided by the user. Unfortunately, syntenic information is
			# not present for such inputs.
			protein_queries_fasta = os.path.abspath(protein_queries_fasta)
			query_fasta = work_dir + 'NR_Reference_Query.faa'
			protein_to_hg = fai.collapseProteinsUsingCDHit(protein_queries_fasta, query_fasta, logObject)

		key_hgs = set([])
		if key_protein_queries_fasta != None:
			key_hgs = fai.mapKeyProteinsToHomologGroups(query_fasta, key_protein_queries_fasta, work_dir, logObject, cpus=cpus)
		query_information = {'protein_to_hg': protein_to_hg, 'query_fasta': query_fasta,
							 'comp_gene_info': comp_gene_info, 'key_hgs': key_hgs}
		with open(query_info_pickle_file, 'wb') as pickle_file:
			pickle.dump(query_information, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
		os.system('touch %s' % step1_check_file)

	if query_information == None:
		try:
			with open(query_info_pickle_file, 'rb') as handle:
				query_information = pickle.load(handle)
			assert(os.path.isfile(query_information['query_fasta']))
		except Exception as e:
			sys.stderr.write('Issues with reading in query information from pickle file (might not exist). Please rerun annotations after deleting checkpoint file Step1.txt.!\n')
			logObject.error('Issues with reading in query information from pickle file (might not exist). Please rerun annotations after deleting checkpoint file Step1.txt.!\n')
			sys.stderr.write(str(e) + '\n')
			sys.exit(1)

	# Quickly read target genomes information (don't load the database yet!)
	target_genome_dmnd_db = None
	target_annotation_information = None
	if os.path.isdir(target_genomes):
		preptg_dir = os.path.abspath(target_genomes) + '/'
		target_annot_listing_file = preptg_dir + 'Target_Genome_Annotation_Files.txt'
		target_genome_dmnd_db = preptg_dir + 'Target_Genomes_DB.dmnd'

		try:
			assert(os.path.isfile(target_annot_listing_file) and os.path.isfile(target_genome_dmnd_db))
		except Exception as e:
			sys.stderr.write('Issues finding expected files within prepTG output directory provided to --target_genomes argument.\n')
			logObject.error('Issues finding expected files within prepTG output directory provided to --target_genomes argument.\n')
			sys.stderr.write(str(e) + '\n')
			sys.exit(1)

		target_annotation_information = {}
		valid_tg_samples = set([])
		with open(target_annot_listing_file) as otalf:
			for line in otalf:
				line = line.strip()
				sample, gbk = line.split('\t')
				gbk = preptg_dir + gbk
				if os.path.isfile(gbk) and os.path.getsize(gbk) > 0:
					target_annotation_information[sample] = {}
					target_annotation_information[sample]['genbank'] = gbk
					valid_tg_samples.add(sample)

	logObject.info('--------------------\nStep 2\n--------------------\nRunning DIAMOND BLASTp and Processing Results.')
	sys.stdout.write('--------------------\nStep 2\n--------------------\nRunning DIAMOND BLASTp and Processing Results.\n')

	step2_check_file = check_dir + 'step2.txt'
	diamond_work_dir = outdir + 'DIAMOND_Analysis/'
	diamond_results = None
	diamond_results_pickle_file = outdir + 'DIAMOND_Results.pkl'
	if not os.path.isfile(step2_check_file):
		util.setupReadyDirectory([diamond_work_dir])
		diamond_results_file = fai.runDiamondBlastp(target_genome_dmnd_db, query_information['query_fasta'], diamond_work_dir, logObject, diamond_sensitivity=diamond_sensitivity, evalue_cutoff=evalue_cutoff, cpus=cpus)
		diamond_results = fai.processDiamondBlastp(target_annotation_information, diamond_results_file, diamond_work_dir, logObject)
		with open(diamond_results_pickle_file, 'wb') as pickle_file:
			pickle.dump(diamond_results, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
		os.system('touch %s' % step2_check_file)

	if diamond_results == None:
		try:
			with open(diamond_results_pickle_file, 'rb') as handle:
				diamond_results = pickle.load(handle)
		except Exception as e:
			sys.stderr.write('Issues with reading in diamond results from pickle file (might not exist). Please rerun annotations after deleting checkpoint file Step2.txt.!\n')
			logObject.error('Issues with reading in diamond results from pickle file (might not exist). Please rerun annotations after deleting checkpoint file Step2.txt.!\n')
			sys.stderr.write(str(e) + '\n')
			sys.exit(1)

	logObject.info('--------------------\nStep 3\n--------------------\nLoading target genomes database prepared by prepTG.')
	sys.stdout.write('--------------------\nStep 3\n--------------------\nLoading target genomes database prepared by prepTG.\n')

	step3_check_file = check_dir + 'step3.txt'
	target_information = None
	target_info_pickle_file = outdir + 'Target_Information.pkl'
	if not os.path.isfile(step3_check_file):
		if os.path.isdir(target_genomes):
			preptg_dir = os.path.abspath(target_genomes) + '/'
			target_genomes_pkl_dir = preptg_dir + 'Target_Genomes_Info_Pickled/'
			target_annot_listing_file = preptg_dir + 'Target_Genome_Annotation_Files.txt'
			target_genome_dmnd_db = preptg_dir + 'Target_Genomes_DB.dmnd'

			try:
				assert(os.path.isdir(target_genomes_pkl_dir) and os.path.isfile(target_annot_listing_file) and os.path.isfile(target_genome_dmnd_db))
			except Exception as e:
				sys.stderr.write('Issues finding expected files within prepTG output directory provided to --target_genomes argument.\n')
				logObject.error('Issues finding expected files within prepTG output directory provided to --target_genomes argument.\n')
				sys.stderr.write(str(e) + '\n')
				sys.exit(1)

			target_annotation_information = {}
			valid_tg_samples = set([])
			with open(target_annot_listing_file) as otalf:
				for line in otalf:
					line = line.strip()
					sample, gbk = line.split('\t')
					gbk = preptg_dir + gbk
					if os.path.isfile(gbk) and os.path.getsize(gbk) > 0:
						target_annotation_information[sample] = {}
						target_annotation_information[sample]['genbank'] = gbk
						valid_tg_samples.add(sample)

			target_genome_gene_info = fai.loadTargetGenomeInfo(target_annotation_information, target_genomes_pkl_dir, diamond_results, valid_tg_samples, logObject)

			target_information = {'target_annotation_information': target_annotation_information,
								  'target_genome_gene_info': target_genome_gene_info,
								  'target_concat_genome_db': target_genome_dmnd_db}
			with open(target_info_pickle_file, 'wb') as pickle_file:
				pickle.dump(target_information, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
		os.system('touch %s' % step3_check_file)

	if target_information == None:
		try:
			with open(target_info_pickle_file, 'rb') as handle:
				target_information = pickle.load(handle)
		except Exception as e:
			sys.stderr.write('Issues with reading in target information from pickle file (might not exist). Please rerun annotations after deleting checkpoint file Step2.txt.!\n')
			logObject.error('Issues with reading in target information from pickle file (might not exist). Please rerun annotations after deleting checkpoint file Step2.txt.!\n')
			sys.stderr.write(str(e) + '\n')
			sys.exit(1)


	logObject.info('--------------------\nStep 4\n--------------------\nIdentifying homologous gene-cluster segments.')
	sys.stdout.write('--------------------\nStep 4\n--------------------\nIdentifying homologous gene-cluster segments.\n')

	step4_check_file = check_dir + 'step4.txt'
	hmm_work_dir = outdir + 'GC_Segment_Processing/'
	homologous_gbk_dir = outdir + 'Homologous_GenBanks_Directory/'
	if not os.path.isfile(step4_check_file):
		util.setupReadyDirectory([hmm_work_dir, homologous_gbk_dir])
		min_hits = min_prop*len(set(query_information['protein_to_hg'].values()))
		#print(min_hits)
		key_protein_min_hits = key_protein_min_prop*len(query_information['key_hgs'])
		#print(key_protein_min_hits)

		fai.identifyGCInstances(query_information, target_information, diamond_results, hmm_work_dir, logObject,
								 min_hits=min_hits, min_key_hits=key_protein_min_hits, draft_mode=draft_mode,
								 gc_to_gc_transition_prob=gc_transition, bg_to_bg_transition_prob=bg_transition,
								 gc_emission_prob_with_hit=ge_emission, gc_emission_prob_without_hit=be_emission,
								 syntenic_correlation_threshold=syntenic_correlation_threshold,
								 kq_evalue_threshold=key_protein_evalue_cutoff,
								 max_int_genes_for_merge=max_genes_disconnect, flanking_context=flanking_context,
								 cpus=cpus, block_size=1000, gc_delineation_mode=delineation_mode)
		if draft_mode or filter_paralogs:
			fai.filterParalogousSegmentsAndConcatenateIntoMultiRecordGenBanks(hmm_work_dir, homologous_gbk_dir, logObject)
		else:
			for f in os.listdir(hmm_work_dir + 'GeneCluster_Genbanks/'):
				shutil.move(hmm_work_dir + 'GeneCluster_Genbanks/' + f, homologous_gbk_dir)

		os.system('touch %s' % step4_check_file)

	step5_check_file = check_dir + 'step5.txt'
	plot_dir = outdir + 'Plotting_Files/'
	plot_result_pdf = outdir + 'Homologous_Gene-Cluster_Segments.pdf'
	plot_naming_pdf = outdir + 'Name_Mapping_of_Queries_to_Original_LocusTags.pdf'
	if not os.path.isfile(step5_check_file) and plotting_flag:
		logObject.info('--------------------\nStep 5\n--------------------\nPlotting overview of gene-cluster segments identified.')
		sys.stdout.write('--------------------\nStep 5\n--------------------\nPlotting overview of gene-cluster segments identified.\n')
		util.setupReadyDirectory([plot_dir])
		fai.plotOverviews(target_information['target_annotation_information'], hmm_work_dir,
						  query_information['protein_to_hg'], plot_dir, plot_result_pdf, plot_naming_pdf, logObject, height=5, width=15)
		os.system('touch %s' % step5_check_file)

	# Close logging object and exit
	util.closeLoggerObject(logObject)
	sys.exit(0)

if __name__ == '__main__':
	multiprocessing.set_start_method('fork')
	faiMain()
