#!/usr/bin/env python3

### Program: prepTG
### Author: Rauf Salamzade
### Kalan Lab
### UW Madison, Department of Medical Microbiology and Immunology

# BSD 3-Clause License
#
# Copyright (c) 2021, Kalan-Lab
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import shutil
import sys
import argparse
from Bio import SeqIO
from zol import util
import subprocess
import multiprocessing
import shutil
import gzip
import traceback

valid_gtdb_releases = set(['R214', 'R220'])
premade_db_set = set(['Acinetobacter', 'Bacillales', 'Corynebacterium', 'Enterobacter', 'Enterococcus', 'Escherichia', 
                      'Klebsiella', 'Lactobacillus', 'Listeria', 'Micromonospora', 'Mycobacterium', 'Pseudomonas', 
					  'Salmonella', 'Staphylococcus', 'Streptococcus', 'Neisseria', 'Streptomyces', 'Cutibacterium'])
premade_download_links = {'Acinetobacter': 'https://zenodo.org/record/10042148/files/Acinetobacter.tar.gz?download=1',
			  'Bacillales': 'https://zenodo.org/record/10044725/files/Bacillales.tar.gz?download=1',
			  'Corynebacterium': 'https://zenodo.org/record/10032209/files/Corynebacterium.tar.gz?download=1',
			  'Enterobacter': 'https://zenodo.org/record/10042148/files/Enterobacter.tar.gz?download=1',
			  'Enterococcus': 'https://zenodo.org/record/10042148/files/Enterococcus.tar.gz?download=1',
			  'Escherichia': 'https://zenodo.org/record/10032209/files/Escherichia.tar.gz?download=1', 
			  'Klebsiella': 'https://zenodo.org/record/10042148/files/Klebsiella.tar.gz?download=1',
			  'Cutibacterium': 'https://zenodo.org/record/10032209/files/Cutibacterium.tar.gz?download=1',
			  'Listeria': 'https://zenodo.org/record/10032209/files/Listeria.tar.gz?download=1',
			  'Lactobacillus': 'https://zenodo.org/record/10032209/files/Lactobacillus.tar.gz?download=1',
			  'Micromonospora': 'https://zenodo.org/record/10044725/files/Micromonospora.tar.gz?download=1',
			  'Mycobacterium': 'https://zenodo.org/record/10032209/files/Mycobacterium.tar.gz?download=1',
			  'Pseudomonas': 'https://zenodo.org/record/10042148/files/Pseudomonas.tar.gz?download=1',
			  'Salmonella': 'https://zenodo.org/record/10032209/files/Salmonella.tar.gz?download=1',
			  'Staphylococcus': 'https://zenodo.org/record/10042148/files/Staphylococcus.tar.gz?download=1',
			  'Streptomyces': 'https://zenodo.org/record/10044725/files/Streptomyces.tar.gz?download=1',
              'Neisseria': 'https://zenodo.org/record/10032209/files/Neisseria.tar.gz?download=1',
              'Streptococcus': 'https://zenodo.org/record/10032209/files/Streptococcus.tar.gz?download=1'}

def create_parser():
	""" Parse arguments """
	parser = argparse.ArgumentParser(description="""
	Program: prepTG
	Author: Rauf Salamzade
	Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology

	Prepares a directory of target genomes for being searched for query gene clusters using fai.
	
	Premade databases of representative genomes are available for the following genera:

	Acinetobacter (n=1,643), Bacillales (n=3,150), Corynebacterium (n=726), Enterobacter (n=878), 
	Enterococcus (n=937), Escherichia (n=2,436), Klebsiella (n=1,022), Listeria (n=353), 
	Mycobacterium (n=744), Pseudomonas (n=2,666), Salmonella (n=308), Staphylococus (n=496), 
	Streptomyces (n=1,555), Streptococcus (n=2,452), Cutibacterium (n=27), Neisseria (n=414),
	Lactobacillus (n=541), and Micromonospora (n=211).

	In addition, users can simply request all genomes belonging to a specific species/genus 
	in GTDB R214 to be downloaded.
	
        ----------------------------------------------------------------------------------------
	> Example commands:

	1. Setup a prepTG database which includes some local genomes in FASTA format

	    prepTG -i User_Genomes_Directory/ 

	2. Setup a prepTG database which includes some local genomes and all Cutibacterium granulosum
	   genomes in GTDB R214: 
	
	    prepTG -i User_Genomes_Directory/ -g "Cutibacterium granulosum" -o prepTG_Database/
	
	3. Setup local prepTG database by downloading a premade one of representative 
	   Cutibacterium genomes:

	    prepTG -d Cutibacterium -o prepTG_Database/

	----------------------------------------------------------------------------------------
	> Considerations
	If FASTA format is provided, assumption is that genomes are prokaryotic and 
	pyrodigal/prodigal will be used to perform gene-calling. Eukaryotic genomes can 
	be provided as FASTA format but the --reference_proteome file should be used in 
	such case to map proteins from a reference proteome (from the same species ideally) 
	on to the target genomes. This will prevent detection of new genes in gene-clusters 
	detected by fai but synchronize gene-calling and allow for better similarity 
	assessment between orthologous genes.
								  
	If you are interested in inferring horizontal gene transfer and working with bacterial
	genomes - consider issuing the "--mge_annotation" flag to annotate phage, plasmid and 
	IS element associated proteins. 
	""", formatter_class=argparse.RawTextHelpFormatter)

	parser.add_argument('-d', '--download_premade', help='Download and setup pre-made databases of representative genomes for specific taxon/genus.\nProvide name of the taxon, e.g. "Escherichia"', required=False, default=None)
	parser.add_argument('-i', '--input_dir', help='Directory with target genomes (either featuring GenBanks or FASTAs).', required=False, default=None)
	parser.add_argument('-g', '--gtdb_taxon', help='Name of a GTDB valid genus or species to incorporate genomes from.\nShould be surrounded by\nquotes (e.g. "Escherichia coli").', required=False, default=None)
	parser.add_argument('-gr', '--gtdb_release', help='GTDB release to use. [Current default is R220].', default='R220', required=False)
	parser.add_argument('-o', '--output_dir', help='Output directory, which can then be provided as input for the\n"-tg" argument in fai.', required=True)
	parser.add_argument('-l', '--locus_tag_length', type=int, help='Length of locus tags to set. Default is 3, allows for <~18k genomes.', required=False, default=3)
	parser.add_argument('-r', '--rename_locus_tags', action='store_true', help='Whether to rename locus tags if provided for CDS features in GenBanks.', required=False, default=False)
	parser.add_argument('-gcm', '--gene_calling_method', help='Method to use for gene calling. Options are: pyrodigal, prodigal,\nor prodigal-gv. [Default is pyrodigal].', required=False, default='pyrodigal')
	parser.add_argument('-m', '--meta_mode', action='store_true', help='Flag to use meta mode instead of single for pyrodigal/prodigal.', default=False, required=False)
	parser.add_argument('-rp', '--reference_proteome', help='Provide path to a reference proteome to use for protein/gene-calling\nin target genomes - which should be in FASTA format.', default=None, required=False)
	parser.add_argument('-cst', '--create_species_tree', action='store_true', help='Use skani to infer a neighbor-joing based species tree for the genomes.', required=False, default=False)
	parser.add_argument('-ma', '--mge_annotation', action='store_true', help="Perform MGE annotation of proteins - for bacterial genomes only.", required=False, default=False)
	parser.add_argument('-c', '--cpus', type=int, help="Total number of cpus/threads to use for running OrthoFinder2/prodigal.\n[Default is 1].", required=False, default=1)
	parser.add_argument('-mm', '--max_memory', type=int, help='Uses resource module to set soft memory limit. Provide in Giga-bytes.\nGenerally memory shouldn\'t be a major concern unless working\nwith hundreds of large metagenomes. [currently\nexperimental; default is None].', default=None, required=False)
	parser.add_argument('-v', '--version', action='store_true', help="Get version and exit.", required=False, default=False)

	args = parser.parse_args()
	return args

def prepTG():
	"""
	Void function which runs primary workflow for program.
	"""

	# get version
	version = util.getVersion()

	if len(sys.argv)>1 and ('-v' in set(sys.argv) or '--version' in set(sys.argv)):
		sys.stdout.write(version + '\n')
		sys.exit(0)

	"""
	PARSE ARGUMENTS
	"""
	myargs = create_parser()

	target_genomes_dir = myargs.input_dir
	gtdb_taxon = myargs.gtdb_taxon
	download_premade = myargs.download_premade
	outdir = os.path.abspath(myargs.output_dir) + '/'
	locus_tag_length = myargs.locus_tag_length
	rename_locus_tags = myargs.rename_locus_tags
	gtdb_release = myargs.gtdb_release
	cpus = myargs.cpus
	max_memory = myargs.max_memory
	gene_calling_method = myargs.gene_calling_method
	mge_annotation_flag = myargs.mge_annotation
	meta_mode = myargs.meta_mode
	create_species_tree = myargs.create_species_tree
	reference_proteome = myargs.reference_proteome
	if reference_proteome != None:
		gene_calling_method = 'miniprot reference proteome mapping'
		reference_proteome = os.path.abspath(myargs.reference_proteome)


	if os.path.isdir(outdir):
		sys.stderr.write("Output directory exists! Exiting\n ")
		sys.exit(1)
	else:
		os.system('mkdir %s' % outdir)

	if target_genomes_dir == None and download_premade == None and gtdb_taxon == None:
		sys.stderr.write('No input genomes provided or requested!\n')
		sys.exit(1)

	if download_premade != None:
		try:
			assert(download_premade in premade_db_set)
		except:
			sys.stderr.write('Non-valid genus specified for downloading a premade database.\n')
			sys.stderr.write('Valid options include:\n' + ', '.join(sorted(premade_db_set)) + '\n')
			sys.exit(1)

	if gtdb_taxon != None and download_premade != None:
		sys.stderr.write('Currently requesting genomes for a specific GTDB genus/species and also requesting using a premade database is not supported.\n')
		sys.exit(1)

	if target_genomes_dir != None and download_premade != None:
		sys.stderr.write('Currently providing genomes in a directory and also requesting a premade database is not supported.\n')
		sys.exit(1)

	if target_genomes_dir != None:
		try:
			assert(os.path.isdir(target_genomes_dir))
			target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
		except:
			sys.stderr.write('Issue with validating directory with target genomes exists.\n')
			sys.exit(1)

	"""	
	START WORKFLOW
	"""
	
	sys.stdout.write('Input command:\n%s\n' % ' '.join(sys.argv))

        # create logging object
	log_file = outdir + 'Progress.log'
	logObject = util.createLoggerObject(log_file)
	logObject.info("Saving parameters for future records.")
	parameters_file = outdir + 'Parameter_Inputs.txt'
	parameter_values = [download_premade, target_genomes_dir, gtdb_taxon, outdir, reference_proteome, gene_calling_method, 
					    meta_mode, create_species_tree, cpus, max_memory]
	parameter_names = ["Download Premade Database", "Target Genomes Directory", "GTDB species/genus", "Output Directory", 
					   "Reference Proteome Provided", "Gene Calling Method", "Use Meta Mode?", "Create Species Tree?", 
					   "Number of Cpus", "Maximum Memory in GB"]
	util.logParametersToFile(parameters_file, parameter_names, parameter_values)
	logObject.info("Done saving parameters!")

	logObject.info('Running prepTG version %s' % version)
	sys.stdout.write('Running prepTG version %s\n' % version)

	gtdb_genomes_dir = outdir + 'GTDB_Genomes/'
	if gtdb_taxon != None:
		# Step 0: Download GTDB listing file from lsaBGC git repo, parse GTDB information 
		# file, get list of Genbank accessions, and perform dry-run with ncbi-genome-download 
		# if requested.
		gtdb_release = gtdb_release.upper()
		try:
			assert(gtdb_release in valid_gtdb_releases)
		except:
			sys.stderr.write('GTDB release specified is not valid, options include: ' + ', '.join(valid_gtdb_releases))
			sys.exit(1)

		sys.stdout.write("Using axel to download the GTDB " + gtdb_release + " listing file.\n")
		gtdb_listing_file = outdir + "GTDB_" + gtdb_release + "_Information.txt.gz"	
		wget_cmd = ['axel', 'https://github.com/raufs/gtdb_gca_to_taxa_mappings/raw/main/GTDB_' + gtdb_release + '_Information.txt.gz', '-o', gtdb_listing_file]

		try:
			subprocess.call(' '.join(wget_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
			assert (os.path.isfile(gtdb_listing_file))
		except:
			sys.stderr.write('Had an issue running: %s\n' % ' '.join(wget_cmd))
			sys.stderr.write(traceback.format_exc())
			sys.exit(1)
	
		genbank_accession_listing_file = outdir + 'NCBI_Genbank_Accession_Listing.txt'
		sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB %s and NCBI's Genbank db\n" % (gtdb_taxon, gtdb_release))

		if not os.path.isfile(genbank_accession_listing_file):
			genbank_accession_listing_handle = open(genbank_accession_listing_file, 'w')
			with gzip.open(gtdb_listing_file, 'rt') as ogtdb:
				for line in ogtdb:
					line = line.strip('\n')
					ls = line.split('\t')
					if ls[0] == 'none': continue
					if len(gtdb_taxon.split()) == 1:
						if ls[1] == gtdb_taxon:
							genbank_accession_listing_handle.write(ls[0] + '\n')
					elif len(gtdb_taxon.split()) == 2:
						if ls[2] == gtdb_taxon:
							genbank_accession_listing_handle.write(ls[0] + '\n')
			genbank_accession_listing_handle.close()

		ogalf = open(genbank_accession_listing_file)
		accession_count = len(ogalf.readlines())
		ogalf.close()

		if accession_count == 0:
			sys.stderr.write('Warning: no genomes found to belong the genus or species specified in GTDB.\n')
		else:
			genome_listing_file = outdir + 'GTDB_Genomes_from_NCBIGenbank_for_Taxa.txt'
			if not os.path.isfile(genome_listing_file):
				if accession_count != 0:
					ngd_dry_cmd = ['ncbi-genome-download', '--dry-run', '--section', 'genbank', '-A', genbank_accession_listing_file, 'bacteria', '>', genome_listing_file]
					try:
						subprocess.call(' '.join(ngd_dry_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
						assert (os.path.isfile(genome_listing_file))
					except:
						sys.stderr.write('Had an issue running: %s\n' % ' '.join(cmd))
						sys.stderr.write(traceback.format_exc())
				else:
					of = open(genome_listing_file, 'w')
					of.close()

			oglf = open(genome_listing_file)
			genome_count = len(oglf.readlines())
			oglf.close()
			if genome_count == 0:
				sys.stderr.write('Warning: no genomes could be downloaded with ncbi-genome-download.\n')
			else:
				if genome_count != 0:
					ngd_real_cmd = ['ncbi-genome-download', '--formats', 'fasta', '--retries', '2', '--section',
							'genbank', '-A', genbank_accession_listing_file, '-o', gtdb_genomes_dir,
							'--flat-output',  'bacteria']
					try:
						subprocess.call(' '.join(ngd_real_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
						assert (os.path.isdir(gtdb_genomes_dir))
					except:
						sys.stderr.write('Had an issue running: %s\n' % ' '.join(cmd))
						sys.stderr.write(traceback.format_exc())

					uncompress_cmds = []
					for f in os.listdir(gtdb_genomes_dir):
						uncompress_cmds.append(['gunzip', gtdb_genomes_dir + f, logObject])
					p = multiprocessing.Pool(cpus)
					p.map(util.multiProcess, uncompress_cmds)
					p.close()

					gca_to_species_name = {}
					with gzip.open(gtdb_listing_file, 'rt') as ogtdb:
						for line in ogtdb:
							line = line.strip('\n')
							ls = line.split('\t')
							if ls[0] == 'none': continue
							gca = ls[0]
							sp_name = '_'.join(ls[2].split())
							gca_to_species_name[gca] = sp_name
									
					for gf in os.listdir(gtdb_genomes_dir):
						gfile = gtdb_genomes_dir + gf  
						if not gfile.endswith('.fasta') and not gfile.endswith('.fa') and not gfile.endswith('.fna'): continue
						assert (util.is_fasta(gfile))
						gca = '_'.join(gf.split('_')[:2])
						species_name = gca_to_species_name[gca]
						renamed_gfile = gtdb_genomes_dir + species_name + '_' + gca + '.fasta'
						os.rename(gfile, renamed_gfile)

	if target_genomes_dir != None:
		try:
			assert(os.path.isdir(target_genomes_dir))
			target_genomes_dir = os.path.abspath(target_genomes_dir) + '/'
		except:
			sys.stderr.write('Issue with validating directory with target genomes exists.\n')
			sys.exit(1)


	# set max memory limit
	if max_memory != None:
		logObject.info("Setting maximum memory usage to: %dGB" % max_memory)
		try:
			util.memory_limit(max_memory)
		except Exception as e:
			logObject.info("Error setting memory limit")
			sys.stdout.write("Error setting memory limit\n")

	if download_premade != None:
		# Step 1: Download and setup premade prepTG database for genus
		curr_workspace_dir =  os.getcwd()
		dl = premade_download_links[download_premade]
		dl_base = dl.split('/')[-1].split('?')[0]
		axel_download_dbs_cmd = ['axel', '-a', '-n', str(cpus), dl]
		extract_cmd = ['tar', '-zxvf', dl_base]
		reorganize_cmd = ['mv', download_premade + '/*', '.']
		rm_cmd = ['rm', '-f', dl_base]
		try:
			os.chdir(outdir)
			os.system(' '.join(axel_download_dbs_cmd))
			os.system(' '.join(extract_cmd))
			assert(os.path.isdir(download_premade))
			os.system(' '.join(reorganize_cmd))
			shutil.rmtree(download_premade)
			assert(os.path.isdir(outdir) and os.path.isfile(outdir + 'Target_Genomes_DB.dmnd'))
			os.system(' '.join(rm_cmd))
			os.chdir(curr_workspace_dir)
		except Exception as e:
			logObject.error('Had an issue downloading/setting-up the premade database requested.')
			sys.stderr.write('Had an issue downloading/setting-up the premade database requested.')
			logObject.error(e)
			sys.exit(1)
	else:
		# Step 1: List genomes in directory
		sys.stdout.write('--------------------\nStep 1\n--------------------\nProcessing genomes provided in input or requested for download from GTDB/NCBI.\n')
		target_listing_file = outdir + 'Target_Genomes.txt'
		uncompress_dir = outdir + 'Uncompressed_Genomes/'

		if target_genomes_dir != None:
			list_cmd = ['listAllGenomesInDirectory.py', '-i', target_genomes_dir, '--uncompress_dir', uncompress_dir, '>',
						target_listing_file]
			try:
				subprocess.call(' '.join(list_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
				logObject.info('Successfully ran: %s' % ' '.join(list_cmd))
			except Exception as e:
				logObject.error('Had an issue running: %s' % ' '.join(list_cmd))
				sys.stderr.write('Had an issue running: %s' % ' '.join(list_cmd))
				logObject.error(e)
				sys.exit(1)
		if gtdb_taxon != None:
			list_cmd = ['listAllGenomesInDirectory.py', '-i', gtdb_genomes_dir, '--uncompress_dir', uncompress_dir, '>>', target_listing_file]
			try:
				subprocess.call(' '.join(list_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, executable='/bin/bash')
				logObject.info('Successfully ran: %s' % ' '.join(list_cmd))
			except Exception as e:
				logObject.error('Had an issue running: %s' % ' '.join(list_cmd))
				sys.stderr.write('Had an issue running: %s' % ' '.join(list_cmd))
				logObject.error(e)
				sys.exit(1)

		genome_count = 0
		with open(target_listing_file) as otlf:
			for line in otlf:
				genome_count += 1
		if genome_count == 0:
			logObject.error('No genomes found and/or could be downloaded!!! Exiting!')
			sys.stderr.write('No genomes found and/or could be downloaded!!! Exiting!\n')
			sys.exit(1)

		# Step 2: Setup files needed for fai
		sys.stdout.write('--------------------\nStep 2\n--------------------\nRe-formatting genomes for usage in fai downstream.\n')
		format_assess_dir = outdir + 'Format_Assessment/'
		util.setupReadyDirectory([format_assess_dir])
		format_predictions_file = outdir + 'All_Format_Inferences.txt'
		target_genome_annotation_listing_file = outdir + 'Target_Genome_Annotation_Files.txt'
		additional_sample_genomes, additional_format_prediction = util.parseSampleGenomes(target_listing_file, format_assess_dir, format_predictions_file, logObject, cpus=cpus)
		if additional_format_prediction == 'mixed':
			logObject.error(
				'Format of additional genomes provided is not consistently FASTA or Genbank, please check input.')
			raise RuntimeError(
				'Format of additional genomes provided is not consistently FASTA or Genbank, please check input.')

		additional_proteomes_directory = outdir + 'Predicted_Proteomes_Additional/'
		additional_genbanks_directory = outdir + 'Genomic_Genbanks_Additional/'
		additional_genbanks_directory_local = 'Genomic_Genbanks_Additional/'
		util.setupReadyDirectory([additional_proteomes_directory, additional_genbanks_directory])
		if additional_format_prediction == 'fasta':
			if reference_proteome == None:
				additional_prodigal_outdir = outdir + 'Prodigal_Gene_Calling_Additional/'
				util.setupReadyDirectory([additional_prodigal_outdir])
				util.processGenomesUsingProdigal(additional_sample_genomes, additional_prodigal_outdir,
												additional_proteomes_directory, additional_genbanks_directory, logObject,
												cpus=cpus, locus_tag_length=locus_tag_length, gene_calling_method=gene_calling_method,
												meta_mode=meta_mode)
				shutil.rmtree(additional_prodigal_outdir)
			else:
				additional_miniprot_outdir = outdir + 'Miniprot_Gene_Calling_Additional/'
				util.setupReadyDirectory([additional_miniprot_outdir])
				util.processGenomesUsingMiniprot(reference_proteome, additional_sample_genomes, additional_miniprot_outdir,
												additional_proteomes_directory, additional_genbanks_directory, logObject,
												cpus=cpus, locus_tag_length=locus_tag_length)
				shutil.rmtree(additional_miniprot_outdir)
		else:
			# genomes are provided as Genbanks with CDS features
			gene_name_mapping_outdir = outdir + 'Mapping_of_New_Gene_Names_to_Original/'
			util.setupReadyDirectory([gene_name_mapping_outdir])
			util.processGenomesAsGenbanks(additional_sample_genomes, additional_proteomes_directory,
										additional_genbanks_directory, gene_name_mapping_outdir, logObject,
										cpus=cpus, locus_tag_length=locus_tag_length, rename_locus_tags=rename_locus_tags)

		target_genome_annotation_listing_handle = open(target_genome_annotation_listing_file, 'w')
		target_genomes_concat_db_faa = outdir + 'Target_Genomes_DB.faa'
		target_genomes_concat_db_handle = open(target_genomes_concat_db_faa, 'w')
		for f in os.listdir(additional_proteomes_directory):
			sample = f.split('.faa')[0]
			target_genome_annotation_listing_handle.write(sample + '\t' + additional_genbanks_directory_local + sample + '.gbk\n')
			try:
				with open(additional_proteomes_directory + sample + '.faa') as ospf:
					for rec in SeqIO.parse(ospf, 'fasta'):
						target_genomes_concat_db_handle.write('>' + sample + '|' + rec.id + '\n' + str(rec.seq) + '\n')
			except:
				pass
		target_genome_annotation_listing_handle.close()
		target_genomes_concat_db_handle.close()

		# If requested, run MGE annotations
		if mge_annotation_flag:
			mge_annot_dir = outdir + 'MGE_Annotations/'
			mge_annot_search_dir = mge_annot_dir + 'Search_Results/'
			mge_annot_summary_dir = mge_annot_dir + 'Summary_Results/'
			util.setupReadyDirectory([mge_annot_dir, mge_annot_search_dir, mge_annot_summary_dir])
			search_mge_annot_inputs = []
			process_mge_annot_inputs = []
			for f in os.listdir(additional_proteomes_directory):
				sample = f.split('.faa')[0]
				faa_file = additional_proteomes_directory + f
				gbk_file = additional_genbanks_directory + sample + '.gbk'
				vog_annot_file = mge_annot_search_dir + sample + '.vog.txt'
				mobsuite_annot_file = mge_annot_search_dir + sample + '.mobsuite.txt'
				is_annot_file = mge_annot_search_dir + sample + '.isfinder.txt'
				summary_file = mge_annot_summary_dir + sample + '.txt'
				search_mge_annot_inputs.append([sample, faa_file, vog_annot_file, mobsuite_annot_file, is_annot_file, logObject])
				process_mge_annot_inputs.append([sample, gbk_file, summary_file, vog_annot_file, mobsuite_annot_file, is_annot_file, logObject])
	
			p = multiprocessing.Pool(cpus)
			p.map(util.annotateMGEs, search_mge_annot_inputs)
			p.close()

			p = multiprocessing.Pool(cpus)
			p.map(util.processMGEAnnotations, process_mge_annot_inputs)
			p.close()	

		# remove individual proteome files
		sys.stdout.write('--------------------\nStep 4\n--------------------\nRemoving intermediate files to save space.\n')
		shutil.rmtree(additional_proteomes_directory)

		target_genomes_concat_db_dmnd = outdir + 'Target_Genomes_DB.dmnd'
		diamond_makedb_cmd = ['diamond', 'makedb', '--ignore-warnings', '--in', target_genomes_concat_db_faa, '-d',
							target_genomes_concat_db_dmnd]
		try:
			subprocess.call(' '.join(diamond_makedb_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
							executable='/bin/bash')
			logObject.info('Successfully ran: %s' % ' '.join(diamond_makedb_cmd))
		except Exception as e:
			logObject.error('Had an issue running: %s' % ' '.join(diamond_makedb_cmd))
			sys.stderr.write('Had an issue running: %s\n' % ' '.join(diamond_makedb_cmd))
			logObject.error(e)
			sys.exit(1)

		target_genome_annotation_data = util.readInAnnotationFilesForExpandedSampleSet(target_genome_annotation_listing_file, additional_genbanks_directory)

		pickle_output_dir = outdir + 'Target_Genomes_Info_Pickled/'
		util.setupReadyDirectory([pickle_output_dir])
		try:
			sample_genbank_info = []
			for sample in target_genome_annotation_data:
				sample_genbank = target_genome_annotation_data[sample]['genbank']
				result_pkl_file = pickle_output_dir + sample + '.pkl'
				sample_genbank_info.append([sample, sample_genbank, result_pkl_file])

			p = multiprocessing.Pool(cpus)
			p.map(util.parseGenbankAndFindBoundaryGenes, sample_genbank_info)
			p.close()
		except Exception as e:
			sys.stderr.write('Issues creating pickle file needed for fai.')
			sys.stderr.write(e + '\n')
			logObject.error(e)
			sys.exit(1)
		
		# If requested - create a species tree based on skani ANI estimates and neighbor-joining (using the ape library in R)		
		species_tree_dir = outdir + 'Species_Tree_Workspace/'
		species_tree = outdir + 'Species_Tree.nwk'
		if create_species_tree:
				util.setupReadyDirectory([species_tree_dir])
				sys.stdout.write('--------------------\nStep 5\n--------------------\nInferring speices tree based on skani ANI estimates + neighbor-joining.\n')
				util.createNJTree(additional_genbanks_directory, species_tree, species_tree_dir, logObject, cpus=cpus)

	# Close logging object and exit
	util.closeLoggerObject(logObject)
	sys.exit(0)

if __name__ == '__main__':
	prepTG()