#!/usr/bin/env python

import sys
import argparse

parser = argparse.ArgumentParser(description='This script is for parsing NCBI\'s assembly summary file down to the provided accessions.')

required = parser.add_argument_group('required arguments')

required.add_argument("-a", "--assembly_summary", help="NCBI's assembly summary file", action="store", dest="all_assemblies", required=True)
required.add_argument("-w", "--wanted_accessions", help="Single-column file with wanted accessions", action="store", dest="wanted_accs", required=True)
parser.add_argument("-o", "--output_file", help='Wanted summary info only (default: "Wanted.tsv")', action="store", dest="output_file", default="Wanted.tsv")

if len(sys.argv)==1:
    parser.print_help(sys.stderr)
    sys.exit(0)

args = parser.parse_args()

wanted_dict = {}

with open(args.wanted_accs, "r") as wanted_accs:
    for line in wanted_accs:
        root_acc = line.strip().split(".")[0]
        wanted_dict[str(root_acc)] = line.strip()

out_file = open(args.output_file, "w")

# out_file.write("input_accession\tdownloaded_accession\tassembly_name\ttaxid\torganism_name\tinfraspecific_name\tversion_status\tassembly_level\tftp_path\n")

with open(args.all_assemblies) as assemblies:
    for line in assemblies:
        line = line.split("\t")
        if line[0].split(".")[0] in wanted_dict:
            
            dl_acc = str(line[0])
            if not dl_acc:
                dl_acc = "NA"
            
            ass_name = str(line[15])
            if not ass_name:
                ass_name = "NA"
            
            taxid = str(line[5])
            if not taxid:
                taxid = "NA"
            
            org_name = str(line[7])
            if not org_name:
                org_name = "NA"
            
            infra_name = str(line[8])
            if not infra_name:
                infra_name = "NA"
            
            version_status = str(line[10])
            if not version_status:
                version_status = "NA"
            
            ass_level = str(line[11])
            if not ass_level:
                ass_level = "NA"
            
            ftp_path = str(line[19])
            if not ftp_path:
                ftp_path = "NA"

            out_file.write(str(wanted_dict[str(line[0].split(".")[0])]) + "\t" + str(dl_acc) + "\t"  + str(ass_name) + "\t" + str(taxid) + "\t" + str(org_name) + "\t" + str(infra_name) + "\t" + str(version_status) + "\t" + str(ass_level) + "\t" + str(ftp_path) + "\n")
