#!/bin/bash

######################################################### summarize assembly statistics #########################################################
#prerequisite: a text file named species_list.txt listing target species (one per line); these are the "Organism Name" column from the NCBI insecta genome assemblies spreadsheet (1480 entries)

#first exclude entries that do not have only genus and species listed, and exclude entries listed as "unclassified"
#keep a record of the entries that have been excluded (these can be processed manually)
awk '{print NF}' species_list.txt > NF_counts.txt
paste NF_counts.txt species_list.txt > NF_species_list.txt
awk '$1==2 {print $2,$3}' NF_species_list.txt | grep -v unclassified > species_list.clean.txt
awk '$1!=2 {print $0}' NF_species_list.txt | cut -f 2- > species_list.removed.txt
grep unclassified species_list.txt >> species_list.removed.txt
rm NF_counts.txt NF_species_list.txt

#make a new folder where the genome assembly stats will be summarized
mkdir genome_assembly_stats

#make a shell script (download_genome_assembly_stats.sh) for downloading the genome assembly stats files (to be searched for additional data)
#this uses the "ncbi-genome-download" utility (can be used to dowload the full genomes as well)
gsed 's/^/ncbi-genome-download -s genbank --genera "/g' species_list.clean.txt > download_genome_assembly_stats.sh
gsed -i 's/$/" invertebrate -o genome_assembly_stats --flat-output -F assembly-stats/g' download_genome_assembly_stats.sh

#run the script
bash download_genome_assembly_stats.sh
rm download_genome_assembly_stats.sh

#subset the information needed from assembly stats files, and compile those data in one output file
cd genome_assembly_stats
ls *assembly_stats.txt > genome_stats.txt

#find and exclude assemblies without coverage information
cat genome_stats.txt | while read line
	do
		genome=$line
		grep coverage $genome || echo $genome >> missing_coverage.txt
	done

cat missing_coverage.txt | while read line
	do
		exclude=$line
		grep -vw $exclude genome_stats.txt > genome_stats.temp
		mv genome_stats.temp genome_stats.txt
	done

#keep a record of assemblies with no "Expected final version" marked
#exclude those assemblies from the "genome_stats.txt" file (assemblies with no "Expected final version" marked are processed separately)
cat genome_stats.txt | while read line
	do
		genome=$line
		grep "Expected final version" $genome || echo $genome >> missing_expected_final_version.txt
	done

cat missing_expected_final_version.txt | while read line
	do
		exclude=$line
		grep -vw $exclude genome_stats.txt > genome_stats.temp
		mv genome_stats.temp genome_stats.txt
	done
		
#exclude assemblies that lack coverage information from the list of assemblies without "Expected final version" information
cat missing_coverage.txt | while read line
	do
		exclude=$line
		grep -vw $exclude missing_expected_final_version.txt > missing_expected_final_version.temp
		mv missing_expected_final_version.temp missing_expected_final_version.txt
	done

#get results for assemblies with coverage and "Expected final version" information (listed in the updated "genome_stats.txt" file), and for those without "Expected final version" information (listed in the "missing_expected_final_version.txt" file)
for file in {"genome_stats","missing_expected_final_version"}
	do
		cat $file.txt | while read line
			do
				genome=$line
				grep Organism $genome | cut -d ':' -f 2 >> $file.species
				grep coverage $genome | cut -d ':' -f 2 >> $file.coverage.txt
				grep "Assembly level" $genome | cut -d ':' -f 2 >> $file.level.txt
				grep "total-length" $genome | grep -v "#" | grep "Primary" | head -n 1 | cut -f 6 >> $file.total_length.bp.txt
				grep "Expected final version" $genome | cut -d ':' -f 2 >> $file.expected_final_version.txt
				grep "contig-N50" $genome | grep -v "#" | head -n 1 | cut -f 6 >> $file.contig_N50.txt
				grep "assembly accession" $genome | head -n 1 | cut -d ':' -f 2 >> $file.accession.txt
			done
#remove leading space characters in the species column, keep genus and species name, and change the space character to underscore (used for grep steps)
		gsed -i 's/^ *//' $file.species
		cut -d '(' -f 1 $file.species > $file.species.txt
		gsed -i 's/ $//g' $file.species.txt
		gsed -i 's/ /_/g' $file.species.txt
#remove the "x" from values listed in the coverage column (need coverage to be numeric for plotting)
		gsed -i 's/x//g' $file.coverage.txt
		gsed -i 's/X//g' $file.coverage.txt
#convert assembly length to Mb
		awk '{print $1/1000000}' $file.total_length.bp.txt > $file.total_length.Mb.txt
	done
	
#merge columns for the results files, remove carriage return symbol, and delete intermediate files
#the "expected_final_version" column is ignored for the results file lacking this information (i.e. "results_without_expected_final_version.txt")
#instead, use 'awk' to print "NA" for that data field
paste genome_stats.species.txt genome_stats.accession.txt genome_stats.coverage.txt genome_stats.contig_N50.txt genome_stats.level.txt genome_stats.total_length.Mb.txt genome_stats.expected_final_version.txt > results_with_expected_final_version.txt
paste missing_expected_final_version.species.txt missing_expected_final_version.accession.txt missing_expected_final_version.coverage.txt missing_expected_final_version.contig_N50.txt missing_expected_final_version.level.txt missing_expected_final_version.total_length.Mb.txt > results_without_expected_final_version.temp
awk '{print $0,"NA"}' results_without_expected_final_version.temp > results_without_expected_final_version.txt
gsed -i 's/ NA/\tNA/g' results_without_expected_final_version.txt
gsed -i 's/\r//g' results_with_expected_final_version.txt
gsed -i 's/\r//g' results_without_expected_final_version.txt

#merge the two results files and sort data entries alphabetically based on species ID (column #1)
cat results_with_expected_final_version.txt results_without_expected_final_version.txt | sort -k 1 > results.txt
#remove intermediate files
rm genome_stats.txt missing_expected_final_version.txt results_without_expected_final_version.temp genome_stats.species.txt genome_stats.coverage.txt genome_stats.contig_N50.txt genome_stats.level.txt genome_stats.total_length.bp.txt genome_stats.total_length.Mb.txt genome_stats.expected_final_version.txt missing_expected_final_version.species.txt missing_expected_final_version.coverage.txt missing_expected_final_version.contig_N50.txt missing_expected_final_version.level.txt missing_expected_final_version.total_length.bp.txt missing_expected_final_version.total_length.Mb.txt missing_expected_final_version.expected_final_version.txt results_with_expected_final_version.txt results_without_expected_final_version.txt missing_coverage.txt *.species

#this section of the script picks, for species with multiple assemblies available, the assembly at the highest level (e.g. chromosome-level) that has the highest coverage reported
gsed 's/ /_/g' ../species_list.clean.txt > ../species_list.clean
cat ../species_list.clean | while read line
	do
		target=$line
		grep $target results.txt > $target.results.txt
#first pick the assembly at the chromosome level that has the highest coverage
		grep Chromosome $target.results.txt | sort -n -r -k 3 | head -n 1 >> unique_assemblies.chromosome-level.txt
		rm $target.results.txt
	done

#exclude species for which an assembly has already been selected, and repeat the selection step
#this is done for assemblies available only at the contig or scaffold level (no chromosome-level assemblies)
cut -f 1 unique_assemblies.chromosome-level.txt > species_to_exclude.txt
cat species_to_exclude.txt | while read line
	do
		exclude=$line
		grep -v $exclude ../species_list.clean > ../species_list.temp
		mv ../species_list.temp ../species_list.clean
	done
rm species_to_exclude.txt

cat ../species_list.clean | while read line
	do
		target=$line
		grep $target results.txt > $target.results.txt
#pick the assembly at the scaffold level that has the highest coverage
		grep Scaffold $target.results.txt | sort -n -r -k 3 | head -n 1 >> unique_assemblies.scaffold-level.txt
		rm $target.results.txt
	done

#exclude species for which an assembly has already been selected, and repeat the selection step
#this last step is to keep the highest-coverage assembly that is only available at the contig level

cut -f 1 unique_assemblies.scaffold-level.txt > species_to_exclude.txt

cat species_to_exclude.txt | while read line
	do
		exclude=$line
		grep -v $exclude ../species_list.clean > ../species_list.temp
		mv ../species_list.temp ../species_list.clean
	done

cat ../species_list.clean | while read line
	do
		target=$line
		grep $target results.txt > $target.results.txt
#pick the assembly at the contig level that has the highest coverage
		grep Contig $target.results.txt | sort -n -r -k 3 | head -n 1 >> unique_assemblies.contig-level.txt
		rm $target.results.txt
	done

#merge results files (chromosome-, scaffold-, and contig-level) and sort records alphabetically by species 
cat unique_assemblies.chromosome-level.txt unique_assemblies.scaffold-level.txt unique_assemblies.contig-level.txt | sort -k 1 | uniq > unique_assemblies.txt
rm unique_assemblies.chromosome-level.txt unique_assemblies.scaffold-level.txt unique_assemblies.contig-level.txt results.txt species_to_exclude.txt ../species_list.clean

#identify and exclude repeated assemblies for the same species (e.g. because separate assemblies are available for multiple subspecies); these will be processed manually 
cut -d ' ' -f 1 unique_assemblies.txt | cut -d '_' -f 1,2 > species_with_unique_assemblies.txt
gsed -i 's/\t//g' species_with_unique_assemblies.txt

uniq -c species_with_unique_assemblies.txt | cut -d ' ' -f 4,5 > counts.txt
awk '$1 == 1 {print $2}' counts.txt > species_to_keep.txt
rm counts.txt species_with_unique_assemblies.txt

cat species_to_keep.txt | while read line
	do
		target=$line
		grep $target unique_assemblies.txt >> unique_assemblies.filtered.1.txt
	done

#remove the entry for Heliconius erato (listed as a hybrid Heliconius erato x Heliconius himera)
grep -v Heliconius_erato unique_assemblies.filtered.1.txt > unique_assemblies.filtered.txt
rm unique_assemblies.filtered.1.txt
gsed -i 's/_/ /g' unique_assemblies.filtered.txt

#find insect species that have genome assemblies but no data in the results file produced by the script so far
mv ../species_list.clean.txt ./species_list.txt
gsed -i 's/ /_/g' species_list.txt

cat species_to_keep.txt | while read line
	do
		target=$line
		grep -v $target species_list.txt > species_list.temp
		mv species_list.temp species_list.txt
	done

#combine records of species with missing data (to be searched to data manually)
cat ../species_list.removed.txt species_list.txt > ../species_for_manual_processing.txt
gsed -i 's/_/ /g' ../species_for_manual_processing.txt
mv unique_assemblies.filtered.txt ../results_all_species_no_taxonomy.txt
rm unique_assemblies.txt ../species_list.removed.txt species_to_keep.txt species_list.txt
cd ../	

######################################################### add taxonomical information (order and family) to species with assembly stats #########################################################
#prerequisite 1: a text file listing insect orders (one per line), named "insecta_orders.txt"
#prerequisite 2: the "harvest_insecta_family_IDs.R" script, which uses the 'taxize' package to get insect families from each order
#prerequisite 3: a text file listing insect families (one per line), named "insecta_families.txt"
#prerequisite 4: the "harvest_insecta_genera_IDs.R" script, which uses the 'taxize' package to get insect genera from each family

#assign family to each insect order with data in NCBI
mkdir taxonomy
cd taxonomy/
cat ../insecta_orders.txt | while read line
	do
		order=$line
		gsed "s/input/\"$order\"/g" ../harvest_insecta_family_IDs.R > harvest_insecta_family_IDs.final.R
		gsed -i "s/output/$order/g" harvest_insecta_family_IDs.final.R
		Rscript harvest_insecta_family_IDs.final.R
#remove column header, and print order name next to the corresponding family name (i.e. each line contains "family", "order")
		tail -n +2 ${order}_families.txt > ${order}_families
		rm ${order}_families.txt
		awk -v order="$order" '{print $0, order}' ${order}_families >> insecta_families_with_order.txt
	done
rm *_families 

#repeat the step above but only for the 'Plecoptera' order; this order was processed separatelly using the taxon unique identifier (UID; 50622 for Plecoptera) since this also occurs as a genus of moths
gsed "s/input/\"50622\"/g" ../harvest_insecta_family_IDs.R > harvest_insecta_family_IDs.final.R
gsed -i "s/taxa\$output\$childtaxa_name/taxa\$\'50622\'\$childtaxa_name/g" harvest_insecta_family_IDs.final.R
gsed -i "s/output_families/Plecoptera_families/g" harvest_insecta_family_IDs.final.R
Rscript harvest_insecta_family_IDs.final.R
#remove column header, and print order name next to the corresponding family name (i.e. each line contains "family", "order")
tail -n +2 Plecoptera_families.txt > Plecoptera_families
rm Plecoptera_families.txt
awk '{print $0, "Plecoptera"}' Plecoptera_families >> insecta_families_with_order.txt
rm *_families harvest_insecta_family_IDs.final.R

#assign family to each insect genus with data in NCBI
cat ../insecta_families.txt | while read line
	do
		family=$line
		gsed "s/input/\"$family\"/g" ../harvest_insecta_genera_IDs.R > harvest_insecta_genera_IDs.final.R
		gsed -i "s/output/$family/g" harvest_insecta_genera_IDs.final.R
		Rscript harvest_insecta_genera_IDs.final.R
#remove column header, print family name next to species name
		tail -n +2 ${family}_genera.txt > ${family}_genera
		rm ${family}_genera.txt
		awk -v family="$family" '{print $0, family}' ${family}_genera >> insecta_genera_with_family.txt
	done
rm *_genera

#Repeat the step above for the Xylophagaidae and Heterocheilidae families using the taxon unique identifiers for these families
#NOTE: the 'Xylophagaidae' family also occurs as a family of bivalves
#NOTE: the 'Heterocheilidae' family also occurs as a family of nematodes
gsed "s/input/\"92613\"/g" ../harvest_insecta_genera_IDs.R > harvest_insecta_genera_IDs.final.R
gsed -i "s/taxa\$output\$childtaxa_name/taxa\$\'92613\'\$childtaxa_name/g" harvest_insecta_genera_IDs.final.R
gsed -i "s/output_genera/Xylophagaidae_genera/g" harvest_insecta_genera_IDs.final.R
Rscript harvest_insecta_genera_IDs.final.R
gsed "s/input/\"169478\"/g" ../harvest_insecta_genera_IDs.R > harvest_insecta_genera_IDs.final.R
gsed -i "s/taxa\$output\$childtaxa_name/taxa\$\'169478\'\$childtaxa_name/g" harvest_insecta_genera_IDs.final.R
gsed -i "s/output_genera/Heterocheilidae_genera/g" harvest_insecta_genera_IDs.final.R
Rscript harvest_insecta_genera_IDs.final.R
#remove column header, print family name next to species name
for family in {"Heterocheilidae","Xylophagaidae"}
	do
		tail -n +2 ${family}_genera.txt > ${family}_genera
		rm ${family}_genera.txt
		awk -v family="$family" '{print $0, family}' ${family}_genera >> insecta_genera_with_family.txt
	done
rm *_genera

#remove two entries, one of which has undertain taxonomic ID (Trigonalinae gen. trigJanzen01 Trigonalidae) and the other is an error (Bacteria Latreille et al. 1825 Diapheromeridae)
grep -v "Latreille" insecta_genera_with_family.txt | grep -v "trigJanzen" > insecta_genera_with_family.clean.txt
mv insecta_genera_with_family.clean.txt insecta_genera_with_family.txt

#add order information for the families that have genera assigned
cut -d ' ' -f 2 insecta_genera_with_family.txt > target_families.txt
cat target_families.txt | while read line
	do
		target=$line
		grep -w -m 1 $target insecta_families_with_order.txt | cut -d ' ' -f 2 >> orders.txt
	done

paste insecta_genera_with_family.txt orders.txt > insecta_genera_with_family_and_order.txt

cut -d ' ' -f 1 insecta_genera_with_family_and_order.txt > insecta_genera_with_taxonomy.txt

#make a list of genera with genome stats that also have taxonomic information
cut -d ' ' -f 1 ../results_all_species_no_taxonomy.txt | sort | uniq > genera_in_spreadsheet.txt
cat genera_in_spreadsheet.txt | while read line
	do
		target=$line
		grep -w $target insecta_genera_with_taxonomy.txt >> insecta_genera_with_taxonomy_and_assembly_information.txt
	done
	
#set aside genome assemblies for manual taxonomy entry (for genera with no taxonomic information)
cat insecta_genera_with_taxonomy_and_assembly_information.txt | while read line
	do
		target=$line
		grep -v $target genera_in_spreadsheet.txt > genera_in_spreadsheet.temp
		mv genera_in_spreadsheet.temp genera_in_spreadsheet.txt
	done
	
cat genera_in_spreadsheet.txt | while read line
	do
		target=$line
		grep -w $target ../results_all_species_no_taxonomy.txt >> ../results_species_for_manual_taxonomy_entry.txt
	done

#add taxonomy information for species for which it is available
cat insecta_genera_with_taxonomy_and_assembly_information.txt | while read line
	do
		target=$line
		grep -w $target ../results_all_species_no_taxonomy.txt >> species_with_all_info
	done

sort -k 1 species_with_all_info | uniq > species_with_all_info.txt

cut -d ' ' -f 1 species_with_all_info.txt > genera_for_final_spreadsheet.txt

gsed -i 's/\t/ /g' insecta_genera_with_family_and_order.txt
cat genera_for_final_spreadsheet.txt | while read line
	do
		target=$line
		grep -w -m 1 $target insecta_genera_with_family_and_order.txt | cut -d ' ' -f 2 >> family_for_assemblies.txt
		grep -w -m 1 $target insecta_genera_with_family_and_order.txt | cut -d ' ' -f 3 >> order_for_assemblies.txt
	done
paste order_for_assemblies.txt family_for_assemblies.txt species_with_all_info.txt > results_all_species_with_taxonomy

echo "Order	Family	Species	Assembly_accession	Coverage	ContigN50	Assembly_level	Assembly_length_Mb	Expected_final_version" > header.txt
cat header.txt results_all_species_with_taxonomy > ../results_all_species_with_taxonomy.txt
rm header.txt results_all_species_with_taxonomy order_for_assemblies.txt family_for_assemblies.txt species_with_all_info.txt *.final.R species_with_all_info genera_in_spreadsheet.txt orders.txt target_families.txt insecta_genera_with_taxonomy.txt insecta_genera_with_taxonomy_and_assembly_information.txt genera_for_final_spreadsheet.txt