#!/bin/bash
#prerequisite 1: a text file with genome size estimates for insect species (named insecta_genome_sizes.txt), obtained from https://www.genomesize.com/

#use only FCM-based genome size estimates
grep "FCM" insecta_genome_sizes.txt > insecta_FCM_genome_sizes.txt

#annotate the genome size file with a column of species name (with spaces replaced by underscore; this will be used with grep)
cut -f 4 insecta_FCM_genome_sizes.txt > FCM_species.txt
gsed -i 's/ /_/g' FCM_species.txt
paste FCM_species.txt insecta_FCM_genome_sizes.txt > insecta_FCM_genome_sizes.annotated.txt

#find species with genome assembly stats that also fave FCM genome size estimates; keep records separate for each taxa (genome sizes are averaged when multiple estimates are available for a species)
cp ../species_list.txt ./
gsed -i 's/ /_/g' species_list.txt

cat species_list.txt | while read line
	do
		target=$line
		grep -w $target insecta_FCM_genome_sizes.annotated.txt | cut -f 7 > $target.FCM_estimates_pg.txt
#average estimates obtained for the same species
		awk -v target="$target" '{ total += $1 } END { print target, total/NR*978}' $target.FCM_estimates_pg.txt >> genome_size_averages.FCM.txt
		rm $target.FCM_estimates_pg.txt
	done

#add empty fields for species with assembly information but no FCM genome size estimate available
cut -d ' ' -f 1 genome_size_averages.FCM.txt > insecta_genomes_FCM_species.txt
cat insecta_genomes_FCM_species.txt | while read line
	do
		target=$line
		grep -vw $target species_list.txt > species_list.temp
		mv species_list.temp species_list.txt
	done
	
awk '{print $0,"NA"}' species_list.txt > species_missing_FCM_estimate.txt

cat genome_size_averages.FCM.txt species_missing_FCM_estimate.txt | sort -k 1 > all_species_FCM_data.temp

#make header line
echo "Species FCM_genome_size" > header.txt
cat header.txt all_species_FCM_data.temp > all_species_FCM_data.txt
rm header.txt all_species_FCM_data.temp genome_size_averages.FCM.txt species_missing_FCM_estimate.txt insecta_genomes_FCM_species.txt species_list.txt FCM_species.txt insecta_FCM_genome_sizes.*