This archive contains scripts and related files used in Hebert et al. 2022 (Interrogating 1,000 Insect Genomes for NUMTs: A Risk Assessment for Species Scans)

####################Bioinformatic scripts

1. get_information_on_nuclear_genomes.sh
#used to obtain information on insect nuclear genome assemblies in NCBI, to select a high-quality assembly for each species, and to record taxonomy for each species 
#the script needs as input a list of species names (i.e. the species_list.txt file provided below)
#the taxonomic information is added using the R scripts provided below ("harvest_insecta_family_IDs.R" and "harvest_insecta_genera_IDs.R")

2. harvest_insecta_family_IDs.R and harvest_insecta_genera_IDs.R
#these R scripts are required for the "get_information_on_nuclear_genomes.sh" script above, and used to add taxonomic information for species with nuclear genome assemblies

3. add_FCM_genome_size_estimates.sh
#used to add FCM genome size estimates and to obtain an average genome size when multiple extimates were available for a given species
#requires as input the "insecta_genome_sizes.txt" file, which was obtained from https://www.genomesize.com/, and is also provided below  

4. get_nuclear_assemblies.sh
#used to download nuclear genome assemblies from NCBI using the 'ncbi-genome-download' package, based on a list of target species
#requires the "nuclear_genome_stats_Dec2021.csv" file, which is provided below

5. get_mitogenomes_fasta.sh
#used to download mitogenome sequences in .fasta and .gbk format using the 'ncbi-acc-download' package
#uses the "nuclear_genome_stats_Dec2021.csv" file below to obtain a mitogenome accession ID for each species 

6. check_annotated_mitogenomes.sh
#used to quality-check NCBI mitogenomes that have an annotation available; filters based on the presence of the 13 PCGs, and the order of those genes
#requires the "annotated_mitogenomes.txt" and "mitogenome_genes.txt" files provided below  

7. rename_and_filter_bed_files.sh
#used to rename (replace accession ID with the corresponding species ID) bed files obtained from MITOS after the annotation of NCBI-derived mitogenomes that lacked an annotation
#used to filter the mitogenome annotations to keep only those with one copy of COX1, with 13 or more PCGs, with the PCGs in the expected order, and based on mitogenome length
#requires the "mitogenome_genes.txt" files provided below

8. find_mitogenomes_in_nuclear_assemblies.sh
# used to identify full-length mitogenomes present in nuclear genome assemblies
#requires the "unique_barcoded_families.txt" file provided below, which lists (on each line) families with COX1 barcode sequence publicly available
#requires the "mtCOI_Bait_Families.fas" file provided below, containing the COX1 bait sequences for each family 
#requires the "target_families_species.txt" listing family and species ID for all 1446 species to be considered for the mitogenome search

9. rename_and_filter_singleton_annotations.sh
#used to rename (replace accession ID with the corresponding species ID) bed files obtained from MITOS after the annotation of "singleton" candidate mitogenomes identified using the "find_mitogenomes_in_nuclear_assemblies.sh" script above
#used to filter the mitogenome annotations to keep only those with one copy of COX1, with 13 or more PCGs, with the PCGs in the expected order, and based on mitogenome length 
#requires the "mitogenome_genes.txt" files provided below

10. rename_and_filter_double_annotations.sh
#used to rename (replace accession ID with the corresponding species ID) bed files obtained from MITOS after the annotation of "doubles" candidate mitogenomes identified using the "find_mitogenomes_in_nuclear_assemblies.sh" script above
#used to filter the mitogenome annotations to keep only those with one copy of COX1, with 13 or more PCGs, with the PCGs in the expected order, and based on mitogenome length
#requires the "mitogenome_genes.txt" files provided below

11. COX1_harvest_from_annotated_public_mitogenomes.sh
#used to extract the full-length COX1 gene from the filtered annotated mitogenomes available on NCBI, using the BEDTools "getfasta" utility
#requires the "annotated_mitogenomes.filtered.txt" file provided below

12. COX1_harvest_from_nonannotated_public_mitogenomes.sh
#used to extract the full-length COX1 gene from the filtered nonannotated mitogenomes available on NCBI, using the BEDTools "getfasta" utility
#the same approach was used to obtain full-length COX1 sequences from the mitogenomes mined from nuclear genomes (which were also annotated using MITOS)

13. filter_nuclear_assemblies_for_mtDNA.sh
#used to remove scaffolds from nuclear genome assemblies that contain 'mitochondrion' in the header
#used to remove scaffolds identified as mitogenomes using the "find_mitogenomes_in_nuclear_assemblies.sh" script

14. full_NUMT_blast.loop.based_on_COX1.sh
#used to perform the BLASTn searches for the identification of NUMTs, based on the COX1 barcode sequence and the filtered (i.e. post mtDNA removal) nuclear genomes
#excludes BLASTn hits that are >= 99% identical to the query sequence and that cover the entire query sequence (658 bp)

15. get_coding_regions_NCBI_annotated_genomes.sh
#used to isolate the coding regions (13 PCGs and two rRNA genes) from the NCBI annotated mitogenomes that passed filtering, and that had a high-quality nuclear genome to be used in NUMT searches
#requires a list of species to be included (i.e. the "NCBI_annotated_mitogenomes_with_nuclear_genome.txt" file provided below) 

16. get_coding_regions_NCBI_non_annotated_genomes.sh
#used to isolate the coding regions (13 PCGs and two rRNA genes) based on the MITOS annotations of the NCBI (previously nonannotated) mitogenomes; only species that passed mitogenome filtering, and that had a high-quality nuclear genome were used
#requires a list of species to be included (i.e. the "NCBI_nonannotated_mitogenomes_with_nuclear_genome.txt" file provided below)
#the same approach was used to obtain the coding regions based on the MITOS annotations of mitogenomes mined from nuclear genomes (requiring the "singleton_annotated_mitogenomes_with_nuclear_genome.txt" and "doubles_annotated_mitogenomes_with_nuclear_genome.txt" files provided below)

17. NUMT_count_along_mitogenome.sh
#used to partition each mitogenome (coding regions only) in non-overlapping windows of 658bp (with the fasta_windows_v1.1.sh script)
#used to perform BLASTn searches using each 658 mitogenome window as the query
#used to exclude BLASTn hits that are >= 99% identical to the query sequence and that cover the entire query sequence (658 bp)
#used to calculate the mitogenome-wide NUMT average and SD

18. plot_genome_size_and_coverage.R
#used to plot Figures S1, S2, and S3, and to perform analyses described in these figures
#requires as input the file "Insecta_genomes.csv", which contains nuclear assembly statistics and is provided below

19. get_unique_genera_with_FCM_data.sh
#used to subset one species per genus, for taxa with high coverage genomes and FCM genome size data available; the output produced by this script is used in the "plot_genome_size_and_coverage.R" script above to make the inset in figure S3 

20. NUMT_count_barplot.R
#used to make Figure 1 and to perform the sign test based on average NUMT counts for the six Lepidoptera families with data in both the high coverage and low coverage assembly categories
#requires as input the "NUMTs_by_count.csv" and "Lepidoptera_mean_NUMTs_shared_families.csv" files provided below

21. plot_mitogenome-wide_NUMT_results.R
#used to make Figure 4 and to perform analyses summarized in this figure
#requires as input the files "results_mitogenome_coding_only.csv" and "results_mitogenome_coding_only.one_sp_per_genus.csv", provided below

22. comparisons_congeneric_species.R
#used to make Figure 6 and to perform analyses summarized in this figure
#requires as input the files "one_species_pair_per_genus_NUMTs.csv" and "one_species_pair_per_genus_genome_size.csv", provided below

23. comparisons_among_major_orders.R
#used to make Figure 7 and to compare NUMT counts among the major orders
#requires as input the "NUMTs_major_orders.csv" file provided below

24. blast_results_summary_and_indel_scan.R
#used to summarize BLAST results, filter hits for those considered to be reliable NUMTs, scan for frame-shift causing INDELS, and output a NUMT FASTA file for STOP CODON screening
#requires, as input, output of script #14 ("full_NUMT_blast.loop.based_on_COX1.sh") provided above

25. bivariate_plots.R
#used to generate bivariate plots of sequence length compared to %ID for Figure 2 (all 8,423 NUMTs), Figure 9 (ten individual species), the two components of Figure S5 (all individual species), and Figure S6 (gen size vs. # of C5* NUMTS)
#requires, as input, "NUMT_Reliable_Hits_Results.xlsx" provided below

26. numt_length_histo.R
#used to generate Figure 3 (NUMT length histogram)
#requires, as input, output of script #24 (blast_results_summary_and_indel_scan.R): file "NUMT_Reliable_Hits_Results_1500bp.csv" provided below

27. numt_count_vs_assembly_length.R
#used to make Figure 5 (NUMT count vs. assembly length)
#requires, as input, "Table S2.xlsx" and "results_mitogenome_coding_only.csv" provided below

28. circular_cladograms_numtcount_gensize.R
#used to make components of Figure 8 (circular cladograms with NUMT count and genome size histrogram overlays) as well as Figure S4
#requires, as input, "COITree_5major.tre", "COITree_Others.tre", and "TreeData.txt" provided below

29. C5-star_plot.R
#used to make Figure 10, NUMT count compate to %ID for all C5* NUMTs
#requires, as input, "NUMT_Reliable_Hits_Results_.xlsx" provided below

30. C5-star_MLtrees.R
#used to make the three components of Figure 11, linear maximum-likelihood trees of all C5* NUMTs with mtCOI included
#requires, as input, "NUMT_w_COI_Tree1.tre", "NUMT_w_COI_Tree2.tre", "NUMT_w_COI_Tree3.tre", and "NUMT_w_COI_TreeData.txt" provided below


####################other analysis-related files
1. species_list.txt
#these are enries from the "Organism Name" column from the NCBI insecta genome assemblies spreadsheet

2. insecta_orders.txt and insecta_families.txt
#text files listing insect orders and families (one per line), used by the "get_information_on_nuclear_genomes.sh" script above (at the step that adds taxonomy information)

3. insecta_genome_sizes.txt
#genome size estimates for insect species, obtained from https://www.genomesize.com/, and required for the "add_FCM_genome_size_estimates.sh" script above

4. nuclear_genome_stats_Dec2021.csv
#file that compiles information on nuclear genome assemblies including accession, coverage, contig N50, or assembly length
#these data were obtained using the "get_information_on_nuclear_genomes.sh" script above
#this file is required for the "get_nuclear_assemblies.sh" and the "get_mitogenomes_fasta.sh" scripts above

5. annotated_mitogenomes.txt
#this file lists the species with annotated mitogenomes that will be used for the quality check; it is required for the "check_annotated_mitogenomes.sh" script above

6. mitogenome_genes.txt
#this file lists the 13 protein-coding genes of the mitogenome
#it is required for the "check_annotated_mitogenomes.sh" and "rename_and_filter_bed_files.sh" scripts included above

7. unique_barcoded_families.txt
#this file lists families with a COX1 barcode sequence publicly available
#it is required for the "find_mitogenomes_in_nuclear_assemblies.sh" script above

8. mtCOI_Bait_Families.fas
#this file contains the COX1 bait sequences for each family, used in the "find_mitogenomes_in_nuclear_assemblies.sh" script above

9. annotated_mitogenomes.filtered.txt
#this file contains the list of 215 species with annotated mitogenomes available on NCBI, that passed mitogenome filtering (done using the "check_annotated_mitogenomes.sh" script above)
#it is required for the "COX1_harvest_from_annotated_public_mitogenomes.sh" script above

10. NCBI_annotated_mitogenomes_with_nuclear_genome.txt
#this file lists species with quality-filtered mitogenomes (from the NCBI annotated set) that also have a high-quality nuclear genomes, and thus could be used for the mitogenome-wide NUMT screens

11. NCBI_nonannotated_mitogenomes_with_nuclear_genome.txt
#this file lists species with quality-filtered mitogenomes (from the NCBI nonannotated set; annotations were performed in MITOS) that also have a high-quality nuclear genomes, and thus could be used for the mitogenome-wide NUMT screens

12. singleton_annotated_mitogenomes_with_nuclear_genome.txt and doubles_annotated_mitogenomes_with_nuclear_genome.txt
#these files list species with quality-filtered mitogenomes mined from nuclear genomes (annotations were performed in MITOS) that also have a high-quality nuclear genomes, and thus could be used for the mitogenome-wide NUMT screens

13. Insecta_genomes.csv
#this file provides information on nuclear genome assemblies summarized in figures S1, S2, and S3
#it is required for the "plot_genome_size_and_coverage.R" script provided above

14. NUMTs_by_count.csv
#this file summarizes the number of species with NUMT counts in a particular category, estimated separately for the low coverage and high coverage assemblies
#it is used to plot Figure 1, using the "NUMT_count_barplot.R" script provided above

15. Lepidoptera_mean_NUMTs_shared_families.csv
#this file lists average NUMT counts for the six Lep families with data in both the low coverage and high coverage assembly categories
#it is used in the "NUMT_count_barplot.R" to perform a sign test

16. results_mitogenome_coding_only.csv and results_mitogenome_coding_only.one_sp_per_genus.csv
#these files contain average mitogenome-wide NUMT counts and COI barcode NUMT counts used for mitogenome-wide analyses (summarized in Figure 4)

17. one_species_pair_per_genus_NUMTs.csv and one_species_pair_per_genus_genome_size.csv
#these files contain NUMT counts and genome size estimates for congeneric pairs of species, used in Figure 6

18. NUMTs_major_orders.csv
#this file summarizes NUMT counts and metamorphosis type for the major orders represented in our dataset
#it is required for the "comparisons_among_major_orders.R" script provided above

19. NUMT_Reliable_Hits_Results.xlsx
#this file summarizes all purtative NUMT hits for 668 species with high coverage genomes
#it is used for the scripts "bivariate_plots.R" and "C5-star_plot.R" provided above

20. NUMT_Reliable_Hits_Results_1500bp.csv
#this file summarizes all putative NUMT hits for 283 species with high coverage genomes and available full-length (~1500 bp) COI sequences
#it is used for the script "numt_length_histo.R" provided above

21. Table S2.xlsx
#this file summarizes the NUMT composition of all 668 species with high coverage genomes
#it is used for the script "numt_count_vs_assembly_length.R" provided above

22. results_mitogenome_coding_only.csv
#this file summarizes NUMT counts based on a sliding window approach of the entire mitogenome
#it is used for the script "numt_count_vs_assembly_length.R" provided above

23. COITree_5major.tre
#this is a Newick tree file based on the 658 bp COI barcode sequences from the five main insect orders
#it is used for the script "circular_cladograms_numtcount_gensize.R" provided above

24. COITree_Others.tre
#this is a Newick tree file based on the 658 bp COI barcode sequences from all other insect orders
#it is used for the script "circular_cladograms_numtcount_gensize.R" provided above

25. TreeData.txt
#this is a metadata file (for all 668 insect species with high coverage genomes) used to overlay data plots on trees
#it is used for the script "circular_cladograms_numtcount_gensize.R" provided above

26. NUMT_w_COI_Tree1.tre
#this is a Newick tree file composed of all C5* NUMTs and their mtCOI counterparts (part 1 of 3)
#it is used for the script "C5-star_MLtrees.R" provided above

27. NUMT_w_COI_Tree2.tre
#this is a Newick tree file composed of all C5* NUMTs and their mtCOI counterparts (part 2 of 3)
#it is used for the script "C5-star_MLtrees.R" provided above

28. NUMT_w_COI_Tree3.tre
#this is a Newick tree file composed of all C5* NUMTs and their mtCOI counterparts (part 3 of 3)
#it is used for the script "C5-star_MLtrees.R" provided above

29. NUMT_w_COI_TreeData.txt
#this is a metadata file used to overlay data plots on trees
#it is used for the script "C5-star_MLtrees.R" provided above

30. BLAST_Data.zip
#this folder contains BLAST results for all 1002 species that returned any hits
#it is used for the script "blast_results_summary_and_indel_scan.R" privided above








