#load the required library 
library(ggplot2)

#read in data file (this summarizes number of species with NUMT counts split by category, as in Figure 1)
NUMT_species_counts<-read.csv(file="NUMTs_by_count.csv",header = TRUE, sep = ,)

#reorder NUMT bins
NUMT_species_counts$NUMT_bin = factor(NUMT_species_counts$NUMT_bin, levels=c("0", "1_2", "3_4", "5_8", "9_16", "17_32", "33_64", "65_128", "129_256", "257_512", "513_1024"))

#separate all counts and Lep-only counts
all_assemblies<- NUMT_species_counts[NUMT_species_counts$Group == 'all',]
Lep_assemblies<- NUMT_species_counts[NUMT_species_counts$Group == 'Leps',]

#separate the assemblies based on coverage category (low & high)
low_coverage_all_assemblies<- all_assemblies[all_assemblies$Coverage == 'Low',]
high_coverage_all_assemblies<- all_assemblies[all_assemblies$Coverage == 'High',]

low_coverage_Lep_assemblies<- Lep_assemblies[Lep_assemblies$Coverage == 'Low',]
high_coverage_Lep_assemblies<- Lep_assemblies[Lep_assemblies$Coverage == 'High',]

#plot low coverage assemblies (left panel in Figure 1)
ggplot(low_coverage_all_assemblies, aes(x = NUMT_bin, y=Species)) + 
  geom_bar(stat = "identity", colour="black", fill="lightgrey")+
  geom_bar(data=low_coverage_Lep_assemblies, aes(x = NUMT_bin, y=Species), stat = "identity", colour="black", fill="lightgrey")+
  theme_bw()+
  theme(axis.title=element_text(size=12),
        axis.text = element_text(size=12),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank())+
  ylim(0,200)

#plot high coverage assemblies (right panel in Figure 1)
ggplot(high_coverage_all_assemblies, aes(x = NUMT_bin, y=Species)) + 
    geom_bar(stat = "identity", colour="black", fill="#28005fff")+
    geom_bar(data=high_coverage_Lep_assemblies, aes(x = NUMT_bin, y=Species), stat = "identity", colour="black", fill="#28005fff")+
    theme_bw()+
    theme(axis.title=element_text(size=12),
          axis.text = element_text(size=12),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank())+
    ylim(0,200)


#compare average NUMT count among six families of Lepidoptera with data in both the low coverage and high coverage categories
#load the required libraries
library(rstatix)
library(ggpubr)
library(tidyverse)

#read in data file of average NUMT counts for each of the six Lep families represented in the low as well as high coverage categories
average_NUMTs_six_Lep_families<-read.csv(file="Lepidoptera_mean_NUMTs_shared_families.csv",header = TRUE, sep = ,)

#sign test
sign_test(average_NUMTs_six_Lep_families,mean_NUMTs~Coverage)

#summary stats by coverage category
average_NUMTs_six_Lep_families %>%
  group_by(Coverage) %>%
  get_summary_stats(mean_NUMTs, type="median_iqr")

#plot 
ggpaired(average_NUMTs_six_Lep_families, x = "Coverage", y = "mean_NUMTs", 
         order = c("low", "high"),
         ylab = "Mean NUMT counts", xlab = "Coverage category")
