#load the required library 
library(ggplot2)

#read in data file with nuclear assembly stats
insecta_genome_stats<-read.csv(file="Insecta_genomes.csv",header = TRUE, sep = ,)

#reorder assembly levels from low to high completeness (Contig, Scaffold, Chromosome)
insecta_genome_stats$Assembly_level = factor(insecta_genome_stats$Assembly_level, levels=c("Contig", "Scaffold", "Chromosome"))

#separate the 15 samples to be removed (these are assemblies of high coverage but small assembly length; the 'Comments' section on NCBI indicated they derive from endosymbiont bacteria)
insecta_genome_keep<- insecta_genome_stats[insecta_genome_stats$Notes == 'keep',]
insecta_genome_remove<- insecta_genome_stats[insecta_genome_stats$Notes == 'remove',]

#plot genome assembly size vs. genome assembly coverage (as in Figure S1, lower panel)
ggplot(data = insecta_genome_keep, aes(x = log10(as.numeric(Coverage)), y = log10(as.numeric(Assembly_length_Mb)), shape=Coverage_category)) +
  geom_point(pch=21,size = 1,aes(fill=Coverage_category), alpha=0)+
  geom_point(aes(color=Assembly_level), size=3, alpha=0.4)+
  stat_smooth(method="lm",colour="black",se=FALSE,size=0.3)+
  geom_vline(xintercept = 0.7, linetype="dashed", size=0.5)+
  theme_bw()+
  theme(axis.title=element_text(size=12),
        axis.text = element_text(size=12),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.position = "none")+
  xlab("\nLog genome assembly coverage")+
  ylab("Log assembly length (Mb)\n")+
  geom_point(data=insecta_genome_remove, aes(x=log10(as.numeric(Coverage)),y=log10(as.numeric(Assembly_length_Mb))), colour="lightgrey", size=3)

#histogram for coverage (as in Figure S1, upper panel)
ggplot(data = insecta_genome_keep, aes(x=log10(as.numeric(Coverage)))) +
  geom_histogram(binwidth=.05, colour="black", fill="black")+
  geom_vline(xintercept=0.7, linetype="dashed", size=0.5)+
  xlab("\nLog genome assembly coverage")+
  ylab("Count\n")+
  theme_bw()+
  theme(axis.title=element_text(size=12),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.position = "none")

#separate assemblies based on coverage category (low & high; assemblies with no coverage information are classified as low coverage)
low_coverage_assemblies<- insecta_genome_keep[insecta_genome_keep$Coverage_category == 'low',]
high_coverage_assemblies<- insecta_genome_keep[insecta_genome_keep$Coverage_category == 'high',]

#write to disk a file of high coverage assemblies (will be used to make the unique genera dataset)
write.csv(high_coverage_assemblies, file="high_coverage_assemblies.csv")

#test for correlation between coverage and genome assembly size
cor.test(log10(as.numeric(low_coverage_assemblies$Coverage)),log10(low_coverage_assemblies$Assembly_length_Mb))
cor.test(log10(as.numeric(high_coverage_assemblies$Coverage)),log10(high_coverage_assemblies$Assembly_length_Mb))

#correct P values for multiple comparisons using the Bonferroni method
pvals<-c(2.2e-16,0.03496)
p.adjust(pvals,method = "bonferroni")


##plot genome assembly size vs. flow cytometry genome size estimates (Figure S3)
ggplot(data = high_coverage_assemblies, aes(x = log10(FCM_genome_size), y = log10(Assembly_length_Mb))) +
  geom_point(aes(color=Assembly_level), size=3, alpha=0.4)+
  stat_smooth(method="lm",colour="black",se=FALSE,size=0.6)+
  ylim(1.5,4)+
  xlim(1.5,4)+
  theme_bw()+
  theme(axis.title=element_text(size=12),
        axis.text = element_text(size=12),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.position = "none")+
  xlab("\nLog flow cytometry genome size (Mb)")+
  ylab("Log assembly length (Mb)\n")

#test for correlation between flow cytometry genome size estimates and genome assembly size
cor.test(log10(high_coverage_assemblies$FCM_genome_size),log10(high_coverage_assemblies$Assembly_length_Mb))

#read in data file with unique genera and FCM data
FCM_unique_genera<-read.csv(file="Insecta_genomes_with_FCM_data.unique_genera.csv",header = TRUE, sep = ,)
FCM_unique_genera$Assembly_level = factor(FCM_unique_genera$Assembly_level, levels=c("Contig", "Scaffold", "Chromosome"))
FCM_unique_genera_high_coverage_assemblies<-FCM_unique_genera[FCM_unique_genera$Coverage_category == 'high',]

##plot genome assembly size vs. flow cytometry genome size estimates
ggplot(data = FCM_unique_genera_high_coverage_assemblies, aes(x = log10(FCM_genome_size), y = log10(Assembly_length_Mb))) +
  geom_point(aes(color=Assembly_level), size=3, alpha=0.4)+
  stat_smooth(method="lm",colour="black",se=FALSE,size=0.6)+
  ylim(1.5,4)+
  xlim(1.5,4)+
  theme_bw()+
  theme(axis.title=element_text(size=12),
        axis.text = element_text(size=12),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.position = "none")+
  xlab("\nLog flow cytometry genome size (Mb)")+
  ylab("Log assembly length (Mb)\n")

#test for correlation between flow cytometry genome size estimates and genome assembly size
cor.test(log10(FCM_unique_genera_high_coverage_assemblies$FCM_genome_size),log10(FCM_unique_genera_high_coverage_assemblies$Assembly_length_Mb))

#correct P values for multiple comparisons using the Bonferroni method
pvals<-c(2.2e-16,2.2e-16)
p.adjust(pvals,method = "bonferroni")


#read in data file with NUMT counts
genome_size_with_NUMT_counts<-read.csv(file="Insecta_genomes_with_appended_NUMT_counts_100bp_and_over_no_NA.csv",header = TRUE, sep = ,)

#reorder assembly levels from low to high completeness (Contig, Scaffold, Chromosome) and coverage categories
genome_size_with_NUMT_counts$Assembly_level = factor(genome_size_with_NUMT_counts$Assembly_level, levels=c("Contig", "Scaffold", "Chromosome"))
genome_size_with_NUMT_counts$Coverage_category = factor(genome_size_with_NUMT_counts$Coverage_category, levels=c("low", "high"))

#separate assemblies based on coverage category
high_coverage_assemblies<- genome_size_with_NUMT_counts[genome_size_with_NUMT_counts$Coverage_category == 'high',]
low_coverage_assemblies<- genome_size_with_NUMT_counts[genome_size_with_NUMT_counts$Coverage_category == 'low',]

##plot NUMT count for each coverage category (Figure S2)
ggplot(data = genome_size_with_NUMT_counts, aes(x = Coverage_category, y = log10(NUMT_100bp_and_over))) +
  geom_jitter(aes(color=Assembly_level, shape=Coverage_category), width=0.2,size=5, alpha=0.3)+
  geom_boxplot(fill="white", alpha=0.5, outlier.shape = NA)+
  theme_bw()+
  theme(axis.title=element_text(size=15),
        axis.text = element_text(size=15),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank())+
  xlab("\nCoverage category")+
  ylab("Log NUMT count\n")+
  ylim(0,3)+
  scale_shape_manual(values = c(17, 16))

#Wilcoxon rank-sum test for differences in NUMT counts between coverage categories
wilcox.test(NUMT_100bp_and_over~Coverage_category,data=genome_size_with_NUMT_counts)
