#requirements: File 'NUMT_Reliable_Hits_Results.csv', after columns 'STOP', 'IPSC', and 'NUMT.Category' have been populated, and saved as 'NUMT_Reliable_Hits_Results.xlsx'.

#set working directory and install required packages
setwd("/PATH/TO/WORKING/DIRECTORY")
require(ggplot2)
require(scales)
require(grid)
require(gridExtra)
require(ggalt)
require(ggforce)
require(tidyr)
require(ggpubr)
library("seqinr")
library(Biostrings)
library(readxl)

#read input file
all.input <- data.frame(read_excel("NUMT_Reliable_Hits_Results.xlsx",sheet = "NUMT_Reliable_Hits_Results", na = "NA"))

#fill in missing data for species with no NUMTs
all.input$Sequence.Length[is.na(all.input$NUMT.ID)] <- 0
all.input$Percent.ID[is.na(all.input$NUMT.ID)] <- 0
all.input$IPSC[is.na(all.input$NUMT.ID)] <- "No"

#rename a column and format data
names(all.input)[10] <- "X..Pairwise.Identity"
all.input$Sequence.Length <- as.numeric(all.input$Sequence.Length)
all.input$X..Pairwise.Identity <- as.numeric(all.input$X..Pairwise.Identity)
all.input$IPSC <- as.character(all.input$IPSC)
all.input$NUMT.Category <- as.character(all.input$NUMT.Category)
all.input$NUMT.Category <- factor(all.input$NUMT.Category, levels = c("C1","C2","C3","C4","C5","C5*","Diagnosable"))

#Convert pairwise identity column to decimal format
all.input$X..Pairwise.Identity <- round(all.input$X..Pairwise.Identity/100,digits = 3)

#Replace underscores in species names with spaces
all.input$Species <- gsub("_"," ",all.input$Species)

#Add genome size to Species name
all.input$Species <- paste(all.input$Species," (",round(as.numeric(all.input$Size.Mb), digits = 0)," Mb)",sep = "")

#Reorder data
all.input <- all.input[with(all.input, order(Order, Family, Species)),]

#add labels for plots, and parse data frame by order
input <- all.input
input$Label <- ifelse(input$Sequence.Length == 0, input$Label <- "No NUMTs", input$Label <- "")
order.dataframe.list <- split(input,input$Order)

#initialize some variables
count <- 1
pages.required <- 0
order.dataframe.list.five <- list()
order.dataframe.list.others <- list()

#Split data into five major orders + all other orders
for (j in order.dataframe.list) {
  if(names(order.dataframe.list[count]) %in% c("Coleoptera","Diptera","Hemiptera","Hymenoptera","Lepidoptera")){
    order.dataframe.list.five <- c(order.dataframe.list.five,order.dataframe.list[count])
    
  }else{
    order.dataframe.list.others <- c(order.dataframe.list.others,order.dataframe.list[count])
  }
  count <- count + 1
}

############################################################################################
########################## FIGURE 2 ########################################################
############################################################################################
g.plot <- ggplot(input,aes(x = X..Pairwise.Identity, y = Sequence.Length)) + 
  geom_point(aes(colour = IPSC), shape = 19, alpha = 0.7, size = 1, stroke = 0, show.legend = FALSE) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0,1,0.1)) +
  scale_y_continuous(breaks = seq(0,800,100)) +
  coord_cartesian(xlim = c(0.6, 1), ylim = c(100, 800)) +
  scale_colour_manual(values = c("Yes" = "#bcd631","No" = "red"), drop = F) +
  guides(colour = guide_legend(override.aes = list(shape = c(19,19)))) +
  theme_bw() +
  labs(x = "% ID", y = "Sequence Length (bp)") +
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 7, angle = -45, hjust = 0.5, vjust = 0.25),
        panel.grid.major = element_blank())

ggsave("Figure 2.pdf", g.plot)


############################################################################################
########################## FIGURE 9 ########################################################
############################################################################################

#reduce dataframe to only species of interest
species.of.interest <- c("Hycleus phaleratus (91 Mb)","Gonioctena quinquepunctata (1732 Mb)","Liriomyza trifolii (70 Mb)",
                         "Rhagoletis pomonella (1223 Mb)","Orius laevigatus (151 Mb)","Philaenus spumarius (2761 Mb)",
                         "Eumacrocentrus americanus (117 Mb)","Neuroterus quercusbaccarum (2570 Mb)",
                         "Papilio polytes (227 Mb)","Parnassius apollo (1392 Mb)")
input.species.of.interest <- input[which(input$Species %in% species.of.interest),]

#plot data
g.plot.2 <- ggplot(input.species.of.interest,aes(x = X..Pairwise.Identity, y = Sequence.Length)) + 
  geom_point(aes(colour = NUMT.Category, shape = NUMT.Category, alpha = NUMT.Category), size = 2, stroke = 0) +
  facet_wrap(~factor(Species, levels = species.of.interest) + Family + Order, nrow = 5, ncol = 2) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0,1,0.1)) +
  scale_y_continuous(breaks = seq(0,700,100)) +
  coord_cartesian(xlim = c(0.6, 1), ylim = c(0, 800)) +
  scale_color_manual(values = c("#712b90","#00aeef","#84c441","#ffda00","#f37121","#ff0000","grey20")) +
  scale_shape_manual(values = c(19,19,19,19,19,17,18), guide = 'none') +
  scale_alpha_manual(values = c(0.7,0.7,0.7,0.7,0.7,0.7,0.35), guide = 'none') +
  theme_bw() +
  labs(title = "            Smallest Genome                                   Largest Genome", x = "% ID", y = "Sequence Length (bp)") +
  guides(colour = guide_legend(override.aes = list(shape = c(19,19,19,19,19,17,18), alpha = c(1,1,1,1,1,1,0.5)), title = "NUMT Category")) +
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
        plot.title = element_text(size = 14, face = "bold", vjust = 1),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 7, angle = -45, hjust = 0.5, vjust = 0.25),
        panel.grid.major = element_blank(),
        strip.text.x = element_text(size = 9, colour = "black", angle = 0, margin = margin(3, 0, 3, 0)))
ggsave("Figure 9.pdf", g.plot.2, width = 8.5, height = 11, units = "in")


############################################################################################
########################## SUPPLEMENTAL FIGURE S5- MAIN ORDERS #############################
############################################################################################

#first plot to get the total number of pages required for all orders
count <- 1
pages.required <- 0
for (i in order.dataframe.list.five) {
  title <- names(order.dataframe.list.five[count])
  g.plot <- ggplot(i,aes(x = X..Pairwise.Identity, y = Sequence.Length)) + 
    geom_point(aes(colour = NUMT.Category), size = 2, alpha = 0.7, show.legend = FALSE) +
    facet_wrap_paginate(~Species,nrow = 3, ncol = 5)
  pages.required <- pages.required + n_pages(g.plot)
  count <- count + 1
}

#initialize plot list using required number of pages
plot.list <- vector(pages.required, mode='list')
count = 1
plot.list.page <- 1
title.index <- data.frame(Page=numeric(0),Order=character(0))

#plot each order, one at a time, onto the correct number of pages
for (i in order.dataframe.list.five) {
  title <- names(order.dataframe.list.five[count]) #get title for each page
  i$Species <- factor(i$Species, levels = unique(i$Species)) #convert species to factor and maintains original order (alphabetical by Family)

  #this plot gets the number of pages required for the particular order
  g.plot <- ggplot(i, aes(x = X..Pairwise.Identity, y = Sequence.Length)) + 
    geom_point(aes(colour = NUMT.Category), size = 2, show.legend = FALSE) +
    facet_wrap_paginate(~Species,nrow = 3, ncol = 5)
  pages <- n_pages(g.plot)
  
  #this loop plots each page separately, then merges with a master list of plots
  for (k in 1:pages) {
    g.plot <- ggplot(i,aes(x = X..Pairwise.Identity, y = Sequence.Length)) + 
      geom_point(aes(colour = NUMT.Category, shape = NUMT.Category, alpha = NUMT.Category) ,size = 2, stroke = 0) +
      facet_wrap_paginate(~Species + Family,nrow = 3, ncol = 5, page = k) +
      scale_x_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0,1,0.1)) +
      scale_y_continuous(breaks = seq(0,700,100)) +
      coord_cartesian(xlim = c(0.6, 1), ylim = c(0, 800)) +
      scale_colour_manual(values = c("#712b90","#00aeef","#84c441","#ffda00","#f37121","#ff0000","grey20"), guide = guide_legend(title = "NUMT Category", override.aes = list(size = 4), reverse = TRUE)) +
      scale_shape_manual(values = c(19,19,19,19,19,17,18), guide = guide_legend(title = "NUMT Category", reverse = TRUE)) +
      scale_alpha_manual(values = c(0.7,0.7,0.7,0.7,0.7,0.7,0.35), guide = guide_legend(title = "NUMT Category", reverse = TRUE)) +
      geom_text(data = i,aes(label = Label),x = 0.8, y = 400, show.legend = FALSE) +
      theme_bw() +
      labs(title = sprintf("%s (pg %s of %s)",title,k,pages), x = "% ID", y = "Sequence Length (bp)") +
      #guides(colour = guide_legend(override.aes = list(shape = c(19,19,19,19,19,17,18), alpha = c(1,1,1,1,1,1,0.5)), size = c(5,5,5,5,5,5,5), title = "NUMT Category")) +
      theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
            axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
            plot.title = element_text(size = 16, face = "bold", vjust = 2),
            axis.text.y = element_text(face = "bold", size = 9),
            axis.text.x = element_text(face = "bold", size = 7, angle = -45, hjust = 0.5, vjust = 0.25),
            panel.grid.major = element_blank(),
            strip.text.x = element_text(size = 9, colour = "black", angle = 0, margin = margin(3, 0, 3, 0)))
    
    #Add plots to master list and update the current page number  
    plot.list[[plot.list.page]] <- g.plot
    temp <- data.frame(Page = plot.list.page, Order = title)
    title.index <- rbind(title.index, temp)
    plot.list.page <- plot.list.page + 1
  }
  count <- count + 1
}

#print all plots to PDF
glist.all <- lapply(plot.list, ggplotGrob)
ggsave("Figure S5a.pdf", marrangeGrob(grobs = glist.all, nrow=1, ncol=1))


############################################################################################
########################## SUPPLEMENTAL FIGURE S5 - OTHER ORDERS ############################
############################################################################################

#combine all "other" dataframes into a single dataframe
order.dataframe.list.others <- do.call(rbind,order.dataframe.list.others)
order.dataframe.list.others$Species <- factor(order.dataframe.list.others$Species, levels = unique(order.dataframe.list.others$Species))
#order.dataframe.list.others$Consequence <- factor(order.dataframe.list.others$Consequence, levels=c("Barcoding (Species count raised)","Barcoding (Species variance raised)","Metabarcoding (BIN count raised)","eDNA/dietary (OTU count raised)","None (diagnosable)"))
title <- "Other Orders"

#initialize plot list using correct number of pages
pages.required.others <- ceiling(length(unique(order.dataframe.list.others$Species))/(3*5))
plot.list.others <- vector(pages.required.others, mode='list')
plot.list.page.others <- 1

#generate plots with or without "No NUMTs" label depending on if dataset contains any "no NUMTs" species
for (q in seq(1, ceiling(length(unique(order.dataframe.list.others$Species))/(3*5)), 1)) {
  g.plot.others <- ggplot(order.dataframe.list.others,aes(x = X..Pairwise.Identity, y = Sequence.Length)) + 
    geom_point(aes(colour = NUMT.Category, shape = NUMT.Category, alpha = NUMT.Category), size = 2, stroke = 0) +
    facet_wrap_paginate(~Species + Family + Order,nrow = 3, ncol = 5, page = q) +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0,1,0.1)) +
    scale_y_continuous(breaks = seq(0,700,100)) +
    coord_cartesian(xlim = c(0.6, 1), ylim = c(0, 800)) +
    scale_color_manual(values = c("#712b90","#00aeef","#84c441","#ffda00","#f37121","#ff0000","grey20"), guide = guide_legend(title = "NUMT Category", override.aes = list(size = 4), reverse = TRUE)) +
    scale_shape_manual(values = c(19,19,19,19,19,17,18), guide = guide_legend(title = "NUMT Category", reverse = TRUE)) +
    scale_alpha_manual(values = c(0.7,0.7,0.7,0.7,0.7,0.7,0.35), guide = guide_legend(title = "NUMT Category", reverse = TRUE)) +
    geom_text(data = order.dataframe.list.others,aes(label = Label),x = 0.8, y = 400, show.legend = FALSE) +
    theme_bw() +
    labs(title = sprintf("%s (pg %s of %s)",title,q,pages.required.others), x = "% ID", y = "Sequence Length (bp)") +
    theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
          axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
          plot.title = element_text(size = 16, face = "bold", vjust = 2),
          axis.text.y = element_text(face = "bold", size = 9),
          axis.text.x = element_text(face = "bold", size = 7, angle = -45, hjust = 0.5, vjust = 0.25),
          panel.grid.major = element_blank(),
          strip.text.x = element_text(size = 9, colour = "black", angle = 0, margin = margin(3, 0, 3, 0)))
  plot.list.others[[plot.list.page.others]] <- g.plot.others
  plot.list.page.others <- plot.list.page.others + 1
}

#print plots to PDF
glist.all.others <- lapply(plot.list.others, ggplotGrob)
ggsave("Figure S5b.pdf", marrangeGrob(grobs = glist.all.others, nrow=1, ncol=1))


############################################################################################
########################## SUPPLEMENTAL FIGURE S6 ##########################################
############################################################################################

#reduce data set to only C5* NUMTs and count for each species
input.red <- input[input$NUMT.Category == "C5*",]
red.counts <- data.frame(table(input.red$Species))
names(red.counts) <- c("Species","Red.Count")
size.table <- unique(input[,c(1,14)])

#add missing species to data set
missing.species <- unique(input$Species)[is.na(match(unique(input$Species),as.character(red.counts$Species)))]
red.counts.2 <- data.frame("Species" = c(as.character(red.counts$Species),missing.species), "Red.Count" = c(red.counts$Red.Count,rep(0,length(missing.species))))
red.counts.2$Genome.Size.Mb <- size.table$Size.Mb[match(red.counts.2$Species,size.table$Species)]

#generate plot
g.plot.red <- ggplot(red.counts.2,aes(x = Genome.Size.Mb, y = Red.Count)) + 
  geom_point(size = 2, stroke = 0) +
  scale_x_continuous(trans = 'log10', breaks = c(seq(0,1000,100),seq(1000,4000,1000))) +
  scale_y_continuous(trans = 'log10', breaks = c(seq(0,10,1),seq(10,80,10))) +
  theme_bw() +
  labs(x = "Genome Size (Mb)", y = "NUmber of C5* NUMTs") +
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
        plot.title = element_text(size = 16, face = "bold", vjust = 2),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 7, angle = -45, hjust = 0.5, vjust = 0.25),
        panel.grid.minor = element_blank())
ggsave("Figure S6.pdf", g.plot.red)

