#requirements: File 'NUMT_Reliable_Hits_Results.csv', after columns 'STOP', 'IPSC', and 'NUMT.Category' have been populated, and saved as 'NUMT_Reliable_Hits_Results.xlsx'.

setwd("/PATH/TO/WORKING/DIRECTORY")
require(ggplot2)
require(scales)
require(grid)
require(gridExtra)
require(ggalt)
require(ggforce)
require(tidyr)
require(ggpubr)
library("seqinr")
library(Biostrings)
library(readxl)

#input data
input <- data.frame(read_excel("NUMT_Reliable_Hits_Results_.xlsx",sheet = "NUMT_Reliable_Hits_Results", na = "NA"))

#reduce data set to only C5* NUMTs
input <- input[input$NUMT.Category == "C5*",]

#calculate mean %ID for each species
means <- aggregate(Percent.ID ~ Species, data = input, mean)
input$Mean.Percent.ID <- round(means[match(input$Species,means$Species),2], digits = 3)
input$Divergence <- round(100-input$Percent.ID, digits = 3)
input$Mean.Divergence <- round(100-input$Mean.Percent.ID, digits = 3)

#add counts to dataframe
df <- data.frame(table(input$Species))
names(df)[c(1,2)] <- c("Species","Count")
input$NUMT.Count <- df[match(input$Species,df$Species),2]

#set data as factor
input$Species <- factor(input$Species, levels = unique(input$Species))
input$Order <- factor(input$Order, levels = c("Lepidoptera", "Hymenoptera", "Coleoptera", "Diptera", "Hemiptera", "Blattodea", "Orthoptera", "Phasmatodea", "Plecoptera"))

#add custom colours
number <- nlevels(input$Order)
palette <- c("#ffaa00", "#cc0011", "#0099cc", "#443333", "#66cc33", "gray60", "gray60", "gray60", "gray60")

#add jitter to NUMT Count and make it positive or negative for every other species
input <- input[with(input, order(NUMT.Count, -Mean.Divergence)),]
input$NUMT.Count.Jitter <- jitter(input$NUMT.Count, factor = 1, amount = 0.45)
input$Jitter.Diff <- input$NUMT.Count - input$NUMT.Count.Jitter
input$Jitter.Diff.Abs <- abs(input$Jitter.Diff)
temp.df <- data.frame("Species" = unique(input$Species), "Block" = rep_len(c("A","B"), length.out = length(unique(input$Species))))
input$Jitter.Block <- temp.df$Block[match(input$Species, temp.df$Species)]
input$Jitter.Diff.New <- input$Jitter.Diff.Abs
input$Jitter.Diff.New[input$Jitter.Block == "B"] <- input$Jitter.Diff.New[input$Jitter.Block == "B"]*-1
input$NUMT.Count.Jitter <- input$NUMT.Count + input$Jitter.Diff.New

#Add NUMT Count jitter that is the same within each species, to pair with mean divergence
temp.df$NUMT.Count <- input$NUMT.Count[match(temp.df$Species, input$Species)]
temp.df$NUMT.Count.Jitter.Mean <- jitter(temp.df$NUMT.Count, factor = 1, amount = 0.45)
input$NUMT.Count.Jitter.Mean <- temp.df$NUMT.Count.Jitter.Mean[match(input$Species, temp.df$Species)]
input$NUMT.Count.Jitter.Mean.Diff <- input$NUMT.Count - input$NUMT.Count.Jitter.Mean
input$NUMT.Count.Jitter.Mean.Diff.Abs <- abs(input$NUMT.Count.Jitter.Mean.Diff)
input$Jitter.Mean.Diff.New <- input$NUMT.Count.Jitter.Mean.Diff.Abs
input$Jitter.Mean.Diff.New[input$Jitter.Block == "B"] <- input$Jitter.Mean.Diff.New[input$Jitter.Block == "B"]*-1
input$NUMT.Count.Jitter.Mean <- input$NUMT.Count + input$Jitter.Mean.Diff.New

#plot
g.plot <- ggplot(input, aes(x = NUMT.Count.Jitter, y = Divergence)) + 
  geom_point(aes(colour = Order), size = 1.2, alpha = 0.75, show.legend = T) +
  geom_point(data = input[input$NUMT.Count > 1,], aes(x = NUMT.Count.Jitter.Mean, y = Mean.Divergence, colour = Order), size = 1.5, shape = 1, stroke = 1.5, show.legend = F) +
  geom_segment(data = input[input$NUMT.Count > 1,], aes(y=Mean.Divergence, x=NUMT.Count.Jitter.Mean, xend=NUMT.Count.Jitter, yend=Divergence, group = Species, colour = Order), size = 0.4, alpha = 0.8, linetype = "dashed", show.legend = F) +
  theme_bw() +
  labs(x = "# NUMTs", y = "Divergence (%)") +
  scale_x_continuous(breaks = c(1:6,8,9,10)) +
  scale_y_continuous(breaks = c(5,10,15,20,25)) +
  scale_colour_manual(values = palette) + 
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 10),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 10),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 7, angle = -45, hjust = 0.5, vjust = 0.25),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_line(colour="grey", size=0.25),
        panel.grid.minor.y = element_blank())

ggsave("Figure 10.pdf", width = 11, height = 8.5, units = "in")
