#requirements: 'Table S2.xlsx' and 'results_mitogenome_coding_only.csv'

#set working directory and install required packages
setwd("PATH/TO/WORKING/DIRECTORY")
library(readxl)
require(ggplot2)
require(scales)
require(grid)
require(gridExtra)
library(ggpubr)

############################################################################################
########################## FIGURE 5 - UPPER PLOT ###########################################
############################################################################################

#input raw data
input <- read_excel("Table S2.xlsx", sheet = 1)

#reformat headers for ggplot
names(input) <- gsub(" ", "_", names(input))
names(input) <- gsub("\\(","",names(input))
names(input) <- gsub(")","",names(input))
names(input) <- gsub(",","_in",names(input))

#make a function to round up to nearest 50
roundUp50 <- function(x,to=50){
  to * ceiling(x / to)
}

#linear model
lm_NUMTs <- lm(input$Genome_Asssembly_Size_Mb~input$NUMT_Count_Total)
P <- format(summary(lm_NUMTs)$coefficients[2,4],digits = 7)
R2 <- format(summary(lm_NUMTs)$r.squared,digits = 3)

g.plot <- ggplot(input,aes(y = NUMT_Count_Total, x = Genome_Asssembly_Size_Mb/1)) + 
  geom_point() + 
  geom_smooth(method = "lm", formula = y~x, se = F, colour = "red") +
  #stat_cor(cor.coef.name = "rho", method = "spearman", color = "red", geom = "label", label.x = 1, label.y = 2) +
  annotate(geom = "label", x=90, y=100,
           label= paste("R^2 == ",R2),
           size = 5,
           colour = "red",
           parse = TRUE) + 
  scale_x_continuous(trans = 'log10', breaks = c(50,100,200,400,800,1600,3200,6400)) +
  scale_y_continuous(trans = 'log10', breaks = c(1,2,4,8,16,32,64,128,256,512)) +
  theme_bw() +
  labs(x = "Log Genomic Assembly Length (Mb)", y = "Log NUMT Count") +
  coord_cartesian(ylim = c(1, 500), xlim = c(50,6000)) +
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
        plot.title = element_text(size = 16, face = "bold", vjust = 2),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 9, hjust = 0.5, vjust = 0.25),
        panel.grid.major = element_blank(),
        legend.position = c(0.9, 0.8),
        legend.background = element_rect(size=0.5, linetype="solid", colour ="black"),
        legend.title = element_blank())
g.plot  

ggsave("Figure_5_upper.pdf", g.plot)


############################################################################################
########################## FIGURE 5 - LOWER PLOT ###########################################
############################################################################################

#input raw data
input <- read.csv("results_mitogenome_coding_only.csv", header = T)

#make a function to round up to nearest 50
roundUp50 <- function(x,to=50){
  to * ceiling(x / to)
}

#linear model
lm_NUMTs <- lm(input$Genome.Size.Mb~input$Average_NUMTs_mitogenome)
P <- format(summary(lm_NUMTs)$coefficients[2,4],digits = 7)
R2 <- format(summary(lm_NUMTs)$r.squared,digits = 3)


#plot NUMT count vs genome size
g.plot <- ggplot(input,aes(y = Average_NUMTs_mitogenome, x = Genome.Size.Mb/1)) + 
  geom_point() + 
  geom_smooth(method = "lm", formula = y~x, se = F, colour = "red") +
  #stat_cor(cor.coef.name = "rho", method = "spearman", color = "red", geom = "label", label.x = 1, label.y = 2) +
  annotate(geom = "label", x=90, y=100,
           label= paste("R^2 == ",R2),
           size = 5,
           colour = "red",
           parse = TRUE) + 
  scale_x_continuous(trans = 'log10', breaks = c(50,100,200,400,800,1600,3200,6400)) +
  scale_y_continuous(trans = 'log10', breaks = c(1,2,4,8,16,32,64,128,256,512)) +
  theme_bw() +
  labs(x = "Log Genomic Assembly Length (Mb)", y = "Log Average Mitogenome NUMT Count") +
  coord_cartesian(ylim = c(1, 500), xlim = c(50,6400)) +
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
        plot.title = element_text(size = 16, face = "bold", vjust = 2),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 9, hjust = 0.5, vjust = 0.25),
        panel.grid.major = element_blank(),
        legend.position = c(0.9, 0.8),
        legend.background = element_rect(size=0.5, linetype="solid", colour ="black"),
        legend.title = element_blank())
g.plot  

ggsave("Figure_5_lower.pdf", g.plot)