#requirements: File 'NUMT_Reliable_Hits_Results.csv' for 1500 bp data set, after columns 'STOP' and 'IPSC' have been populated, and saved as 'NUMT_Reliable_Hits_Results_1500bp.csv'.

#set working directory and install required packages
setwd("/PATH/TO/WORKING/DIRECTORY")
library(readxl)
require(ggplot2)
require(scales)
require(grid)
require(gridExtra)

#input raw data
input <- read.table("NUMT_Reliable_Hits_Results_1500bp.csv",header = T,sep = ",",na.strings = "NA")

#remove unnecessary rows and columns
input <- input[input$Sequence.Length != "N/A" & input$Coverage.Category == "high",c("Sequence.Length","IPSC")]

#format data
input$Sequence.Length <- as.numeric(as.character(input$Sequence.Length))
input$IPSC <- factor(input$IPSC, levels= c("Yes","No", "unknown"))

#make a function to round up to nearest 50 (for NUMT count ceiling)
roundUp50 <- function(x,to=50){
  to * ceiling(x / to)
}

#build basic plot to get max bin count
basic.plot <- ggplot(input,aes(x = Sequence.Length)) + 
  geom_histogram(binwidth = 10)
count.max <- max(ggplot_build(basic.plot)$data[[1]]$count)

#plot NUMT length histogram with 50 bp bin width and colour by IPSC status
g.plot <- ggplot(input,aes(x = Sequence.Length)) + 
  geom_histogram(binwidth = 10, colour = "black", aes(fill = IPSC)) +
  geom_vline(xintercept = c(150,300,450,600), size = 0.65, color = "blue", linetype = 2)+
  scale_x_continuous(breaks = c(seq(0,500,50),seq(600,1600,100))) +
  scale_y_continuous(breaks = seq(0,roundUp50(count.max),50), expand = expansion(mult = c(0, .1))) +
  scale_fill_manual(labels = c("IPSC Present", "IPSC Absent", "Unknown"), values = c("limegreen","red","grey")) +
  theme_bw() +
  labs(x = "NUMT Sequence Length (bp)", y = "NUMT Count") +
  coord_cartesian(xlim = c(0,1700), ylim = c(0, roundUp50(count.max)),expand = F) +
  theme(axis.title.x = element_text(vjust = 0.5, face = "bold", size = 12),
        axis.title.y = element_text(vjust = 1, face = "bold", size = 12),
        plot.title = element_text(size = 16, face = "bold", vjust = 2),
        axis.text.y = element_text(face = "bold", size = 9),
        axis.text.x = element_text(face = "bold", size = 9, hjust = 0.5, vjust = 0.25),
        panel.grid.major = element_blank(),
        legend.position = c(0.9, 0.8),
        legend.background = element_rect(size=0.5, linetype="solid", colour ="black"),
        legend.title = element_blank())
g.plot  

ggsave("Figure 3.pdf", g.plot)

