library(ggrepel)
library(ggplot2)
library(jtools) # for summ()
library(interactions)
library(sjPlot) 
library(ggeffects) 


theme_set(theme_ggeffects()) #to set the ggeffects-theme as default plotting theme.    
# You can then use further plot-modifiers from sjPlot, 
# like legend_style() or font_size() 

setwd('/Users/Shared/Previously\ Relocated\ Items/Security/projects/2017_SEAsian_macaque_genomz/ROH')

pairwise_vector <- c("bor","mau","ton","hec","nig","nge","tog","bru","sum")

for (pair in pairwise_vector){
  a <- read.table(paste("ROH_",eval(pair),"_density.out", sep=""), header = T)
  assign(pair,a)
}

# convert lengths to kilobases for plotting and model fitting
bor$length <- bor$length/100000
mau$length <- mau$length/100000
ton$length <- ton$length/100000
hec$length <- hec$length/100000
nig$length <- nig$length/100000
nge$length <- nge$length/100000
tog$length <- tog$length/100000
bru$length <- bru$length/100000
sum$length <- sum$length/100000


# bor
    # make a column for gene density per 100kb for each
    bor$gene_density_on_ROHs <- (bor$num_genes/bor$length) 
    # subset the data to only include ROHs with genes
    bor_allgenez <- bor[(bor$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    bor_genes_or_not <- lm(log(length) ~ containsgenes, data=bor)
    summary(bor_genes_or_not)
    # if it has genes, the ROHs are significantly longer

    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    bor_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=bor_allgenez)
    summary(bor_number_of_genes)
    # more genes, shorter ROHs

    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    bor_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=bor)
    summary(bor_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    bor_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=bor_allgenez)
    summary(bor_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    bor_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=bor_allgenez)
    summary(bor_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller
  

# mau
    # mau
    # make a column for gene density per 100kb for each
    mau$gene_density_on_ROHs <- (mau$num_genes/mau$length) 
    # subset the data to only include ROHs with genes
    mau_allgenez <- mau[(mau$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    mau_genes_or_not <- lm(log(length) ~ containsgenes, data=mau)
    summary(mau_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    mau_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=mau_allgenez)
    summary(mau_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    mau_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=mau)
    summary(mau_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    mau_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=mau_allgenez)
    summary(mau_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    mau_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=mau_allgenez)
    summary(mau_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller
    

# ton
    # make a column for gene density per 100kb for each
    ton$gene_density_on_ROHs <- (ton$num_genes/ton$length) 
    # subset the data to only include ROHs with genes
    ton_allgenez <- ton[(ton$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    ton_genes_or_not <- lm(log(length) ~ containsgenes, data=ton)
    summary(ton_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    ton_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=ton_allgenez)
    summary(ton_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    ton_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=ton)
    summary(ton_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    ton_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=ton_allgenez)
    summary(ton_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    ton_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=ton_allgenez)
    summary(ton_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller

    

        
# hec
    # make a column for gene density per 100kb for each
    hec$gene_density_on_ROHs <- (hec$num_genes/hec$length) 
    # subset the data to only include ROHs with genes
    hec_allgenez <- hec[(hec$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    hec_genes_or_not <- lm(log(length) ~ containsgenes, data=hec)
    summary(hec_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    hec_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=hec_allgenez)
    summary(hec_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    hec_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=hec)
    summary(hec_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    hec_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=hec_allgenez)
    summary(hec_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    hec_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=hec_allgenez)
    summary(hec_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller
    
    
# nge
    # make a column for gene density per 100kb for each
    nge$gene_density_on_ROHs <- (nge$num_genes/nge$length) 
    # subset the data to only include ROHs with genes
    nge_allgenez <- nge[(nge$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    nge_genes_or_not <- lm(log(length) ~ containsgenes, data=nge)
    summary(nge_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    nge_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=nge_allgenez)
    summary(nge_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    nge_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=nge)
    summary(nge_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    nge_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=nge_allgenez)
    summary(nge_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    nge_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=nge_allgenez)
    summary(nge_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller

    
# nig
    # make a column for gene density per 100kb for each
    nig$gene_density_on_ROHs <- (nig$num_genes/nig$length) 
    # subset the data to only include ROHs with genes
    nig_allgenez <- nig[(nig$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    nig_genes_or_not <- lm(log(length) ~ containsgenes, data=nig)
    summary(nig_genes_or_not)
    # if it has genes, the ROHs are significantly lonigr
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    nig_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=nig_allgenez)
    summary(nig_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    nig_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=nig)
    summary(nig_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nonignic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    nig_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=nig_allgenez)
    summary(nig_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    nig_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=nig_allgenez)
    summary(nig_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller
    
    
# bru
    # make a column for gene density per 100kb for each
    bru$gene_density_on_ROHs <- (bru$num_genes/bru$length) 
    # subset the data to only include ROHs with genes
    bru_allgenez <- bru[(bru$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    bru_genes_or_not <- lm(log(length) ~ containsgenes, data=bru)
    summary(bru_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    bru_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=bru_allgenez)
    summary(bru_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    bru_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=bru)
    summary(bru_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    bru_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=bru_allgenez)
    summary(bru_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    bru_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=bru_allgenez)
    summary(bru_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller

# sum
    # make a column for gene density per 100kb for each
    sum$gene_density_on_ROHs <- (sum$num_genes/sum$length) 
    # subset the data to only include ROHs with genes
    sum_allgenez <- sum[(sum$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    sum_genes_or_not <- lm(log(length) ~ containsgenes, data=sum)
    summary(sum_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    sum_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=sum_allgenez)
    summary(sum_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    sum_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=sum)
    summary(sum_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    sum_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=sum_allgenez)
    summary(sum_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    sum_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=sum_allgenez)
    summary(sum_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller
    

# tog
    # make a column for gene density per 100kb for each
    tog$gene_density_on_ROHs <- (tog$num_genes/tog$length) 
    # subset the data to only include ROHs with genes
    tog_allgenez <- tog[(tog$containsgenes == 1),] 
    
    # What is the overall relationship between ROH and presence of genes?
    tog_genes_or_not <- lm(log(length) ~ containsgenes, data=tog)
    summary(tog_genes_or_not)
    # if it has genes, the ROHs are significantly longer
    
    # What is the overall relationship between ROH and gene density considering only genic ROHs?
    tog_number_of_genes <- lm(log(length) ~ gene_density_on_ROHs, data=tog_allgenez)
    summary(tog_number_of_genes)
    # more genes, shorter ROHs
    
    # What is the overall relationship between ROH and the number of genes considering all ROHs?
    tog_number_of_genes_all_ROHs <- lm(log(length) ~ gene_density_on_ROHs, data=tog)
    summary(tog_number_of_genes_all_ROHs)
    # more genes, longer ROHs (but this is because nongenic ROHs are very short)
    
    # What is the overall relationship between ROH and the containsNinteractgenez considering only genic ROHs?
    tog_Ninteract <- lm(log(length) ~ containsNinteractgenez, data=tog_allgenez)
    summary(tog_Ninteract)
    # Ninteract present, longer ROHs
    
    # model only ROHs with genes with interaction between presence/absence of Ninteract and gene density
    tog_mod <- lm(log(length) ~ containsNinteractgenez*gene_density_on_ROHs, data=tog_allgenez)
    summary(tog_mod)
    # interaction is significant and opposite signs for containsNinteractgenez and gene_density_on_ROHs
    # as gene density increases, the increase in ROH due to Ninteract genes is smaller
    

# plot predicted values
    bor_plot <- ggpredict(bor_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F
      )+
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="", limits=c(0,50)) +
      scale_y_continuous(name=" ", limits=c(0,2), breaks = c(0, 1, 2))+
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    mau_plot <- ggpredict(mau_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F
      )+
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="", limits=c(0,50)) +
      scale_y_continuous(name="")+
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    ton_plot <- ggpredict(ton_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F
      )+
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="", limits=c(0,50)) +
      scale_y_continuous(name="ROH length (100kb)")+
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    hec_plot <- ggpredict(hec_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F
      )+
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="", limits=c(0,50)) +
      scale_y_continuous(name="")+
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    nig_plot <- ggpredict(nig_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F
      )+
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="", limits=c(0,50)) +
      scale_y_continuous(name="")+
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    nge_plot <- ggpredict(nge_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F
      )+
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="", limits=c(0,50)) +
      scale_y_continuous(name=" ") +
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    tog_plot <- ggpredict(tog_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = F, show.title = F,
           show.x.title = T,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F)+ 
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_y_continuous(name=" ") +
      scale_x_continuous(name=" ", limits=c(0,50)) +
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    bru_plot <- ggpredict(bru_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = T,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F)+ 
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
      scale_x_continuous(name="Genes/100kb", limits=c(0,50))+
      scale_y_continuous(name="")+
      theme(axis.text.x = element_text(size=14))+
      theme(axis.title.x = element_text(size=16))+
      theme(axis.text.y = element_text(size=14))+
      theme(axis.title.y = element_text(size=16))
    
    sum_plot <- ggpredict(sum_mod,  c("gene_density_on_ROHs","containsNinteractgenez")) %>%
      plot(show.y.title = FALSE, show.title = FALSE,
           show.x.title = FALSE,
           show.legend = F,
           colors = c("blue","red"),
           limit.range=F)+ 
            theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
            scale_x_continuous(name="", limits=c(0,50))+
            scale_y_continuous(name="", limits=c(0,3))+
            theme(axis.text.x = element_text(size=14))+
            theme(axis.title.x = element_text(size=16))+
            theme(axis.text.y = element_text(size=14))+
            theme(axis.title.y = element_text(size=16)) #+
      # remove legend key border color & background
 #     scale_color_manual(labels = c("Other genes", "Ninteract"), values = c("blue", "red"))+
#      theme(legend.key=element_blank()) +
#      theme(legend.box.background = element_blank())+
#      theme(legend.title = element_blank()) +
#      theme(legend.text = element_text(size = 16)) +
#      theme(legend.position = c(0.8, 0.6)) +
#      theme(legend.background = element_rect(color = NA)) 
      #theme(legend.background=element_rect(fill = alpha("white", 0)))
      
    
    
png("Predicted_values_ROH.png", width = 250, height = 200, units='mm', res = 300)    
    plot_grid(list(bor_plot, sum_plot, mau_plot,
                   ton_plot, hec_plot, nge_plot,
                   nig_plot, bru_plot,tog_plot
                   ),
              tags = c("A","B",
                       "C","D",
                       "E","F",
                       "G","H",
                       "I"),
              #tags = c("bor","mau",
              #         "ton","hec",
              #         "nig","nge",
              #         "tog","bru",
              #         "sum"),
              margin = c(0.1, 0.1, 0.1, 0.1))+
              theme_bw()
dev.off()


# let's look at the density of gene density
all_data_allgenez <- rbind(bor_allgenez,mau_allgenez,ton_allgenez,
                           hec_allgenez,nig_allgenez,nge_allgenez,
                           bru_allgenez,tog_allgenez,sum_allgenez)
hist(all_data_allgenez$gene_density_on_ROHs,xlim=c(0,30),breaks=seq(0,152,by=1),warn.unused=F)    
# calculate the proportion of ROHs that have a gene density above some value
length(all_data_allgenez$gene_density_on_ROHs[all_data_allgenez$gene_density_on_ROHs > 20])/length(all_data_allgenez$gene_density_on_ROHs)
    
    
# calculate the proportion of ROHs that have a gene density above some value
length(all_data_allgenez$gene_density_on_ROHs[all_data_allgenez$gene_density_on_ROHs < 5])/length(all_data_allgenez$gene_density_on_ROHs)

    
      
    
# what is the maximum number of Ninteract genes on an ROH in each species?
    max(bor_allgenez$num_ninteractgenes)
    max(mau_allgenez$num_ninteractgenes)
    max(ton_allgenez$num_ninteractgenes)
    max(hec_allgenez$num_ninteractgenes)
    max(nig_allgenez$num_ninteractgenes)
    max(nge_allgenez$num_ninteractgenes)
    max(bru_allgenez$num_ninteractgenes)
    max(tog_allgenez$num_ninteractgenes)
    max(sum_allgenez$num_ninteractgenes)
    
# what is the mean number of Ninteract genes on an ROH in each species?
    mean(bor_allgenez$gene_density_on_ROHs[bor_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(bor_allgenez$gene_density_on_ROHs[bor_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(mau_allgenez$gene_density_on_ROHs[mau_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(mau_allgenez$gene_density_on_ROHs[mau_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(ton_allgenez$gene_density_on_ROHs[ton_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(ton_allgenez$gene_density_on_ROHs[ton_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(hec_allgenez$gene_density_on_ROHs[hec_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(hec_allgenez$gene_density_on_ROHs[hec_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(nig_allgenez$gene_density_on_ROHs[nig_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(nig_allgenez$gene_density_on_ROHs[nig_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(nge_allgenez$gene_density_on_ROHs[nge_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(nge_allgenez$gene_density_on_ROHs[nge_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(bru_allgenez$gene_density_on_ROHs[bru_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(bru_allgenez$gene_density_on_ROHs[bru_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(tog_allgenez$gene_density_on_ROHs[tog_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(tog_allgenez$gene_density_on_ROHs[tog_allgenez$containsNinteractgenez ==0], na.rm=T)
    mean(sum_allgenez$gene_density_on_ROHs[sum_allgenez$containsNinteractgenez ==1], na.rm=T)
    mean(sum_allgenez$gene_density_on_ROHs[sum_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    
# what is the distribution of gene_density_on_ROHs on Ninteract and non-Ninteract ROHs in each species?
    tog_allgenez <- tog[(tog$containsgenes == 1),] 
    max(bor_allgenez$gene_density_on_ROHs[bor_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(bor_allgenez$gene_density_on_ROHs[bor_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    bor_allgenez$group_f = factor(bor_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(bor_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
            geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
        
    max(mau_allgenez$gene_density_on_ROHs[mau_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(mau_allgenez$gene_density_on_ROHs[mau_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    mau_allgenez$group_f = factor(mau_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(mau_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    
    max(ton_allgenez$gene_density_on_ROHs[ton_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(ton_allgenez$gene_density_on_ROHs[ton_allgenez$containsNinteractgenez ==0], na.rm=T)

    ton_allgenez$group_f = factor(ton_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(ton_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
        
    max(hec_allgenez$gene_density_on_ROHs[hec_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(hec_allgenez$gene_density_on_ROHs[hec_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    
    hec_allgenez$group_f = factor(hec_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(hec_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    max(nig_allgenez$gene_density_on_ROHs[nig_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(nig_allgenez$gene_density_on_ROHs[nig_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    
    nig_allgenez$group_f = factor(nig_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(nig_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    max(nge_allgenez$gene_density_on_ROHs[nge_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(nge_allgenez$gene_density_on_ROHs[nge_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    nge_allgenez$group_f = factor(nge_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(nge_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    
    max(bru_allgenez$gene_density_on_ROHs[bru_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(bru_allgenez$gene_density_on_ROHs[bru_allgenez$containsNinteractgenez ==0], na.rm=T)

    bru_allgenez$group_f = factor(bru_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(bru_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    
    max(tog_allgenez$gene_density_on_ROHs[tog_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(tog_allgenez$gene_density_on_ROHs[tog_allgenez$containsNinteractgenez ==0], na.rm=T)
    
    
    tog_allgenez$group_f = factor(tog_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(tog_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    max(sum_allgenez$gene_density_on_ROHs[sum_allgenez$containsNinteractgenez ==1], na.rm=T)
    max(sum_allgenez$gene_density_on_ROHs[sum_allgenez$containsNinteractgenez ==0], na.rm=T)

    
    sum_allgenez$group_f = factor(sum_allgenez$containsNinteractgenez, levels=c('0','1'), ordered = T)
    ggplot(sum_allgenez) + 
      geom_density(aes(x=gene_density_on_ROHs, colour=group_f))+
      stat_density(aes(x=gene_density_on_ROHs, colour=group_f),
                   geom="line",position="identity")+
      scale_color_manual(values = c("blue", "red"))+
      theme_bw()
    
    
# all are 4 or less except mau, which has one ROH with 7 Ninteract genes on chr 19
unique(mau_allgenez$num_ninteractgenes)
mau_allgenez[mau_allgenez$num_ninteractgenes==7,]

library(emmeans)
    # this calculates the intercept for each value of containsNinteractgenez for regression of density vs length
    # trend is the intervept
    emtrends(tog_mod, ~ containsNinteractgenez, var="gene_density_on_ROHs")
    # this calculates the slope for each value of containsNinteractgenez for regression of density vs length
    # contrast is at the end
    emtrends(tog_mod, pairwise ~ containsNinteractgenez, var="gene_density_on_ROHs")
    # this is a cool plot
    mylist <- list(gene_density_on_ROHs=seq(0,80,by=10))
    emmip(tog_mod, containsNinteractgenez~gene_density_on_ROHs, at=mylist, CIs=TRUE)

library(sjPlot)
library(sjmisc)    
    plot_model(tog_mod, type = "int")      
    
library(effects)
    plot(allEffects(tog_mod), multiline=TRUE, ci.style="bands")
    
# this is a plot of density vs length when containsNinteractgenez=0 and when containsNinteractgenez=1
plot(x = tog_allgenez[tog_allgenez$containsNinteractgenez == 0, ]$gene_density_on_ROHs, 
     y = tog_allgenez[tog_allgenez$containsNinteractgenez == 0, ]$length, 
         col = rgb(red = 0, green = 0, blue = 1, alpha = 0.25), pch = 19,
         xlab = "Density", ylab = "ROH length")
    abline(a = coef(tog_mod)[1], b = coef(tog_mod)[3], col = "blue", pch = 19, lwd = 2) +
    points(x = tog_allgenez[tog_allgenez$containsNinteractgenez == 1, ]$gene_density_on_ROHs, 
           y = tog_allgenez[tog_allgenez$containsNinteractgenez == 1, ]$length, 
           col = rgb(red = 1, green = 0, blue = 0, alpha = 0.25), pch = 19) +
    abline(a = coef(tog_mod)[1] + coef(tog_mod)[2], b = coef(tog_mod)[3] + coef(tog_mod)[4], 
           col = "red", lwd = 2)
    

    
# this is a plot of density vs length when containsNinteractgenez=0
    plot(x = sum_allgenez[sum_allgenez$containsNinteractgenez == 0, ]$gene_density_on_ROHs, 
         y = sum_allgenez[sum_allgenez$containsNinteractgenez == 0, ]$length, 
         col = rgb(red = 0, green = 0, blue = 1, alpha = 0.25), pch = 19,
         xlab = "Density", ylab = "ROH length")
    abline(a = coef(sum_mod)[1], b = coef(sum_mod)[3], col = "blue", pch = 19, lwd = 2) +
      points(x = sum_allgenez[sum_allgenez$containsNinteractgenez == 1, ]$gene_density_on_ROHs, 
             y = sum_allgenez[sum_allgenez$containsNinteractgenez == 1, ]$length, 
             col = rgb(red = 1, green = 0, blue = 0, alpha = 0.25), pch = 19) +
    abline(a = coef(sum_mod)[1] + coef(sum_mod)[2], b = coef(sum_mod)[3] + coef(sum_mod)[4], 
           col = "red", lwd = 2)
    
# this is a plot of density vs length 
    plot(x = bor_allgenez[bor_allgenez$containsNinteractgenez == 0, ]$gene_density_on_ROHs, 
         y = log(bor_allgenez[bor_allgenez$containsNinteractgenez == 0, ]$length), 
         col = rgb(red = 0, green = 0, blue = 1, alpha = 0.25), pch = 19,
         xlab = "Density", ylab = "ROH length", ylim = c(0,20)) 
    abline(a = coef(bor_mod)[1], b = coef(bor_mod)[3], col = "blue", pch = 19, lwd = 2) +
      points(x = bor_allgenez[bor_allgenez$containsNinteractgenez == 1, ]$gene_density_on_ROHs, 
             y = log(bor_allgenez[bor_allgenez$containsNinteractgenez == 1, ]$length), 
             col = rgb(red = 1, green = 0, blue = 0, alpha = 0.25), pch = 19) +
      abline(a = coef(bor_mod)[1] + coef(bor_mod)[2], b = coef(bor_mod)[3] + coef(bor_mod)[4], 
             col = "red", lwd = 2)        

library(ggplot2)
#Plot it up!
plotz<-ggplot(data=tog_allgenez, aes(x=gene_density_on_ROHs, y=length, group=containsNinteractgenez))+
      #coord_cartesian(ylim = c(0,4))+  
      #For ylim, specify the range of your DV (in our case, 0-4)
      geom_line(size=2, aes(color=containsNinteractgenez))+
      ylab("Length")+
      xlab("Density")+
      ggtitle("Density and Ninteract as Predictors of ROH_Length")+
      theme_bw()+ 
      theme(panel.grid.major=element_blank(),
            panel.grid.minor=element_blank())+
      scale_fill_grey()
plotz    
    
    
        
    
hec_no_genez <- hec[(hec$containsgenes == 0),]  
hec_Ninteract <- hec[hec$containsNinteractgenez == 1,] 
hec_nonNinteract <- hec[((hec$containsgenes == 1)&(hec$containsNinteractgenez == 0)),] 



# do lm with an interaction term
hec_mod <- glm(length ~ containsNinteractgenez, data=hec)
summary(hec_mod)

hec_mod <- glm(length ~ gene_density_on_ROHs, data=hec)
summary(hec_mod)


hec_mod <- lm(length ~ containsNinteractgenez*gene_density_on_ROHs, data=hec)
summary(hec_mod)

plot(hec$gene_density_on_ROHs, hec$length,
     pch = 20,
     col = "steelblue",
     main = "Different Intercepts, Different Slopes")

abline(coef = c(hec_mod$coefficients[1], hec_mod$coefficients[2]), 
       col = "red",
       lwd = 1.5)

abline(coef = c(hec_mod$coefficients[1] + hec_mod$coefficients[3], hec_mod$coefficients[2] + hec_mod$coefficients[4]), 
       col = "green",
       lwd = 1.5)



hec_plot <- ggplot(hec, aes(x = gene_density_on_ROHs*10000, y = length)) +
  #geom_smooth(subset(hec_Ninteract), method=lm, se=T, fullrange=TRUE) +
  #geom_point()
  #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
  #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
  geom_point(data = subset(hec_Ninteract, color == "pink"),
             aes(x = gene_density_on_ROHs*10000, y = length), color = 'gray') +
  geom_point(data = subset(hec_nonNinteract, color == 'gray'),
             aes(x = gene_density_on_ROHs*10000, y = length), color = 'pink') +
  geom_point(data = subset(hec_no_genez, color == 'black'),
             aes(x = gene_density_on_ROHs*10000, y = length), color = 'red')



plot(hec_allgenez$gene_density_on_ROHs, hec_allgenez$length,
     pch = 20,
     col = "steelblue",
     main = "Different Intercepts, Different Slopes")

mod2_coef <- lm(hec_allgenez$length ~ hec_allgenez$gene_density_on_ROHs + hec_allgenez$containsNinteractgenez + hec_allgenez$gene_density_on_ROHs:hec_allgenez$containsNinteractgenez)$coefficients

abline(coef = c(mod2_coef[1], mod2_coef[2]), 
       col = "red",
       lwd = 1.5)

abline(coef = c(mod2_coef[1] + mod2_coef[3], mod2_coef[2] + mod2_coef[4]), 
       col = "green",
       lwd = 1.5)



ggplot(hec_allgenez, aes(x = gene_density_on_ROHs*10000, y = length)) +
  theme_classic() +
  geom_point(
    mapping = aes(colour = containsNinteractgenez, shape = as.factor(containsNinteractgenez)),
    size = 3,
    alpha = 5 / 6
  )+
  geom_smooth(data = subset(hec_nonNinteract), mapping = aes(colour = containsNinteractgenez), method = lm)+
  geom_smooth(data = subset(hec_Ninteract), mapping = aes(colour = containsNinteractgenez), method = lm)






nig_mod <- glm(length ~ containsNinteractgenez*gene_density_on_ROHs, data=nig)
summary(nig_mod)
tog_mod <- glm(length ~ containsNinteractgenez*gene_density_on_ROHs, data=tog)
summary(tog_mod)
nge_mod <- glm(length ~ containsNinteractgenez*gene_density_on_ROHs, data=nge)
summary(nge_mod)
bru_mod <- glm(length ~ containsNinteractgenez*gene_density_on_ROHs, data=bru)
summary(bru_mod)
sum_mod <- glm(length ~ containsNinteractgenez*gene_density_on_ROHs, data=sum)
summary(sum_mod)


#nem
    my_data_only_genez <- nem[nem$containsgenes == 1,] 
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
            (my_data_only_genez$containsNinteractgenez == 1))]
    # none for nem
    
    # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a color column for non-Ninteract and Ninteract windows
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    # make an alpha column for non-Ninteract and Ninteract windows
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                      (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
   
    # get some numbers
    # What is the expected proportion of lower outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02484882
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06862745
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
    #  UQCRH    MRPL55   NDUFAF3  ATP5J2   CYC1     NDUFS8   C11orf83 NDUFA4L2 COX5A    SCO2     MRPL28   MRPS34   PET100  
   # COX6B1           COX6B1 
    
 
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#                       ) +
      xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) + 
      labs(x = element_blank(), y="H", tag = "bor") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
   
#mau
    my_data_only_genez <- mau[mau$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    # ACAD9

        # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02516556
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07881773
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
    #  MRPL55        MRPL53        MRPL30        NDUFAF3       UQCRQ         MRPL18        ATP5J2        C11orf83     
    # MRPL40        MRPS34        EARS2,NDUFAB1 COA3          MRPL38        NDUFS7        COX6B1        ATP5SL   
    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = element_blank(), y="H", tag = "mau") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#ton
    my_data_only_genez <- ton[ton$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # NDUFAF3    GADD45GIP1
    
    
    # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02562974
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.05445545
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
#  ATP5F1   MRPS21   NDUFA2   SURF1    MRPS2    C11orf83 MRPL17   FOXRED1  MRPL10   MRPL34   COX6B1  
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    ton_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = element_blank(), y="H", tag = "ton") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#hec
    my_data_only_genez <- hec[hec$containsgenes == 1,] 
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # SYNJ2BP-COX16 MRPS26
    
    
    # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.0265428
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.05940594
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
#    ATPAF1  MRPS21  MRPL55  NDUFAF7 NDUFAF3 MRPL1   HARS2   COX5A   MRPS7   PET100  MRPL34  NDUFA13
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = element_blank(), y="H", tag = "hec") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#nig
    my_data_only_genez <- nig[nig$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # TARS2      HIGD1A     MRPL2      MRPS17     COX5A      MRPS26     COA3       TACO1      GADD45GIP1 NDUFA13

        # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02512733
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.08247423
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
#  ATP5F1       MRPL9        NDUFA2       HARS2        ATP5J2       COX6C        CYC1         MRP63        ATP5E       
# TTC19        MRPS7        MRPL12       PET100       MRPL34       MRPS12,SARS2 ATP5SL    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_text_repel( data = my_data_only_genez,
                       #mapping = aes(label = Ninteract_acronym),
                       mapping = aes(label = ifelse(color == "blue",as.character(Ninteract_acronym),'')),
                       force_pull = 0,
                       force = 13,
                       nudge_y = 0.1, nudge_x = 15,
                       color = "black",
                       size = 2.5,
                       box.padding = 0.5, 
                       #point.padding = 0.5,
                       direction     = "y",
                       max.overlaps = Inf,
                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
                       segment.size = 0.25,
                       segment.color = 'grey50'
      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = "Number of genes", y="H", tag = "nig") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      #theme(axis.text.x=element_blank()) + theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    

library(gridExtra)
png("FW_H_Ninteract_outliers.png", width = 300, height = 430, units='mm', res = 100)    
    grid.arrange(nem_plot, mau_plot,
                 ton_plot, hec_plot,
                 nig_plot,ncol=1)
    
dev.off()


