library(ggrepel)
library(ggplot2)

setwd('/Users/Shared/Previously\ Relocated\ Items/Security/projects/submitted/2017_SEAsian_macaque_genomz/Fst_windows/pi_all_outputz')

pairwise_vector <- c("nem", "mau","ton", "hec", "nig")
pairwise_vector_plot <- c("nem_plot", "mau_plot", "ton_plot", "hec_plot", "nig_plot")

for (pair in pairwise_vector){
  a <- read.table(paste(eval(pair), "_windowstats.concat_pi__density.txt", sep=""), header = T)
  assign(pair,a)
}


# do lm with an interaction term
nem_mod <- lm(pi ~ containsNinteractgenez*number_of_genes, data=nem)
summary(nem_mod)
mau_mod <- lm(pi ~ containsNinteractgenez*number_of_genes, data=mau)
summary(mau_mod)
ton_mod <- lm(pi ~ containsNinteractgenez*number_of_genes, data=ton)
summary(ton_mod)
hec_mod <- lm(pi ~ containsNinteractgenez*number_of_genes, data=hec)
summary(hec_mod)
nig_mod <- lm(pi ~ containsNinteractgenez*number_of_genes, data=nig)
summary(nig_mod)


#nem
    my_data_only_genez <- nem[nem$containsgenes == 1,] 
    
    # explore relationship between pi and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(pi ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a color column for non-Ninteract and Ninteract windows
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    # make an alpha column for non-Ninteract and Ninteract windows
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                      (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                              (my_data_only_genez$pi < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
   
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.01227936
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.04411765
    # 9 out of 204
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$pi < my_data_only_genez$fitted)]
    
    #  MRPS21   MRPL53   UQCRC1   NDUFAF3  NDUFC1   MRPL2    C11orf83 NDUFS3   ATP5B  
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = pi, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#                       ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.0,0.4), breaks = c(0,0.25,0.5)) + 
      labs(x = element_blank(), y=expression(pi), tag = "bor") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
   
#mau
    my_data_only_genez <- mau[mau$containsgenes == 1,] 
    
    # explore relationship between pi and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(pi ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$pi < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.01107335
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.05392157
    # 11 out of 204
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$pi < my_data_only_genez$fitted)]
    
#    [1] MRPL55        UQCRC1        NDUFAF3       NDUFA2        HARS2         C11orf83      NDUFB10       NDUFAB1,EARS2
#    [9] COA3          GADD45GIP1    COX6B1     
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = pi, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    labs(x = element_blank(), y=expression(pi), tag = "mau") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#ton
    my_data_only_genez <- ton[ton$containsgenes == 1,] 
    
    # explore relationship between pi and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(pi ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$pi < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.009099879
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.03431373
    # 7 out of 204
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$pi < my_data_only_genez$fitted)]
    
    #  MRPS21        NDUFS2        NDUFAF3       NDUFA2        MRPL43        EARS2,NDUFAB1 GADD45GIP1 
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    ton_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = pi, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    labs(x = element_blank(), y=expression(pi), tag = "ton") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#hec
    my_data_only_genez <- hec[hec$containsgenes == 1,] 
    
    # explore relationship between pi and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(pi ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$pi < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.01184081
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06862745
    # 14 out of 204
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$pi < my_data_only_genez$fitted)]
    
    #  MRPS21        MRPL55        MRPL53        UQCRC1        NDUFAF3       UQCRQ         NDUFA2        MRPL43       
    # C11orf83      COX5A         SYNJ2BP-COX16 MRPL40        NDUFAB1,EARS2 GADD45GIP1 
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = pi, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    labs(x = element_blank(), y=expression(pi), tag = "hec") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#nig
    my_data_only_genez <- nig[nig$containsgenes == 1,] 
    
    # explore relationship between pi and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(pi ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$pi < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.009648065
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$pi < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06372549
    # 13 out of 204
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$pi < my_data_only_genez$fitted)]
    
    #  MRPS21     MRPL55     UQCRC1     NDUFAF3    NDUFA2     HARS2      MRPL2      MRPL43     NDUFS3     COX5A      NDUFB10   
    # COA3       GADD45GIP1
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = pi, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = pi, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = pi, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    labs(x = "Number of genes", y=expression(pi), tag = "nig") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      #theme(axis.text.x=element_blank()) + theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    

library(gridExtra)
png("pi_Ninteract_outliers.png", width = 300, height = 430, units='mm', res = 100)    
    grid.arrange(nem_plot, mau_plot,
                 ton_plot, hec_plot,
                 nig_plot,ncol=1)
    
dev.off()


