library(ggrepel)
library(ggplot2)

setwd('/Users/Shared/Previously\ Relocated\ Items/Security/projects/2017_SEAsian_macaque_genomz/PopGenome_OXPHOS/FWH_all_outputz')

pairwise_vector <- c("bor", "mau","ton", "hec", "nig")
pairwise_vector_plot <- c("nem_plot", "mau_plot", "ton_plot", "hec_plot", "nig_plot")

for (pair in pairwise_vector){
  a <- read.table(paste("stats_in_windows_",eval(pair),".csv_N_interact_FW_H_density.txt", sep=""), header = T)
  assign(pair,a)
}

nem <- bor
# subset the data to only include windows with genes
bor_allgenez <- bor[(bor$containsgenes == 1),] 
mau_allgenez <- mau[(mau$containsgenes == 1),] 
ton_allgenez <- ton[(ton$containsgenes == 1),] 
hec_allgenez <- hec[(hec$containsgenes == 1),] 
nig_allgenez <- nig[(nig$containsgenes == 1),] 

# do lm with an interaction term
nem_mod <- glm(FW_H ~ containsNinteractgenez*number_of_genes, data=bor_allgenez)
summary(nem_mod)
mau_mod <- glm(FW_H ~ containsNinteractgenez*number_of_genes, data=mau_allgenez)
summary(mau_mod)
ton_mod <- glm(FW_H ~ containsNinteractgenez*number_of_genes, data=ton_allgenez)
summary(ton_mod)
hec_mod <- glm(FW_H ~ containsNinteractgenez*number_of_genes, data=hec_allgenez)
summary(hec_mod)
nig_mod <- glm(FW_H ~ containsNinteractgenez*number_of_genes, data=nig_allgenez)
summary(nig_mod)


# no interaction term, only numberof genez
nem_mod_num_genz <- glm(FW_H ~ number_of_genes, data=bor_allgenez)
summary(nem_mod_num_genz)
mau_mod_num_genz <- glm(FW_H ~ number_of_genes, data=mau_allgenez)
summary(mau_mod_num_genz)
ton_mod_num_genz <- glm(FW_H ~ number_of_genes, data=ton_allgenez)
summary(ton_mod_num_genz)
hec_mod_num_genz <- glm(FW_H ~ number_of_genes, data=hec_allgenez)
summary(hec_mod_num_genz)
nig_mod_num_genz <- glm(FW_H ~ number_of_genes, data=nig_allgenez)
summary(nig_mod_num_genz)

# no interaction term, only Ninteract
nem_mod_Ninteract <- glm(FW_H ~ containsNinteractgenez, data=bor_allgenez)
summary(nem_mod_Ninteract)
mau_mod_Ninteract <- glm(FW_H ~ containsNinteractgenez, data=mau_allgenez)
summary(mau_mod_Ninteract)
ton_mod_Ninteract <- glm(FW_H ~ containsNinteractgenez, data=ton_allgenez)
summary(ton_mod_Ninteract)
hec_mod_Ninteract <- glm(FW_H ~ containsNinteractgenez, data=hec_allgenez)
summary(hec_mod_Ninteract)
nig_mod_Ninteract <- glm(FW_H ~ containsNinteractgenez, data=nig_allgenez)
summary(nig_mod_Ninteract)

#nem
    my_data_only_genez <- nem[nem$containsgenes == 1,] 
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
            (my_data_only_genez$containsNinteractgenez == 1))]
    # none for nem
    
    # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a color column for non-Ninteract and Ninteract windows
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    # make an alpha column for non-Ninteract and Ninteract windows
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                      (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
   
    # get some numbers
    # What is the expected proportion of lower outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02484882
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06862745
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
    #  UQCRH    MRPL55   NDUFAF3  ATP5J2   CYC1     NDUFS8   C11orf83 NDUFA4L2 COX5A    SCO2     MRPL28   MRPS34   PET100  
   # COX6B1           COX6B1 
    
 
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#                       ) +
      xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) + 
      labs(x = element_blank(), y="H", tag = "bor") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
   
#mau
    my_data_only_genez <- mau[mau$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    # ACAD9

        # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02516556
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07881773
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
    #  MRPL55        MRPL53        MRPL30        NDUFAF3       UQCRQ         MRPL18        ATP5J2        C11orf83     
    # MRPL40        MRPS34        EARS2,NDUFAB1 COA3          MRPL38        NDUFS7        COX6B1        ATP5SL   
    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = element_blank(), y="H", tag = "mau") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#ton
    my_data_only_genez <- ton[ton$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # NDUFAF3    GADD45GIP1
    
    
    # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02562974
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.05445545
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
#  ATP5F1   MRPS21   NDUFA2   SURF1    MRPS2    C11orf83 MRPL17   FOXRED1  MRPL10   MRPL34   COX6B1  
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    ton_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = element_blank(), y="H", tag = "ton") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#hec
    my_data_only_genez <- hec[hec$containsgenes == 1,] 
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # SYNJ2BP-COX16 MRPS26
    
    
    # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.0265428
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.05940594
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
#    ATPAF1  MRPS21  MRPL55  NDUFAF7 NDUFAF3 MRPL1   HARS2   COX5A   MRPS7   PET100  MRPL34  NDUFA13
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = element_blank(), y="H", tag = "hec") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#nig
    my_data_only_genez <- nig[nig$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (FW_H = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$FW_H)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # TARS2      HIGD1A     MRPL2      MRPS17     COX5A      MRPS26     COA3       TACO1      GADD45GIP1 NDUFA13

        # explore relationship between FW_H and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(FW_H ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$FW_H < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02512733
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$FW_H < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.08247423
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$FW_H < my_data_only_genez$fitted)]
    
#  ATP5F1       MRPL9        NDUFA2       HARS2        ATP5J2       COX6C        CYC1         MRP63        ATP5E       
# TTC19        MRPS7        MRPL12       PET100       MRPL34       MRPS12,SARS2 ATP5SL    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = FW_H, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = FW_H, alpha = alpha),color = 'red') +
      geom_text_repel( data = my_data_only_genez,
                       #mapping = aes(label = Ninteract_acronym),
                       mapping = aes(label = ifelse(color == "blue",as.character(Ninteract_acronym),'')),
                       force_pull = 0,
                       force = 13,
                       nudge_y = 0.1, nudge_x = 15,
                       color = "black",
                       size = 2.5,
                       box.padding = 0.5, 
                       #point.padding = 0.5,
                       direction     = "y",
                       max.overlaps = Inf,
                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
                       segment.size = 0.25,
                       segment.color = 'grey50'
      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-8,6), breaks = c(-6.0,-3.0,0,3.0,6.0)) +
    labs(x = "Number of genes", y="H", tag = "nig") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      #theme(axis.text.x=element_blank()) + theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    

library(gridExtra)
png("FW_H_Ninteract_outliers.png", width = 300, height = 430, units='mm', res = 100)    
    grid.arrange(nem_plot, mau_plot,
                 ton_plot, hec_plot,
                 nig_plot,ncol=1)
    
dev.off()


