library(ggrepel)
library(ggplot2)

setwd('/Users/Shared/Previously\ Relocated\ Items/Security/projects/2017_SEAsian_macaque_genomz/PopGenome_OXPHOS/TajD_all_outputz')

pairwise_vector <- c("bor", "mau","ton", "hec", "nig")
pairwise_vector_plot <- c("nem_plot", "mau_plot", "ton_plot", "hec_plot", "nig_plot")

for (pair in pairwise_vector){
  a <- read.table(paste("stats_in_windows_",eval(pair),".csv_N_interact_TajD_density.txt", sep=""), header = T)
  assign(pair,a)
}

nem <- bor

# subset the data to only include windows with genes
bor_allgenez <- bor[(bor$containsgenes == 1),] 
mau_allgenez <- mau[(mau$containsgenes == 1),] 
ton_allgenez <- ton[(ton$containsgenes == 1),] 
hec_allgenez <- hec[(hec$containsgenes == 1),] 
nig_allgenez <- nig[(nig$containsgenes == 1),] 


# do lm with an interaction term
nem_mod <- glm(TajD ~ containsNinteractgenez*number_of_genes, data=bor_allgenez)
summary(nem_mod)
mau_mod <- glm(TajD ~ containsNinteractgenez*number_of_genes, data=mau_allgenez)
summary(mau_mod)
ton_mod <- glm(TajD ~ containsNinteractgenez*number_of_genes, data=ton_allgenez)
summary(ton_mod)
hec_mod <- glm(TajD ~ containsNinteractgenez*number_of_genes, data=hec_allgenez)
summary(hec_mod)
nig_mod <- glm(TajD ~ containsNinteractgenez*number_of_genes, data=nig_allgenez)
summary(nig_mod)

# no interaction term, only numberof genez
nem_mod_num_genz <- glm(TajD ~ number_of_genes, data=bor_allgenez)
summary(nem_mod_num_genz)
mau_mod_num_genz <- glm(TajD ~ number_of_genes, data=mau_allgenez)
summary(mau_mod_num_genz)
ton_mod_num_genz <- glm(TajD ~ number_of_genes, data=ton_allgenez)
summary(ton_mod_num_genz)
hec_mod_num_genz <- glm(TajD ~ number_of_genes, data=hec_allgenez)
summary(hec_mod_num_genz)
nig_mod_num_genz <- glm(TajD ~ number_of_genes, data=nig_allgenez)
summary(nig_mod_num_genz)


#nem
    my_data_only_genez <- nem[nem$containsgenes == 1,] 
    # What are the names of the Ninteract genes with no polymorphism (TajD = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$TajD)) &
            (my_data_only_genez$containsNinteractgenez == 1))]
    # none for nem
    
    # explore relationship between TajD and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(TajD ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a color column for non-Ninteract and Ninteract windows
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    # make an alpha column for non-Ninteract and Ninteract windows
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                      (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                              (my_data_only_genez$TajD < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
   
    # get some numbers
    # What is the expected proportion of lower outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02286971
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.08823529
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$TajD < my_data_only_genez$fitted)]
    
    #  UQCRH         NDUFS2        COX20         MRPL47,NDUFB5 NDUFAF3       MRPL2         ATP5J2        CYC1         
    # C11orf83      NDUFS3        ATP5B         SYNJ2BP-COX16 SCO2          MRPL28        MRPS34        MRPL10       
    # COA3          COX6B1 
    
 
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = TajD, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#                       ) +
      xlim(0,16) + scale_y_continuous(limits = c(-3,3), breaks = c(-3.0,-2.0,-1.0,0,1.0,2.0,3.0)) + 
      labs(x = element_blank(), y="Tajima's D", tag = "bor") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
   
#mau
    my_data_only_genez <- mau[mau$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (TajD = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$TajD)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    # ACAD9

        # explore relationship between TajD and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(TajD ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$TajD < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02593819
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07389163
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$TajD < my_data_only_genez$fitted)]
    
    #  IARS2         MRPL53        NDUFAF3       UQCRQ         HARS2         MRPL18        ATP5J2        MRPL43       
   # USMG5         C11orf83      NDUFAB1,EARS2 COA3          NDUFS7        COX6B1        ATP5SL 
    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = TajD, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-3,3), breaks = c(-3.0,-2.0,-1.0,0,1.0,2.0,3.0)) +
    labs(x = element_blank(), y="Tajima's D", tag = "mau") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#ton
    my_data_only_genez <- ton[ton$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (TajD = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$TajD)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # NDUFAF3    GADD45GIP1
    
    
    # explore relationship between TajD and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(TajD ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$TajD < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.01847982
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
    nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06930693
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$TajD < my_data_only_genez$fitted)]
    
#  ATPAF1   ATP5F1   NDUFS2   MRPL55   MRPL53   NDUFA2   NDUFV1   MRPL49   C11orf83 MRPL17   ATP5B    COX5A    NDUFB10 
#  MRPS7       
    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    ton_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = TajD, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-3,3), breaks = c(-3.0,-2.0,-1.0,0,1.0,2.0,3.0)) +
    labs(x = element_blank(), y="Tajima's D", tag = "ton") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#hec
    my_data_only_genez <- hec[hec$containsgenes == 1,] 
    # What are the names of the Ninteract genes with no polymorphism (TajD = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$TajD)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # SYNJ2BP-COX16 MRPS26
    
    
    # explore relationship between TajD and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(TajD ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$TajD < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02377793
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06930693
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$TajD < my_data_only_genez$fitted)]
    
#    MRPS21     MRPL24     NDUFAF7    MRPL53     BCS1L      NDUFAF3    HARS2      SURF1      USMG5      MRPL12     PET100    
# GADD45GIP1 MRPL34     NDUFA13 
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = TajD, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-3,3), breaks = c(-3.0,-2.0,-1.0,0,1.0,2.0,3.0)) +
    labs(x = element_blank(), y="Tajima's D", tag = "hec") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) + #theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#nig
    my_data_only_genez <- nig[nig$containsgenes == 1,] 
    
    # What are the names of the Ninteract genes with no polymorphism (TajD = 'NA')
    my_data_only_genez$Ninteract_acronym[((is.na(my_data_only_genez$TajD)) &
                                            (my_data_only_genez$containsNinteractgenez == 1))]
    
    # TARS2      HIGD1A     MRPL2      MRPS17     COX5A      MRPS26     COA3       TACO1      GADD45GIP1 NDUFA13

        # explore relationship between TajD and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(TajD ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$TajD < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.01618563
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$TajD < my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.05154639
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$TajD < my_data_only_genez$fitted)]
    
# ATP5F1 MRPL24 NDUFS2 MRPL53 HARS2  ATP5J2 SCO2   MRPL12 POLRMT COX6B1
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = TajD, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = TajD, alpha = alpha),color = 'red') +
      geom_text_repel( data = my_data_only_genez,
                       #mapping = aes(label = Ninteract_acronym),
                       mapping = aes(label = ifelse(color == "blue",as.character(Ninteract_acronym),'')),
                       force_pull = 0,
                       force = 13,
                       nudge_y = 0.1, nudge_x = 15,
                       color = "black",
                       size = 2.5,
                       box.padding = 0.5, 
                       #point.padding = 0.5,
                       direction     = "y",
                       max.overlaps = Inf,
                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
                       segment.size = 0.25,
                       segment.color = 'grey50'
      ) +
    xlim(0,16) + scale_y_continuous(limits = c(-3,3), breaks = c(-3.0,-2.0,-1.0,0,1.0,2.0,3.0)) +
    labs(x = "Number of genes", y="Tajima's D", tag = "nig") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      #theme(axis.text.x=element_blank()) + theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    

library(gridExtra)
png("TajD_Ninteract_outliers.png", width = 300, height = 430, units='mm', res = 100)    
    grid.arrange(nem_plot, mau_plot,
                 ton_plot, hec_plot,
                 nig_plot,ncol=1)
    
dev.off()


