library(ggrepel)
library(ggplot2)
library(interactions)


setwd('/Users/Shared/Previously Relocated Items/Security/projects/2017_SEAsian_macaque_genomz/Fst_windows/all_fst_outputz')

pairwise_vector <- c("nem_mau", "nem_ton", "nem_hec", "nem_nig", "mau_ton",
                     "mau_hec", "mau_nig", "ton_hec", "ton_nig", "hec_nig")
pairwise_vector_plot <- c("nem_mau_plot", "nem_ton_plot", "nem_hec_plot", "nem_nig_plot", "mau_ton_plot",
                     "mau_hec_plot", "mau_nig_plot", "ton_hec_plot", "ton_nig_plot", "hec_nig_plot")

for (pair in pairwise_vector){
  a <- read.table(paste(eval(pair), "_windowstats.concat_FST__density.txt", sep=""), header = T)
  assign(pair,a)
}

# now add faceting variables to each dataset 
nem_mau$pair <- "nem_mau"
nem_ton$pair <- "nem_ton"
nem_hec$pair <- "nem_hec"
nem_nig$pair <- "nem_nig"
mau_ton$pair <- "mau_ton"
mau_hec$pair <- "mau_hec"
mau_nig$pair <- "mau_nig"
ton_hec$pair <- "ton_hec"
ton_nig$pair <- "ton_nig"
hec_nig$pair <- "hec_nig"

#
# subset the data to only include windows with genes
nem_mau_allgenez <- nem_mau[(nem_mau$containsgenes == 1),] 
nem_ton_allgenez <- nem_ton[(nem_ton$containsgenes == 1),] 
nem_hec_allgenez <- nem_hec[(nem_hec$containsgenes == 1),] 
nem_nig_allgenez <- nem_nig[(nem_nig$containsgenes == 1),] 
mau_ton_allgenez <- mau_ton[(mau_ton$containsgenes == 1),] 
mau_hec_allgenez <- mau_hec[(mau_hec$containsgenes == 1),] 
mau_nig_allgenez <- mau_nig[(mau_nig$containsgenes == 1),] 
ton_hec_allgenez <- ton_hec[(ton_hec$containsgenes == 1),] 
ton_nig_allgenez <- ton_nig[(ton_nig$containsgenes == 1),] 
hec_nig_allgenez <- hec_nig[(hec_nig$containsgenes == 1),] 

# do lm with an interaction term (no interactions, but also no significant containsNinteractgenez)
nem_mau_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=nem_mau_allgenez)
summary(nem_mau_mod)
nem_ton_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=nem_ton_allgenez)
summary(nem_ton_mod)
nem_hec_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=nem_hec_allgenez)
summary(nem_hec_mod)
nem_nig_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=nem_nig_allgenez)
summary(nem_nig_mod)
mau_ton_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=mau_ton_allgenez)
summary(mau_ton_mod)
mau_hec_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=mau_hec_allgenez)
summary(mau_hec_mod)
mau_nig_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=mau_nig_allgenez)
summary(mau_nig_mod)
ton_hec_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=ton_hec_allgenez)
summary(ton_hec_mod)
ton_nig_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=ton_nig_allgenez)
summary(ton_nig_mod)
hec_nig_mod <- glm(Fst ~ containsNinteractgenez*number_of_genes, data=hec_nig_allgenez)
summary(hec_nig_mod)

# interaction plots
interact_plot(nem_mau_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(nem_ton_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(nem_hec_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(nem_nig_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(mau_ton_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(mau_hec_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(mau_nig_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(ton_hec_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(ton_nig_mod, pred = containsNinteractgenez, modx = number_of_genes)
interact_plot(hec_nig_mod, pred = containsNinteractgenez, modx = number_of_genes)


# no interaction term (all significant)
nem_mau_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=nem_mau_allgenez)
summary(nem_mau_mod_nointeraction)
nem_ton_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=nem_ton_allgenez)
summary(nem_ton_mod_nointeraction)
nem_hec_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=nem_hec_allgenez)
summary(nem_hec_mod_nointeraction)
nem_nig_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=nem_nig_allgenez)
summary(nem_nig_mod_nointeraction)
mau_ton_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=mau_ton_allgenez)
summary(mau_ton_mod_nointeraction)
mau_hec_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=mau_hec_allgenez)
summary(mau_hec_mod_nointeraction)
mau_nig_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=mau_nig_allgenez)
summary(mau_nig_mod_nointeraction)
ton_hec_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=ton_hec_allgenez)
summary(ton_hec_mod_nointeraction)
ton_nig_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=ton_nig_allgenez)
summary(ton_nig_mod_nointeraction)
hec_nig_mod_nointeraction <- glm(Fst ~ containsNinteractgenez, data=hec_nig_allgenez)
summary(hec_nig_mod_nointeraction)





#nem_mau
    my_data_only_genez <- nem_mau[nem_mau$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a color column for non-Ninteract and Ninteract windows
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    # make an alpha column for non-Ninteract and Ninteract windows
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                      (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                              (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
   
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
              nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
     #  0.02773819
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.09313725
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                         (my_data_only_genez$containsNinteractgenez == 1) &
                         (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#     [1] MRPS21        MRPL55        NDUFB3        UQCRC1        NDUFAF3       NDUFC1        NDUFA2        HARS2        
#    [9] MRPL2         C11orf83      NDUFS3        TMEM126B      ATP5B         NDUFA4L2      MRPL52        SYNJ2BP-COX16
#    [17] MRPL40        SCO2          COA3 
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_mau_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#                       ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=expression(paste(italic(F[ST]))), tag = "A") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
   
#nem_ton
    my_data_only_genez <- nem_ton[nem_ton$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02905383
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06372549
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
    #     [1] MRPS21   MRPL53   UQCRC1   NDUFAF3  NDUFA2   HARS2    MRPS18B  MRPL2    C11orf83 NDUFS3   MRPL48   ATP5B    SCO2 
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_ton_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=element_blank(), tag = "B") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank(),axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#nem_hec
    my_data_only_genez <- nem_hec[nem_hec$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02729964
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.1029412
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#      [1] UQCRH         MRPS21        MRPL9         MRPL55        MRPL53        UQCRC1        NDUFAF3       UQCRQ        
#    [9] NDUFA2        HARS2         C11orf83      NDUFS3        MRPL48        ATP5B         NDUFA4L2      COX5A        
#    [17] MRPL52        COX16         SYNJ2BP-COX16 MRPL40        SCO2  
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=expression(paste(italic(F[ST]))), tag = "C") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank())+
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#nem_nig
    my_data_only_genez <- nem_nig[nem_nig$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02817674
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07352941
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#    [1] MRPS21   MRPL55   UQCRC1   NDUFAF3  UQCRQ    NDUFA2   HARS2    MRPL2    NDUFB6   C11orf83 NDUFS3   ATP5B    NDUFA4L2
#    [14] COX5A    COA3  
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    nem_nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=element_blank(), tag = "D") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank(),axis.text.y=element_blank())+
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#mau_ton
    my_data_only_genez <- mau_ton[mau_ton$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02960202
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07843137
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#    [1] MRPS21        MRPL55        MRPL53        UQCRC1        NDUFAF3       UQCRQ         NDUFA2        HARS2        
#    [9] MRPL18        C11orf83      MRPL17        ATP5B         SYNJ2BP-COX16 MRPS34        NDUFB10       TACO1 

    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_ton_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=expression(paste(italic(F[ST]))), tag = "E") +
      theme_classic(base_size=16) + theme(legend.position = "none")+
      theme(axis.text.x=element_blank())+
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#mau_hec
    my_data_only_genez <- mau_hec[mau_hec$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.03201403
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.06372549
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#     [1] MRPS21        MRPL55        NDUFAF3       UQCRQ         NDUFA2        HARS2         C11orf83      COX6A1       
#    [9] COX16         SYNJ2BP-COX16 MRPL40        MRPS34        MRPL10 
    
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=element_blank(), tag = "F") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank(),axis.text.y=element_blank())+
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
#mau_nig
    my_data_only_genez <- mau_nig[mau_nig$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02708036
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07843137
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#   [1] MRPS21        MRPL55        UQCRC1        NDUFAF3       ACAD9         UQCRQ         NDUFA2        HARS2        
#    [9] MRPL2         C11orf83      TMEM126B      ATP5B         SYNJ2BP-COX16 MRPS34        NDUFB10       COA3         
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    mau_nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=expression(paste(italic(F[ST]))), tag = "G") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank())+
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))

#ton_hec
    my_data_only_genez <- ton_hec[ton_hec$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.03201403
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.09313725
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#    [1] MRPS21        NDUFS2        MRPL53        UQCRC1        NDUFAF3       TIMMDC1       UQCRQ         NDUFA2       
#    [9] HARS2         MRPL43        C11orf83      MRPL48        COX6A1        COX16         SYNJ2BP-COX16 MRPS26       
#    [17] MRPL40        MRPL10        TACO1
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    ton_hec_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = element_blank(), y=element_blank(), tag = "H") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.x=element_blank(),axis.text.y=element_blank())+
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))

#ton_nig
    my_data_only_genez <- ton_nig[ton_nig$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02927311
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.07843137
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#    [1] ATP5F1        MRPS21        TARS2         NDUFS2        UQCRC1        NDUFAF3       TIMMDC1       NDUFA2       
#    [9] COX7A2        MRPL43        C11orf83      ATP5B         SYNJ2BP-COX16 MRPS26        NDUFB10       TACO1  
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    ton_nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = "Number of genes in window", y=expression(paste(italic(F[ST]))), tag = "I") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.5), "cm"))
    
#hec_nig
    my_data_only_genez <- hec_nig[hec_nig$containsgenes == 1,] 
    
    # explore relationship between Fst and number of genes in Ninteract windows
    my_data_only_nonNinteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 0,] 
    my_data_only_Ninteractgenez <- my_data_only_genez[my_data_only_genez$containsNinteractgenez == 1,] 
    my_data_only_nonNinteractgenez<- my_data_only_nonNinteractgenez[complete.cases(my_data_only_nonNinteractgenez),]
    
    my_data_only_genez<- my_data_only_genez[complete.cases(my_data_only_genez),]
    #dim(my_data_only_genez)
    #head(my_data_only_genez)
    
    # calculate a lm for all data (because there was not a significant interaction term)
    mod <- lm(Fst ~ number_of_genes, data=my_data_only_genez)
    # get the fitted values (y = mx+b)
    fitted <- mod$coefficients[2]*my_data_only_genez$number_of_genes + mod$coefficients[1]
    #cbind fitted to data
    my_data_only_genez <- cbind(my_data_only_genez,fitted)
    
    # calculate cooks d for all data
    cooksd <- cooks.distance(mod)
    #cbind cooksd to data
    my_data_only_genez <- cbind(my_data_only_genez,cooksd)
    # make a column to specify whether a gene is an Ninteract gene or not
    my_data_only_genez$color <- ifelse(my_data_only_genez$containsNinteractgenez == 1, "pink", "gray")
    my_data_only_genez$alpha <- ifelse(my_data_only_genez$color == "gray", 0.7, 1)
    # make a column that specifies whether cooksd suggests an outlier
    # but only for Ninteract genes
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "red"
    # now color the ones that are below the fitted line blue
    my_data_only_genez$color[(cooksd >=4*mean(cooksd, na.rm=T)) &
                               (my_data_only_genez$Fst < my_data_only_genez$fitted) &
                               (my_data_only_genez$containsNinteractgenez == 1)] <-  "blue"
    
    
    # get some numbers
    # What is the expected proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 0) & # the gene is not an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 0),])
    #  0.02905383
    # What is the observed proportion of upper outliers
    nrow(my_data_only_genez[(cooksd >=4*mean(cooksd, na.rm=T)) & # cooksD is high
                              (my_data_only_genez$containsNinteractgenez == 1) & # the gene is  an Ninteract gene
                              (my_data_only_genez$Fst > my_data_only_genez$fitted),])/ # the value is an upper outlier
      nrow(my_data_only_genez[(my_data_only_genez$containsNinteractgenez == 1),])
    #  0.08823529
    
    # what are the names of the Ninteract outlier windows?
    my_data_only_genez$Ninteract_acronym[(cooksd >=4*mean(cooksd, na.rm=T)) &
                                           (my_data_only_genez$containsNinteractgenez == 1) &
                                           (my_data_only_genez$Fst > my_data_only_genez$fitted)]
#    [1] MRPS21        MRPL55        UQCRC1        NDUFAF3       ACAD9         UQCRQ         NDUFA2        HARS2        
#    [9] MRPL43        C11orf83      COX5A         COX16         SYNJ2BP-COX16 MRPS26        MRPS34        MRPL10       
#    [17] COA3          TACO1  
    
    # make the color column into an ordered factor
    #my_data_only_genez$color <- factor(my_data_only_genez$color, levels = c("gray", "pink", "red"), 
    #                                   ordered = is.ordered(my_data_only_genez$color))
    # on now plot the data with the color representing outliers for N_interact genes
    hec_nig_plot <- ggplot(my_data_only_genez, aes(x = number_of_genes, y = Fst, color=color, fill=color)) +
      geom_smooth(data = my_data_only_genez, method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color == 'gray'), method=lm, se=T, fullrange=TRUE, colour="gray", fill = "gray") +
      #geom_smooth(data = subset(my_data_only_genez, color != 'gray'), method=lm, se=T, fullrange=TRUE, colour="pink", fill = "pink") +
      geom_point(data = subset(my_data_only_genez, color == "gray"),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'gray') +
      geom_point(data = subset(my_data_only_genez, color == 'pink'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha), color = 'pink') +
      geom_point(data = subset(my_data_only_genez, color == 'red'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
      geom_point(data = subset(my_data_only_genez, color == 'blue'),
                 aes(x = number_of_genes, y = Fst, alpha = alpha),color = 'red') +
#      geom_text_repel( data = my_data_only_genez,
#                       #mapping = aes(label = Ninteract_acronym),
#                       mapping = aes(label = ifelse(color == "red",as.character(Ninteract_acronym),'')),
#                       force_pull = 0,
#                       force = 13,
#                       nudge_y = 0.1, nudge_x = 15,
#                       color = "black",
#                       size = 2.5,
#                       box.padding = 0.5, 
#                       #point.padding = 0.5,
#                       direction     = "y",
#                       max.overlaps = Inf,
#                       hjust = 0, #angle = 45, #segment.curvature = -0.05,
#                       segment.size = 0.25,
#                       segment.color = 'grey50'
#      ) +
      xlim(0,16) + scale_y_continuous(limits = c(0.15,0.8), breaks = c(0,0.5,0.75)) + labs(x = "Number of genes in window", y=element_blank(), tag = "J") +
      theme_classic(base_size=16) + theme(legend.position = "none") +
      theme(axis.text.y=element_blank()) +
      theme(plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm"))
    
library(gridExtra)
png("Fst_Ninteract_outliers.png", width = 300, height = 430, units='mm', res = 100)    
    grid.arrange(nem_mau_plot, nem_ton_plot,
                 nem_hec_plot, nem_nig_plot,
                 mau_ton_plot, mau_hec_plot,
                 mau_nig_plot, ton_hec_plot,
                 ton_nig_plot, hec_nig_plot,ncol=2)
    
dev.off()


