library("rstudioapi")
library(ggplot2)
library(dplyr)
library(car)
library(lme4)
library(tidyr)
library(MuMIn)
library(rsq)
library(ggpubr)
library("caret")
library("corrplot")
library("reshape2")
library("glmnet")
library(randomForest)
library(caret)
library(broom)
library(gbm)
require(caTools)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(performance)

setwd(dirname(getActiveDocumentContext()$path))

# Removes top perc (ex: perc = 0.01) of data in df by
# removing an equal percent of top data (perc/length(list_of_col_names))
# from each col in the input list
remove_out_simple_all <- function(df, perc, list_col_names) {
  df_clean <- df
  perc_each <- perc/length(list_col_names)
  for (name in list_col_names) {
    thresh <- quantile(df[[name]], 1 - perc_each, na.rm = TRUE)
    df_clean <- df_clean[df_clean[[name]] < thresh | is.na(df_clean[[name]]),]
  }
  return(df_clean)
}


# Removes top perc (ex: perc = 0.01) of data in df from column col
remove_out_simple <- function(df, perc, col_name) {
  thresh <- quantile(df[[col_name]], 1 - perc, na.rm = TRUE)
  df_clean <- df[df[[col_name]] < thresh | is.na(df[[col_name]]),]
  return(df_clean)
}


model_stats_glmer <- function(model) {
  print(summary(model))
  print(Anova(model))
  print(r2_nakagawa(model))
  print(check_collinearity(model))
  print(check_singularity(model))
}

model_stats_glm <- function(model) {
  print(summary(model))
  print(Anova(model))
  print(r2_nagelkerke(model))
  print(check_collinearity(model))
  print(check_singularity(model))
}

get_df <- function() {
  # Read data
  df <- read.csv("data1.csv")
  
  # Remove projects where we have no history or large umbrella projects
  df <- df %>% 
    filter(project != "CXF") %>% # no git history
    filter(project != "Cayenne") %>% # no git history
    filter(project != "Jena") %>%  # no git history
    filter(project != "Guacamole") %>%  # no git history
    filter(project != "ServiceMix") %>% # no git history
    filter(project != "MyFaces") %>% # large umbrella project
    filter(project != "myfaces-tobago") %>% # no emails
    filter(project != "myfaces-trinidad") %>% # no emails
    filter(project != "Openmeetings") %>%# no git history
    filter(project != "ODFToolkit") %>% # too large, errors extracting data
    filter(project != "Brooklyn") # markdown only project, we exclude it
  
  df <- df[complete.cases(df), ]
  df
}

# Sokrates failes in getting some data for some projects; we double check this by looking at the total number of files and the #files (tokei);
# This way we identify problematic projects - Ambari is fine, so we keep that one. The rest we exclude from our data
remove_problematic_projects <- function() {
  projects <- df %>% filter(files !=0 ) %>% filter(total_number_of_files == 0) %>% select(project) %>% unique()
  projects <- projects %>% filter(project != "Ambari")
  df %>% filter(! project %in% unlist(projects))
}

df <- get_df()
df <- remove_problematic_projects()


# VARIABLES TO REMOVE OUTLIERS

code <- c("code", "directories", 
          "top_level_dirs")

quality <- c(
  "negligible_risk_file_size_count",
  "low_risk_file_size_count",
  "medium_risk_file_size_count",
  "high_risk_file_size_count", 
  "very_high_risk_file_size_count",
  "negligible_risk_file_size_loc",
  "low_risk_file_size_loc",
  "medium_risk_file_size_loc",
  "high_risk_file_size_loc", 
  "very_high_risk_file_size_loc",
  "conditional_complexity_negligible_risk_count",
  "conditional_complexity_low_risk_count",
  "conditional_complexity_medium_risk_count",
  "conditional_complexity_high_risk_count", 
  "conditional_complexity_very_high_risk_count",
  "conditional_complexity_negligible_risk_loc",
  "conditional_complexity_low_risk_loc",
  "conditional_complexity_medium_risk_loc",
  "conditional_complexity_high_risk_loc", 
  "conditional_complexity_very_high_risk_loc",
  "unit_size_negligible_risk_count",
  "unit_size_low_risk_count",
  "unit_size_medium_risk_count",
  "unit_size_high_risk_count",
  "unit_size_negligible_risk_loc",
  "unit_size_low_risk_loc",
  "unit_size_medium_risk_loc",
  "unit_size_high_risk_loc",
  "unit_size_very_high_risk_loc",
  "number_of_units",
  "unit_duplicates_count",
  "most_complex_unit_mcabe_index", 
  "duplication_percentage",
  "lines_of_code_test",
  "number_of_files_generated",
  "lines_of_code_build_and_deployment",
  "test_vs_main_lines_of_code_percentage"
)

process <- c("active_days",
             "files_added",
             "files_deleted",
             "added_lines",
             "deleted_lines",
             "avg_files_modified_commit",
             "authors",
             "commits",
             "minor_contributors", 
             "major_contributors", 
             "new_contributors", 
             "emails",
             "corrective",
             "perfective",
             "features",
             "unknown",
             "non_functional"
)

vars <- unlist(c(code, quality, process))

# Remove 3% of top outliers
df <- remove_out_simple_all(df, 0.03, vars)



missing_data1 <- df %>% filter(files > 0 ) %>% filter(negligible_risk_file_size_loc == 0)


# REMOVE DATA
# If a lanuage (e.g,. XSL) only appreas in 2 or 3 projects, remove those projects
# If we only have few observations of a project with XSL, SVG, Shell as main PL, we remove those
few_rows_languages <- c("XSL", "SVG",  "Shell", "Plain Text", "Markdown")
uncommon_lang_projects <- c("Lua", "Go", "Erlang", "CSS", "C#", "Kotlin", "Autoconf")

project_uncommon_langs_to_remove <- df %>% filter(programming_lang %in% unlist(uncommon_lang_projects)) %>% select(project) %>% unique()

df_all <- df %>% filter(! programming_lang %in% unlist(few_rows_languages))
df_all <- df %>% filter(!project %in% unlist(project_uncommon_langs_to_remove))


### TABLE 1 STATS ###
stats <- function(df, variables, output_file) {
  vals <- data.frame(matrix(ncol=6, nrow=27))
  colnames(vals) <- c("mean", "median", "max", "mean", "median", "max")
  
  for (i in variables) {
    
    
    a <- (df %>% filter(status == 1) %>% select(i) %>% summarise(n = mean(!!sym(i))))
    b <- (df %>% filter(status == 1) %>% select(i) %>% summarise(n = median(!!sym(i))))
    c <- (df %>% filter(status == 1) %>% select(i) %>% summarise(n = max(!!sym(i))))
    
    
    d <- (df %>% filter(status == 0) %>% select(i) %>% summarise(n = mean(!!sym(i))))
    e <- (df %>% filter(status == 0) %>% select(i) %>% summarise(n = median(!!sym(i))))
    f <- (df %>% filter(status == 0) %>% select(i) %>% summarise(n = max(!!sym(i))))
    
    print(i)
    print(a$n)
    print(b$n)
    print(c$n)
    
    vals[nrow(vals) + 1,] = c(a$n,b$n,c$n,d$n,e$n,f$n)
  }
  
  write.csv(vals, file = output_file, row.names = FALSE)
  
}

# TABLE 1 in the paper
stats(df, cols, "stats-new-stdev.csv") 


# MODELS FOR TABLE 3 
base_model <- glmer(status ~ 
                      log(code + 0.001) + 
                      log(directories + 0.001) + 
                      log(top_level_dirs + 0.001) + 
                      # scale(incubation_month) + 
                      (1|incubation_month) +
                      (1 | programming_lang), data=df_all, family = "binomial",control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=10000000))
)

model_stats_glmer(base_model)

base_proc_model <- glmer(status ~ 
                           log(code + 0.001) + 
                           log(directories + 0.001) + 
                           log(top_level_dirs + 0.001) + 
                           #scale(incubation_month) + 
                           scale(major_contributors) +
                           log(minor_contributors + 0.001) +
                           log(new_contributors + 0.001) + 
                           log(files_added + 0.001) +
                           log(files_deleted + 0.001) +
                           #log(added_lines + 0.001) + # HIGHLY CORRELATED
                           #log(deleted_lines + 0.001) + # HIGHLY CORRELATED
                           log(avg_files_modified_commit + 0.001) + 
                           scale(active_days) +
                           log(emails + 0.001) +
                           log(features + 0.001) +
                           log(corrective + 0.001) +
                           log(perfective + 0.001) +    
                           log(non_functional + 0.001) + (1|incubation_month) +
                           (1 | programming_lang) , data=df_all, family = "binomial",control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=10000000))
)

model_stats_glmer(base_proc_model)
check_model(base_proc_model, panel=FALSE)
# model_performance(base_proc_model)

base_proc_quality_model <- glmer(status ~ 
                                   log(code + 0.001) + 
                                   log(directories + 0.001) + 
                                   log(top_level_dirs + 0.001) + 
                                   #scale(incubation_month) + 
                                   scale(major_contributors) +
                                   log(minor_contributors + 0.001) +
                                   log(new_contributors + 0.001) + 
                                   log(files_added + 0.001) +
                                   log(files_deleted + 0.001) +
                                   #log(added_lines + 0.001) + # HIGHLY CORRELATED
                                   #log(deleted_lines + 0.001) + # HIGHLY CORRELATED
                                   log(avg_files_modified_commit + 0.001) + 
                                   scale(active_days) +
                                   log(emails + 0.001) +
                                   log(features + 0.001) +
                                   log(corrective + 0.001) +
                                   log(perfective + 0.001) +    
                                   log(non_functional + 0.001) + 
                                   log(test_vs_main_lines_of_code_percentage +0.001)  +
                                   log(conditional_complexity_medium_risk_count + 0.001) + 
                                   log(conditional_complexity_very_high_risk_count + 0.001) + 
                                   log(very_high_risk_file_size_count +0.001)  +
                                   log(unit_size_very_high_risk_count +0.001)  +
                                   log(duplication_percentage + 0.001) + 
                                   log(most_complex_unit_loc + 0.001) + 
                                   (1|incubation_month) +
                                   (1 | programming_lang), data=df_all, family = "binomial",control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=10000000))
)

model_stats_glmer(base_proc_quality_model)
check_model(base_proc_quality_model, panel = FALSE)

compare_performance(base_model, base_proc_model, base_proc_quality_model)

# TABLE 3
tab_model(base_model, base_proc_model, base_proc_quality_model, 
          show.ci = FALSE, 
          # show.aic = TRUE,
          # show.aicc = TRUE,
          # show.loglik = TRUE,
          dv.labels = c("Base", "Base, Processes", "Base, Processes, Quality"),
          pred.labels = c("Intercept", "SLOC", "Directories", "Top Level Directories", "Incubation Month",
                          "Major Contributors", "Minor Contributors","New Contributors", "Files Added", "Files Deleted", "Avg. Files Modified per Commit",
                          "Active Days", "Emails", "Corrective", "Features", "Perfective", "Non Functional",
                          "Test/Main Lines of Code Percentage", "#Functions /w McCabe Index 11-25", "#Functions /w McCabe Index >51 ", "Very Large File Size Count", "Very Large Function Size Count",
                          "Code Duplication Percentage", "Most Complex Function LOC"),
          file="models-all-glmer.html")

## JAVA PROJECTS ##

java_projects <- df %>% filter(programming_lang == "Java") %>% group_by(project) %>% select(project) %>% unique()
java_df <- df %>% filter(project %in% java_projects$project)

java <- df[df$programming_lang == "Java",]

# MODELS FOR TABLE 2

java_glm_base_model <- glm(status ~ 
                      log(code + 0.001) + 
                      log(top_level_dirs + 0.001) + 
                      scale(incubation_month),
                      data=java_df, family = "binomial")

model_stats_glm(java_glm_base_model)

java_glm_base_proc_model <- glm(status ~ 
                           log(code + 0.001) + 
                           log(top_level_dirs + 0.001) + 
                           scale(incubation_month) + 
                           scale(major_contributors) +
                           log(minor_contributors + 0.001) +
                           log(new_contributors + 0.001) + 
                           log(files_added + 0.001) +
                           log(files_deleted + 0.001) +
                           log(avg_files_modified_commit + 0.001) + 
                           log(emails + 0.001) +
                           log(corrective + 0.001) +
                           log(features + 0.001) +
                           log(perfective + 0.001) +    
                           log(non_functional + 0.001), data=java_df, family="binomial")


model_stats_glm(java_glm_base_proc_model)
check_model(base_proc_model, panel=FALSE)
# model_performance(base_proc_model)

java_glm_base_proc_quality_model <- glm(status ~ 
                                          log(code + 0.001) + 
                                          log(top_level_dirs + 0.001) + 
                                          scale(incubation_month) + 
                                          scale(major_contributors) +
                                          log(minor_contributors + 0.001) +
                                          log(new_contributors + 0.001) + 
                                          log(files_added + 0.001) +
                                          log(files_deleted + 0.001) +
                                          log(avg_files_modified_commit + 0.001) + 
                                          log(emails + 0.001) +
                                          log(corrective + 0.001) +
                                          log(features + 0.001) +
                                          log(perfective + 0.001) +    
                                          log(non_functional + 0.001) + 
                                          log(test_vs_main_lines_of_code_percentage +0.001)  +
                                          log(conditional_complexity_medium_risk_count + 0.001) + 
                                          log(conditional_complexity_very_high_risk_count + 0.001) + 
                                          log(very_high_risk_file_size_count +0.001)  +
                                          log(unit_size_very_high_risk_count +0.001)  +
                                          log(duplication_percentage + 0.001), data=java_df, family = "binomial")

model_stats_glm(java_glm_base_proc_quality_model)
check_model(base_proc_quality_model, panel = FALSE)

compare_performance(base_model, base_proc_model, base_proc_quality_model)

# TABLE 2
tab_model(java_glm_base_model, java_glm_base_proc_model, java_glm_base_proc_quality_model, 
          show.ci = FALSE, 
          # show.aic = TRUE,
          # show.aicc = TRUE,
          # show.loglik = TRUE,
          # auto.labels = FALSE,
          dv.labels = c("Base", "Base, Processes", "Base, Processes, Quality"),
          pred.labels = c("Intercept", "SLOC", "Top Level Directories", "Incubation Month",
                          "Major Contributors", "Minor Contributors","New Contributors", "Files Added", "Files Deleted", "Avg. Files Modified per Commit",
                          "Emails", "Corrective", "Features", "Perfective", "Non Functional",
                          "Test/Main Lines of Code Percentage", "#Functions /w McCabe Index 11-25", "#Functions /w McCabe Index >51 ", "Very Large File Size Count", "Very Large Function Size Count",
                          "Code Duplication Percentage"),
          file="models-java-glm.html")



### PLOTS ###


df_plots <- get_df()
df_plots <- remove_problematic_projects()
# df_plots$status <- ifelse(1, "graduated", "retired")

df_plots$status <- ifelse(df_plots$status == 1, "graduated", "retired")
slow_projects <- df_plots %>% select(incubation_month, project) %>% group_by(project) %>% summarise(n = max(incubation_month)) %>% filter(n > 24)
fast_projects <- df_plots %>% select(incubation_month, project) %>% group_by(project) %>% summarise(n = max(incubation_month)) %>% filter(n <= 24)


slow_projects_df <- df_plots[df_plots$project %in% slow_projects$project, ] %>% filter(incubation_month <=24)
fast_projects_df <- df_plots[df_plots$project %in% fast_projects$project, ] %>% filter(incubation_month <=24)



# Active days
a <- ggplot(slow_projects_df, aes(x = (incubation_month), y = active_days)) + 
  geom_smooth(aes(group=status, color=status)) + 
  #scale_x_discrete(limits=c("4","8","12","16","20", "24")) +
  scale_x_continuous(name="Incubation Month", breaks = c(5,10,15,20,25)) + 
  labs(x="Incubation Month", y = "Active Days", title ="Incubating > 24 months ") +
  theme(axis.text.x = element_text(color = "black", size = 14),
        axis.text.y = element_text(color = "black", size = 14),
        axis.title.x = element_text(color = "black", size = 14),
        axis.title.y = element_text(color = "black", size = 14),
        legend.position = c(0.92,0.94),
        legend.text=element_text(size=16),
        legend.title=element_blank()) +
  scale_fill_manual(values = c("#00AFBB", "#FC4E07")) 


b <- ggplot(fast_projects_df, aes(x = (incubation_month), y = active_days)) + 
  geom_smooth(aes(group=status, color=status)) + 
  scale_x_continuous(name="Incubation Month", breaks = c(5,10,15,20,25)) + 
  labs(x="Incubation Month", y = "Active Days", title ="Incubating <= 24 months ") +
  theme(axis.text.x = element_text(color = "black", size = 14),
        axis.text.y = element_text(color = "black", size = 14),
        axis.title.x = element_text(color = "black", size = 14),
        axis.title.y = element_text(color = "black", size = 14),
        legend.position = c(0.92,0.94),
        legend.text=element_text(size=16),
        legend.title=element_blank()) +
  scale_fill_manual(values = c("#00AFBB", "#FC4E07")) 

ggarrange(a, b, legend = "bottom", common.legend = 1)

# Avg Commits per Contributor plot
a <- ggplot(slow_projects_df, aes(x = (incubation_month), y = commits/authors)) + 
  geom_smooth(aes(group=status, color=status)) +  
  scale_x_continuous(name="Incubation Month", breaks = c(5,10,15,20,25)) +
  labs(x="Incubation Month", y = "Avg. Commits per Contributor", title ="Incubating > 24 months ") +
  theme(axis.text.x = element_text(color = "black", size = 14),
        axis.text.y = element_text(color = "black", size = 14),
        axis.title.x = element_text(color = "black", size = 14),
        axis.title.y = element_text(color = "black", size = 14),
        legend.position = c(0.92,0.94),
        legend.text=element_text(size=16),
        legend.title=element_blank()) +
  scale_fill_manual(values = c("#00AFBB", "#FC4E07")) 


b <- ggplot(fast_projects_df, aes(x = (incubation_month), y = commits/authors)) + 
  geom_smooth(aes(group=status, color=status)) + 
  scale_x_continuous(name="Incubation Month", breaks = c(5,10,15,20,25)) +
  labs(x="Incubation Month", y = "Avg. Commits per Contributor", title ="Incubating <= 24 months ") +
  theme(axis.text.x = element_text(color = "black", size = 14),
        axis.text.y = element_text(color = "black", size = 14),
        axis.title.x = element_text(color = "black", size = 14),
        axis.title.y = element_text(color = "black", size = 14),
        legend.position = c(0.92,0.94),
        legend.text=element_text(size=16),
        legend.title=element_blank()) +
  scale_fill_manual(values = c("#00AFBB", "#FC4E07")) 

ggarrange(a, b, legend = "bottom", common.legend = 1)

# Ratio between number of files and contributors
a <- ggplot(df_plots[df_plots$incubation_month <= 24,], aes(x = (incubation_month), y = files / (minor_contributors + major_contributors))) + 
  geom_smooth(aes(group=status, color=status)) +  
  scale_x_continuous(name="Incubation Month", breaks = c(5,10,15,20,25)) +
  labs(x="Incubation Month", y = "Ratio between number of files and contributors", title ="Ratio between number of files and contributors") +
  theme(axis.text.x = element_text(color = "black", size = 14),
        axis.text.y = element_text(color = "black", size = 14),
        axis.title.x = element_text(color = "black", size = 14),
        axis.title.y = element_text(color = "black", size = 14),
        legend.position = c(0.92,0.94),
        legend.text=element_text(size=16),
        legend.title=element_blank()) +
  scale_fill_manual(values = c("#00AFBB", "#FC4E07"))

# SLOC
b <- ggplot(df_plots[df_plots$incubation_month <= 24,], aes(x = (incubation_month), y = code)) + 
  geom_smooth(aes(group=status, color=status)) + 
  scale_x_continuous(name="Incubation Month", breaks = c(5,10,15,20,25)) +
  labs(x="Incubation Month", y = "Lines of code", title ="Lines of code") +
  theme(axis.text.x = element_text(color = "black", size = 14),
        axis.text.y = element_text(color = "black", size = 14),
        axis.title.x = element_text(color = "black", size = 14),
        axis.title.y = element_text(color = "black", size = 14),
        legend.position = c(0.92,0.94),
        legend.text=element_text(size=16),
        legend.title=element_blank()) +
  scale_fill_manual(values = c("#00AFBB", "#FC4E07")) 

ggarrange(a, b, legend = "bottom", common.legend = 1)


## CORRELATION MATRIX ##
cols <- c("code",
          "commits",
          "directories",
          "top_level_dirs",
          "incubation_month",
          "authors",
          "major_contributors", 
          "minor_contributors", 
          "new_contributors", 
          "files_added",
          "files_deleted",
          "files",
          "avg_files_modified_commit",
          "active_days",
          "emails",
          "corrective",
          "features",
          "perfective",
          "non_functional",
          "number_of_units",
          "test_vs_main_lines_of_code_percentage",
          "conditional_complexity_medium_risk_count",
          "conditional_complexity_very_high_risk_count",
          "very_high_risk_file_size_count",
          "unit_size_very_high_risk_count",
          "duplication_percentage",
          "most_complex_unit_loc"
)
idx <- match(cols, names(df))
# idx <- sort(c(idx-1, idx))

NewDF <- df[,idx] 

cors <- function(df) {
  M <- Hmisc::rcorr(as.matrix(df))
  Mdf <- map(M, ~data.frame(.x))
  return(Mdf)
}

formatted_cors <- function(df){
  cors(df) %>%
    map(~rownames_to_column(.x, var="measure1")) %>%
    map(~pivot_longer(.x, -measure1, "measure2")) %>%
    bind_rows(.id = "id") %>%
    pivot_wider(names_from = id, values_from = value) %>%
    rename(p = P) %>%
    mutate(sig_p = ifelse(p < .05, T, F),
           p_if_sig = ifelse(sig_p, p, NA),
           r_if_sig = ifelse(sig_p, r, NA)) 
}

formatted_cors(NewDF) %>%
  ggplot(aes(measure1, measure2, fill=r, label=round(r_if_sig,2))) +
  geom_tile() +
  labs(x = NULL, y = NULL, fill = "Pearson's\nCorrelation", title="Correlations in Mtcars",
       subtitle="Only significant Pearson's correlation coefficients shown") +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446", limits=c(-1,1)) +
  geom_text() +
  theme_classic() +
  scale_x_discrete(expand=c(0,0)) +
  scale_y_discrete(expand=c(0,0)) +
  theme(text=element_text(family="Roboto"))
