library(lubridate) # days(x), etc. library(effsize) # cliff.delta(...) #library(orddom) # common-language effect size / A statistic # colors with alpha channel library(grDevices) gray_lighter <- adjustcolor("gray70", alpha.f=0.2) gray_light <- adjustcolor("gray48", alpha.f=1) gray_dark <- adjustcolor("gray34", alpha.f=1) blue_light <- adjustcolor("skyblue1", alpha.f=1) blue_dark <- adjustcolor("steelblue4", alpha.f=1) red_light <- adjustcolor("salmon", alpha.f=1) red_dark <- adjustcolor("firebrick3", alpha.f=1) complete_setdiff <- function(x, y) { sort(c(setdiff(x, y), setdiff(y, x))) } # DISPERSION OF VALUES # see http://stackoverflow.com/a/12867538 and https://en.wikipedia.org/wiki/Outlier#Tukey.27s_test outlier_thresholds <- function(v, k) { # k=1.5 (outlier) or k=3 (far outlier) typical, see Tukey test q_1 = quantile(v, na.rm=TRUE)[2] q_3 = quantile(v, na.rm=TRUE)[4] #iqr = IQR(v) iqr = q_3-q_1 lower = q_1 - k*iqr upper = q_3 + k*iqr thresholds <- data.frame(lower, upper) row.names(thresholds) <- NULL return(thresholds) } outlier_filter <- function(v, k) { # k=1.5 (outlier) or k=3 (far outlier) typical, see Tukey test thresholds <- outlier_thresholds(v, k) return(vthresholds$upper) } outliers <- function(v, k) { # k=1.5 (outlier) or k=3 (far outlier) typical, see Tukey test return(v[outlier_filter(v,k)]) } outlier_pos <- function(v, k) { # k=1.5 (outlier) or k=3 (far outlier) typical, see Tukey test return(which(outlier_filter(v, k))) } outliers_percent <- function(v, k) { # k=1.5 (outlier) or k=3 (far outlier) typical, see Tukey test return(length(outliers(v, k))/length(v)*100) } get_unique_hash_values_filter <- function(commit_df) { hash_values <- unique(commit_df$hash_value) res <- integer(length(hash_values)) for (i in 1:length(hash_values)) { hash_value <- hash_values[i] res[i] <- min(which(commit_df$hash_value == hash_value)) print(paste0(hash_value, ": ", res[i])) } } get_next_monday <- function(date) { date <- as.POSIXlt(date, tz="UTC") while (date$wday != 1) { # 1 = Monday date <- date + days(1) } return(as.POSIXct(date, tz="UTC")) } get_previous_sunday <- function(date) { date <- as.POSIXlt(date, tz="UTC") while (date$wday != 0) { # 0 = Sunday date <- date - days(1) } return(as.POSIXct(date, tz="UTC")) } get_builds_per_week <- function(project, builds_df) { project_builds <- builds_df[builds_df$gh_project_name==project,] min_date <- min(project_builds$gh_build_started_at) max_date <- max(project_builds$gh_build_started_at) next_monday <- get_next_monday(min_date) previous_sunday <- get_previous_sunday(max_date) max_offset <- as.integer(difftime(previous_sunday, next_monday, units="days"))-7 builds_per_week <- numeric(max_offset) for (i in seq(from=0, to=max_offset, by=7)) { monday <- min_date + days(i) sunday <- monday + days(6) build_filter <- difftime(project_builds$gh_build_started_at, monday) >= 0 & difftime(sunday, project_builds$gh_build_started_at) >= 0 build_count <- nrow(project_builds[build_filter,]) #print(paste0(project, ", week ", i, "-", i+6, ": ", build_count)) builds_per_week[i] <- build_count } return(builds_per_week) } get_mean_builds_per_week <- function(builds_df) { projects <- unique(builds_df$gh_project_name) res <- data.frame( project=projects, mean_builds_per_week=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) res$mean_builds_per_week[i] <- mean(get_builds_per_week(project, builds_df)) } return(res) } get_time_between_builds <- function(project, builds_df) { # builds already sorteted by start date project_builds <- builds_df[builds_df$gh_project_name==project,] build_count <- nrow(project_builds) time_between_builds <- numeric(build_count-1) for (i in 1:(build_count-1)) { current_build <- project_builds$gh_build_started_at[i] next_build <- project_builds$gh_build_started_at[i+1] time_between_builds[i] <- difftime(next_build, current_build, units="days") } return(time_between_builds) } get_mean_time_between_builds <- function(builds_df) { projects <- unique(builds_df$gh_project_name) sorted_builds <- builds_df[order(builds_df$gh_build_started_at),] res <- data.frame( project=projects, mean_time_between_builds=numeric(length(projects)), sd=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) time_between_builds <- get_time_between_builds(project, sorted_builds) res$mean_time_between_builds[i] <- mean(time_between_builds) res$sd[i] <- sd(time_between_builds) } return(res) } ####################### CHECK IF BUILDS FOR DEFAULT BRANCH EXIST ########################### check_builds_default_branch <- function(projects, builds) { res <- data.frame( project = character(nrow(projects)), builds_for_default_branch = integer(nrow(projects)), stringsAsFactors=FALSE ) for (i in 1:nrow(projects)) { project <- projects$gh_project_name[i] print(paste0("Project ", i, ": ", project)) default_branch <- projects$default_branch[i] builds_for_default_branch <- builds[builds$gh_project_name == project & builds$git_branch == default_branch,] res$project[i] <- project res$builds_for_default_branch[i] <- length(unique(builds_for_default_branch$tr_build_id)) } return(res) } ####################### COMMIT RATE ############################### get_commits_per_week <- function(project, sorted_commits_df, ignore_inactive_weeks) { fiter <- sorted_commits_df$project==project if (length(which(fiter)) == 0) { return(numeric(0)) } project_commits <- sorted_commits_df[fiter,] min_date <- min(project_commits$commit_date) max_date <- max(project_commits$commit_date) next_monday <- get_next_monday(min_date) previous_sunday <- get_previous_sunday(max_date) max_offset <- as.integer(difftime(previous_sunday, next_monday, units="days"))-7 weeks <- (max_offset+7)/7 if (max_offset < 0) { return(numeric(0)) } commits_per_week <- numeric(weeks) for (i in seq(from=0, to=max_offset, by=7)) { monday <- min_date + days(i) sunday <- monday + days(6) commit_filter <- difftime(project_commits$commit_date, monday) >= 0 & difftime(sunday, project_commits$commit_date) >= 0 commit_count <- nrow(project_commits[commit_filter,]) #print(paste0(project, ", week ", i, "-", i+6, ": ", commit_count)) commits_per_week[(i+7)%/%7] <- commit_count } if (ignore_inactive_weeks) { commits_per_week <- commits_per_week[commits_per_week>0] # only include weeks with commits } return(commits_per_week) } get_all_commits_per_week <- function(commits_df, ignore_inactive_weeks) { projects <- unique(commits_df$project) sorted_commits <- commits_df[order(commits_df$commit_date),] res <- data.frame( project=character(0), commit_count=numeric(0), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) commits_per_week <- get_commits_per_week(project, sorted_commits, ignore_inactive_weeks) if (length(commits_per_week) > 0) { tmp <- data.frame(commit_count=commits_per_week) tmp$project <- rep(project, length(commits_per_week)) res <- data.frame( rbind(tmp, res) ) } } return(res) } get_median_commit_rate <- function(commits_df, ignore_inactive_weeks) { projects <- unique(commits_df$project) sorted_commits <- commits_df[order(commits_df$commit_date),] res <- data.frame( project=projects, median_commits_per_week=numeric(length(projects)), iqr=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) commits_per_week <- get_commits_per_week(project, sorted_commits, ignore_inactive_weeks) if (length(commits_per_week) == 0) { res$median_commits_per_week[i] <- NA res$iqr[i] <- NA } else { res$median_commits_per_week[i] <- median(commits_per_week) res$iqr[i] <- IQR(commits_per_week) } } return(res) } ################# COMMIT SIZE (BREADTH) ################################## get_median_file_count <- function(commits_df) { projects <- unique(commits_df$project) res <- data.frame( project=projects, median_file_count=numeric(length(projects)), iqr=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) fiter <- commits_df$project==project & !is.na(commits_df$file_count) project_commits <- commits_df[fiter,] res$median_file_count[i] <- median(project_commits$file_count) res$iqr[i] <- IQR(project_commits$file_count) } return(res) } ################# COMMIT SIZE (DEPTH) ################################## get_median_normalized_line_count_all <- function(commits_df) { projects <- unique(commits_df$project) res <- data.frame( project=character(length(projects)), median_normalized_line_count=numeric(length(projects)), iqr=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) res$project[i] <- project fiter <- commits_df$project==project & !is.na(commits_df$normalized_line_count) if (length(which(fiter)) > 0) { project_commits <- commits_df[fiter,] normalized_line_count <- project_commits$line_count/project_commits$file_count res$median_normalized_line_count[i] <- median(normalized_line_count) res$iqr[i] <- IQR(normalized_line_count) } else { res$median_normalized_line_count[i] <- NA res$iqr[i] <- NA } } return(res) } ################# MERGE RATIO ################################## get_merge_ratio <- function(project, commits_df, merges_df) { fiter <- merges_df$project==project if (length(which(fiter)) == 0) { return(NA) } project_merges <- merges_df[fiter,] merge_count <- nrow(project_merges) fiter <- commits_df$project==project project_commits <- commits_df[fiter,] commit_count <- nrow(project_commits) return(merge_count/(commit_count+merge_count)) } get_project_merge_ratio <- function(commits_df, merges_df) { projects <- unique(c(commits_df$project, merges_df$project)) res <- data.frame( project=projects, merge_ratio=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) merge_ratio <- get_merge_ratio(project, commits_df, merges_df) res$merge_ratio[i] <- mean(merge_ratio) } return(res) } ################# PULL REQUEST RATIO ################################## get_pr_ratio <- function(project, merges_df) { fiter <- merges_df$project==project if (length(which(fiter)) == 0) { return(NA) } project_merges <- merges_df[fiter,] merge_count <- nrow(project_merges) pr_count <- nrow(project_merges[!is.na(project_merges$pull_request_id),]) return(pr_count/merge_count) } get_project_pr_ratio <- function(merges_df) { projects <- unique(merges_df$project) res <- data.frame( project=projects, pr_ratio=numeric(length(projects)), stringsAsFactors=FALSE ) for (i in 1:length(projects)) { project <- projects[i] print(paste0("Project ", i, ": ", project)) pr_ratio <- get_pr_ratio(project, merges_df) res$pr_ratio[i] <- mean(pr_ratio) } return(res) }