library(htmlTable)
library(OIsurv)
library(survival)
library(car)
library(survminer)
library(ggplot2)
# Only do this once
# sd <- read.csv("survival_data.pypi_2008_2017-12_6.csv")
# sd$age_start = c(0, unlist(lapply(2:nrow(sd),
# function(x){ ifelse(sd[(x-1),"name"] == sd[x,"name"],
# sd[(x-1),"age"],
# 0) })))
# write.table(sd, file="survival_data.csv", sep=";", row.names = FALSE)
# sd.noskip <- read.csv("no_skip_last_year.csv")
# sd.noskip$age_start = c(0, unlist(lapply(2:nrow(sd.noskip),
# function(x){ ifelse(sd.noskip[(x-1),"name"] == sd.noskip[x,"name"],
# sd.noskip[(x-1),"age"],
# 0) })))
# write.table(sd.noskip, file="no_skip_last_year.repeated.csv", sep=";", row.names = FALSE)
sd <- read.csv("survival_data.csv", sep=";")
# View(sd[1:100,])
sd$license_type = as.factor(sd$license_code)
table(sd$license_type)
names(sd)
nrow(sd)
length(unique(sd$name))
sd$fy = sd$age <= 12
sd$fhy = sd$age <= 6
sd$has_university = sd$university>=0.1
sd$has_commercial = sd$commercial>0.1
sd$has_backporting = sd$backporting==1
sd$upstreams_sq = sd$upstreams ^ 2
sd$downstreams_sq = sd$downstreams ^ 2
sd$commits_sq = sd$commits ^ 2
library(sqldf)
age_stats = sqldf("select name, min(month) as 'start_month',
max(age) as 'death_age', max(dead) as 'dead'
from sd group by name")
View(head(age_stats))
table(age_stats$start_month=="2016-12")
table(age_stats$start_month=="2016-11")
# boxplot(dead ~ death_age, age_stats[age_stats$death_age<18,])
pdf(file="pypi-per-month.pdf", width=6, height=3.5)
par(mar=c(4, 4, 1, 0))
plot(table(age_stats[age_stats$start_month!="2016-12",]$start_month),
# type = "l",
ylab="Number of new PyPI packages", xlab="Time", xaxt="n")
axis(1, at=c(1,13,25,37,49,61,73,85,97,109),
labels=c("2008-01","2009-01","2010-01","2011-01","2012-01",
"2013-01","2014-01","2015-01","2016-01","2017-01"))
dev.off()
hist(age_stats[!(age_stats$name %in% age_stats[age_stats$start_month<"2012-01",]$name),]$death_age)
# ag = age_stats[!(age_stats$name %in% age_stats[age_stats$start_month<"2012-01",]$name),]$death_age
# length(ag)
ag = age_stats[!(age_stats$name %in% age_stats[age_stats$start_month<"2012-01",]$name) &
age_stats$dead==1,]$death_age
length(ag)
plot(table(ag[ag>1]))
# Putting death_age = 0 and death_age = 1 together because of the way
# the data was collected: months are calendar months (if project starts
# on Jan 31st and dies on Feb 1st, death_age = 1)
dead_prematurely = age_stats[age_stats$dead &
(age_stats$death_age==0 |
age_stats$death_age==1),]$name
length(dead_prematurely)
# dead_prematurely6 = age_stats[age_stats$dead &
# (age_stats$death_age<6),]$name
# length(dead_prematurely6)
# Split the data to model premies separately
length(unique(subset(sd, !(name %in% age_stats[age_stats$start_month<"2012-01",]$name))$name))
nrow(sd)
sd.prem = subset(sd, !(name %in% unique(age_stats[age_stats$start_month<"2012-01",]$name)))
nrow(sd.prem)
length(unique(sd.prem$name))
sd.prem = subset(sd.prem, age<6)
# (name %in% dead_prematurely | age<=1))
# (name %in% dead_prematurely6 | age<6))
nrow(sd.prem)
length(unique(sd.prem$name))
table(sd.prem$dead)
sd.prem.sample = subset(sd.prem) #, age_start!=age)
nrow(sd.prem.sample)
# sd.prem.sample = rbind(sd.prem[sd.prem$dead==0,],
# sd.prem[sd.prem$dead==1,][sample(nrow(sd.prem[sd.prem$dead==1,]),
# nrow(sd.prem[sd.prem$dead==0,])),])
nrow(sd.prem.sample)
table(sd.prem.sample$dead)
# t = table(sd.prem.f$name)
# head(t[t>1])
sd.prem.sample.f = subset(sd.prem.sample,
commits<=exp(6) &
contributors<=exp(3.5) &
q90<=1 &
issues<=exp(5) &
submitters<=exp(5) &
downstreams<40 &
t_downstreams<=exp(8) &
dc_katz<0.001 &
non_dev_issues<5 &
cc_degree<50 &
upstreams<30)
m.prem = glm(dead ~ log(commits+1) +
# log(commits_sq + 1) +
log(contributors+1) +
# log(q90+1) +
# log(issues+1) +
log(non_dev_issues+1) +
# log(submitters+1) +
# log(non_dev_submitters+1) +
log(cc_degree+1) +
# log(downstreams+1) +
downstreams +
# log(t_downstreams+1) +
# log(upstreams+1) +
upstreams +
# log(d_upstreams+1) +
d_upstreams +
# log(t_upstreams+1),
scale(log(dc_katz+1)) +
has_university +
has_commercial +
# log(commercial+1) +
has_backporting +
license_type +
org,
family="binomial",
data=sd.prem.sample.f)
vif(m.prem)
summary(m.prem)
Anova(m.prem, type=2)
library(pscl)
pR2(m.prem)
# plot(m.prem)
source("helpers.r")
library(texreg)
library(xtable)
file="tex_model_glm.csv"
modelNames=c("Early-stage abandoners")
caption="Early-stage abandoners model; response: abandoned=TRUE"
mList = list(m1=m.prem)
makeTexRegCox(mList, file, modelNames, caption, digits=2)
print_Anova_glm(m.prem, "anova_model_glm_1.csv")
# print_anova(rddFit, "~/R/anova_model_fresh_rdd_1.csv")
# 2-month+ survivors
sd.f = subset(sd, !(name %in% dead_prematurely) &
!(name %in% age_stats[age_stats$start_month<"2012-01",]$name) &
age_start!=age & age>=6)
nrow(sd.f)
table(sd.f$dead)
# sd.0 = subset(sd, age_start!=age)
# sfit0 = survfit(Surv(sd.0$age_start, sd.0$age, sd.0$dead) ~ 1)
sd.f = subset(sd, age_start!=age & !(name %in% unique(age_stats[age_stats$start_month<"2012-01",]$name)))
s = Surv(sd.f$age_start, sd.f$age, sd.f$dead)
sfit = survfit(s ~ 1)
# plot(sfit, xlab="months", ylab="proportion not dead")
library(ggplot2)
ggsurvplot(
sfit,
# sfit0,
data = sd.f,
# data = sd.0,
size = 1, # change line size
palette = c("#2E9FDF"),
#c("#E7B800", "#2E9FDF"),# custom color palettes
conf.int = TRUE, # Add confidence interval
# pval = TRUE, # Add p-value
# risk.table = TRUE, # Add risk table
# risk.table.col = "strata",# Risk table color by groups
# legend.labs =
# c("Male", "Female"), # Change legend labels
# risk.table.height = 0.25, # Useful to change when you have multiple groups
legend = "none",
xlab = "Time in months", # customize X axis label.
break.time.by = 12, # break X axis in time intervals by 500.
ggtheme = theme_light()
# ggtheme = theme_bw() # Change ggplot2 theme
)
ggsave("base-survival.pdf", width = 6, height = 3.5)
# s = Surv(sd$age, sd$dead)
# sfit = survfit(s ~ 1)
# plot(sfit)
sd.f$commits_time = log(sd.f$commits+1) * sd.f$age_start
sd.f$downstreams_time = log(sd.f$downstreams+1) * log(sd.f$age_start+1)
sd.f$upstreams_time = log(sd.f$upstreams+1) * log(sd.f$age_start+1)
sd.f$q90_time = log(sd.f$q90+1) * log(sd.f$age_start)
sd.f$has_downstreams = sd.f$downstreams > 0
y = subset(sd.f,
commits<=exp(6) &
contributors<=exp(3.5) &
q90<=exp(2.5) &
issues<=exp(5) &
submitters<=exp(4) &
upstreams<=exp(4) &
downstreams<=500 &
t_downstreams<=exp(8) &
dc_katz<0.002 &
cc_degree<30)
hist(log(y$commits+1))
hist(log(y$contributors+1))
hist(log(y$q90+1))
hist(log(y$issues+1))
hist(log(y$submitters+1))
hist(log(y$upstreams+1))
hist(log(y$downstreams+1))
boxplot(log(downstreams+1) ~ age, y)
boxplot(log(upstreams+1) ~ age, y)
boxplot(log(upstreams_time+1) ~ age, y)
boxplot(log(q90+1) ~ age, y)
sm_repeating <- coxph(Surv(age_start, age, dead) ~
# sm_repeating <- coxph(Surv(age, dead) ~
# cluster(name) +
log(commits+1) +
log(contributors+1) +
# log(q90+1) +
# q90_time +
log(non_dev_issues+1) +
# log(issues+1) +
# log(non_dev_issues+1) + #* log(contributors+1) +
# log(submitters+1) +
# log(non_dev_submitters+1) +
log(cc_degree+1) +
# log(downstreams+1) +
downstreams +
# downstreams_time +
upstreams +
d_upstreams +
# log(upstreams_sq+1) +
# upstreams_time +
# log(t_downstreams+1) +
# log(t_upstreams+1),
scale(log(dc_katz+1)) +
has_university +
has_commercial +
has_backporting +
license_type +
org,
data=y)
# data=sd.prem.sample.f)
summary(sm_repeating)
hist(log(sd.prem.sample.f$commits+1))
hist(log(sd.prem.sample.f$contributors+1))
hist(log(sd.prem.sample.f$q90+1))
hist(log(sd.prem.sample.f$issues+1))
hist(log(sd.prem.sample.f$submitters+1))
hist(log(sd.prem.sample.f$upstreams+1))
hist(log(sd.prem.sample.f$downstreams+1))
boxplot(log(downstreams+1) ~ age, sd.prem.sample.f)
boxplot(downstreams ~ age, sd.prem.sample.f)
boxplot(log(upstreams+1) ~ age, sd.prem.sample.f)
boxplot(log(q90+1) ~ age, sd.prem.sample.f)
boxplot(dead ~ age, sd.prem.sample.f)
vif(sm_repeating)
# library("survminer")
# options("scipen"=100, "digits"=4)
cox.zph(sm_repeating) %>%
extract2("table") %>%
txtRound(digits = 2) %>%
knitr::kable(align = "r")
test.ph = cox.zph(sm_repeating)
ggcoxzph(test.ph)
Anova(sm_repeating, type=2)
round(exp(sm_repeating$coefficients), 2)
sm_repeating_inter <- coxph(Surv(age_start, age, dead) ~
# sm_repeating <- coxph(Surv(age, dead) ~
# cluster(name) +
log(commits+1) * log(non_dev_issues+1) +
# log(contributors+1) +
# log(q90+1) +
# q90_time +
# log(non_dev_issues+1) +
# log(issues+1) +
log(contributors+1) * log(non_dev_issues+1) +
# log(submitters+1) +
# log(non_dev_submitters+1) +
log(cc_degree+1) +
# log(downstreams+1) +
downstreams +
# downstreams_time +
# log(upstreams+1)+
upstreams +
# log(upstreams_sq+1)+
upstreams_sq +
d_upstreams +
# log(d_upstreams+1)+
# upstreams_time +
# log(t_downstreams+1) +
# log(t_upstreams+1),
scale(log(dc_katz+1)) +
has_university +
has_commercial +
has_backporting +
license_type +
org,
data=subset(y)) #, downstreams<=10 & upstreams<=20))
summary(sm_repeating_inter)
vif(sm_repeating_inter)
source("helpers.r")
library(texreg)
library(xtable)
file="tex_model_cox.csv"
modelNames=c("Abandoners")
caption="All abandoners except early-stage model"
mList = list(m1=sm_repeating)
makeTexRegCox(mList, file, modelNames, caption, digits=2)
print_Anova_glm(sm_repeating, "anova_model_cox_1.csv")
file="tex_model_all.csv"
modelNames=c("Early-stage abandoners","Non-early-stage abandoners","Non-early-stage abandoners")
caption="All abandoners except early-stage model"
mList = list(m1=m.prem, m2=sm_repeating, m3=sm_repeating_inter)
makeTexRegCox(mList, file, modelNames, caption, digits=2)
print_Anova_glm(m.prem, "anova_model_all_1.csv")
print_Anova_glm(sm_repeating, "anova_model_all_2.csv")
print_Anova_glm(sm_repeating_inter, "anova_model_all_3.csv")
# print_anova(rddFit, "~/R/anova_model_fresh_rdd_1.csv")
################ BELOW BE PUMPKINS ################
# sd.f = subset(sd, age_start!=age)
nrow(sd)
sd.f = droplevels(
subset(sd, age_start!=age &
!(name %in% age_stats[age_stats$start_month<"2012-01",]$name) &
commits<=exp(6) &
contributors<=exp(3.5) &
q90<=exp(3) &
issues<=exp(5) &
submitters<=exp(5) &
upstreams<=exp(4) &
downstreams<=exp(6.5) &
t_downstreams<=exp(8) &
dc_katz<=0.01)
)
nrow(sd.f)
# Projects being born per month
# plot(table(sd[sd$age==0,]$month))
# Projects dying per month
# plot(table(sd[sd$dead==1,]$month))
plot(table(sd.f$month))
sd_last <- sd[sd$last_observation == 1, ]
sd_new <- sd_last[sample(nrow(sd_last), 2000), ]
cox.fit.last <- coxph(Surv(age, dead) ~
+ q90
, data=sd_new)
summary(cox.fit.last)
cox.zph.last <- survival::cox.zph(cox.fit.last)
plot(cox.zph.last)
sd_last <- read.csv("survival_data4R_last.csv")
# sd_last$university <- as.integer(sd_last$uni > 0.2)
sd_last$commercial <- as.integer(sd_last$comm > 0.2)
sd_last$trivial <- as.integer(sd_last$max_contrib <= 1)
sd_full <- read.csv("survival_data4R.csv")
# sd_full$university <- as.integer(sd_full$uni > 0.2)
sd_full$commercial <- as.integer(sd_full$comm > 0.2)
sd_full$trivial <- as.integer(sd_full$max_contrib <= 1)
?survival::Surv
# baseline hazard
surv_obj = survfit(Surv(age, dead) ~ 1, data=sdl)
plot(surv_obj, xlab="months", ylab="proportion not dead")
title("Baseline survival function")
sd_last_split = survSplit(
Surv(age, dead) ~ .,
data=sd_last,
cut=c(1),
episode = 'tgroup'
)
head(sd_last_split)
cox.fit.last <- coxph(Surv(age, dead) ~
+ log(contributors1 + 1)
+ q90
+ log(non_dev_submitters + 1)
+ log(closed_issues + 1)
+ log(connectivity1000 + 1)
+ a_downstreams
+ upstreams
+ d_upstreams0
+ org
+ university
+ cc_closeness_1
, data=sd_last)
summary(cox.fit.last)
cox.zph.last <- cox.zph(cox.fit.last)
cox.zph.last
cox.zph.full <- cox.zph(cox.fit.full)
cox.zph.full
cox.fit.full <- coxph(Surv(age, dead) ~
+ log(contributors1 + 1)
+ q90
+ log(non_dev_submitters + 1)
+ log(closed_issues + 1)
+ log(connectivity1000 + 1)
+ a_downstreams
+ upstreams
+ d_upstreams0
+ org
+ university
+ cc_closeness_1
, data=sd_full)
summary(cox.fit.full)
cox.zph.full <- cox.zph(cox.fit.full)
cox.zph.full
ggcoxzph(cox.zph.full)
?sample
# sample dataset to reduce memory consumption
ggcoxdiagnostics(cox.fit.full)
surv_obj = survfit(Surv(age, dead) ~ !commercial, data=sdl)
plot(surv_obj, xlab="months", ylab="proportion not dead", col=c("blue", "red"))
legend("topright", c("commercial", "not"), col=c("blue", "red"), lty=1)
title("Commercial vs not")
?legend
surv_obj = survival::survfit(survival::Surv(age, dead) ~ !uni, data=sdl)
plot(surv_obj, xlab="months", ylab="proportion not dead", col=c("blue", "red"))
legend("topright", c("university", "not"), col=c("blue", "red"), lty=1)
title("University vs not")
surv_obj = survival::survfit(survival::Surv(age, dead) ~ !trivial, data=sdl)
plot(surv_obj, xlab="months", ylab="proportion not dead", col=c("blue", "red"))
legend("topright", c("trivial", "not"), col=c("blue", "red"), lty=1)
title("Trivial vs not")
# sd is just the same data without first year duplicates for compatibility
sd = sdl[(sdl$age > 11) | sdl$dead,]
head(sd)
s <- coxph(Surv(age, dead) ~
# fy # S INSIGNIFICANT
+ log(contributors1 + 1)
# + fy*log(contributors1 + 1) # S INSIGNIFICANT
#
+ q90 #+ gini # + commits # + q50 + q70
+ fy*q90
+ log(non_dev_submitters + 1)
# + fy*log(non_dev_submitters + 1) # S INSIGNIFICANT
#
+ log(closed_issues + 1)
# + fy*log(closed_issues + 1) # P INSIGNIFICANT
#
+ log(connectivity1000 + 1)
# + fy*log(connectivity1000 + 1) # S INSIGNIFICANT
#
+ a_downstreams # + a_downstreams + ac_downstreams + c_downstreams
+ fy*a_downstreams
#
+ upstreams
# + fy*upstreams # S INSIGNIFICANT
#
+ d_upstreams0
# + fy*d_upstreams0 # NA
#
+ org
+ fy*org
#
+ university
# + fy*university # P INSIGNIFICANT
# + commercial
# + dc_katz
# + fy*dc_katz
#
+ cc_closeness_1
# + fy*cc_closeness_1 # P INSIGNIFICANT
, data=sdl)
summary(s)
vif(s)
anova(s)
plot(survfit(s), xlab="months", ylab="proportion not dead")
# TODO:
# get number of commits in general for active developers
# also number of projects
s_nov <- coxph(Surv(age, dead) ~
+ log(contributors1 + 1)
+ q90 #+ gini # + commits # + q50 + q70
+ log(non_dev_submitters + 1)
# + non_dev_issues + issues # useless
+ log(closed_issues + 1)
# connectivity in the last month/year
+ log(connectivity1000 + 1)
+ a_downstreams # + a_downstreams + ac_downstreams + c_downstreams
# upstreams / a_upstreams:
+ upstreams
# binarize dead upstreams
+ d_upstreams0
+ org
+ dc_katz
+ cc_closeness_1
, data=sd)
summary(s_nov)
vif(s_nov)
anova(s_nov)
# Repeating observations
# Index([u'age', u'date', u'project', u'q50', u'gini', u'contributors',
# u'upstreams', u'commits', u'submitters', u'c_upstreams',
# u'c_downstreams', u'dead', u'ac_downstreams', u'connectivity',
# u'non_dev_issues', u'a_downstreams', u'closed_issues', u'ac_upstreams',
# u'a_upstreams', u'downstreams', u'issues', u'q70'],
# dtype='object')
# TO_CHECK (1|project_id) - random effect term
sm_repeating <- coxph(Surv(age, dead) ~ cluster(project)
+ log(contributors + 1) + q70 + gini
+ submitters
+ closed_issues
+ log(connectivity + 1)
+ a_downstreams
+ a_upstreams
, data=sd)
summary(sm_repeating)
vif(sm_repeating)
plot(survfit(s_nov), xlab="months", ylab="proportion not dead")
library(car)
vif(s)