This script enable us too run statistical testing on our dataset. In particular, we used Kruskal-Wallis H-test, and to compare three .categories and then we had to account for multiple comparison of p-values so we did an additional test post-hoc; Dunny test and finally to report our p-values, we need the effect size as well.
import scipy
import scipy.stats as stats
import json
import csv
import seaborn as sns; sns.set()
import missingno as msno
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
# Posthoc test
import scikit_posthocs as sp
import statsmodels as sm
# Kruskal-Wallis H-test
from numpy.random import seed
from numpy.random import randn
from scipy.stats import kruskal
from sksurv.preprocessing import OneHotEncoder
from sksurv.metrics import concordance_index_censored
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.svm import FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel
df1 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_1.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
df2 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
df3 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_3.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
#!open .
from __future__ import division
"""This script is developed by Neil Ernst and available at GitHub at:
https://github.com/neilernst/cliffsDelta
"""
def cliffsDelta(lst1, lst2, **dull):
"""Returns delta and true if there are more than 'dull' differences"""
if not dull:
dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
m, n = len(lst1), len(lst2)
lst2 = sorted(lst2)
j = more = less = 0
for repeats, x in runs(sorted(lst1)):
while j <= (n - 1) and lst2[j] < x:
j += 1
more += j*repeats
while j <= (n - 1) and lst2[j] == x:
j += 1
less += (n - j)*repeats
d = (more - less) / (m*n)
size = lookup_size(d, dull)
return d, size
def lookup_size(delta: float, dull: dict) -> str:
"""
:type delta: float
:type dull: dict, a dictionary of small, medium, large thresholds.
"""
delta = abs(delta)
if delta < dull['small']:
return 'negligible'
if dull['small'] <= delta < dull['medium']:
return 'small'
if dull['medium'] <= delta < dull['large']:
return 'medium'
if delta >= dull['large']:
return 'large'
def runs(lst):
"""Iterator, chunks repeated values"""
for j, two in enumerate(lst):
if j == 0:
one, i = two, 0
if one != two:
yield j - i, one
i = j
one = two
yield j - i + 1, two
## Load data
data1 = df1['ProportionCommit']
data2 = df2['ProportionCommit']
data3 = df3['ProportionCommit']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
## Load data
data1 = df1['Commits']
data2 = df2['Commits']
data3 = df3['Commits']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Commits distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Commits distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Commits distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['Efforts']
data2 = df2['Efforts']
data3 = df3['Efforts']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Efforts distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Efforts distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Efforts distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['Buggy']
data2 = df2['Buggy']
data3 = df3['Buggy']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Quality distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Quality distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Quality distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['Tech_Diversity']
data2 = df2['Tech_Diversity']
data3 = df3['Tech_Diversity']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Technical-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Technical-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Technical-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['CodeDiversity']
data2 = df2['CodeDiversity']
data3 = df3['CodeDiversity']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Code-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Code-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Code-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['Affiliated']
data2 = df2['Affiliated']
data3 = df3['Affiliated']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Affiliation distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Affiliation distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Affiliation distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['Time2Commit']
data2 = df2['Time2Commit']
data3 = df3['Time2Commit']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta between Cat-I and Cat-II of Time2First-Commit distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta between Cat-II and Cat-III of Time2First-Commit distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta between Cat-I and Cat-III of Time2First-Commit distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
# ## Females df1, df2, and df3
# df1_female = df1.groupby(['Gender']).get_group(('female'))
# df2_female = df2.groupby(['Gender']).get_group(('female'))
# df3_female = df3.groupby(['Gender']).get_group(('female'))
# ## Males df1, df2, and df3
# df1_male = df1.groupby(['Gender']).get_group(('male'))
# df2_male = df2.groupby(['Gender']).get_group(('male'))
# df3_male = df3.groupby(['Gender']).get_group(('male'))
# ## Neutrals df1, df2, and df3
# df1_neutral = df1.groupby(['Gender']).get_group(('neutral'))
# df2_neutral = df2.groupby(['Gender']).get_group(('neutral'))
# df3_neutral = df3.groupby(['Gender']).get_group(('neutral'))
##############################################################
#########. FROM CAT TO NUMNERICAL
# y1 = np.array([45, 42, 27]) ## Male
# y2 = np.array([11, 12, 20]) ## Neutral
# y3 = np.array([ 5, 7, 14]) ## Female
################################ Medians ##############################
## Category I
femaledataset1 = [4, 3, 3, 4, 5, 3, 5]
maledataset1 = [51, 51, 52, 45, 45, 46, 44]
neutraldataset1 = [ 6, 7, 6, 12, 11, 12, 12]
## Category II
femaledataset2 = [ 4, 5, 4, 4, 6, 6, 7]
maledataset2 = [48, 47, 47, 48, 45, 44, 42]
neutraldataset2 = [ 9, 9, 10, 9, 10, 11, 12]
## Category III
femaledataset3 = [10, 13, 13, 15, 14, 14, 15]
maledataset3 = [34, 32, 30, 23, 25, 27, 24]
neutraldataset3 = [17, 16, 18, 23, 22, 20, 22]
import statistics as ss
ss.median(neutraldataset3)
## Data
data1 = femaledataset1 #df1_female['Gender']
data2 = femaledataset2 #df2_female['Gender']
data3 = femaledataset3 #df3_female['Gender']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
data1 = maledataset1 #df1_male['Gender']
data2 = maledataset2 #df2_male['Gender']
data3 = maledataset3 #df3_male['Gender']
## processing data as list
x1 = list(data1) # df1_male
x2 = list(data2) #df2_male
x3 = list(data3) #df3_male
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
data1 = neutraldataset1 #df1_neutral['Gender']
data2 = neutraldataset2 #df2_neutral['Gender']
data3 = neutraldataset3 #df3_neutral['Gender']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta of Neutrals distributions Cat-I vs. Cat-II = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
print('The cliff delta of Neutrals distributions Cat-II vs. Cat-III = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
print('The cliff delta of Neutrals distributions Cat-I vs. Cat-III = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
## Load data
data1 = df1['Density']
data2 = df2['Density']
data3 = df3['Density']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
## Load data
data1 = df1['Longivity']
data2 = df2['Longivity']
data3 = df3['Longivity']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
## Load data
data1 = df1['Density']
data2 = df2['Density']
data3 = df3['Density']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
## Load data
data1 = df1['Accept_Rate']
data2 = df2['Accept_Rate']
data3 = df3['Accept_Rate']
## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]
# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1
H = 5.99
alpha = 0.05
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
sp.posthoc_dunn(x, p_adjust = 'holm')
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
# from scipy.stats import norm # scipy's normal distribution module
# import matplotlib.pyplot as plt
# import numpy as np
# %matplotlib inline
# fig = plt.figure(figsize=(10, 4))
# x = np.linspace(-4, 4, 101)
# y = norm.pdf(x) # probability density function
# pct95_1 = x < norm.ppf(0.025); pct975_1 = x < norm.ppf(0.0125); pct99_1 = x < norm.ppf(0.005)
# pct95_2 = x > norm.ppf(0.975); pct975_2 = x > norm.ppf(0.9875); pct99_2 = x > norm.ppf(0.995)
# ax = plt.subplot(111)
# ax.plot(x, y)
# opacity = 0.25
# shading = "k"
# sections = [pct95_1, pct95_2, pct975_1, pct975_2, pct99_1, pct99_2]
# for section in sections:
# ax.fill_between(x[section], np.zeros(sum(section)), y[section], color=shading, alpha=opacity)
# ax.text(0.1, 0.9, "Two-tailed t-Test", transform=ax.transAxes)
# ax.set_ylim(0, 0.41)
# plt.show()
# fig = plt.figure(figsize=(10, 4))
# x = np.linspace(-4, 4, 101)
# y = norm.pdf(x)
# pct95 = x < norm.ppf(0.05); pct975 = x < norm.ppf(0.025); pct99 = x < norm.ppf(0.01)
# sections = [pct95, pct975, pct99]
# ax = plt.subplot(111)
# ax.plot(x, y)
# for section in sections:
# ax.fill_between(x[section], np.zeros(sum(section)), y[section], color='k', alpha=0.25)
# ax.text(0.1, 0.9, "Left-tailed t-Test", transform=ax.transAxes)
# ax.set_ylim(0, 0.41)
# plt.show()
# df1 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_1.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# df2 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# df3 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_3.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
#df3.groupby(['Gender', 'Events'])['Gender'].count()