Statistical testing

This script enable us too run statistical testing on our dataset. In particular, we used Kruskal-Wallis H-test, and to compare three .categories and then we had to account for multiple comparison of p-values so we did an additional test post-hoc; Dunny test and finally to report our p-values, we need the effect size as well.

Import Libraries

In [1]:
import scipy
import scipy.stats as stats
import json
import csv
import seaborn as sns; sns.set()
import missingno as msno
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings

# Posthoc test
import scikit_posthocs as sp
import statsmodels as sm
# Kruskal-Wallis H-test
from numpy.random import seed
from numpy.random import randn
from scipy.stats import kruskal


from sksurv.preprocessing import OneHotEncoder
from sksurv.metrics import concordance_index_censored
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.svm import FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel

Load dataset

In [9]:
df1 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_1.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
df2 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
df3 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_3.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
In [10]:
#!open .

Statistical testing -- complete dataset with confidential info.

Comparing Three categories Cat I vs. Cat II and Cat II vs. Cat III

Cliff Delta

In [2]:
from __future__ import division

"""This script is developed by Neil Ernst and available at GitHub at: 
    https://github.com/neilernst/cliffsDelta
"""
def cliffsDelta(lst1, lst2, **dull):

    """Returns delta and true if there are more than 'dull' differences"""
    if not dull:
        dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    size = lookup_size(d, dull)
    return d, size


def lookup_size(delta: float, dull: dict) -> str:
    """
    :type delta: float
    :type dull: dict, a dictionary of small, medium, large thresholds.
    """
    delta = abs(delta)
    if delta < dull['small']:
        return 'negligible'
    if dull['small'] <= delta < dull['medium']:
        return 'small'
    if dull['medium'] <= delta < dull['large']:
        return 'medium'
    if delta >= dull['large']:
        return 'large'


def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two

Commits proportion

In [12]:
## Load data
data1 = df1['ProportionCommit']
data2 = df2['ProportionCommit']
data3 = df3['ProportionCommit']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=669.2977660270117, p=4.6114418029198995e-146
Different distributions (reject H0)
In [13]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[13]:
1 2 3
1 -1.000000e+00 7.211036e-01 3.289690e-109
2 7.211036e-01 -1.000000e+00 2.195114e-112
3 3.289690e-109 2.195114e-112 -1.000000e+00

Comparing Commits

In [14]:
## Load data
data1 = df1['Commits']
data2 = df2['Commits']
data3 = df3['Commits']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=nan, p=nan
Different distributions (reject H0)
In [15]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[15]:
1 2 3
1 -1.000000e+00 3.917513e-01 1.264505e-121
2 3.917513e-01 -1.000000e+00 3.333606e-113
3 1.264505e-121 3.333606e-113 -1.000000e+00
In [16]:
print('The cliff delta between Cat-I and Cat-II of Commits distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Commits distributions = -0.043 and The effect\_size is: negligible
In [17]:
print('The cliff delta between Cat-II and Cat-III of Commits distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Commits distributions = -0.907 and The effect\_size is: large
In [18]:
print('The cliff delta between Cat-I and Cat-III of Commits distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Commits distributions = -0.914 and The effect\_size is: large

Comparing Efforts

In [214]:
## Load data
data1 = df1['Efforts']
data2 = df2['Efforts']
data3 = df3['Efforts']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=513.1652137900775, p=3.694789264988701e-112
Different distributions (reject H0)
In [215]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[215]:
1 2 3
1 -1.000000e+00 5.727693e-02 6.514003e-93
2 5.727693e-02 -1.000000e+00 6.621532e-77
3 6.514003e-93 6.621532e-77 -1.000000e+00
In [216]:
print('The cliff delta between Cat-I and Cat-II of Efforts distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Efforts distributions = 0.086 and The effect_szise is: negligible
In [217]:
print('The cliff delta between Cat-II and Cat-III of Efforts distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Efforts distributions = 0.742 and The effect_szise is: large
In [218]:
print('The cliff delta between Cat-I and Cat-III of Efforts distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Efforts distributions = 0.794 and The effect_szise is: large

Comparing Quality

In [283]:
## Load data
data1 = df1['Buggy']
data2 = df2['Buggy']
data3 = df3['Buggy']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=349.84148199784545, p=1.0786670529336249e-76
Different distributions (reject H0)
In [284]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[284]:
1 2 3
1 -1.000000e+00 6.516144e-01 4.180242e-60
2 6.516144e-01 -1.000000e+00 4.290944e-57
3 4.180242e-60 4.290944e-57 -1.000000e+00
In [285]:
print('The cliff delta between Cat-I and Cat-II of Quality distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Quality distributions = -0.023 and The effect\_size is: negligible
In [286]:
print('The cliff delta between Cat-II and Cat-III of Quality distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Quality distributions = -0.636 and The effect\_size is: large
In [287]:
print('The cliff delta between Cat-I and Cat-III of Quality distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Quality distributions = -0.644 and The effect\_size is: large

Technical Diversity --- How many projects contributing to?

In [293]:
## Load data
data1 = df1['Tech_Diversity']
data2 = df2['Tech_Diversity']
data3 = df3['Tech_Diversity']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
    
Statistics=979.8387416121095, p=1.7010641089438713e-213
Different distributions (reject H0)
In [294]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[294]:
1 2 3
1 -1.000000e+00 2.925983e-01 1.944583e-167
2 2.925983e-01 -1.000000e+00 3.258837e-155
3 1.944583e-167 3.258837e-155 -1.000000e+00
In [295]:
print('The cliff delta between Cat-I and Cat-II of Technical-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Technical-Diversity distributions = -0.053 and The effect\_size is: negligible
In [296]:
print('The cliff delta between Cat-II and Cat-III of Technical-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Technical-Diversity distributions = -0.961 and The effect\_size is: large
In [297]:
print('The cliff delta between Cat-I and Cat-III of Technical-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Technical-Diversity distributions = -0.968 and The effect\_size is: large

Code Diversity --- Compute-engine, Edge computing, IoT, networking, security, etc.

In [303]:
## Load data
data1 = df1['CodeDiversity']
data2 = df2['CodeDiversity']
data3 = df3['CodeDiversity']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(data1, data2, data3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
    
Statistics=309.27826493833413, p=6.935504890963666e-68
Different distributions (reject H0)
In [304]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[304]:
1 2 3
1 -1.000000e+00 2.911554e-12 7.344661e-68
2 2.911554e-12 -1.000000e+00 1.971502e-25
3 7.344661e-68 1.971502e-25 -1.000000e+00
In [305]:
print('The cliff delta between Cat-I and Cat-II of Code-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Code-Diversity distributions = -0.273 and The effect\_size is: small
In [306]:
print('The cliff delta between Cat-II and Cat-III of Code-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Code-Diversity distributions = -0.409 and The effect\_size is: medium
In [307]:
print('The cliff delta between Cat-I and Cat-III of Code-Diversity distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Code-Diversity distributions = -0.669 and The effect\_size is: large

Affiliated

In [270]:
## Load data
data1 = df1['Affiliated']
data2 = df2['Affiliated']
data3 = df3['Affiliated']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
    
Statistics=292.047889819127, p=3.824780354828373e-64
Different distributions (reject H0)
In [271]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[271]:
1 2 3
1 -1.000000e+00 9.530684e-03 1.279365e-56
2 9.530684e-03 -1.000000e+00 3.000640e-40
3 1.279365e-56 3.000640e-40 -1.000000e+00
In [272]:
print('The cliff delta between Cat-I and Cat-II of Affiliation distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Affiliation distributions = -0.082 and The effect_szise is: negligible
In [273]:
print('The cliff delta between Cat-II and Cat-III of Affiliation distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Affiliation distributions = -0.422 and The effect_szise is: medium
In [274]:
print('The cliff delta between Cat-I and Cat-III of Affiliation distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Affiliation distributions = -0.504 and The effect_szise is: large

Time to first commit

In [275]:
## Load data
data1 = df1['Time2Commit']
data2 = df2['Time2Commit']
data3 = df3['Time2Commit']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
    
Statistics=518.165432285491, p=3.0325364015770204e-113
Different distributions (reject H0)
In [276]:
#sp.posthoc_conover(x, p_adjust = 'holm')
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[276]:
1 2 3
1 -1.000000e+00 6.993636e-01 1.131578e-87
2 6.993636e-01 -1.000000e+00 1.555446e-84
3 1.131578e-87 1.555446e-84 -1.000000e+00
In [280]:
print('The cliff delta between Cat-I and Cat-II of Time2First-Commit distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta between Cat-I and Cat-II of Time2First-Commit distributions = -0.010 and The effect\_size is: negligible
In [281]:
print('The cliff delta between Cat-II and Cat-III of Time2First-Commit distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta between Cat-II and Cat-III of Time2First-Commit distributions = 0.746 and The effect\_size is: large
In [282]:
print('The cliff delta between Cat-I and Cat-III of Time2First-Commit distributions = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta between Cat-I and Cat-III of Time2First-Commit distributions = 0.811 and The effect\_size is: large

Gender By Category [female, male, neutral]

In [76]:
# ## Females df1, df2, and df3
# df1_female  = df1.groupby(['Gender']).get_group(('female'))
# df2_female  = df2.groupby(['Gender']).get_group(('female'))
# df3_female  = df3.groupby(['Gender']).get_group(('female'))

# ## Males df1, df2, and df3
# df1_male    = df1.groupby(['Gender']).get_group(('male'))
# df2_male    = df2.groupby(['Gender']).get_group(('male'))
# df3_male    = df3.groupby(['Gender']).get_group(('male'))

# ## Neutrals df1, df2, and df3
# df1_neutral = df1.groupby(['Gender']).get_group(('neutral'))
# df2_neutral = df2.groupby(['Gender']).get_group(('neutral'))
# df3_neutral = df3.groupby(['Gender']).get_group(('neutral'))
##############################################################

#########. FROM CAT TO NUMNERICAL
# y1 = np.array([45, 42, 27])   ## Male
# y2 = np.array([11, 12, 20])  ## Neutral
# y3 = np.array([ 5,  7, 14]) ## Female

################################ Medians ##############################
## Category I
femaledataset1  = [4,   3,  3,  4,  5,  3,  5] 
maledataset1    = [51, 51, 52, 45, 45, 46, 44]                            
neutraldataset1 = [ 6,  7,  6, 12, 11, 12, 12] 

## Category II
femaledataset2  = [ 4,  5,  4,  4,  6,  6,  7]
maledataset2    = [48, 47, 47, 48, 45, 44, 42]
neutraldataset2 = [ 9,  9, 10,  9, 10, 11, 12]

## Category III
femaledataset3  = [10, 13, 13, 15, 14, 14, 15]        
maledataset3    = [34, 32, 30, 23, 25, 27, 24] 
neutraldataset3 = [17, 16, 18, 23, 22, 20, 22] 
In [103]:
import statistics as ss
ss.median(neutraldataset3)
Out[103]:
20
In [ ]:

In [78]:
 

Females comparisons

In [79]:
## Data 
data1 = femaledataset1    #df1_female['Gender']
data2 = femaledataset2    #df2_female['Gender']
data3 = femaledataset3    #df3_female['Gender']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=15.240621447518006, p=0.0004903894317895102
Different distributions (reject H0)
In [80]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[80]:
1 2 3
1 -1.000000 0.20684 0.000384
2 0.206840 -1.00000 0.020450
3 0.000384 0.02045 -1.000000
In [81]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
The cliff delta of the distributions = -0.592 and The effect_szise is: large
In [82]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
The cliff delta of the distributions = -1.000 and The effect_szise is: large
In [83]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
The cliff delta of the distributions = -1.000 and The effect_szise is: large

Males comparisons

In [84]:
data1 = maledataset1 #df1_male['Gender']
data2 = maledataset2 #df2_male['Gender'] 
data3 = maledataset3 #df3_male['Gender']

## processing data as list
x1 = list(data1) # df1_male
x2 = list(data2) #df2_male
x3 = list(data3) #df3_male
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
 
Statistics=13.701976874300637, p=0.0010584090049194882
Different distributions (reject H0)
In [85]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[85]:
1 2 3
1 -1.000000 0.604302 0.001789
2 0.604302 -1.000000 0.007113
3 0.001789 0.007113 -1.000000
In [86]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
The cliff delta of the distributions = 0.245 and The effect_szise is: small
In [87]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
The cliff delta of the distributions = 1.000 and The effect_szise is: large
In [88]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
The cliff delta of the distributions = 1.000 and The effect_szise is: large

Neutral Comparisons

In [89]:
data1 = neutraldataset1 #df1_neutral['Gender']
data2 = neutraldataset2 #df2_neutral['Gender']
data3 = neutraldataset3 #df3_neutral['Gender']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
 
Statistics=13.523559226581572, p=0.001157168028824208
Different distributions (reject H0)
In [90]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[90]:
1 2 3
1 -1.000000 0.965441 0.004035
2 0.965441 -1.000000 0.004035
3 0.004035 0.004035 -1.000000
In [91]:
print('The cliff delta of Neutrals distributions Cat-I vs. Cat-II = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x2))
The cliff delta of Neutrals distributions Cat-I vs. Cat-II = 0.020 and The effect\_size is: negligible
In [92]:
print('The cliff delta of Neutrals distributions Cat-II vs. Cat-III = %.3f and The effect\_size is: %s' % cliffsDelta(x2, x3))
The cliff delta of Neutrals distributions Cat-II vs. Cat-III = -1.000 and The effect\_size is: large
In [93]:
print('The cliff delta of Neutrals distributions Cat-I vs. Cat-III = %.3f and The effect\_size is: %s' % cliffsDelta(x1, x3))
The cliff delta of Neutrals distributions Cat-I vs. Cat-III = -1.000 and The effect\_size is: large

Commit density == Lines of code per commit for each developer.

Commit density – Lines of code per commit for each developer.

In [47]:
## Load data
data1 = df1['Density']
data2 = df2['Density']
data3 = df3['Density']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=155.92795477906947, p=1.3825308547698142e-34
Different distributions (reject H0)
In [48]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[48]:
1 2 3
1 -1.000000e+00 4.449379e-01 1.539600e-28
2 4.449379e-01 -1.000000e+00 4.481624e-25
3 1.539600e-28 4.481624e-25 -1.000000e+00

Logivity

In [46]:
## Load data
data1 = df1['Longivity']
data2 = df2['Longivity']
data3 = df3['Longivity']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=nan, p=nan
Different distributions (reject H0)
In [47]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[47]:
1 2 3
1 -1.000000e+00 1.858889e-02 8.963070e-153
2 1.858889e-02 -1.000000e+00 3.711013e-127
3 8.963070e-153 3.711013e-127 -1.000000e+00
In [48]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
The cliff delta of the distributions = -0.140 and The effect_szise is: negligible
In [49]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
The cliff delta of the distributions = -0.999 and The effect_szise is: large
In [50]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
The cliff delta of the distributions = -0.998 and The effect_szise is: large

Commits Density

In [36]:
## Load data
data1 = df1['Density']
data2 = df2['Density']
data3 = df3['Density']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=155.92795477906947, p=1.3825308547698142e-34
Different distributions (reject H0)
In [37]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[37]:
1 2 3
1 -1.000000e+00 4.449379e-01 1.539600e-28
2 4.449379e-01 -1.000000e+00 4.481624e-25
3 1.539600e-28 4.481624e-25 -1.000000e+00
In [38]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
The cliff delta of the distributions = -0.035 and The effect_szise is: negligible
In [39]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
The cliff delta of the distributions = -0.437 and The effect_szise is: medium
In [40]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
The cliff delta of the distributions = -0.416 and The effect_szise is: medium

Patch_Acceptance_Rate

In [36]:
## Load data
data1 = df1['Accept_Rate']
data2 = df2['Accept_Rate']
data3 = df3['Accept_Rate']

## processing data as list
x1 = list(data1)
x2 = list(data2)
x3 = list(data3)
x = [x1, x2, x3]

# compare samples
stat, p = kruskal(x1, x2, x3)
print(f'Statistics={stat}, p={p}')
# interpret
k = 3
df = k-1 
H = 5.99
alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
Statistics=nan, p=nan
Different distributions (reject H0)
In [37]:
sp.posthoc_dunn(x, p_adjust = 'holm')
Out[37]:
1 2 3
1 -1.000000e+00 6.250840e-01 2.929445e-22
2 6.250840e-01 -1.000000e+00 3.134132e-24
3 2.929445e-22 3.134132e-24 -1.000000e+00
In [38]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x2))
The cliff delta of the distributions = 0.027 and The effect_szise is: negligible
In [39]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x1, x3))
The cliff delta of the distributions = -0.400 and The effect_szise is: medium
In [40]:
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(x2, x3))
The cliff delta of the distributions = -0.405 and The effect_szise is: medium
In [27]:
# from scipy.stats import norm # scipy's normal distribution module
# import matplotlib.pyplot as plt
# import numpy as np
# %matplotlib inline

# fig = plt.figure(figsize=(10, 4))
# x = np.linspace(-4, 4, 101)
# y = norm.pdf(x) # probability density function

# pct95_1 = x < norm.ppf(0.025); pct975_1 = x < norm.ppf(0.0125); pct99_1 = x < norm.ppf(0.005)
# pct95_2 = x > norm.ppf(0.975); pct975_2 = x > norm.ppf(0.9875); pct99_2 = x > norm.ppf(0.995)

# ax = plt.subplot(111)
# ax.plot(x, y)
# opacity = 0.25
# shading = "k"
# sections = [pct95_1, pct95_2, pct975_1, pct975_2, pct99_1, pct99_2]
# for section in sections:
#     ax.fill_between(x[section], np.zeros(sum(section)), y[section], color=shading, alpha=opacity)

# ax.text(0.1, 0.9, "Two-tailed t-Test", transform=ax.transAxes)
# ax.set_ylim(0, 0.41)
# plt.show()
In [199]:
# fig = plt.figure(figsize=(10, 4))
# x = np.linspace(-4, 4, 101)
# y = norm.pdf(x)

# pct95 = x < norm.ppf(0.05); pct975 = x < norm.ppf(0.025); pct99 = x < norm.ppf(0.01)
# sections = [pct95, pct975, pct99]
# ax = plt.subplot(111)
# ax.plot(x, y)
# for section in sections:
#     ax.fill_between(x[section], np.zeros(sum(section)), y[section], color='k', alpha=0.25)

# ax.text(0.1, 0.9, "Left-tailed t-Test", transform=ax.transAxes)
# ax.set_ylim(0, 0.41)
# plt.show()
In [168]:
# df1 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_1.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# df2 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# df3 = pd.read_csv("~/research/scripts/onboarding/onboarding_notebook/category_3.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
In [128]:
#df3.groupby(['Gender', 'Events'])['Gender'].count()
In [ ]: