In [ ]:
 
In [ ]:
# External resources:
"""
1. Stack analytics source code: ```https://opendev.org/x/stackalytics
2. 



CSV file --AFTER FILTERING OUT CONFIDENTIAL INFO:
    1. commit_proportion23.csv
    
"""
In [5]:
import scipy
import scipy.stats as stats
import json
import csv
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
%matplotlib inline

import warnings
#import sksurv


from sksurv.preprocessing import OneHotEncoder
from sksurv.metrics import concordance_index_censored
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.svm import FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel
In [3]:
 

Gender commit density

In [6]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))    Accept_Rate

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Gender",  y="Density", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="count", inner="quartile", scale_hue=False, bw=.2)
sns.despine(left=True)
plt.xticks(fontsize=14) #rotation=60
g.set_yscale('linear')
#g.set(xlabel='OpenStack Releases', ylabel='Commit density per gender')
#g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Commit density per gender", fontsize=16)

g.legend(loc=0)
plt.show()

Commit acceptance rate per Gender

In [24]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(4,3), dpi= 120)

#plt.figure(figsize=(6,4))    Accept_Rate

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Gender",  y="Accept_Rate", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="count", inner="quartile", scale_hue=False, bw=.2)
sns.despine(left=True)
plt.xticks(fontsize=16)
g.set_yscale('linear')
#g.set(xlabel='OpenStack Releases', ylabel='Commit acceptance rate')
#g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

plt.xlabel("OpenStack Gender Contributors", fontsize=16)
plt.ylabel("Commit acceptance rate", fontsize=16)

g.legend()
plt.show()
#fig.savefig("commit_accept_rate23.pdf")

Commiit Acceptance Rate

In [1]:
 
In [26]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))    Accept_Rate

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Accept_Rate", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="count", inner="quartile", scale_hue=False, bw=.2)
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
#g.set(xlabel='OpenStack Releases', ylabel='Commit acceptance rate')
g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Commit acceptance rate", fontsize=16)

g.legend(loc=0)
plt.show()
#fig.savefig("commit_cat23.pdf")
In [28]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Accept_Rate", hue="Categories", data=dfszz,
                palette="Set2",  scale_hue=True, bw=.2)
sns.despine(left=True)
plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Commit acceptance rate')
g.set_xticklabels(['Cactus', 'Diablo', 'Essex', 'Folsom', 'Grizzly', 'Havana', 'Icehouse', 'Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

g.legend(loc=0)
plt.show()
In [ ]:
 

Commit Density Cat I vs. Cat II

In [24]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Density", hue="Categories", data=dfszz,
                palette="Set2",  scale_hue=True, bw=.2)
sns.despine(left=True)
plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Commit density')
g.set_xticklabels(['Cactus', 'Diablo', 'Essex', 'Folsom', 'Grizzly', 'Havana', 'Icehouse', 'Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

g.legend(loc=0)
plt.show()

Commit Density Cat II vs. Cat III

In [7]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))    

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Density", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="count", inner="quartile", scale_hue=False, bw=.2)
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
#g.set(xlabel='OpenStack Releases', ylabel='Commit density')
g.set_xticklabels(['Cat-2', 'Cat-3'])

plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Commit density", fontsize=18)

g.legend(loc=0)
plt.show()
In [ ]:
 
In [29]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))    

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Density", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="count", inner="quartile", scale_hue=False, bw=.2)
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
#g.set(xlabel='OpenStack Releases', ylabel='Commit density')
g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Commit density", fontsize=18)

g.legend(loc=0)
plt.show()
#fig.savefig("commit_cat23.pdf")
In [ ]:
 

RQ1 Productivity in Software ecosystem

In [9]:
import seaborn as sns
sns.set(style="ticks", palette="dark", color_codes=True)
dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Commits", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="width", inner="quartile")
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Commits')
g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
plt.show()
#fig.savefig("commit_cat23.pdf")
In [8]:
# import seaborn as sns
# sns.set(style="ticks", palette="pastel", color_codes=True)
# dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# fig = plt.figure(figsize=(6,4),dpi= 120)

# #plt.figure(figsize=(6,4))

# # Draw a nested violinplot and split the violins for easier comparison
# g=sns.violinplot(x="Events",  y="Proportion", hue="Categories", data=dfszz, split=True,
#                 palette="muted", scale="width", inner="quartile")
# sns.despine(left=True)
# plt.xticks(rotation=60)
# g.set_yscale('linear')
# g.set(xlabel='OpenStack Releases', ylabel='Proportion of commits')
# g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
# plt.show()
# #fig.savefig("commit_pro_bean_cat23.pdf")
In [ ]:
 
In [86]:
sns.set(style='ticks')
fig = plt.figure(dpi= 120)

plt.subplots_adjust(left=None, bottom=None, right=0.9,
                    top=None, wspace=0.5, hspace=None)

ax = fig.add_subplot(221)
ax.set_title('Cat-1 commits proportions')
sns.set(style="ticks")

# Initialize the figure with a logarithmic x axis
#f, ax = plt.subplots(figsize=(5, 3))
ax.set_yscale("linear")

# Load the example planets dataset
df_nobug = pd.read_csv("./category_1.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)


# Plot the orbital period with horizontal boxes
sns.boxplot(x="Events", y="Commits", data=df_nobug,
            whis="range", palette="bright")

# Add in points to show each observation
sns.swarmplot(x="Events", y="Commits", data=df_nobug,
              size=3, color=".4", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
plt.xticks(rotation=60)
ax.set(ylabel="Commits")
ax.set_xticklabels(['Cactus', 'Diablo', 'Essex', 'Folsom', 'Gizzly', 'Havana', 'Icehouse'])
#sns.despine(trim=True, left=True)


######### SET sub plot 2

ax = fig.add_subplot(222)
ax.set_title('Cat-2 commits proportions')
sns.set(style="ticks")

# Initialize the figure with a logarithmic x axis
#fig, axx = plt.subplots(figsize=(5, 3))
ax.set_yscale("linear")

# Load the example planets dataset
df_buggy = pd.read_csv("./category_2.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# Plot the orbital period with horizontal boxes
sns.boxplot(x="Events", y="Commits", data=df_buggy, 
            whis="range", palette="bright")

# Add in points to show each observation
sns.swarmplot(x="Events", y="Commits", data=df_buggy, 
              size=3, color=".4", linewidth=0)

#plt.ylim(0, 15500)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
plt.xticks(rotation=60)
sns.despine(trim=True, left=True)
ax.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
#sns.despine(offset=10, trim=True)
plt.show()
#fig.savefig("szz_cat12.pdf")

RQ3 Gender Diversity

In [23]:
from decimal import *

r = [0,1,2]

x = ["Cat-I", "Cat-II", "Cat-III"]

# y1 = np.array([46, 43, 27])   ## Male
# y2 = np.array([11, 11, 20])  ## Neutral
# y3 = np.array([ 4,  7, 14]) ## Female

y1 = np.array([46, 43, 27])   ## Male
y2 = np.array([11, 12, 20])  ## Neutral
y3 = np.array([ 4,  6, 14]) ## Female


#sns.set_style((style=None, rc=None)
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
    
# memo of sample number
snum = y1+y2+y3


# normalization
y1 = (y1/snum)*100.   
y2 = (y2/snum)*100.   
y3 = (y3/snum)*100.   

plt.figure(figsize=(3,4), dpi= 120)

# stack bars
plt.bar(x, y1, color='#87ceeb',label="Male", width = 0.4)
plt.bar(x, y2 ,bottom=y1, color='#e2e7e4', label="Neutral", width = 0.4)
plt.bar(x, y3 ,bottom=y1+y2, color='#ffadda', label="Female", width = 0.4)

# add text annotation corresponding to the percentage of each data.
for xpos, ypos, yval in zip(x, y1/2, y1):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2/2, y2):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2+y3/2, y3):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)

plt.ylim(0,105)
# Custom x axis
#plt.xticks(r, x)
#plt.xticks(rotation=60, fontsize=14)
plt.yticks( fontsize=14)
plt.xlabel("Categories I, II, & III", fontsize=16)
plt.ylabel("% Gender representation", fontsize=16)

#print(snum)

plt.legend(bbox_to_anchor=(0.5,0.5), loc='center')
plt.savefig('gender_cats.pdf', bbox_inches='tight', pad_inches=0.02)
In [14]:
 
In [15]:
 
In [19]:
from decimal import *

r = [0,1,2,3,4,5,6]

x = ["Cactus","Diablo","Essex","Folsom","Grizzly","Havana","Icehouse"] ## Direct

## Category I 

y1 = np.array([51, 51, 52, 48, 48, 48, 46]) ## Male    #                              ==> 45 == 74% 
y2 = np.array([ 6,  7,  6,  9,  8, 10, 10]) ## Neutral # [12,  6,  7, 11, 12, 15, 17] ==> 11 == 18%
y3 = np.array([4,   3,  3,  4,  5,  3,  5]) ## Female  # [4,   3,  3,  5,  4,  5,  6] ==>  5 == 18%



#sns.set_style((style=None, rc=None)
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
    
# memo of sample number
snum = y1+y2+y3


# normalization
y1 = (y1/snum)*100.   
y2 = (y2/snum)*100.   
y3 = (y3/snum)*100.   

plt.figure(figsize=(6,4), dpi= 120)

# stack bars
plt.bar(x, y1, color='#87ceeb',label="Male", width = 0.5)
plt.bar(x, y2 ,bottom=y1, color='#e2e7e4', label="Neutral", width = 0.5)
plt.bar(x, y3 ,bottom=y1+y2, color='#ffadda', label="Female", width = 0.5)

# add text annotation corresponding to the percentage of each data.
for xpos, ypos, yval in zip(x, y1/2, y1):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2/2, y2):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2+y3/2, y3):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
# add text annotation corresponding to the "total" value of each bar
# for xpos, ypos, yval in zip(x, y1+y2+y3, snum):
#     plt.text(xpos, ypos, "N=%d"%yval, ha="center", va="bottom")

plt.ylim(0,105)
# Custom x axis
#plt.xticks(r, x)
#plt.xticks(rotation=60, fontsize=14)
plt.yticks( fontsize=14)
plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("% Gender in Category I", fontsize=16)

#print(snum)

#plt.legend(bbox_to_anchor=(1.0,0.5), loc='center left')
plt.savefig('gender_cat1.pdf', bbox_inches='tight', pad_inches=0.02)
In [11]:
#!open .
In [20]:
from decimal import *

r = [0,1,2,3,4,5,6]

# x=list of releases -> a,b, ...,g

#x = ["Cactus","Diablo","Essex","Folsom","Grizzly","Havana","Icehouse"] ## Direct

x = ["Juno","Kilo","Liberty","Mitaka","Newton","Ocata","Pike"] ## OUI

## Category II

y1 = np.array([48, 47, 47, 48, 45, 44, 42]) ## Male     #[35, 34, 31, 31, 32, 33, 32] ==> 42 == 69%
y2 = np.array([ 9,  9, 10,  9, 10, 11, 12]) ## Neutral  #[21, 22, 24, 24, 23, 21, 21] ==> 12 == 20%
y3 = np.array([ 4,  5,  4,  4,  6,  6,  7]) ## Female   #[ 5,  5,  6,  6,  6,  7,  8] ==>  7 == 11%



#sns.set_style((style=None, rc=None)
sns.set_style("ticks", {"xtick.major.size": 14, "ytick.major.size": 14})
    
# memo of sample number
snum = y1+y2+y3


# normalization
y1 = (y1/snum)*100.   
y2 = (y2/snum)*100.   
y3 = (y3/snum)*100.   

plt.figure(figsize=(6,4), dpi= 120)

# stack bars
plt.bar(x, y1, color='#87ceeb',label="Male", width = 0.5)
plt.bar(x, y2 ,bottom=y1, color='#e2e7e4', label="Neutral", width = 0.5)
plt.bar(x, y3 ,bottom=y1+y2, color='#ffadda', label="Female", width = 0.5)

# add text annotation corresponding to the percentage of each data.
for xpos, ypos, yval in zip(x, y1/2, y1):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2/2, y2):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2+y3/2, y3):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
# add text annotation corresponding to the "total" value of each bar
# for xpos, ypos, yval in zip(x, y1+y2+y3, snum):
#     plt.text(xpos, ypos, "N=%d"%yval, ha="center", va="bottom")

plt.ylim(0,105)
# Custom x axis
#plt.xticks(r, x)
#plt.xticks(rotation=60, fontsize=14)
plt.yticks( fontsize=14)
plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("% Gender in Category II", fontsize=16)

#plt.legend(bbox_to_anchor=(1.01,0.5), loc='center left')
plt.savefig('gender_cat2.pdf', bbox_inches='tight', pad_inches=0.02)
In [13]:
 
In [21]:
from decimal import *

x = ["Juno","Kilo","Liberty","Mitaka","Newton","Ocata","Pike"] ## OUI



## Category III       
y1 = np.array([34, 32, 30, 23, 25, 27, 24]) ## Male    # ==> 27 == 44%
y2 = np.array([17, 16, 18, 23, 22, 20, 22]) ## Neutral # ==> 20 == 33%
y3 = np.array([10, 13, 13, 15, 14, 14, 15]) ## Female  # ==> 14 == 23%   




#sns.set_style((style=None, rc=None)
sns.set_style("ticks", {"xtick.major.size": 12, "ytick.major.size": 12})
    
# memo of sample number
snum = y1+y2+y3


# normalization
y1 = (y1/snum)*100.   
y2 = (y2/snum)*100.   
y3 = (y3/snum)*100.   

plt.figure(figsize=(6,4), dpi= 120)

# stack bars
plt.bar(x, y3 ,bottom=y1+y2, color='#ffadda', label="Female", width = 0.5)
plt.bar(x, y2 ,bottom=y1, color='#e2e7e4', label="Neutral", width = 0.5)
plt.bar(x, y1, color='#87ceeb',label="Male", width = 0.5)


# add text annotation corresponding to the percentage of each data.
for xpos, ypos, yval in zip(x, y1/2, y1):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
for xpos, ypos, yval in zip(x, y1+y2+y3/2, y3):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    
    
for xpos, ypos, yval in zip(x, y1+y2/2, y2):
    plt.text(xpos, ypos, "%d%%"%round(yval), ha="center", va="center", fontsize=12)
    

# add text annotation corresponding to the "total" value of each bar
# for xpos, ypos, yval in zip(x, y1+y2+y3, snum):
#     plt.text(xpos, ypos, "N=%d"%yval, ha="center", va="bottom")

plt.ylim(0,105)
# Custom x axis
#plt.xticks(r, x)
#plt.xticks(rotation=60, fontsize=13)
plt.yticks(fontsize=13)
plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("% Gender in Category III", fontsize=16)


#plt.legend(bbox_to_anchor=(1.25,0.5),title='Gender', loc='center right')
#plt.setp(ax.get_legend().get_texts(), fontsize='10') 
plt.legend(bbox_to_anchor=(0.5,0.5), loc='upper center')

plt.savefig('gender_cat3.pdf', bbox_inches='tight', pad_inches=0.02)
In [10]:
import seaborn as sn; sn.set()
import matplotlib.pyplot as plt
sn.set(style='ticks')

df = pd.read_csv("./category_1.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# Plot the orbital period with horizontal boxes
sns.boxplot(x="Events", y="Buggy", data=df, 
            whis="range", palette="vlag", width=0.3)

# Add in points to show each observation
#sns.swarmplot(x="Events", y="Buggy", data=df, 
#              size=3, color=".4", linewidth=0)

# Tweak the visual presentation
plt.xticks(rotation=60)
#ax.xaxis.grid(True)
#ax.set(ylabel="commit IDs ")
sns.despine(trim=True, left=True)
#ax.set_title("Bug Inducing commits")
#plt.ylim(0, 15500)

plt.show()

RQ2 Quality

In [72]:
df = pd.read_csv("./szz_chi_sqaure.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
df.sort_values(["bug_status", "onboarded"], axis=0, 
                 ascending=True, inplace=True) 
In [73]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# from pprint import pprint
# contingency table


# table = [[2648, 219666],
# 		 [10341, 404212]
#         ]


tab = pd.crosstab([df.onboarded], [df.bug_status])

table = tab
# table = [[2648, 10341],
# 		 [219666, 404212]
#         ]    
#pprint(table)
stat, p, dof, expected = chi2_contingency(table)
# .apply(lambda x: round(x, 3))
print('dof= {}\n'.format(dof))
print("\nexpected =\n {}".format(np.round(expected, 3)))
print("\n----------- interpret test-statistic ----------------\n")
prob = 0.99
critical = chi2.ppf(prob, dof)
print('probability = %.3f,\ncritical = %.3f,\nstat = %.3f,' % (prob, critical, stat))
if abs(stat) >= critical:
	print('Dependent (reject H0) ')
else:
	print('Independent (fail to reject H0)')
print("\n++++++++++++++ interpret p-value ++++++++++++++\n")
alpha = 1.0 - prob
print("significance = {}, p = {}".format(alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
dof= 1


expected =
 [[1038.642 1584.358]
 [ 417.358  636.642]]

----------- interpret test-statistic ----------------

probability = 0.990,
critical = 6.635,
stat = 0.131,
Independent (fail to reject H0)

++++++++++++++ interpret p-value ++++++++++++++

significance = 0.010000000000000009, p = 0.7171764798662714
Independent (fail to reject H0)
In [ ]:
 
In [75]:
sns.set(style='ticks')
fig = plt.figure(dpi= 120)

plt.subplots_adjust(left=None, bottom=None, right=0.9, top=None, wspace=0.5, hspace=None)

ax = fig.add_subplot(221)
ax.set_title('Cat-1 Bug Inducing commits')
sns.set(style="ticks")

# Initialize the figure with a logarithmic x axis
#f, ax = plt.subplots(figsize=(5, 3))
ax.set_yscale("linear")

# Load the example planets dataset
df_nobug = pd.read_csv("./category_1.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)


# Plot the orbital period with horizontal boxes
sns.boxplot(x="Events", y="SZZ", data=df_nobug,
            whis="range", palette="bright")

# Add in points to show each observation
sns.swarmplot(x="Events", y="SZZ", data=df_nobug,
              size=3, color=".4", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
plt.xticks(rotation=60)
ax.set(ylabel="Bug-Inducing commits")
ax.set_xticklabels(['Cactus', 'Diablo', 'Essex', 'Folsom', 'Gizzly', 'Havana', 'Icehouse'])
#sns.despine(trim=True, left=True)


######### SET sub plot 2

ax = fig.add_subplot(222)
ax.set_title('Cat-2 Bug Inducing commits')
sns.set(style="ticks")

# Initialize the figure with a logarithmic x axis
#fig, axx = plt.subplots(figsize=(5, 3))
ax.set_yscale("linear")

# Load the example planets dataset
df_buggy = pd.read_csv("./category_2.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# Plot the orbital period with horizontal boxes
sns.boxplot(x="Events", y="SZZ", data=df_buggy, 
            whis="range", palette="bright")

# Add in points to show each observation
sns.swarmplot(x="Events", y="SZZ", data=df_buggy, 
              size=3, color=".4", linewidth=0)

#plt.ylim(0, 15500)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
plt.xticks(rotation=60)
sns.despine(trim=True, left=True)
ax.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
#sns.despine(offset=10, trim=True)
plt.show()
#fig.savefig("szz_cat12.pdf")
In [166]:
# fig = plt.figure(dpi= 120)

# plt.subplots_adjust(left=None, bottom=None, right=0.9, top=None, wspace=0.5, hspace=None)

# ax = fig.add_subplot(221)
# ax.set_title('Cat-2 Bug Inducing commits')
# sns.set(style="ticks")

# # Initialize the figure with a logarithmic x axis
# #f, ax = plt.subplots(figsize=(5, 3))
# ax.set_yscale("linear")

# # Load the example planets dataset
# df_nobug = pd.read_csv("./category_2.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)


# # Plot the orbital period with horizontal boxes
# sns.boxplot(x="Events", y="SZZ", data=df_nobug,
#             whis="range", palette="bright")

# # Add in points to show each observation
# #sns.swarmplot(x="Events", y="SZZ", data=df_nobug,
# #              size=3, color=".4", linewidth=0)

# # Tweak the visual presentation
# ax.xaxis.grid(True)
# ax.set(ylabel="Bug-Inducing commits")
# ax.set(xlabel="OpenStack Releases")
# plt.xticks(rotation=60)
# ax.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
# #sns.despine(trim=True, left=True)


# ######### SET sub plot 2

# ax = fig.add_subplot(222)
# ax.set_title('Cat-3 Bug Inducing commits')
# sns.set(style="ticks")

# # Initialize the figure with a logarithmic x axis
# #fig, axx = plt.subplots(figsize=(5, 3))
# ax.set_yscale("linear")

# # Load the example planets dataset
# df_buggy = pd.read_csv("./category_3.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# # Plot the orbital period with horizontal boxes
# sns.boxplot(x="Events", y="SZZ", data=df_buggy, 
#             whis="range", palette="bright")

# # Add in points to show each observation
# #sns.swarmplot(x="Events", y="SZZ", data=df_buggy, 
# #              size=3, color=".4", linewidth=0)

# #plt.ylim(0, 15500)

# # Tweak the visual presentation
# ax.xaxis.grid(True)
# ax.set(ylabel="")
# ax.set(xlabel="OpenStack Releases")
# plt.xticks(rotation=60)
# sns.despine(trim=True, left=True)
# ax.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

# #sns.despine(offset=10, trim=True)
# plt.show()
# #fig.savefig("szz_boxplot_cat23.pdf")
In [193]:
#df = pd.read_csv("./category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
#df = pd.read_csv("./category_3.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# releases = df_buggy["Events"].unique()
# bugInducing = df_buggy["SZZ"]

# bugInducing
In [ ]:
 
In [14]:
# import seaborn as sns
# sns.set(style="whitegrid", palette="pastel", color_codes=True)

# # Load the example tips dataset
# tips = sns.load_dataset("tips")

# # Draw a nested violinplot and split the violins for easier comparison
# sns.violinplot(x="day", y="total_bill", hue="sex", data=tips, split=True,
#                inner="quart", palette={"Male": "b", "Female": "y"})
# sns.despine(left=True)
# plt.show()
In [26]:
 
In [27]:
 
In [11]:
 
In [178]:
 
In [12]:
import seaborn as sns
sns.set(style="ticks", palette="pastel", color_codes=True)
dfszz = pd.read_csv("./SZZ.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="SZZ", hue="Categories", data=dfszz, split=True,
                palette="Set2", scale="width", inner="quartile")
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Bug Inducing commits')
g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Bug Inducing commits", fontsize=18)


plt.show()
#fig.savefig("szz_bean23.pdf")
In [179]:
import seaborn as sns
sns.set(style="ticks", palette="pastel", color_codes=True)
dfszz = pd.read_csv("./Efforts.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Effort", hue="Category", data=dfszz, split=True,
                palette="Set2", scale="width", inner="quartile")
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Bug Inducing commits')
g.set_xticklabels(['Males', 'Females', 'Neutrals'])

plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Bug Inducing commits", fontsize=18)


plt.show()
#fig.savefig("szz_bean23.pdf")
In [ ]:
 
In [ ]:
# Mann-Whitney U test
from scipy.stats import mannwhitneyu

# generate two independent samples
data1 = cat2["SZZ"]
data2 = cat3["SZZ"]

# compare samples
stat, p = mannwhitneyu(data1, data2)
print('Statistics={}, p-value= {}'.format(stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')
    
In [67]:
from __future__ import division

"""
https://github.com/neilernst/cliffsDelta
"""
def cliffsDelta(lst1, lst2, **dull):

    """Returns delta and true if there are more than 'dull' differences"""
    if not dull:
        dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    size = lookup_size(d, dull)
    return d, size


def lookup_size(delta: float, dull: dict) -> str:
    """
    :type delta: float
    :type dull: dict, a dictionary of small, medium, large thresholds.
    """
    delta = abs(delta)
    if delta < dull['small']:
        return 'negligible'
    if dull['small'] <= delta < dull['medium']:
        return 'small'
    if dull['medium'] <= delta < dull['large']:
        return 'medium'
    if delta >= dull['large']:
        return 'large'


def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two
    

You can report the results of Mann-Whitney's U test as follows:

The medians of Group A and Group B were 2.5 and 3.5, respectively. We ran a Mann-Whitney's U test to evaluate the difference in the responses of our 5-Likert scale question. We found a significant effect of Group (The mean ranks of Group A and Group B were 7.8 and 13.2, respectively; U = 23, Z = -2.11, p < 0.05, r = 0.47).

In [ ]:
#df = pd.read_csv("./category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
#df = pd.read_csv("./category_3.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# Wilcoxon signed-rank test
from scipy.stats import wilcoxon
from scipy import stats
import numpy as np
# generate two independent samples
data1 = cat2["Buggy"] 
data2 = cat3["Buggy"]    
# compare samples
stat, p = wilcoxon(data1, data2)
print('Statistics={}, p-value= {}'.format(stat, p))
# interpret
# med1 = np.median(data1) #stats.median_absolute_deviation(data1, axis=None)
# med2 = np.median(data2) #stats.median_absolute_deviation(data2, axis=None)

med1 = stats.median_absolute_deviation(data1, axis=None)
med2 = stats.median_absolute_deviation(data2, axis=None)
print("The medians absolute deviation of Cat-2 = %.3f, and Cat-3 = %.3f" %(med1, med2))
#cliff1, effect_szise = cliffsDelta(data1, data2)    
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(data1, data2))
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')
In [ ]:
 
In [70]:
# # Kruskal-Wallis H-test
# from numpy.random import seed
# from numpy.random import randn
# from scipy.stats import kruskal
# # seed the random number generator
# seed(1)
# # generate three independent samples
# data1 = 5 * randn(100) + 50
# data2 = 5 * randn(100) + 50
# data3 = 5 * randn(100) + 52
# # compare samples
# stat, p = kruskal(data1, data2, data3)
# print('Statistics=%.3f, p=%.3f' % (stat, p))
# # interpret
# alpha = 0.05
# if p > alpha:
# 	print('Same distributions (fail to reject H0)')
# else:
# 	print('Different distributions (reject H0)')
In [71]:
# # Friedman test
# from numpy.random import seed
# from numpy.random import randn
# from scipy.stats import friedmanchisquare
# # seed the random number generator
# seed(1)
# # generate three independent samples
# data1 = 5 * randn(100) + 50
# data2 = 5 * randn(100) + 50
# data3 = 5 * randn(100) + 52
# # compare samples
# stat, p = friedmanchisquare(data1, data2, data3)
# print('Statistics=%.3f, p=%.3f' % (stat, p))
# # interpret
# alpha = 0.05
# if p > alpha:
# 	print('Same distributions (fail to reject H0)')
# else:
# 	print('Different distributions (reject H0)')
In [73]:
# ## Prepare Data
# df_raw = pd.read_csv("./category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# df = df_raw[['Events', 'SZZ', 'Gender']].groupby('Events').apply(lambda x: x.mean())
# #df.sort_values('SZZ', inplace=True)
# df
In [74]:
# Prepare Data
#dfcat2 = pd.read_csv("./category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
In [15]:
# import seaborn as sns; sns.set()
# import matplotlib.pyplot as plt
# sn.set(style='ticks')
# df_buggy = pd.read_csv("./category_2.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# ax = sns.scatterplot(x="Events", y="Tech_Diversity",
#                       hue="Gender", style="Gender", data=df_buggy)
In [90]:
fig = plt.figure(figsize=(6,4), dpi= 120)
df_diversity = pd.read_csv("./Technical_diversity.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
with sns.axes_style(style=None):
    ax = sns.violinplot("Core", "Commits", hue="Category", data=df_diversity, scale="area",
                   split=True, inner="quartile",
                   palette="Set2"); #["lightblue", "lightpink"]
    
plt.xlabel("Number of core Project teams") 

# with sns.axes_style(style=None):
#     ax = sns.kdeplot(df_diversity.Core, df_diversity.Commits);



# #fig.set_title('Contributed commits/project')
# sns.set(style="ticks")

# #fig.set_yscale("linear")
# with sns.axes_style(style=None):
#     sns.violinplot("Tech_Diversity", "Commits", hue="Category", data=df_diversity,
#                    split=True, inner="quartile",
#                    palette="Set2");#["lightblue", "lightpink"] bright

# plt.xlabel("Number of Project teams")    
#plt.legend().remove()
#plt.legend(bbox_to_anchor=(0.5,0.5), loc='lower left')
Out[90]:
Text(0.5, 0, 'Number of core Project teams')
In [84]:
fig = plt.figure(figsize=(6,4), dpi= 120)
#df_diversity = pd.read_csv("./Technical_diversity.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
with sns.axes_style(style=None):
    sns.violinplot("Tech_Diversity", "Gender", hue="Category", data=df_diversity,  scale="area",
                   split=True, inner="quartile",
                   palette="Set2"); #["lightblue", "lightpink"]
    
plt.xlabel("Number of Project teams") 
plt.legend(bbox_to_anchor=(1.01,0.5), loc='upper center')
Out[84]:
<matplotlib.legend.Legend at 0x1240f9390>
In [82]:
# with sns.axes_style(style='ticks'):
#     g = sns.catplot("Tech_Diversity", "Commits", "Gender",
#                        data=df_diversity, kind="box")
#     g.set_axis_labels("Number of Projects", "Commits to projects");
In [91]:
# sns.jointplot("Tech_Diversity", "Commits", data=df_diversity, kind='reg');

!open ./Technical_diversity.csv
In [163]:
#####. TECHNICAL DIVERSITY
#fig = plt.figure(figsize=(6,4), dpi= 120)
df_diversity = pd.read_csv("./Technical_diversity.csv", sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4),dpi= 120)

plt.subplots_adjust(left=None, bottom=None, right=0.9, top=None, wspace=0.5, hspace=None)

ax = fig.add_subplot(221)
#ax.set_title('Contributed commits/project')
sns.set(style="ticks")

ax.set_yscale("linear")
with sns.axes_style(style="ticks"):
    g = sns.violinplot("Core", "Commits", hue="Category", data=df_diversity,
               split=True, inner="quartile",legend=False,
               palette="Set2");#["lightblue", "lightpink"] bright


    
plt.xlabel("Number of Core Project")    
#plt.legend().remove()

plt.legend(bbox_to_anchor=(0.5,0.7), loc='lower left')




# plt.legend(title='Category', loc='upper left', labels=['Cat-II', 'Cat-III'])
# plt.show()

######### SET sub plot 2

ax = fig.add_subplot(222)
ax.set_title('Gender across projects')

sns.set(style="ticks")

# Initialize the figure with a logarithmic x axis
ax.set_yscale("linear")
ax.set_ylabel("Projects")
with sns.axes_style(style=None):
    sns.violinplot("Gender", "Core", hue="Category", data=df_diversity,
                   split=True, inner="quartile",legend=False,
                   palette="Set2"); #["lightblue", "lightpink"]

plt.ylabel("#Core Projects")
plt.xlabel("Gender") #Number of Project teams
#plt.legend(bbox_to_anchor=(1.5,0.5), loc='best')
plt.legend().remove()
plt.show()

#fig.savefig("tech-diversity-cat23.pdf")
In [158]:
# import numpy as np
# import matplotlib.pyplot as plt

# # Create some mock data
# t = df_diversity["Tech_Diversity"]
# data1 = df_diversity["Commits"] 
# data2 = df_diversity["Core"]

# fig, ax1 = plt.subplots()

# color = 'tab:red'
# ax1.set_xlabel('time (s)')
# ax1.set_ylabel('exp', color=color)
# ax1.plot(t, data1, color=color)
# ax1.tick_params(axis='y', labelcolor=color)

# ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

# color = 'tab:blue'
# ax2.set_ylabel('sin', color=color)  # we already handled the x-label with ax1
# ax2.plot(t, data2, color=color)
# ax2.tick_params(axis='y', labelcolor=color)

# fig.tight_layout()  # otherwise the right y-label is slightly clipped
# plt.show()

RQ1 Effort

In [32]:
import seaborn as sns
sns.set(style="ticks", palette="bright", color_codes=True)
df_effort = pd.read_csv("./Efforts.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4),dpi= 120)

# Draw a nested violinplot and split the violins for easier comparison
g=sns.violinplot(x="Events",  y="Effort", hue="Category", data=df_effort, split=True,
                palette="Set2", scale="width", inner="quartile")
sns.despine(left=True)
#plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Effort made by contributors')
g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])

plt.xlabel("OpenStack Releases", fontsize=16)
plt.ylabel("Effort made by contributors", fontsize=17)


plt.show()
#fig.savefig("effort-bean-cat23.pdf")
In [34]:
!open ./Efforts.csv

Effort -- Category 1-2

In [13]:
import seaborn as sns
sns.set(style="ticks", palette="pastel", color_codes=True)
df_effort = pd.read_csv("./Effort12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

# Draw a nested violinplot and split the violins for easier comparison
g=sns.boxplot(x="Events",  y="Effort", hue="Category", data=df_effort, 
                palette="Set2") #muted
sns.despine(left=True)
plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Effort made by contributors 1-2')
g.set_xticklabels(["Cactus","Diablo","Essex","Folsom","Grizzly","Havana","Icehouse",'Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
plt.show()
#fig.savefig("effort-bean-cat12.pdf")
In [87]:
dfcat1 = pd.read_csv("./category_1.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
dfcat2 = pd.read_csv("./category_2.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
In [ ]:
 
In [88]:
from __future__ import division

"""
https://github.com/neilernst/cliffsDelta
"""
def cliffsDelta(lst1, lst2, **dull):

    """Returns delta and true if there are more than 'dull' differences"""
    if not dull:
        dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    size = lookup_size(d, dull)
    return d, size


def lookup_size(delta: float, dull: dict) -> str:
    """
    :type delta: float
    :type dull: dict, a dictionary of small, medium, large thresholds.
    """
    delta = abs(delta)
    if delta < dull['small']:
        return 'negligible'
    if dull['small'] <= delta < dull['medium']:
        return 'small'
    if dull['medium'] <= delta < dull['large']:
        return 'medium'
    if delta >= dull['large']:
        return 'large'


def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two
    
In [89]:
# Wilcoxon signed-rank test
from scipy.stats import wilcoxon
from scipy import stats
import numpy as np
# generate two independent samples
data1 = dfcat1["Efforts"] 
data2 = dfcat2["Efforts"]    
# compare samples
stat, p = wilcoxon(data1, data2)
print('Statistics={}, p-value= {}'.format(stat, p))
# interpret
# med1 = np.median(data1) #stats.median_absolute_deviation(data1, axis=None)
# med2 = np.median(data2) #stats.median_absolute_deviation(data2, axis=None)

med1 = stats.median_absolute_deviation(data1, axis=None)
med2 = stats.median_absolute_deviation(data2, axis=None)
print("The medians absolute deviation of Cat-2 = %.3f, and Cat-3 = %.3f" %(med1, med2))
#cliff1, effect_szise = cliffsDelta(data1, data2)    
print('The cliff delta of the distributions = %.3f and The effect_szise is: %s' % cliffsDelta(data1, data2))
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')
Statistics=2849.5, p-value= 0.10124908250502146
The medians absolute deviation of Cat-2 = 4.448, and Cat-3 = 4.448
The cliff delta of the distributions = 0.086 and The effect_szise is: negligible
Same distribution (fail to reject H0)

Corporate diversity

In [8]:
# import random
# # Draw plot
# import matplotlib.patches as patches
# %matplotlib inline
# # Prepare Data
# df = pd.read_csv("./corporate_diversity.csv")

# # Prepare Data
# #df = df.groupby('Releases').size().reset_index(name='Companies')
# n = df['Releases'].unique().__len__()+1
# all_colors = list(plt.cm.colors.cnames.keys())
# random.seed(9)
# c = random.choices(all_colors, k=n)

# # Plot Bars
# fig = plt.figure(figsize=(6,4), dpi= 120)
# plt.bar(df['Releases'], df['Companies'], color="red", width=.2)
# for i, val in enumerate(df['Companies'].values):
#     plt.text(i, val, int(val), horizontalalignment='center',
#              verticalalignment='bottom', fontdict={'fontweight':500, 'size':12})

# # Decoration
# plt.gca().set_xticklabels(df['Releases'], rotation=60, horizontalalignment= 'right',
#                           fontsize=12)

# # Add patches to color the X axis labels
# p1 = patches.Rectangle((.50, -0.005), width=.40, height=.10, alpha=.2,
#                        facecolor='green', transform=fig.transFigure)
# p2 = patches.Rectangle((.120, -0.005), width=.340, height=.10, alpha=.2,
#                        facecolor='yellow', transform=fig.transFigure)
# fig.add_artist(p1)
# fig.add_artist(p2)


# sns.despine(left=True)
# plt.yticks(fontsize=10)
# plt.ylabel('#Corporations across release cycle', fontsize=12)
# plt.xlabel('OpenStack Releases', fontsize=12)
# plt.ylim(0, 600)
# plt.show()
# fig.savefig("company_diversity.pdf")

Pie Chart of top companies by commits.

showing other's category as sum of less common commits companies

In [93]:
import random
# Draw plot
import matplotlib.patches as patches
%matplotlib inline
# Prepare Data
df = pd.read_csv("./corporate_diversity.csv")

# Prepare Data
#df = df.groupby('Releases').size().reset_index(name='Companies')
n = df['Releases'].unique().__len__()+1
all_colors = list(plt.cm.colors.cnames.keys())
random.seed(9)
c = random.choices(all_colors, k=n)

# Plot Bars
fig = plt.figure(figsize=(6,4), dpi= 120)
plt.plot(df['Releases'], df['Companies'], color="red")
for i, val in enumerate(df['Companies'].values):
    plt.text(i, val, int(val), horizontalalignment='center',
             verticalalignment='bottom', fontdict={'fontweight':500, 'size':12})

# Decoration
plt.gca().set_xticklabels(df['Releases'], rotation=60, horizontalalignment= 'right',
                          fontsize=12)

# # Add patches to color the X axis labels
# p1 = patches.Rectangle((.50, -0.005), width=.40, height=.10, alpha=.2,
#                        facecolor='green', transform=fig.transFigure)
# p2 = patches.Rectangle((.120, -0.005), width=.340, height=.10, alpha=.2,
#                        facecolor='yellow', transform=fig.transFigure)
# fig.add_artist(p1)
# fig.add_artist(p2)


sns.despine(left=True)
plt.yticks(fontsize=10)
plt.ylabel('#Corporations across release cycle', fontsize=16)
plt.xlabel('OpenStack Releases', fontsize=16)
plt.ylim(0, 600)
plt.show()
In [17]:
# Import
df = pd.read_csv("./company_commits.csv")


# Draw Plot
fig, ax = plt.subplots(figsize=(12, 8), subplot_kw=dict(aspect="equal"), dpi= 120)

data = df['Commits']
categories = df['Company']
explode = [0,0.1,0,0,0,0,0,0,0,0,0,0,0,0.1]

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%".format(pct)

wedges, texts, autotexts = ax.pie(data, 
                                  autopct=lambda pct: func(pct, data),
                                  textprops=dict(color="black"), 
                                  #colors=plt.cm.Dark2.colors,
                                  startangle=140,
                                  #colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"],
                                  colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
                                            '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
                                            '#9a6324', '#a9a9a9'],
                                  rotatelabels=True,
                                  explode=explode)

# Decoration
ax.legend(wedges, categories, title="Compnaies", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.setp(autotexts, size=10, weight=200)
ax.set_title("Commit proportion by compnaies")
plt.show()
#fig.savefig("company_commit_prop.pdf")
In [19]:
import squarify 

df = pd.read_csv("./company_commits.csv")

labels = df.apply(lambda x: str(x[0]) + "\n  (" + str(x[2]) + ")" , axis=1)
sizes = df['Commits'].values.tolist()
color = sns.palplot(sns.color_palette("husl", 8))
colors = color #[plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]

# Draw Plot
fig = plt.figure(figsize=(8,5), dpi= 120)
squarify.plot(sizes=sizes, label=labels, color=colors, alpha=.6)

# Decorate
plt.title('Commit proportion by compnaies')
plt.axis('off')
plt.show()
fig.savefig("company_commit_treemap.pdf")
In [20]:
# #!open ./Efforts.csv
# # msno.matrix(df)
# # msno.heatmap(df)

# import seaborn as sns
# df_effort = pd.read_csv("./Efforts.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# df_effort.groupby(["Affiliated", "Category"])["Affiliated"].describe()
In [82]:
import seaborn as sns
df_effort = pd.read_csv("./Affiliation12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

g=sns.catplot(x="Category", hue="Affiliated", kind="count",
            palette="Set2", edgecolor=".1", aspect=0.51,
            data=df_effort);

sns.despine(left=True)
#plt.xticks(rotation=60)

g.set_xticklabels(['Cat-I', 'Cat-II'], fontsize=16)
                 
plt.yticks(fontsize=16)

plt.ylabel('Hired vs. Volunteer contributors', fontsize=20)
plt.xlabel('OpenStack contributors', fontsize=18)


# check axes and find which is have legend
leg = g.axes.flat[0].get_legend()

#title
new_title = 'Status'
g._legend.set_title(new_title)
# replace labels
new_labels = ['Hire', 'Vols']
for t, l in zip(g._legend.texts, new_labels): 
    t.set_text(l)




#plt.show()
#plt.legend() #.remove()
#fig.savefig("affiliation_cat12.pdf")
<Figure size 720x480 with 0 Axes>
In [6]:
 
In [78]:
import seaborn as sns
#sns.set(style="ticks", palette="pastel", color_codes=True)
df_effort = pd.read_csv("./Efforts.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
#df_effort = pd.read_csv("./Affiliation12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

g = sns.catplot(x="Category", hue="Affiliated", kind="count",
            palette="Set2", edgecolor=".1", aspect=0.51,
            data=df_effort);


sns.despine(left=True)
#plt.xticks(rotation=60)

plt.yticks(fontsize=16)
g.set_xticklabels(['Cat-II', 'Cat-III'], fontsize=16)
plt.ylabel('Hired vs. Volunteer contributors', fontsize=20)
plt.xlabel('OpenStack contributors', fontsize=18)




# check axes and find which is have legend
leg = g.axes.flat[0].get_legend()


#title
new_title = 'Status'
g._legend.set_title(new_title)
# replace labels
new_labels = ['Hire', 'Vols']
for t, l in zip(g._legend.texts, new_labels): 
    t.set_text(l)
    
#legend(loc='upper center', shadow=True, fontsize='x-large')
plt.legend.fontsize='x-large'
    
<Figure size 720x480 with 0 Axes>

Chi-Square --- Analysis

In [11]:
# dfszz = pd.read_csv("./commit_proportion23.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)

# dfszz.groupby(['Categories','Commits'])['Commits'].size().reset_index(name='counts')

# df_b4 = pd.read_csv('./category_3.csv', sep=",")
# df_b4.groupby(['Events', 'Gender']).size().reset_index(name='counts')

# #df_b4.groupby('Gender')[['Events']].describe()
# #df_b4.groupby('Events', as_index=False).agg({"Gender": "count"})
# .size().reset_index(name='counts')
In [ ]:
tab = pd.crosstab([df.collab], [df.bug_status])
In [ ]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# from pprint import pprint
# contingency table


# table = [[2648, 219666],
# 		 [10341, 404212]
#         ]

table = tab

# table = [[2648, 10341],
#  		 [219666, 404212]
#          ]    
#pprint(table)
stat, p, dof, expected = chi2_contingency(table)
# .apply(lambda x: round(x, 3))
print('dof= {}\n'.format(dof))
print("\nexpected =\n {}".format(np.round(expected, 3)))
print("\n----------- interpret test-statistic ----------------\n")
prob = 0.99
critical = chi2.ppf(prob, dof)
print('probability = %.3f,\ncritical = %.3f,\nstat = %.3f,' % (prob, critical, stat))
if abs(stat) >= critical:
	print('Dependent (reject H0) ')
else:
	print('Independent (fail to reject H0)')
print("\n++++++++++++++ interpret p-value ++++++++++++++\n")
alpha = 1.0 - prob
print("significance = {}, p = {}".format(alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
In [210]:
# import seaborn as sns
# #sns.set(style="ticks", palette="pastel", color_codes=True)
# #df_effort = pd.read_csv("./Efforts.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# df_effort = pd.read_csv("./Affiliation12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# fig = plt.figure(figsize=(6,4), dpi= 120)

# g = sns.catplot(x="Category", hue="Active", kind="count",
#             palette="Set2", edgecolor=".1", aspect=0.51,
#             data=df_effort);


# sns.despine(left=True)
# #plt.xticks(rotation=60)

# plt.yticks(fontsize=16)
# g.set_xticklabels(['Cat-I', 'Cat-II'], fontsize=16)
# plt.ylabel('Hired vs. Volunteer contributors', fontsize=20)
# plt.xlabel('OpenStack Releases', fontsize=18)
In [44]:
# Logivity

import seaborn as sns
#sns.set(style="ticks", palette="pastel", color_codes=True)
#df_effort = pd.read_csv("./Efforts.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
df_effort = pd.read_csv("./Affiliation12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

g = sns.catplot(x="Category", hue="Affiliated", kind="count",
            palette="Set2", edgecolor=".1", aspect=0.51,
              
            data=df_effort);


# # check axes and find which is have legend
#leg = g.axes.flat[0].get_legend()

# title
#new_title = 'Status'
#g._legend.set_title(new_title)
# # replace labels
# new_labels = ['Hired', 'Volunteers']
# for t, l in zip(g._legend.texts, new_labels): 
#     t.set_text(l)

    
    
sns.despine(left=True)
#plt.xticks(rotation=60)

plt.yticks(fontsize=16)
g.set_xticklabels(['Cat-I', 'Cat-II'], fontsize=16)
plt.ylabel('Hired vs. Volunteer contributors', fontsize=20)
plt.xlabel('OpenStack Releases', fontsize=18)


# check axes and find which is have legend
for ax in g.axes.flat:
    leg = g.axes.flat[0].get_legend()
    if not leg is None: break
# or legend may be on a figure
if leg is None: leg = g._legend

# change legend texts
new_title = 'Status'
leg.set_title(new_title)
new_labels = ['Hire', 'Vols']
for t, l in zip(leg.texts, new_labels): 
    t.set_text(l)

#plt.show()


#plt.legend(title='Status', labels=['Hired', 'Volunteers'], bbox_to_anchor=(1.01,0.5), loc='best')
# plt.show(g)

#############
<Figure size 720x480 with 0 Axes>
In [211]:
# import seaborn as sns
# # sns.set(style="ticks", palette="bright", color_codes=True)
# # df_effort = pd.read_csv("./Affiliation12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# # fig = plt.figure(figsize=(6,4),dpi= 120)

# # # Draw a nested violinplot and split the violins for easier comparison
# # g=sns.boxplot(x="Events",  y="Logivity", hue="Category", data=df_effort,
# #                 palette="Set2")
# # sns.despine(left=True)
# # plt.xticks(rotation=60)
# # g.set_yscale('linear')
# # g.set(xlabel='OpenStack Releases', ylabel='Effort made by contributors')
# # g.set_xticklabels(['Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
# # plt.show()
# # #fig.savefig("effort-bean-cat23.pdf")

# df_effort = pd.read_csv("./Affiliation12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
# fig = plt.figure(figsize=(6,4), dpi= 120)

# g = sns.catplot(x="Category", hue="Affiliated", kind="count",
#             palette="Set2", edgecolor=".1", aspect=0.51,
#             data=df_effort);


# sns.despine(left=True)
# #plt.xticks(rotation=60)

# plt.yticks(fontsize=16)
# g.set_xticklabels(['Cat-I', 'Cat-II'], fontsize=16)
# plt.ylabel('Hired vs. Volunteer contributors', fontsize=20)
# plt.xlabel('OpenStack Releases', fontsize=18)
In [181]:
######.  EFFORT


import seaborn as sns
sns.set(style="ticks", palette="pastel", color_codes=True)
df_effort = pd.read_csv("./Effort12.csv",sep=',', encoding='utf-8',quoting=csv.QUOTE_ALL)
fig = plt.figure(figsize=(6,4), dpi= 120)

#plt.figure(figsize=(6,4))

#Draw a nested violinplot and split the violins for easier comparison
g=sns.boxplot(x="Events",  y="Effort", hue="Category", data=df_effort, 
                palette="Set2") #muted





sns.despine(left=True)
plt.xticks(rotation=60)
g.set_yscale('linear')
g.set(xlabel='OpenStack Releases', ylabel='Effort made by contributors 1-2')
g.set_xticklabels(["Cactus","Diablo","Essex","Folsom","Grizzly","Havana","Icehouse",'Juno', 'Kilo', 'Liberty', 'Mitaka', 'Newton', 'Octava', 'Pike'])
plt.show()
#fig.savefig("effort-bean-cat12.pdf")
In [ ]:
 

Polar chart comparing metrics on Onboarding vs. direct intergration

In [47]:
# #!/usr/bin/env python3

# """

# efault template: 'plotly'
#     Available templates:
#         ['ggplot2', 'seaborn', 'simple_white', 'plotly',
#          'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
#          'ygridoff', 'gridon', 'none']
         
         
#   Cat2
# Females	10
# Effort	9
# Density	158.8553474
# Accept_Rate	0.3677
# Technical Diversity	1
# Software Quality	23
# Time2Commit	267.84 (9 mths)
# Commit density 158.8553474 (/25) == 6.38
# longivity 1046 ==> 34 mths
# Hired by company 58.54%
# -------------------

# Cat3
# Females	13	
# Effort	3
# Density	255.8331579
# Accept_Rate	0.4902
# Technical Diversity	3
# Software Quality	21
# Time2Commit	118.9875 (4 mths)
# Commit density	255.8331579 (/25) == 10.24
# longivity 1298 ==> 43 mths
# Hired by company 82%
# ----------------------
# 'Females','Technical diversity', 'Commit density', 'Hired', 'Longivity', 'Acceptance rate', 'First commit', 'Buggy Commits', 'Efforts'
# 10,1,6.38,58.54,34,36,9,23,9
# 13,3,10.24,82,43,49,4,21,3
# """
# import plotly.graph_objects as go

# categories = ['GD(f)','TD', 'Den', 'CD', 'Rt', 
#               'CAR', 'TFC', 'SZZ', 'Eft']

# fig = go.Figure()


# fig.add_trace(go.Scatterpolar(
#       r=[13,3,10.24,82,43,49,4,21,3],
#       theta=categories,
#       fill='toself',
#       mode = "lines+markers",
#         fillcolor = 'white', # #FFAA70
#         line =  dict(
#             color = 'red'
#         ),
#         marker = dict(
#         color = "black", # #8090c7
#         symbol = "square",
#         size = 7
#       ),
#       name='Cat-III'
# ))

# fig.add_trace(go.Scatterpolar(
#       r=[10,1,6.38,58.54,34,36,9,23,9],
#       theta=categories,
#       fill='toself',
#       mode = 'markers',
#         #fillcolor = 'green', #
#         line =  dict(
#             color = 'green'
#         ),
#         marker = dict(
#         color = "blue", # #8090c7
#         symbol = "star",
#         size = 8
#       ),
#       name='Cat-II'
# ))

# fig.update_layout(
#   polar=dict(
#     radialaxis=dict(
#       visible=True,
#       range=[0, 90]
#     )),
#     font=dict(
#         size=16
#     ),
#     legend=dict(
#         font=dict(
#             size=16
#         )
#     ),
#     template="presentation",
#   showlegend=True
# )

# fig.show()
In [3]:
# import numpy as np
# import matplotlib.pyplot as plt


# N = 1
# menMeans = (45)
# womenMeans = (5)
# neutralMeans = (11)
# menStd = (2)
# womenStd = (3)
# neutralStd = ()


# ind = np.arange(N)    # the x locations for the groups
# width = 0.15       # the width of the bars: can also be len(x) sequence

# p1 = plt.bar(ind, menMeans, width, yerr=menStd)
# p2 = plt.bar(ind, womenMeans, width,
#              bottom=menMeans, yerr=womenStd)

# plt.ylabel('Scores')
# plt.title('Scores by group and gender')
# plt.xticks(ind, ('Cat1', 'Cat2', 'Cat3'))
# plt.yticks(np.arange(0, 81, 10))
# plt.legend((p1[0], p2[0]), ('Men', 'Women'))

# plt.show()
In [71]:
 
In [41]:
# import matplotlib.pyplot as plt


# def make_patch_spines_invisible(ax):
#     ax.set_frame_on(True)
#     ax.patch.set_visible(False)
#     for sp in ax.spines.values():
#         sp.set_visible(False)


# fig, host = plt.subplots()
# fig.subplots_adjust(right=0.75)

# par1 = host.twinx()
# par2 = host.twinx()

# # Offset the right spine of par2.  The ticks and label have already been
# # placed on the right by twinx above.
# par2.spines["right"].set_position(("axes", 1.2))
# # Having been created by twinx, par2 has its frame off, so the line of its
# # detached spine is invisible.  First, activate the frame but make the patch
# # and spines invisible.
# make_patch_spines_invisible(par2)
# # Second, show the right spine.
# par2.spines["right"].set_visible(True)

# p1, = host.plot([0, 1, 2], [0, 1, 2], "b-", label="Density")
# p2, = par1.plot([0, 1, 2], [0, 3, 2], "r-", label="Temperature")
# p3, = par2.plot([0, 1, 2], [50, 30, 15], "g-", label="Velocity")

# host.set_xlim(0, 2)
# host.set_ylim(0, 2)
# par1.set_ylim(0, 4)
# par2.set_ylim(1, 65)

# host.set_xlabel("Distance")
# host.set_ylabel("Density")
# par1.set_ylabel("Temperature")
# par2.set_ylabel("Velocity")

# host.yaxis.label.set_color(p1.get_color())
# par1.yaxis.label.set_color(p2.get_color())
# par2.yaxis.label.set_color(p3.get_color())

# tkw = dict(size=4, width=1.5)
# host.tick_params(axis='y', colors=p1.get_color(), **tkw)
# par1.tick_params(axis='y', colors=p2.get_color(), **tkw)
# par2.tick_params(axis='y', colors=p3.get_color(), **tkw)
# host.tick_params(axis='x', **tkw)

# lines = [p1, p2, p3]

# host.legend(lines, [l.get_label() for l in lines])

# plt.show()

Summary ploy - Radar

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import pi

''' 
    For visualization purposes, we used this radar
    chart to show an overall view of how Cat-II and Cat-III performed. they are not on 
    the same scale.
''' 
df = pd.DataFrame({'cat_id': ['cat3','cat2'],
                   'GD(f)': [15, 10],
                   'TD'   : [6,   1],
                   'Den'  : [14,  7],
                   'CD'   : [40, 29],
                   'Rt'   : [23, 17],
                   'PAR'  : [27, 18],
                   'TFC'  : [4,   8],
                   'SZZ'  : [9, 13],
                   'Eft'  : [3,   6]},
                  columns=['cat_id', 'GD(f)','TD', 'Den', 'CD', 'Rt', 'PAR', 'TFC', 'SZZ', 'Eft'])



categories = list(df)[1:]

values = df.mean().values.flatten().tolist()
values += values[:1] # repeat the first value to close the circular graph

angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
angles += angles[:1]
In [11]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6), subplot_kw=dict(polar=True))

plt.xticks(angles[:-1], categories, color='black', size=16)
plt.yticks(np.arange(1, 40), ['1', '2', '3', '4', '5', '6', '7', '8',
                              '9', '10', '11', '12', '13', '14', '15'],
           color='white', size=16)

colors = ['g', 'y']

plt.ylim(0, 40)
ax.set_rlabel_position(40)
 
# part 1
val_c1 = df.loc[0].drop('cat_id').values.flatten().tolist()
val_c1 += val_c1[:1]

ax.plot(angles, val_c1, linewidth=4, marker = '*', markersize=10, linestyle='solid', color='#CD853F',label='Cat-3')
ax.fill(angles, val_c1, '#FFFFFF', alpha=1.0)
 
# part 2
val_c2=df.loc[1].drop('cat_id').values.flatten().tolist()
val_c2 += val_c2[:1]

ax.plot(angles, val_c2, linewidth=3, marker = 'o', markersize=8, linestyle='solid', color='g', label='Cat-2')
ax.fill(angles, val_c2, '#FFFF00', alpha=0.3) 

marker = dict(
        color = "black", # #8090c7
        symbol = "square",
        size = 7
      ),
    
## Legend
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 0.1), fontsize=16)

plt.show()
fig.savefig("polr_summary.pdf")

!open polr_summary.pdf