Varsha Waingankar
https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
This research aimed at the case of customers default payments in Taiwan and compares the predictive accuracy of probability of default using various methods
There are 25 variables and each indicated below
# There are 25 variables:
#ID: ID of each client
#LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
#SEX: Gender (1=male, 2=female)
#EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
#MARRIAGE: Marital status (1=married, 2=single, 3=others)
#AGE: Age in years
#PAY_0: Repayment status in September, 2005
#(-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
#PAY_2: Repayment status in August, 2005 (scale same as above)
#PAY_3: Repayment status in July, 2005 (scale same as above)
#PAY_4: Repayment status in June, 2005 (scale same as above)
#PAY_5: Repayment status in May, 2005 (scale same as above)
#PAY_6: Repayment status in April, 2005 (scale same as above)
#BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
#BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
#BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
#BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
#BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
#BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
#PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
#PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
#PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
#PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
#PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
#PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
#default.payment.next.month: Default payment (1=yes, 0=no)
#importing all the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from imblearn.pipeline import make_pipeline as make_pipeline_imb # To do our transformation in a unique time
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix, precision_recall_curve, accuracy_score
#Reading the data using pandas
df = pd.read_excel("default of credit card clients.xls")
#default of credit card clients.xls
df.head()
#Replacing the column name for convenience
df.rename(columns={"default payment next month": "default"}, inplace = True)
#checking the columns
df.columns
df['default'].value_counts()
Data is highly imbalanced with a ratio of about 78 :22 percent
plt.figure(2)
labels = 'No default- 0','Default - 1'
colors = ['gold', 'yellowgreen']
plt.pie(df.groupby('default').size(),labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=140)
#plt.axis('equal')
plt.show()
#Checking for null values
#function to do it
for c in df:
if df[c].isnull().values.any():
print("nan values present" +c)
else:
print("{} No null values".format(c))
Reduced unknown values to category 4 (Education)
df['EDUCATION'].unique()
#Change values for education (1 = graduate school; 2 = university; 3 = high school; 4 = others)
#Anything other than 4 will be changed to 4
fil = (df['EDUCATION'] == 5) | (df['EDUCATION'] == 6) | (df['EDUCATION']== 0)
df.loc[fil, 'EDUCATION'] = 4
df['EDUCATION'].value_counts()
df['MARRIAGE'].unique()
Reduced unknown values to category 3 (Marital Status)
df.loc[df['MARRIAGE'] == 0, 'MARRIAGE'] = 3
df['MARRIAGE'].value_counts()
sns.distplot(df['LIMIT_BAL'],kde=True)
sns.distplot(df['AGE'],kde=True)
#: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
#X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.
The target variable "default" is correlated to "Repayment status" Which indicates that Repayment status is the best feature interms of predicting default
import seaborn as sns
corr = df.drop(['ID'], axis=1).corr()
f, ax = plt.subplots(figsize=(7, 7))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmin=0,vmax=1, center=0,
square=True, linewidths=.5)
continuous_var = df.describe().columns
_ = df.hist(column=continuous_var, figsize=(20,20))
Transformation to normalize values ( If the distribution is not Gaussian or the standard deviation is very small, the min-max scaler works better.)
minmax_scale = preprocessing.MinMaxScaler().fit(df)
df_minmax = minmax_scale.transform(df)
df_minmax = pd.DataFrame(df_minmax, columns= list(df))
df_minmax.hist(figsize=(20,20))
plt.show()
#Checking if gender has any association with the Limit Balance.
#Equally distributed , no such relationship
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
sns.barplot(x='SEX',y='LIMIT_BAL', data=df,ax=ax)
#Education also doesnt provide insights if a person will default or not
pd.crosstab(df['EDUCATION'], df['default']).plot(kind='bar')
plt.title('Frequency for educational qualification')
plt.xlabel('Education')
plt.ylabel('Frequency of default')
plt.show()
print("1 : graduate school; 2 : university; 3 : high school; 4 : others")
#Marital status also doesn't provide insights if a person will default or not
pd.crosstab(df['MARRIAGE'],df['default']).plot(kind='bar')
plt.title('Frequency of marital status')
plt.xlabel('Marital Status')
plt.ylabel('Frequency of default')
plt.show()
print("1 : married; 2 : single; 3 : others ")
#VALUES AFTER APPLYING MINMAX SCALER
pd.crosstab(df_minmax['MARRIAGE'],df_minmax['default']).plot(kind='bar')
plt.title('Frequency of marital status')
plt.xlabel('Marital Status')
plt.ylabel('Frequency of default')
plt.show()
#print("1 : married; 2 : single; 3 : others ")
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
sns.barplot(x='EDUCATION',y='LIMIT_BAL', data=df,ax=ax)
plt.title("Education level and amount of limit balance")
print("1 : graduate school; 2 : university; 3 : high school; 4 : others")
plt.show()
temp = df["default"].value_counts()
df1 = pd.DataFrame({'default': temp.index,'values': temp.values})
plt.figure(figsize = (6,6))
plt.title('Default Credit Card Clients - target value - data unbalance\n (Default = 1, Not Default = 0)')
sns.barplot(x = 'default', y="values", data=df1)
locs, labels = plt.xticks()
plt.show()
#Perform oversampling to balance the data
X = df_minmax.drop(["default"], axis=1).values #Setting the X to do the split
y = df_minmax["default"].values # transforming the values in array
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=2, test_size=0.20)
# Separate majority and minority classes
df_majority = df_minmax[df_minmax['default']==0]
df_minority = df_minmax[df_minmax['default']==1]
print(df_majority['default'].count())
print("-----------")
print(df_minority['default'].count())
print("-----------")
print(df['default'].value_counts())
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
prediction = logreg.predict(X_test)
print("accuaracy of model")
a= accuracy_score(y_test, prediction)
a=a*100
print(a)
Since distribution is 78:22 ratio, so running a model yeilds an 80 percent accuarcy. So it makes no sense to run a model on imbalanced data. Even random guess will give this result.
Random Oversampling of minority class is performed, to get equal proportion of both classes. Random oversampling just replicates the existing minority class data points.
from sklearn.utils import resample
# Upsample minority class
df_minority_oversampling = resample(df_minority,
replace=True, # sample with replacement
n_samples=22677, # to match majority class
random_state=587) # reproducible results
# Combine majority class with upsampled minority class
df_oversample = pd.concat([df_majority, df_minority_oversampling])
# Display new class counts
print("Now the distribution of non default and default are almost close")
df_oversample['default'].value_counts()
#using the new data frame - oversampled dataframe --- oversampling of minority class
X = df_oversample.drop(["default"], axis=1).values #Setting the X to do the split
y = df_oversample["default"].values # transforming the values in array
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=2, test_size=0.20)
# Create dictionary for storing values of all models
prediction = dict()
#Run the logistic Regression model
#import the linear_model class from sklearn package
from sklearn import linear_model
#create an object of the class, logreg is the object of class LogisticRegression
logreg = linear_model.LogisticRegression(C=1e5)
#call object.fir on (X_train----Set of predictors, Y_train ------target variable. 80 percent is used for training)
logreg.fit(X_train, y_train)
#Model learns from training process
#After training the model -- predict the the class for rest of 20 percent of data
prediction['Logistic'] = logreg.predict(X_test)
#after predicting we check for the accuracy
#Accuracy is defined as comparison between the actual class of target variable from the test data vs predicted
print("accuaracy of model")
a= accuracy_score(y_test, prediction['Logistic'])
a=a*100
print(a)
#Print the confusion matrix
#Confusion matrix is classifying Actual and predicted
#False negative ---Predicted as negative but actually positive
#True Positive ----Predicted as positive and actually positive
#True Negative ---- Predicted as negative and actually negative
#False Positive----Predicted as positive but actually negative
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,prediction['Logistic'])
#print(confusion_matrix)
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, prediction['Logistic'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test, prediction['Logistic'],normalize=True)
plt.show()
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test,prediction['Logistic'])
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
For a data point to be classified into two different categories, We find the k nearest neighbors (k is any odd value) Then we use majority voting on the labels. The majority class label is assigned to the data point If k is even then distance is calculated. The shorted distance is used
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
prediction['KNN']= classifier.predict(X_test)
print("accuaracy of model")
a= accuracy_score(y_test, prediction['KNN'])
a=a*100
print(a)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,prediction['KNN'])
#print(confusion_matrix)
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, prediction['KNN'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test, prediction['KNN'],normalize=True)
plt.show()
average_precision = average_precision_score(y_test,prediction['KNN'])
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
Gini impurity is a measure of how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset.
DTC will segregate the data points based on all values of variables and identify the variable, which creates the best homogeneous sets of data points (which are heterogeneous to each other)
#Calling the Decision TRee Classifier class
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
prediction['DecisionTree'] = clf_gini.predict(X_test)
print("accuracy of the model")
a=accuracy_score(y_test, prediction['DecisionTree'])
a=a*100
print(a)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,prediction['DecisionTree'])
#print(confusion_matrix)
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, prediction['DecisionTree'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test, prediction['DecisionTree'],normalize=True)
plt.show()
average_precision = average_precision_score(y_test,prediction['DecisionTree'])
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_gini, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
Particularly, trees that are grown very deep tend to learn highly irregular patterns: they overfit their training sets, i.e. have low bias, but very high variance. Random forests are a way of averaging multiple deep decision trees, trained on different parts of the same training set, with the goal of reducing the variance
#calling the RandomForest Classifier
clf = RandomForestClassifier(n_jobs=1000,
random_state=9,
#criterion=RFC_METRIC,
n_estimators=11,
verbose=False)
clf.fit(X_train,y_train)
prediction['RandomForest'] = clf.predict(X_test)
a= accuracy_score(prediction['RandomForest'], y_test)
a= a*100
print(a)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,prediction['RandomForest'])
#print(confusion_matrix)
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, prediction['RandomForest'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test, prediction['RandomForest'],normalize=True)
plt.show()
average_precision = average_precision_score(y_test,prediction['RandomForest'])
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
import pydot
# Limit depth of tree to 3 levels
rf_small = RandomForestClassifier(n_estimators=10, max_depth = 3)
rf_small.fit(X_train, y_train)
# Extract the small tree
tree_small = rf_small.estimators_[5]
dfnew = df_minmax.iloc[:, :-1]
feature_list =list(dfnew.columns)
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, filled =True,precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');
Image(graph.create_png())
Check the best features to used for predictive Modeling
#Shows the plot of best features to use , while modeling, importance of features
#The esitmators can be assigned a value, change in the values will result in different values of accuarcy
target = 'default'
predictors = [ 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE',
'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
train_df, val_df = train_test_split(df_minmax, test_size=20, random_state=10, shuffle=True )
clf = RandomForestClassifier(n_jobs=1000,
random_state=9,
#criterion=RFC_METRIC,
n_estimators=10,
verbose=False)
clf.fit(train_df[predictors], train_df[target].values)
preds = clf.predict(val_df[predictors])
tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': clf.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()
accuracy_score(preds,val_df[target])
from sklearn.metrics import accuracy_score
cmp = 0
for model, predicted in prediction.items():
accuracy = accuracy_score(y_test, predicted)
accuracy
print(model, accuracy*100)
cmp += 1
Shows how well the model performs In regression problems , accuracy is used as a solid metric to identify model performance Whereas in classification problems, Confusion matrix and AUC is used as solid metric to check models performance
#Plotting the ROC - Area Under the Curve for all the models
def formatt(x):
if x == 0:
return 0
return 1
vfunc = np.vectorize(formatt)
cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']
for model, predicted in prediction.items():
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
cmp += 1
plt.title('Classifiers comparison with ROC')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
over-sampling approach in which the minority class is over-sampled by creating ``synthetic'' examples rather than by over-sampling with replacement.
Reduces the chance of overfitting
#Perform oversampling to balance the data
X = df_minmax.drop(["default"], axis=1).values #Setting the X to do the split
y = df_minmax["default"].values # transforming the values in array
# splitting data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.20)
classifier = RandomForestClassifier
# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), \
classifier(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)
#Showing the diference before and after the transformation used
print("normal data distribution: {}".format(Counter(y)))
X_smote, y_smote = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y_smote)))
#Train test split
X_train, X_test, y_train, y_test=train_test_split(X_smote, y_smote, random_state=2, test_size=0.20)
prediction=dict()
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
prediction['KNN']= classifier.predict(X_test)
print("accuaracy of model")
a= accuracy_score(y_test, prediction['KNN'])
a=a*100
print(a)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,prediction['KNN'])
skplt.metrics.plot_confusion_matrix(y_test, prediction['KNN'],normalize=True)
plt.show()
average_precision = average_precision_score(y_test,prediction['KNN'])
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
clf = RandomForestClassifier(n_jobs=100,
random_state=9,
#criterion=RFC_METRIC,
n_estimators=11,
verbose=False)
clf.fit(X_train,y_train)
prediction['RandomForest'] = clf.predict(X_test)
a= accuracy_score(prediction['RandomForest'], y_test)
a= a*100
print(a)
skplt.metrics.plot_confusion_matrix(y_test, prediction['RandomForest'],normalize=True)
plt.show()
average_precision = average_precision_score(y_test,prediction['RandomForest'])
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
from sklearn.metrics import accuracy_score
cmp = 0
for model, predicted in prediction.items():
accuracy = accuracy_score(y_test, predicted)
accuracy
print(model,accuracy*100)
cmp += 1
def formatt(x):
if x == 0:
return 0
return 1
vfunc = np.vectorize(formatt)
cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']
for model, predicted in prediction.items():
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
cmp += 1
plt.title('Classifiers comparison with ROC')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Not expected to produce good results cause lesser number of observations. Good number of observations are need for model to perform well Sample size has to be large
#Perform oversampling to balance the data
X = df_minmax.drop(["default"], axis=1).values #Setting the X to do the split
y = df_minmax["default"].values # transforming the values in array
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=2, test_size=0.20)
# Separate majority and minority classes
df_majority = df_minmax[df_minmax['default']==0]
df_minority = df_minmax[df_minmax['default']==1]
print(df_majority['default'].count())
print("-----------")
print(df_minority['default'].count())
print("-----------")
print(df['default'].value_counts())
from sklearn.utils import resample
# Upsample minority class
df_majority_undersampling = resample(df_majority,
replace=True, # sample with replacement
n_samples=6677, # to match majority class
random_state=587) # reproducible results
# Combine majority class with upsampled minority class
df_undersample = pd.concat([df_minority, df_majority_undersampling])
# Display new class counts
df_undersample['default'].value_counts()
#using the new data frame - oversampled dataframe --- oversampling of minority class
X = df_undersample.drop(["default"], axis=1).values #Setting the X to do the split
y = df_undersample["default"].values # transforming the values in array
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=2, test_size=0.20)
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
prediction['KNN']= classifier.predict(X_test)
clf = RandomForestClassifier(n_jobs=1000,
random_state=9,
#criterion=RFC_METRIC,
n_estimators=11,
verbose=False)
clf.fit(X_train,y_train)
prediction['RandomForest'] = clf.predict(X_test)
cmp = 0
for model, predicted in prediction.items():
accuracy = accuracy_score(y_test, predicted)
accuracy
print(model,accuracy*100)
cmp += 1