# Regression
Code used for regression analysis.
The input for this script is `df_sum.csv` as explained in `data_wrangling.md`.

In [1]:
# DEPENDENCIES
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from scipy.stats.stats import pearsonr
from sklearn import preprocessing

sns.set_style('whitegrid')
plt.rcParams.update({'font.size': 15})

In [2]:
# IMPORT DATA
df = pd.read_csv("netlogo_output/df_sum.csv")

# CLEAN DATA
df = df.dropna()  # can't have NaN's
df = df[df.p < 0.05]  # can't have insignificant correlations

df.head()

Unnamed: 0,popsize,hostility,memsize,numskills,dist_cor,p
0,100,0.2,1,1,0.007378,0.004465384
1,100,0.2,1,21,0.029851,1.219621e-30
2,100,0.2,1,41,0.026136,7.196704e-24
3,100,0.2,1,61,0.085331,2.7659409999999998e-238
4,100,0.2,1,81,0.017161,3.748902e-11


In [3]:
# get IV and DV
X = df["popsize"]
y = df["dist_cor"]

# R^2 drops if I add a constant
X = sm.add_constant(X)

# fit the model
model = sm.OLS(y, X).fit()
# predictions = model.predict(X)

# show summary
model.summary()

0,1,2,3
Dep. Variable:,dist_cor,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,11.61
Date:,"Sat, 07 Nov 2020",Prob (F-statistic):,0.000692
Time:,13:56:02,Log-Likelihood:,143.5
No. Observations:,736,AIC:,-283.0
Df Residuals:,734,BIC:,-273.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3635,0.015,24.195,0.000,0.334,0.393
popsize,-8.872e-05,2.6e-05,-3.407,0.001,-0.000,-3.76e-05

0,1,2,3
Omnibus:,341.314,Durbin-Watson:,1.671
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.918
Skew:,-0.223,Prob(JB):,1.07e-10
Kurtosis:,1.861,Cond. No.,1180.0


In [4]:
X = df["hostility"]
y = df["dist_cor"]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,dist_cor,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,12.66
Date:,"Sat, 07 Nov 2020",Prob (F-statistic):,0.000397
Time:,13:56:07,Log-Likelihood:,144.02
No. Observations:,736,AIC:,-284.0
Df Residuals:,734,BIC:,-274.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2807,0.013,21.587,0.000,0.255,0.306
hostility,0.0767,0.022,3.558,0.000,0.034,0.119

0,1,2,3
Omnibus:,607.422,Durbin-Watson:,1.677
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.697
Skew:,-0.287,Prob(JB):,8.04e-13
Kurtosis:,1.78,Cond. No.,3.74


In [5]:
X = df["memsize"]
y = df["dist_cor"]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,dist_cor,R-squared:,0.101
Model:,OLS,Adj. R-squared:,0.1
Method:,Least Squares,F-statistic:,82.42
Date:,"Sat, 07 Nov 2020",Prob (F-statistic):,1e-18
Time:,13:56:10,Log-Likelihood:,176.88
No. Observations:,736,AIC:,-349.8
Df Residuals:,734,BIC:,-340.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2242,0.013,17.834,0.000,0.200,0.249
memsize,0.0023,0.000,9.079,0.000,0.002,0.003

0,1,2,3
Omnibus:,66.158,Durbin-Watson:,1.863
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35.368
Skew:,-0.378,Prob(JB):,2.09e-08
Kurtosis:,2.237,Cond. No.,90.0


In [11]:
X = df["numskills"]
y = df["dist_cor"]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,dist_cor,R-squared:,0.361
Model:,OLS,Adj. R-squared:,0.36
Method:,Least Squares,F-statistic:,415.3
Date:,"Sun, 20 Sep 2020",Prob (F-statistic):,1.68e-73
Time:,13:38:11,Log-Likelihood:,302.72
No. Observations:,736,AIC:,-601.4
Df Residuals:,734,BIC:,-592.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1413,0.011,13.419,0.000,0.121,0.162
numskills,0.0043,0.000,20.378,0.000,0.004,0.005

0,1,2,3
Omnibus:,26.185,Durbin-Watson:,1.147
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14.224
Skew:,0.155,Prob(JB):,0.000815
Kurtosis:,2.394,Cond. No.,89.0


### Multiple Linear Regression

In [16]:
X = df[["numskills", "memsize"]]
y = df["dist_cor"]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,dist_cor,R-squared:,0.468
Model:,OLS,Adj. R-squared:,0.466
Method:,Least Squares,F-statistic:,322.1
Date:,"Sun, 20 Sep 2020",Prob (F-statistic):,4.16e-101
Time:,13:42:13,Log-Likelihood:,369.8
No. Observations:,736,AIC:,-733.6
Df Residuals:,733,BIC:,-719.8
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0428,0.013,3.394,0.001,0.018,0.068
numskills,0.0043,0.000,22.476,0.000,0.004,0.005
memsize,0.0023,0.000,12.107,0.000,0.002,0.003

0,1,2,3
Omnibus:,29.689,Durbin-Watson:,1.41
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16.512
Skew:,0.195,Prob(JB):,0.00026
Kurtosis:,2.379,Cond. No.,152.0


### Full model

In [24]:
X = df[IVs]
y = df["dist_cor"]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,dist_cor,R-squared:,0.497
Model:,OLS,Adj. R-squared:,0.494
Method:,Least Squares,F-statistic:,180.6
Date:,"Sun, 20 Sep 2020",Prob (F-statistic):,1.46e-107
Time:,14:17:09,Log-Likelihood:,390.65
No. Observations:,736,AIC:,-771.3
Df Residuals:,731,BIC:,-748.3
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0509,0.017,2.943,0.003,0.017,0.085
popsize,-8.504e-05,1.87e-05,-4.559,0.000,-0.000,-4.84e-05
hostility,0.0722,0.015,4.668,0.000,0.042,0.103
memsize,0.0023,0.000,12.323,0.000,0.002,0.003
numskills,0.0043,0.000,23.049,0.000,0.004,0.005

0,1,2,3
Omnibus:,28.997,Durbin-Watson:,1.489
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13.951
Skew:,0.104,Prob(JB):,0.000935
Kurtosis:,2.358,Cond. No.,2170.0
