{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Regression\n", "Code used for regression analysis.\n", "The input for this script is `df_sum.csv` as explained in `data_wrangling.md`." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# DEPENDENCIES\n", "import itertools\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import statsmodels.api as sm\n", "from scipy.stats.stats import pearsonr\n", "from sklearn import preprocessing\n", "\n", "sns.set_style('whitegrid')\n", "plt.rcParams.update({'font.size': 15})" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popsizehostilitymemsizenumskillsdist_corp
01000.2110.0073784.465384e-03
11000.21210.0298511.219621e-30
21000.21410.0261367.196704e-24
31000.21610.0853312.765941e-238
41000.21810.0171613.748902e-11
\n", "
" ], "text/plain": [ " popsize hostility memsize numskills dist_cor p\n", "0 100 0.2 1 1 0.007378 4.465384e-03\n", "1 100 0.2 1 21 0.029851 1.219621e-30\n", "2 100 0.2 1 41 0.026136 7.196704e-24\n", "3 100 0.2 1 61 0.085331 2.765941e-238\n", "4 100 0.2 1 81 0.017161 3.748902e-11" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# IMPORT DATA\n", "df = pd.read_csv(\"netlogo_output/df_sum.csv\")\n", "\n", "# CLEAN DATA\n", "df = df.dropna() # can't have NaN's\n", "df = df[df.p < 0.05] # can't have insignificant correlations\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: dist_cor R-squared: 0.016
Model: OLS Adj. R-squared: 0.014
Method: Least Squares F-statistic: 11.61
Date: Sat, 07 Nov 2020 Prob (F-statistic): 0.000692
Time: 13:56:02 Log-Likelihood: 143.50
No. Observations: 736 AIC: -283.0
Df Residuals: 734 BIC: -273.8
Df Model: 1
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [0.025 0.975]
const 0.3635 0.015 24.195 0.000 0.334 0.393
popsize -8.872e-05 2.6e-05 -3.407 0.001 -0.000 -3.76e-05
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 341.314 Durbin-Watson: 1.671
Prob(Omnibus): 0.000 Jarque-Bera (JB): 45.918
Skew: -0.223 Prob(JB): 1.07e-10
Kurtosis: 1.861 Cond. No. 1.18e+03


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.18e+03. This might indicate that there are
strong multicollinearity or other numerical problems." ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: dist_cor R-squared: 0.016\n", "Model: OLS Adj. R-squared: 0.014\n", "Method: Least Squares F-statistic: 11.61\n", "Date: Sat, 07 Nov 2020 Prob (F-statistic): 0.000692\n", "Time: 13:56:02 Log-Likelihood: 143.50\n", "No. Observations: 736 AIC: -283.0\n", "Df Residuals: 734 BIC: -273.8\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 0.3635 0.015 24.195 0.000 0.334 0.393\n", "popsize -8.872e-05 2.6e-05 -3.407 0.001 -0.000 -3.76e-05\n", "==============================================================================\n", "Omnibus: 341.314 Durbin-Watson: 1.671\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 45.918\n", "Skew: -0.223 Prob(JB): 1.07e-10\n", "Kurtosis: 1.861 Cond. No. 1.18e+03\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.18e+03. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n", "\"\"\"" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get IV and DV\n", "X = df[\"popsize\"]\n", "y = df[\"dist_cor\"]\n", "\n", "# R^2 drops if I add a constant\n", "X = sm.add_constant(X)\n", "\n", "# fit the model\n", "model = sm.OLS(y, X).fit()\n", "# predictions = model.predict(X)\n", "\n", "# show summary\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: dist_cor R-squared: 0.017
Model: OLS Adj. R-squared: 0.016
Method: Least Squares F-statistic: 12.66
Date: Sat, 07 Nov 2020 Prob (F-statistic): 0.000397
Time: 13:56:07 Log-Likelihood: 144.02
No. Observations: 736 AIC: -284.0
Df Residuals: 734 BIC: -274.8
Df Model: 1
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [0.025 0.975]
const 0.2807 0.013 21.587 0.000 0.255 0.306
hostility 0.0767 0.022 3.558 0.000 0.034 0.119
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 607.422 Durbin-Watson: 1.677
Prob(Omnibus): 0.000 Jarque-Bera (JB): 55.697
Skew: -0.287 Prob(JB): 8.04e-13
Kurtosis: 1.780 Cond. No. 3.74


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: dist_cor R-squared: 0.017\n", "Model: OLS Adj. R-squared: 0.016\n", "Method: Least Squares F-statistic: 12.66\n", "Date: Sat, 07 Nov 2020 Prob (F-statistic): 0.000397\n", "Time: 13:56:07 Log-Likelihood: 144.02\n", "No. Observations: 736 AIC: -284.0\n", "Df Residuals: 734 BIC: -274.8\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 0.2807 0.013 21.587 0.000 0.255 0.306\n", "hostility 0.0767 0.022 3.558 0.000 0.034 0.119\n", "==============================================================================\n", "Omnibus: 607.422 Durbin-Watson: 1.677\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 55.697\n", "Skew: -0.287 Prob(JB): 8.04e-13\n", "Kurtosis: 1.780 Cond. No. 3.74\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "\"\"\"" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df[\"hostility\"]\n", "y = df[\"dist_cor\"]\n", "X = sm.add_constant(X)\n", "model = sm.OLS(y, X).fit()\n", "predictions = model.predict(X)\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: dist_cor R-squared: 0.101
Model: OLS Adj. R-squared: 0.100
Method: Least Squares F-statistic: 82.42
Date: Sat, 07 Nov 2020 Prob (F-statistic): 1.00e-18
Time: 13:56:10 Log-Likelihood: 176.88
No. Observations: 736 AIC: -349.8
Df Residuals: 734 BIC: -340.6
Df Model: 1
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [0.025 0.975]
const 0.2242 0.013 17.834 0.000 0.200 0.249
memsize 0.0023 0.000 9.079 0.000 0.002 0.003
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 66.158 Durbin-Watson: 1.863
Prob(Omnibus): 0.000 Jarque-Bera (JB): 35.368
Skew: -0.378 Prob(JB): 2.09e-08
Kurtosis: 2.237 Cond. No. 90.0


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: dist_cor R-squared: 0.101\n", "Model: OLS Adj. R-squared: 0.100\n", "Method: Least Squares F-statistic: 82.42\n", "Date: Sat, 07 Nov 2020 Prob (F-statistic): 1.00e-18\n", "Time: 13:56:10 Log-Likelihood: 176.88\n", "No. Observations: 736 AIC: -349.8\n", "Df Residuals: 734 BIC: -340.6\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 0.2242 0.013 17.834 0.000 0.200 0.249\n", "memsize 0.0023 0.000 9.079 0.000 0.002 0.003\n", "==============================================================================\n", "Omnibus: 66.158 Durbin-Watson: 1.863\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 35.368\n", "Skew: -0.378 Prob(JB): 2.09e-08\n", "Kurtosis: 2.237 Cond. No. 90.0\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "\"\"\"" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df[\"memsize\"]\n", "y = df[\"dist_cor\"]\n", "X = sm.add_constant(X)\n", "model = sm.OLS(y, X).fit()\n", "predictions = model.predict(X)\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: dist_cor R-squared: 0.361
Model: OLS Adj. R-squared: 0.360
Method: Least Squares F-statistic: 415.3
Date: Sun, 20 Sep 2020 Prob (F-statistic): 1.68e-73
Time: 13:38:11 Log-Likelihood: 302.72
No. Observations: 736 AIC: -601.4
Df Residuals: 734 BIC: -592.2
Df Model: 1
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [0.025 0.975]
const 0.1413 0.011 13.419 0.000 0.121 0.162
numskills 0.0043 0.000 20.378 0.000 0.004 0.005
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 26.185 Durbin-Watson: 1.147
Prob(Omnibus): 0.000 Jarque-Bera (JB): 14.224
Skew: 0.155 Prob(JB): 0.000815
Kurtosis: 2.394 Cond. No. 89.0


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: dist_cor R-squared: 0.361\n", "Model: OLS Adj. R-squared: 0.360\n", "Method: Least Squares F-statistic: 415.3\n", "Date: Sun, 20 Sep 2020 Prob (F-statistic): 1.68e-73\n", "Time: 13:38:11 Log-Likelihood: 302.72\n", "No. Observations: 736 AIC: -601.4\n", "Df Residuals: 734 BIC: -592.2\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 0.1413 0.011 13.419 0.000 0.121 0.162\n", "numskills 0.0043 0.000 20.378 0.000 0.004 0.005\n", "==============================================================================\n", "Omnibus: 26.185 Durbin-Watson: 1.147\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 14.224\n", "Skew: 0.155 Prob(JB): 0.000815\n", "Kurtosis: 2.394 Cond. No. 89.0\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "\"\"\"" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df[\"numskills\"]\n", "y = df[\"dist_cor\"]\n", "X = sm.add_constant(X)\n", "model = sm.OLS(y, X).fit()\n", "predictions = model.predict(X)\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multiple Linear Regression" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: dist_cor R-squared: 0.468
Model: OLS Adj. R-squared: 0.466
Method: Least Squares F-statistic: 322.1
Date: Sun, 20 Sep 2020 Prob (F-statistic): 4.16e-101
Time: 13:42:13 Log-Likelihood: 369.80
No. Observations: 736 AIC: -733.6
Df Residuals: 733 BIC: -719.8
Df Model: 2
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [0.025 0.975]
const 0.0428 0.013 3.394 0.001 0.018 0.068
numskills 0.0043 0.000 22.476 0.000 0.004 0.005
memsize 0.0023 0.000 12.107 0.000 0.002 0.003
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 29.689 Durbin-Watson: 1.410
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.512
Skew: 0.195 Prob(JB): 0.000260
Kurtosis: 2.379 Cond. No. 152.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: dist_cor R-squared: 0.468\n", "Model: OLS Adj. R-squared: 0.466\n", "Method: Least Squares F-statistic: 322.1\n", "Date: Sun, 20 Sep 2020 Prob (F-statistic): 4.16e-101\n", "Time: 13:42:13 Log-Likelihood: 369.80\n", "No. Observations: 736 AIC: -733.6\n", "Df Residuals: 733 BIC: -719.8\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 0.0428 0.013 3.394 0.001 0.018 0.068\n", "numskills 0.0043 0.000 22.476 0.000 0.004 0.005\n", "memsize 0.0023 0.000 12.107 0.000 0.002 0.003\n", "==============================================================================\n", "Omnibus: 29.689 Durbin-Watson: 1.410\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.512\n", "Skew: 0.195 Prob(JB): 0.000260\n", "Kurtosis: 2.379 Cond. No. 152.\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "\"\"\"" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df[[\"numskills\", \"memsize\"]]\n", "y = df[\"dist_cor\"]\n", "X = sm.add_constant(X)\n", "\n", "model = sm.OLS(y, X).fit()\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Full model" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
OLS Regression Results
Dep. Variable: dist_cor R-squared: 0.497
Model: OLS Adj. R-squared: 0.494
Method: Least Squares F-statistic: 180.6
Date: Sun, 20 Sep 2020 Prob (F-statistic): 1.46e-107
Time: 14:17:09 Log-Likelihood: 390.65
No. Observations: 736 AIC: -771.3
Df Residuals: 731 BIC: -748.3
Df Model: 4
Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err t P>|t| [0.025 0.975]
const 0.0509 0.017 2.943 0.003 0.017 0.085
popsize -8.504e-05 1.87e-05 -4.559 0.000 -0.000 -4.84e-05
hostility 0.0722 0.015 4.668 0.000 0.042 0.103
memsize 0.0023 0.000 12.323 0.000 0.002 0.003
numskills 0.0043 0.000 23.049 0.000 0.004 0.005
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 28.997 Durbin-Watson: 1.489
Prob(Omnibus): 0.000 Jarque-Bera (JB): 13.951
Skew: 0.104 Prob(JB): 0.000935
Kurtosis: 2.358 Cond. No. 2.17e+03


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.17e+03. This might indicate that there are
strong multicollinearity or other numerical problems." ], "text/plain": [ "\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: dist_cor R-squared: 0.497\n", "Model: OLS Adj. R-squared: 0.494\n", "Method: Least Squares F-statistic: 180.6\n", "Date: Sun, 20 Sep 2020 Prob (F-statistic): 1.46e-107\n", "Time: 14:17:09 Log-Likelihood: 390.65\n", "No. Observations: 736 AIC: -771.3\n", "Df Residuals: 731 BIC: -748.3\n", "Df Model: 4 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 0.0509 0.017 2.943 0.003 0.017 0.085\n", "popsize -8.504e-05 1.87e-05 -4.559 0.000 -0.000 -4.84e-05\n", "hostility 0.0722 0.015 4.668 0.000 0.042 0.103\n", "memsize 0.0023 0.000 12.323 0.000 0.002 0.003\n", "numskills 0.0043 0.000 23.049 0.000 0.004 0.005\n", "==============================================================================\n", "Omnibus: 28.997 Durbin-Watson: 1.489\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 13.951\n", "Skew: 0.104 Prob(JB): 0.000935\n", "Kurtosis: 2.358 Cond. No. 2.17e+03\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 2.17e+03. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n", "\"\"\"" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df[IVs]\n", "y = df[\"dist_cor\"]\n", "X = sm.add_constant(X)\n", "\n", "model = sm.OLS(y, X).fit()\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }