import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import netCDF4 as nc
import pandas as pd
import os
import csv
from glob import glob
import xarray as xr
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import re
from datetime import datetime
# Step 1: Data Preparation
# Load your dataset into a pandas DataFrame
os.chdir('/Users/chenchenren/postdoc/paper/2N and water-US/regression/')
excel_file = "./soybean_reg_data.csv"
# Read the specific sheet from the Excel file
ds = pd.read_csv(excel_file)
df_soybean = ds
print(df_soybean)
geoid year tmp pre irrigation1 lnyield lnfer \ 0 1001.0 2009.0 24.140660 8.808839 0.718851 7.609614 2.022537 1 1001.0 2010.0 25.675484 5.438990 0.830087 7.035731 1.910005 2 1001.0 2011.0 24.544823 5.270162 0.595086 7.466514 2.025641 3 1001.0 2013.0 24.036592 7.514143 0.001241 7.926855 2.906756 4 1003.0 2008.0 24.727230 8.675528 0.001980 7.934111 1.240602 ... ... ... ... ... ... ... ... 17577 55141.0 2016.0 17.486904 7.304833 0.000000 8.134343 2.860158 17578 55141.0 2017.0 16.590237 6.467205 0.000000 7.978877 2.633171 17579 55141.0 2018.0 17.266659 8.373576 0.000000 7.838077 2.639034 17580 55141.0 2019.0 16.238592 8.735889 0.000000 7.912185 2.671287 17581 55141.0 2020.0 16.202635 6.832954 0.000000 8.173033 2.625827 group zone 0 1 5.0 1 1 5.0 2 1 5.0 3 1 5.0 4 1 5.0 ... ... ... 17577 2 1.0 17578 2 1.0 17579 2 1.0 17580 2 1.0 17581 2 1.0 [17582 rows x 9 columns]
# Filter data for irrigated soybean
data = df_soybean[df_soybean['group'] == 1]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1256, but rank is 7 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.709 Model: OLS Adj. R-squared: 0.651 Within R-squared: 0.243 Method: Least Squares F-statistic: 21038509288831.2 Date: Thu, 29 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 14:41:13 Log-Likelihood: 3156.68 No. Observations: 7507 AIC: -3799.4 Df Residuals: 6250 BIC: 4903.6 Df Model: 1256 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 0.800648 0.388248 2.062206 3.918814e-02 0.039695 1.561600 lnfer 0.130809 0.008856 14.769889 2.290910e-49 0.113451 0.148168 irrigation1 -0.019336 0.017419 -1.110080 2.669645e-01 -0.053476 0.014804 irrigation12 0.003916 0.001987 1.970166 4.881935e-02 0.000020 0.007811 tmp 0.544660 0.036766 14.814312 1.183961e-49 0.472600 0.616719 tmp_tmp_interaction -0.012964 0.000877 -14.789376 1.715359e-49 -0.014682 -0.011246 pre 0.224017 0.016294 13.748319 5.212636e-43 0.192081 0.255953 pre_pre_interaction -0.013249 0.001157 -11.451744 2.304610e-30 -0.015517 -0.010981
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp = model.params['tmp']
coeff_tmp_tmp_interaction = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range = np.linspace(data['tmp'].min(), data['tmp'].max(), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp = coeff_tmp * tmp_range + coeff_tmp_tmp_interaction * (tmp_range ** 2)-5.5
# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['tmp', 'tmp'] * tmp_range**2 +
2 * cov_matrix.loc['tmp', 'tmp_tmp_interaction'] * tmp_range**3 +
cov_matrix.loc['tmp_tmp_interaction', 'tmp_tmp_interaction'] * tmp_range**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_tmp = lnyield_pred_tmp - 1.96 * pred_se
ci_upper_tmp = lnyield_pred_tmp + 1.96 * pred_se
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range, lnyield_pred_tmp, label='Predicted lnyield', color='blue')
plt.fill_between(tmp_range, ci_lower_tmp, ci_upper_tmp, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre = model.params['pre']
coeff_pre_pre_interaction = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range = np.linspace(np.percentile(data['pre'],1), np.percentile(data['pre'],99), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre = coeff_pre * pre_range + coeff_pre_pre_interaction * (pre_range ** 2)-0.8
# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['pre', 'pre'] * pre_range**2 +
2 * cov_matrix.loc['pre', 'pre_pre_interaction'] * pre_range**3 +
cov_matrix.loc['pre_pre_interaction', 'pre_pre_interaction'] * pre_range**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_pre = lnyield_pred_pre - 1.96 * pred_se
ci_upper_pre = lnyield_pred_pre + 1.96 * pred_se
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range, lnyield_pred_pre, label='Predicted lnyield', color='blue')
plt.fill_between(pre_range, ci_lower_pre, ci_upper_pre, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for rainfed soybean
data = df_soybean[df_soybean['group'] == 2]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1583, but rank is 5 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.708 Model: OLS Adj. R-squared: 0.654 Within R-squared: 0.233 Method: Least Squares F-statistic: -22135224246007.9 Date: Thu, 29 Aug 2024 Prob (F-statistic): 1.00e+00 Time: 14:41:27 Log-Likelihood: 5039.48 No. Observations: 10075 AIC: -6911.0 Df Residuals: 8491 BIC: 4522.0 Df Model: 1583 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 3.566492 0.220815 16.151458 1.109087e-58 3.133701 3.999282 lnfer 0.127607 0.006789 18.797386 7.933075e-79 0.114301 0.140912 tmp 0.340510 0.024217 14.060477 6.643677e-45 0.293044 0.387975 tmp_tmp_interaction -0.007994 0.000639 -12.519552 5.836378e-36 -0.009246 -0.006743 pre 0.177062 0.013533 13.083432 4.095265e-39 0.150537 0.203587 pre_pre_interaction -0.010534 0.000976 -10.791532 3.774445e-27 -0.012447 -0.008620
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp = model.params['tmp']
coeff_tmp_tmp_interaction = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd = np.linspace(data['tmp'].min(), data['tmp'].max(), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd = coeff_tmp * tmp_range_rfd + coeff_tmp_tmp_interaction * (tmp_range_rfd ** 2)-3.1
# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['tmp', 'tmp'] * tmp_range_rfd**2 +
2 * cov_matrix.loc['tmp', 'tmp_tmp_interaction'] * tmp_range_rfd**3 +
cov_matrix.loc['tmp_tmp_interaction', 'tmp_tmp_interaction'] * tmp_range_rfd**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_tmp_rfd = lnyield_pred_tmp_rfd - 1.96 * pred_se
ci_upper_tmp_rfd = lnyield_pred_tmp_rfd + 1.96 * pred_se
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd, lnyield_pred_tmp_rfd, label='Predicted lnyield', color='blue')
plt.fill_between(tmp_range_rfd, ci_lower_tmp_rfd, ci_upper_tmp_rfd, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre = model.params['pre']
coeff_pre_pre_interaction = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd = np.linspace(np.percentile(data['pre'],1), np.percentile(data['pre'],99), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd = coeff_pre * pre_range_rfd + coeff_pre_pre_interaction * (pre_range_rfd ** 2)-0.6
# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['pre', 'pre'] * pre_range_rfd**2 +
2 * cov_matrix.loc['pre', 'pre_pre_interaction'] * pre_range_rfd**3 +
cov_matrix.loc['pre_pre_interaction', 'pre_pre_interaction'] * pre_range_rfd**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_pre_rfd = lnyield_pred_pre_rfd - 1.96 * pred_se
ci_upper_pre_rfd = lnyield_pred_pre_rfd + 1.96 * pred_se
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd, lnyield_pred_pre_rfd, label='Predicted lnyield', color='blue')
plt.fill_between(pre_range_rfd, ci_lower_pre_rfd, ci_upper_pre_rfd, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 1)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.729 Model: OLS Adj. R-squared: 0.677 Within R-squared: 0.213 Method: Least Squares F-statistic: 46.2 Date: Thu, 29 Aug 2024 Prob (F-statistic): 6.92e-34 Time: 14:41:27 Log-Likelihood: 635.36 No. Observations: 949 AIC: -964.7 Df Residuals: 796 BIC: -221.8 Df Model: 152 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 2.772323 1.411778 1.963710 0.049564 0.005289 5.539358 lnfer 0.071517 0.016528 4.326987 0.000015 0.039123 0.103912 irrigation1 -0.066475 0.175892 -0.377929 0.705483 -0.411217 0.278267 irrigation12 0.066211 0.122418 0.540861 0.588603 -0.173724 0.306146 tmp 0.475086 0.159011 2.987752 0.002810 0.163430 0.786742 tmp_tmp_interaction -0.012017 0.004551 -2.640476 0.008279 -0.020937 -0.003097 pre 0.161312 0.041595 3.878162 0.000105 0.079787 0.242837 pre_pre_interaction -0.010554 0.003401 -3.102931 0.001916 -0.017221 -0.003888
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 152, but rank is 7 warnings.warn('covariance of constraints does not have full '
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp1 = model.params['tmp']
coeff_tmp_tmp_interaction1 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range1 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp1 = coeff_tmp1 * tmp_range1 + coeff_tmp_tmp_interaction1 * (tmp_range1 ** 2)-4.5
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range1, lnyield_pred_tmp1, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre1 = model.params['pre']
coeff_pre_pre_interaction1 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range1 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre1 = coeff_pre1 * pre_range1 + coeff_pre_pre_interaction1 * (pre_range1 ** 2)-0.57
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range1, lnyield_pred_pre1, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 2)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 305, but rank is 7 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.715 Model: OLS Adj. R-squared: 0.647 Within R-squared: 0.310 Method: Least Squares F-statistic: 239346187613.5 Date: Thu, 29 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 14:41:28 Log-Likelihood: 1205.19 No. Observations: 1585 AIC: -1798.4 Df Residuals: 1279 BIC: -155.7 Df Model: 305 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 2.799067 1.206685 2.319634 2.036067e-02 0.434009 5.164126 lnfer 0.062591 0.017941 3.488669 4.854314e-04 0.027427 0.097756 irrigation1 -0.157767 0.144642 -1.090741 2.753871e-01 -0.441261 0.125727 irrigation12 0.119411 0.104036 1.147788 2.510560e-01 -0.084495 0.323317 tmp 0.349841 0.125661 2.784010 5.369134e-03 0.103550 0.596131 tmp_tmp_interaction -0.007060 0.003242 -2.177250 2.946190e-02 -0.013415 -0.000705 pre 0.400009 0.048965 8.169204 3.104301e-16 0.304038 0.495979 pre_pre_interaction -0.029878 0.003767 -7.930857 2.176382e-15 -0.037261 -0.022494
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp2 = model.params['tmp']
coeff_tmp_tmp_interaction2 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range2 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp2 = coeff_tmp2 * tmp_range2 + coeff_tmp_tmp_interaction2 * (tmp_range2 ** 2)-4
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range2, lnyield_pred_tmp2, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre2 = model.params['pre']
coeff_pre_pre_interaction2 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range2 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre2 = coeff_pre2 * pre_range2 + coeff_pre_pre_interaction2 * (pre_range2 ** 2)-1.2
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range2, lnyield_pred_pre2, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 3)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 342, but rank is 7 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.743 Model: OLS Adj. R-squared: 0.682 Within R-squared: 0.377 Method: Least Squares F-statistic: -8850742018281.0 Date: Thu, 29 Aug 2024 Prob (F-statistic): 1.00e+00 Time: 14:44:21 Log-Likelihood: 809.10 No. Observations: 1772 AIC: -932.2 Df Residuals: 1429 BIC: 947.4 Df Model: 342 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 2.474073 1.261171 1.961727 4.979423e-02 0.002224 4.945922 lnfer 0.135249 0.018715 7.226722 4.947932e-13 0.098568 0.171930 irrigation1 -0.002444 0.105756 -0.023113 9.815603e-01 -0.209722 0.204833 irrigation12 0.002774 0.014578 0.190293 8.490792e-01 -0.025798 0.031346 tmp 0.417687 0.120396 3.469277 5.218615e-04 0.181715 0.653659 tmp_tmp_interaction -0.009835 0.002872 -3.424827 6.151920e-04 -0.015463 -0.004207 pre 0.311809 0.034264 9.100094 9.025372e-20 0.244652 0.378966 pre_pre_interaction -0.018173 0.002632 -6.905689 4.996047e-12 -0.023331 -0.013015
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp3 = model.params['tmp']
coeff_tmp_tmp_interaction3 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range3 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp3 = coeff_tmp3 * tmp_range3 + coeff_tmp_tmp_interaction3 * (tmp_range3 ** 2)-4.3
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range3, lnyield_pred_tmp3, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre3 = model.params['pre']
coeff_pre_pre_interaction3 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range3 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre3 = coeff_pre3 * pre_range3 + coeff_pre_pre_interaction3 * (pre_range3 ** 2)-1.15
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range3, lnyield_pred_pre3, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 4)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 336, but rank is 7 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.595 Model: OLS Adj. R-squared: 0.527 Within R-squared: 0.238 Method: Least Squares F-statistic: -10494853094051.4 Date: Thu, 29 Aug 2024 Prob (F-statistic): 1.00e+00 Time: 14:44:21 Log-Likelihood: 811.28 No. Observations: 2343 AIC: -948.6 Df Residuals: 2006 BIC: 992.3 Df Model: 336 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept -2.123577 2.011294 -1.055826 2.910477e-01 -6.065642 1.818488 lnfer 0.125002 0.015565 8.031049 9.664307e-16 0.094496 0.155509 irrigation1 -0.010905 0.024445 -0.446115 6.555139e-01 -0.058817 0.037006 irrigation12 0.002964 0.002623 1.130108 2.584308e-01 -0.002177 0.008105 tmp 0.760151 0.179720 4.229639 2.340663e-05 0.407906 1.112396 tmp_tmp_interaction -0.017714 0.003993 -4.436234 9.154629e-06 -0.025540 -0.009888 pre 0.334204 0.034323 9.737126 2.093934e-22 0.266933 0.401475 pre_pre_interaction -0.019695 0.002210 -8.911486 5.035321e-19 -0.024027 -0.015363
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp4 = model.params['tmp']
coeff_tmp_tmp_interaction4 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range4 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp4 = coeff_tmp4 * tmp_range4 + coeff_tmp_tmp_interaction4 * (tmp_range4 ** 2)-8
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range4, lnyield_pred_tmp4, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre4 = model.params['pre']
coeff_pre_pre_interaction4 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range4 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre4 = coeff_pre4 * pre_range4 + coeff_pre_pre_interaction4 * (pre_range4 ** 2)-1.25
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range4, lnyield_pred_pre4, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 5)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.529 Model: OLS Adj. R-squared: 0.433 Within R-squared: 0.219 Method: Least Squares F-statistic: 190.6 Date: Thu, 29 Aug 2024 Prob (F-statistic): 1.16e-67 Time: 14:44:21 Log-Likelihood: 147.76 No. Observations: 858 AIC: -3.5 Df Residuals: 712 BIC: 690.7 Df Model: 145 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept -0.238525 9.524595 -0.025043 9.800207e-01 -18.906388 18.429339 lnfer 0.209968 0.023900 8.785335 1.558992e-18 0.163125 0.256811 irrigation1 0.015060 0.028786 0.523151 6.008695e-01 -0.041361 0.071480 irrigation12 0.003037 0.003670 0.827398 4.080113e-01 -0.004156 0.010230 tmp 0.611501 0.773902 0.790153 4.294383e-01 -0.905319 2.128322 tmp_tmp_interaction -0.014099 0.015590 -0.904340 3.658151e-01 -0.044656 0.016458 pre 0.191364 0.060136 3.182184 1.461687e-03 0.073499 0.309228 pre_pre_interaction -0.010960 0.003802 -2.882722 3.942549e-03 -0.018413 -0.003508
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 145, but rank is 7 warnings.warn('covariance of constraints does not have full '
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp5 = model.params['tmp']
coeff_tmp_tmp_interaction5 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range5 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp5 = coeff_tmp5 * tmp_range5 + coeff_tmp_tmp_interaction5 * (tmp_range5 ** 2)-6.5
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range5, lnyield_pred_tmp5, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre5 = model.params['pre']
coeff_pre_pre_interaction5 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range5 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre5 = coeff_pre5 * pre_range5 + coeff_pre_pre_interaction5 * (pre_range5 ** 2)-0.65
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range5, lnyield_pred_pre5, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Import the necessary library for setting fonts
from matplotlib import rcParams
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16
# Define a list of (tmp_range, lnyield_pred) tuples for the five models
models = [(tmp_range1, lnyield_pred_tmp1),
(tmp_range2, lnyield_pred_tmp2),
(tmp_range3, lnyield_pred_tmp3),
(tmp_range4, lnyield_pred_tmp4),
(tmp_range5, lnyield_pred_tmp5)]
# Plotting the results
plt.figure(figsize=(4, 4))
# Plot the first model (One model)
plt.plot(tmp_range, lnyield_pred_tmp, label='One model', color='blue', linewidth=4)
plt.fill_between(tmp_range, ci_lower_tmp, ci_upper_tmp, color='blue', alpha=0.2, label='95% CI')
# Plot the five models
for idx, (tmp_range_, lnyield_pred_) in enumerate(models):
if idx == 0:
plt.plot(tmp_range_, lnyield_pred_, color='green', label='Five models')
else:
plt.plot(tmp_range_, lnyield_pred_, color='green')
# Add labels and title
plt.xlabel('Temperature (°C)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')
x_ticks = np.linspace(tmp_range.min(), tmp_range.max(), num=5) # Custom X tick positions
y_ticks = np.linspace(-2, 1.2, num=4) # Custom Y tick positions
plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks]) # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks]) # Custom Y tick labels
plt.tick_params(axis='both', which='major', labelsize=16) # Adjust tick label size
# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_irr_tem.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)
# Show the plot
plt.show()
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16
# Define a list of (pre_range, lnyield_pred) tuples for the five models
models = [(pre_range1, lnyield_pred_pre1),
(pre_range2, lnyield_pred_pre2),
(pre_range3, lnyield_pred_pre3),
(pre_range4, lnyield_pred_pre4),
(pre_range5, lnyield_pred_pre5)]
# Plotting the results
plt.figure(figsize=(4, 4))
# Plot the first model (One model)
plt.plot(pre_range, lnyield_pred_pre, label='One model', color='blue', linewidth=4)
plt.fill_between(pre_range, ci_lower_pre, ci_upper_pre, color='blue', alpha=0.2, label='95% CI')
# Plot the five models
for idx, (pre_range_, lnyield_pred_) in enumerate(models):
if idx == 0:
plt.plot(pre_range_, lnyield_pred_, color='green', label='Five models')
else:
plt.plot(pre_range_, lnyield_pred_, color='green')
# Add labels and title
plt.xlabel('Precipitation (100mm)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')
x_ticks = np.linspace(2, 10, num=4) # Custom X tick positions
y_ticks = np.linspace(-0.4, 0.4, num=4) # Custom Y tick positions
plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks]) # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks]) # Custom Y tick labels
plt.tick_params(axis='both', which='major', labelsize=16) # Adjust tick label size
# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_irr_pre.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)
# Show the plot
plt.show()
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 1))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.755 Model: OLS Adj. R-squared: 0.712 Within R-squared: 0.223 Method: Least Squares F-statistic: -24926737835298.1 Date: Thu, 29 Aug 2024 Prob (F-statistic): 1.00e+00 Time: 14:45:39 Log-Likelihood: 1144.73 No. Observations: 1777 AIC: -1767.5 Df Residuals: 1516 BIC: -336.5 Df Model: 260 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 5.043071 0.583724 8.639476 5.647164e-18 3.898992 6.187149 lnfer 0.050117 0.014981 3.345461 8.214596e-04 0.020756 0.079479 tmp 0.247224 0.066761 3.703113 2.129701e-04 0.116375 0.378074 tmp_tmp_interaction -0.004787 0.001930 -2.479632 1.315180e-02 -0.008570 -0.001003 pre 0.006076 0.022567 0.269250 7.877370e-01 -0.038154 0.050306 pre_pre_interaction 0.001494 0.001792 0.833344 4.046505e-01 -0.002019 0.005007
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 260, but rank is 5 warnings.warn('covariance of constraints does not have full '
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd1 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd1 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd1 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd1 = coeff_tmp_rfd1 * tmp_range_rfd1 + coeff_tmp_tmp_interaction_rfd1 * (tmp_range_rfd1 ** 2)-2.35
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd1, lnyield_pred_tmp_rfd1, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd1 = model.params['pre']
coeff_pre_pre_interaction_rfd1 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd1 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd1 = coeff_pre_rfd1 * pre_range_rfd1 + coeff_pre_pre_interaction_rfd1 * (pre_range_rfd1 ** 2)
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd1, lnyield_pred_pre_rfd1, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 2))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 414, but rank is 5 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.645 Model: OLS Adj. R-squared: 0.590 Within R-squared: 0.263 Method: Least Squares F-statistic: 65260093579113.6 Date: Thu, 29 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 14:45:39 Log-Likelihood: 2350.19 No. Observations: 3099 AIC: -3870.4 Df Residuals: 2684 BIC: -1364.3 Df Model: 414 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 4.662776 0.517787 9.005192 2.152887e-19 3.647931 5.677620 lnfer 0.071498 0.008279 8.636499 5.796193e-18 0.055272 0.087724 tmp 0.255181 0.054923 4.646170 3.381546e-06 0.147534 0.362828 tmp_tmp_interaction -0.005416 0.001451 -3.732438 1.896356e-04 -0.008260 -0.002572 pre 0.189788 0.020793 9.127347 7.019874e-20 0.149034 0.230542 pre_pre_interaction -0.012937 0.001627 -7.949341 1.875062e-15 -0.016126 -0.009747
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd2 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd2 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd2 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd2 = coeff_tmp_rfd2 * tmp_range_rfd2 + coeff_tmp_tmp_interaction_rfd2 * (tmp_range_rfd2 ** 2)-2.4
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd2, lnyield_pred_tmp_rfd2, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd2 = model.params['pre']
coeff_pre_pre_interaction_rfd2 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd2 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd2 = coeff_pre_rfd2 * pre_range_rfd2 + coeff_pre_pre_interaction_rfd2 * (pre_range_rfd2 ** 2)-0.6
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd2, lnyield_pred_pre_rfd2, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 3))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 393, but rank is 5 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.646 Model: OLS Adj. R-squared: 0.584 Within R-squared: 0.404 Method: Least Squares F-statistic: 9708370610756.8 Date: Thu, 29 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 14:45:40 Log-Likelihood: 1478.89 No. Observations: 2629 AIC: -2169.8 Df Residuals: 2235 BIC: 144.7 Df Model: 393 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 5.988896 0.548497 10.918750 9.377795e-28 4.913863 7.063930 lnfer 0.135684 0.011772 11.525564 9.806759e-31 0.112611 0.158758 tmp 0.009136 0.053946 0.169361 8.655131e-01 -0.096596 0.114869 tmp_tmp_interaction 0.000200 0.001353 0.147871 8.824449e-01 -0.002451 0.002851 pre 0.475892 0.035173 13.529879 1.041943e-41 0.406954 0.544831 pre_pre_interaction -0.030512 0.002584 -11.810234 3.455978e-32 -0.035576 -0.025449
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd3 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd3 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd3 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd3 = coeff_tmp_rfd3 * tmp_range_rfd3 + coeff_tmp_tmp_interaction_rfd3 * (tmp_range_rfd3 ** 2)+0.2
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd3, lnyield_pred_tmp_rfd3, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd3 = model.params['pre']
coeff_pre_pre_interaction_rfd3 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd3 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd3 = coeff_pre_rfd3 * pre_range_rfd3 + coeff_pre_pre_interaction_rfd3 * (pre_range_rfd3 ** 2)-1.7
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd3, lnyield_pred_pre_rfd3, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 4))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 366, but rank is 5 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.732 Model: OLS Adj. R-squared: 0.658 Within R-squared: 0.364 Method: Least Squares F-statistic: 7165590151798.0 Date: Thu, 29 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 14:45:40 Log-Likelihood: 606.76 No. Observations: 1690 AIC: -479.5 Df Residuals: 1323 BIC: 1514.2 Df Model: 366 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 3.928053 1.903232 2.063886 3.902855e-02 0.197787 7.658320 lnfer 0.180562 0.020036 9.011745 2.028031e-19 0.141292 0.219832 tmp 0.177480 0.168832 1.051219 2.931582e-01 -0.153426 0.508385 tmp_tmp_interaction -0.004969 0.003783 -1.313541 1.890006e-01 -0.012384 0.002446 pre 0.460187 0.054861 8.388181 4.937205e-17 0.352661 0.567713 pre_pre_interaction -0.028826 0.003597 -8.014023 1.110160e-15 -0.035876 -0.021776
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd4 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd4 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd4 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd4 = coeff_tmp_rfd4 * tmp_range_rfd4 + coeff_tmp_tmp_interaction_rfd4 * (tmp_range_rfd4 ** 2)-1
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd4, lnyield_pred_tmp_rfd4, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd4 = model.params['pre']
coeff_pre_pre_interaction_rfd4 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd4 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd4 = coeff_pre_rfd4 * pre_range_rfd4 + coeff_pre_pre_interaction_rfd4 * (pre_range_rfd4 ** 2)-1.7
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd4, lnyield_pred_pre_rfd4, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 5))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.672 Model: OLS Adj. R-squared: 0.595 Within R-squared: 0.101 Method: Least Squares F-statistic: 30224499013205.9 Date: Thu, 29 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 14:45:54 Log-Likelihood: 254.73 No. Observations: 880 AIC: -175.5 Df Residuals: 713 BIC: 622.8 Df Model: 166 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept -7.146240 4.875147 -1.465851 1.426888e-01 -16.701352 2.408871 lnfer 0.144921 0.023243 6.234911 4.520359e-10 0.099365 0.190477 tmp 1.146929 0.390122 2.939922 3.282947e-03 0.382303 1.911554 tmp_tmp_interaction -0.022834 0.007780 -2.934814 3.337475e-03 -0.038083 -0.007585 pre 0.148563 0.036180 4.106212 4.022010e-05 0.077651 0.219474 pre_pre_interaction -0.009581 0.002490 -3.847653 1.192548e-04 -0.014462 -0.004701
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 166, but rank is 5 warnings.warn('covariance of constraints does not have full '
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd5 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd5 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd5 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd5 = coeff_tmp_rfd5 * tmp_range_rfd5 + coeff_tmp_tmp_interaction_rfd5 * (tmp_range_rfd5 ** 2)-13.95
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd5, lnyield_pred_tmp_rfd5, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd5 = model.params['pre']
coeff_pre_pre_interaction_rfd5 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd5 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd5 = coeff_pre_rfd5 * pre_range_rfd5 + coeff_pre_pre_interaction_rfd5 * (pre_range_rfd5 ** 2)-0.45
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd5, lnyield_pred_pre_rfd5, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16
# Define a list of (tmp_range, lnyield_pred) tuples for the five models
models = [(tmp_range_rfd1, lnyield_pred_tmp_rfd1),
(tmp_range_rfd2, lnyield_pred_tmp_rfd2),
(tmp_range_rfd3, lnyield_pred_tmp_rfd3),
(tmp_range_rfd4, lnyield_pred_tmp_rfd4),
(tmp_range_rfd5, lnyield_pred_tmp_rfd5)]
# Plotting the results
plt.figure(figsize=(4, 4))
# Plot the first model (One model)
plt.plot(tmp_range_rfd, lnyield_pred_tmp_rfd, label='One model', color='blue', linewidth=4)
plt.fill_between(tmp_range_rfd, ci_lower_tmp_rfd, ci_upper_tmp_rfd, color='blue', alpha=0.2, label='95% CI')
# Plot the five models
for idx, (tmp_range_, lnyield_pred_) in enumerate(models):
if idx == 0:
plt.plot(tmp_range_, lnyield_pred_, color='green', label='Five models')
else:
plt.plot(tmp_range_, lnyield_pred_, color='green')
# Add labels and title
plt.xlabel('Temperature (°C)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')
x_ticks = np.linspace(tmp_range_rfd.min(), tmp_range_rfd.max(), num=5) # Custom X tick positions
y_ticks = np.linspace(-0.6, 1, num=4) # Custom Y tick positions
plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks]) # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks]) # Custom Y tick labels
plt.tick_params(axis='both', which='major', labelsize=16) # Adjust tick label size
# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_rfd_tem.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)
# Show the plot
plt.show()
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16
# Define a list of (pre_range, lnyield_pred) tuples for the five models
models = [(pre_range_rfd1, lnyield_pred_pre_rfd1),
(pre_range_rfd2, lnyield_pred_pre_rfd2),
(pre_range_rfd3, lnyield_pred_pre_rfd3),
(pre_range_rfd4, lnyield_pred_pre_rfd4),
(pre_range_rfd5, lnyield_pred_pre_rfd5)]
# Plotting the results
plt.figure(figsize=(4, 4))
# Plot the first model (One model)
plt.plot(pre_range_rfd, lnyield_pred_pre_rfd, label='One model', color='blue', linewidth=4)
plt.fill_between(pre_range_rfd, ci_lower_pre_rfd, ci_upper_pre_rfd, color='blue', alpha=0.2, label='95% CI')
# Plot the five models
for idx, (pre_range_, lnyield_pred_) in enumerate(models):
if idx == 0:
plt.plot(pre_range_, lnyield_pred_, color='green', label='Five models')
else:
plt.plot(pre_range_, lnyield_pred_, color='green')
# Add labels and title
plt.xlabel('Precipitation (100mm)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')
x_ticks = np.linspace(2, 10, num=4) # Custom X tick positions
y_ticks = np.linspace(-0.3, 0.2, num=5) # Custom Y tick positions
plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks]) # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks]) # Custom Y tick labels
plt.tick_params(axis='both', which='major', labelsize=16) # Adjust tick label size
# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_rfd_pre.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)
# Show the plot
plt.show()
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 1)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.732 Model: OLS Adj. R-squared: 0.679 Within R-squared: 0.220 Method: Least Squares F-statistic: 140.0 Date: Wed, 28 Aug 2024 Prob (F-statistic): 2.14e-71 Time: 17:06:55 Log-Likelihood: 639.91 No. Observations: 949 AIC: -965.8 Df Residuals: 792 BIC: -203.5 Df Model: 156 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 2.789019 1.420123 1.963927 4.953856e-02 0.005628 5.572410 lnfer 0.080143 0.016234 4.936688 7.946044e-07 0.048325 0.111962 irrigation1 0.141556 0.441856 0.320367 7.486900e-01 -0.724466 1.007579 irrigation12 0.338094 0.300276 1.125945 2.601887e-01 -0.250436 0.926623 tmp 0.475791 0.159445 2.984050 2.844602e-03 0.163285 0.788297 tmp_tmp_interaction -0.012027 0.004562 -2.636388 8.379394e-03 -0.020968 -0.003086 pre 0.148433 0.043822 3.387179 7.061526e-04 0.062543 0.234322 pre_pre_interaction -0.009605 0.003571 -2.689620 7.153343e-03 -0.016604 -0.002606 lnfer_irrigation1_tmp_interaction -0.015636 0.018682 -0.836965 4.026124e-01 -0.052252 0.020980 lnfer_irrigation12_tmp_tmp_interaction -0.000531 0.000847 -0.626245 5.311539e-01 -0.002192 0.001130 lnfer_irrigation1_pre_interaction 0.032951 0.027199 1.211498 2.257048e-01 -0.020357 0.086260 lnfer_irrigation12_pre_pre_interaction -0.001150 0.002914 -0.394539 6.931831e-01 -0.006861 0.004561
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 156, but rank is 11 warnings.warn('covariance of constraints does not have full '
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 2)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.718 Model: OLS Adj. R-squared: 0.649 Within R-squared: 0.316 Method: Least Squares F-statistic: 3326159224107.0 Date: Wed, 28 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 17:06:55 Log-Likelihood: 1212.82 No. Observations: 1585 AIC: -1805.6 Df Residuals: 1275 BIC: -141.4 Df Model: 309 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 2.592114 1.209910 2.142402 3.216113e-02 0.220734 4.963494 lnfer 0.064939 0.017695 3.669950 2.425980e-04 0.030258 0.099620 irrigation1 0.173287 0.224644 0.771386 4.404783e-01 -0.267007 0.613582 irrigation12 -0.248563 0.180658 -1.375881 1.688586e-01 -0.602646 0.105519 tmp 0.372547 0.127112 2.930853 3.380324e-03 0.123412 0.621682 tmp_tmp_interaction -0.007665 0.003293 -2.327491 1.993913e-02 -0.014119 -0.001210 pre 0.397774 0.044993 8.840842 9.499993e-19 0.309590 0.485958 pre_pre_interaction -0.029778 0.003470 -8.582117 9.314284e-18 -0.036579 -0.022978 lnfer_irrigation1_tmp_interaction -0.024499 0.032200 -0.760858 4.467420e-01 -0.087609 0.038611 lnfer_irrigation12_tmp_tmp_interaction 0.001053 0.000815 1.291736 1.964485e-01 -0.000545 0.002651 lnfer_irrigation1_pre_interaction 0.052379 0.078906 0.663820 5.068058e-01 -0.102273 0.207031 lnfer_irrigation12_pre_pre_interaction -0.005112 0.006111 -0.836408 4.029255e-01 -0.017090 0.006866
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 309, but rank is 11 warnings.warn('covariance of constraints does not have full '
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 3)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 346, but rank is 11 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.751 Model: OLS Adj. R-squared: 0.691 Within R-squared: 0.397 Method: Least Squares F-statistic: 423233537990.0 Date: Wed, 28 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 17:06:55 Log-Likelihood: 838.45 No. Observations: 1772 AIC: -982.9 Df Residuals: 1425 BIC: 918.6 Df Model: 346 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 1.818522e+00 1.276758 1.424327 1.543518e-01 -0.683879 4.320922 lnfer 1.430024e-01 0.019184 7.454390 9.028422e-14 0.105403 0.180602 irrigation1 1.697547e-01 0.096894 1.751958 7.978097e-02 -0.020155 0.359664 irrigation12 -2.752325e-02 0.014778 -1.862439 6.254118e-02 -0.056488 0.001441 tmp 4.545799e-01 0.121412 3.744123 1.810246e-04 0.216618 0.692542 tmp_tmp_interaction -1.069193e-02 0.002894 -3.694690 2.201554e-04 -0.016364 -0.005020 pre 3.924562e-01 0.035756 10.976037 4.983096e-28 0.322376 0.462536 pre_pre_interaction -2.413294e-02 0.002722 -8.864334 7.696272e-19 -0.029469 -0.018797 lnfer_irrigation1_tmp_interaction 3.313665e-03 0.004712 0.703274 4.818849e-01 -0.005921 0.012549 lnfer_irrigation12_tmp_tmp_interaction -4.956403e-08 0.000029 -0.001730 9.986199e-01 -0.000056 0.000056 lnfer_irrigation1_pre_interaction -7.708309e-02 0.019978 -3.858362 1.141494e-04 -0.116240 -0.037927 lnfer_irrigation12_pre_pre_interaction 2.357695e-03 0.000777 3.032901 2.422153e-03 0.000834 0.003881
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 4)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 340, but rank is 11 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.605 Model: OLS Adj. R-squared: 0.538 Within R-squared: 0.258 Method: Least Squares F-statistic: 2637640612839.9 Date: Wed, 28 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 17:06:56 Log-Likelihood: 842.65 No. Observations: 2343 AIC: -1003.3 Df Residuals: 2002 BIC: 960.6 Df Model: 340 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept -3.513804 2.063075 -1.703187 8.853301e-02 -7.557357 0.529749 lnfer 0.141105 0.015489 9.110276 8.217285e-20 0.110748 0.171462 irrigation1 0.174299 0.080622 2.161924 3.062401e-02 0.016282 0.332315 irrigation12 -0.022010 0.011244 -1.957382 5.030263e-02 -0.044048 0.000029 tmp 0.870480 0.184208 4.725523 2.295234e-06 0.509439 1.231521 tmp_tmp_interaction -0.020199 0.004096 -4.930965 8.182442e-07 -0.028227 -0.012170 pre 0.360418 0.035214 10.234981 1.382222e-24 0.291399 0.429437 pre_pre_interaction -0.020955 0.002283 -9.180254 4.300741e-20 -0.025428 -0.016481 lnfer_irrigation1_tmp_interaction 0.002370 0.002600 0.911446 3.620603e-01 -0.002726 0.007467 lnfer_irrigation12_tmp_tmp_interaction 0.000003 0.000011 0.293160 7.694001e-01 -0.000018 0.000024 lnfer_irrigation1_pre_interaction -0.024085 0.007587 -3.174598 1.500442e-03 -0.038955 -0.009215 lnfer_irrigation12_pre_pre_interaction 0.000268 0.000088 3.041465 2.354300e-03 0.000095 0.000441
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 5)]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.537 Model: OLS Adj. R-squared: 0.440 Within R-squared: 0.233 Method: Least Squares F-statistic: 298.4 Date: Wed, 28 Aug 2024 Prob (F-statistic): 2.25e-90 Time: 17:06:56 Log-Likelihood: 155.38 No. Observations: 858 AIC: -10.8 Df Residuals: 708 BIC: 702.4 Df Model: 149 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 1.009895 8.858575 0.114002 9.092362e-01 -16.352593 18.372384 lnfer 0.229537 0.026924 8.525439 1.522308e-17 0.176768 0.282307 irrigation1 0.103771 0.116006 0.894537 3.710344e-01 -0.123595 0.331138 irrigation12 -0.012705 0.033366 -0.380778 7.033679e-01 -0.078100 0.052691 tmp 0.486572 0.719888 0.675900 4.991040e-01 -0.924382 1.897526 tmp_tmp_interaction -0.011602 0.014526 -0.798699 4.244648e-01 -0.040072 0.016868 pre 0.259156 0.062784 4.127714 3.663872e-05 0.136101 0.382211 pre_pre_interaction -0.014688 0.004041 -3.634390 2.786387e-04 -0.022609 -0.006767 lnfer_irrigation1_tmp_interaction 0.002022 0.002848 0.709733 4.778700e-01 -0.003561 0.007605 lnfer_irrigation12_tmp_tmp_interaction 0.000004 0.000026 0.136112 8.917328e-01 -0.000048 0.000055 lnfer_irrigation1_pre_interaction -0.016022 0.008274 -1.936389 5.282012e-02 -0.032240 0.000195 lnfer_irrigation12_pre_pre_interaction 0.000124 0.000269 0.460528 6.451375e-01 -0.000404 0.000652
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['irrigation12'] = data['irrigation1'] * data['irrigation1'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 149, but rank is 11 warnings.warn('covariance of constraints does not have full '
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 1))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 264, but rank is 9 warnings.warn('covariance of constraints does not have full '
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.758 Model: OLS Adj. R-squared: 0.715 Within R-squared: 0.233 Method: Least Squares F-statistic: 96.9 Date: Wed, 28 Aug 2024 Prob (F-statistic): 4.32e-77 Time: 17:06:56 Log-Likelihood: 1156.31 No. Observations: 1777 AIC: -1782.6 Df Residuals: 1512 BIC: -329.7 Df Model: 264 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 4.111933 1.348002 3.050391 0.002285 1.469897 6.753969 lnfer 0.938364 0.759405 1.235657 0.216586 -0.550043 2.426771 tmp 0.338686 0.162593 2.083033 0.037248 0.020010 0.657361 tmp_tmp_interaction -0.008189 0.004781 -1.712763 0.086756 -0.017560 0.001182 pre 0.111375 0.063920 1.742423 0.081434 -0.013905 0.236656 pre_pre_interaction -0.005409 0.004971 -1.088106 0.276548 -0.015152 0.004334 lnfer_tmp_interaction -0.102543 0.092240 -1.111696 0.266269 -0.283331 0.078244 lnfer_tmp_tmp_interaction 0.003565 0.002747 1.297981 0.194294 -0.001818 0.008948 lnfer_pre_interaction -0.051524 0.031974 -1.611418 0.107089 -0.114193 0.011145 lnfer_pre_pre_interaction 0.003138 0.002607 1.204084 0.228557 -0.001970 0.008247
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 2))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.651 Model: OLS Adj. R-squared: 0.596 Within R-squared: 0.275 Method: Least Squares F-statistic: 98.5 Date: Wed, 28 Aug 2024 Prob (F-statistic): 1.11e-96 Time: 17:06:56 Log-Likelihood: 2376.16 No. Observations: 3099 AIC: -3914.3 Df Residuals: 2680 BIC: -1384.0 Df Model: 418 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 6.016660 0.951334 6.324447 2.541408e-10 4.152080 7.881240 lnfer -0.709806 0.570828 -1.243467 2.136957e-01 -1.828609 0.408996 tmp 0.151761 0.101951 1.488573 1.365999e-01 -0.048059 0.351581 tmp_tmp_interaction -0.003432 0.002694 -1.274149 2.026107e-01 -0.008712 0.001847 pre 0.142751 0.030351 4.703284 2.560097e-06 0.083264 0.202239 pre_pre_interaction -0.009072 0.002438 -3.721473 1.980640e-04 -0.013850 -0.004294 lnfer_tmp_interaction 0.046797 0.061703 0.758424 4.481973e-01 -0.074138 0.167732 lnfer_tmp_tmp_interaction -0.000562 0.001637 -0.343212 7.314389e-01 -0.003770 0.002646 lnfer_pre_interaction 0.036090 0.025593 1.410167 1.584905e-01 -0.014071 0.086250 lnfer_pre_pre_interaction -0.002996 0.002022 -1.481616 1.384426e-01 -0.006960 0.000967
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 418, but rank is 9 warnings.warn('covariance of constraints does not have full '
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 3))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.662 Model: OLS Adj. R-squared: 0.602 Within R-squared: 0.431 Method: Least Squares F-statistic: 40555524311549.3 Date: Wed, 28 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 17:06:57 Log-Likelihood: 1539.75 No. Observations: 2629 AIC: -2283.5 Df Residuals: 2231 BIC: 54.5 Df Model: 397 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 9.859382 0.986244 9.996894 1.572514e-23 7.926378 11.792385 lnfer -2.498688 0.510061 -4.898805 9.642113e-07 -3.498388 -1.498987 tmp -0.298623 0.097980 -3.047789 2.305321e-03 -0.490661 -0.106585 tmp_tmp_interaction 0.007264 0.002446 2.970185 2.976205e-03 0.002471 0.012058 pre 0.357796 0.052796 6.776911 1.227725e-11 0.254317 0.461275 pre_pre_interaction -0.024120 0.003909 -6.171033 6.784519e-10 -0.031780 -0.016459 lnfer_tmp_interaction 0.221760 0.048105 4.609935 4.027952e-06 0.127476 0.316044 lnfer_tmp_tmp_interaction -0.005130 0.001174 -4.370418 1.240088e-05 -0.007431 -0.002830 lnfer_pre_interaction 0.061620 0.026551 2.320819 2.029663e-02 0.009581 0.113659 lnfer_pre_pre_interaction -0.003340 0.001917 -1.742288 8.145813e-02 -0.007097 0.000417
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 397, but rank is 9 warnings.warn('covariance of constraints does not have full '
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 4))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.735 Model: OLS Adj. R-squared: 0.661 Within R-squared: 0.372 Method: Least Squares F-statistic: 390901626296.6 Date: Wed, 28 Aug 2024 Prob (F-statistic): 0.00e+00 Time: 17:06:57 Log-Likelihood: 616.82 No. Observations: 1690 AIC: -491.6 Df Residuals: 1319 BIC: 1523.8 Df Model: 370 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept 6.329019 3.929353 1.610703 0.107245 -1.372371 14.030410 lnfer -1.002779 1.616194 -0.620457 0.534957 -4.170462 2.164903 tmp -0.013749 0.350351 -0.039243 0.968697 -0.700423 0.672926 tmp_tmp_interaction -0.001632 0.007999 -0.204043 0.838320 -0.017310 0.014046 pre 0.523433 0.131485 3.980927 0.000069 0.265727 0.781139 pre_pre_interaction -0.032636 0.008597 -3.796332 0.000147 -0.049485 -0.015787 lnfer_tmp_interaction 0.099726 0.143031 0.697230 0.485659 -0.180610 0.380062 lnfer_tmp_tmp_interaction -0.001795 0.003238 -0.554314 0.579364 -0.008141 0.004551 lnfer_pre_interaction -0.039611 0.051298 -0.772177 0.440009 -0.140154 0.060931 lnfer_pre_pre_interaction 0.002444 0.003376 0.723855 0.469155 -0.004173 0.009060
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 370, but rank is 9 warnings.warn('covariance of constraints does not have full '
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 5))]
# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
# Define the formula for the regression model
formula = 'lnyield ~ lnfer + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'
# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
# Extract the summary table
summary = model.summary2().tables[1]
# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]
fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()
# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)
# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)
# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)
# Format the OLS summary with the within R-squared
ols_info = f"""
OLS Regression Results
==============================================================================
Dep. Variable: lnyield R-squared: {model.rsquared:.3f}
Model: OLS Adj. R-squared: {model.rsquared_adj:.3f}
Within R-squared: {within_r2:.3f}
Method: Least Squares F-statistic: {model.fvalue:.1f}
Date: {datetime.now():%a, %d %b %Y} Prob (F-statistic): {model.f_pvalue:.2e}
Time: {datetime.now():%H:%M:%S} Log-Likelihood: {model.llf:.2f}
No. Observations: {model.nobs:.0f} AIC: {model.aic:.1f}
Df Residuals: {model.df_resid:.0f} BIC: {model.bic:.1f}
Df Model: {model.df_model:.0f}
Covariance Type: cluster
==============================================================================
"""
# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()
# Print the final summary
print(final_summary)
OLS Regression Results ============================================================================== Dep. Variable: lnyield R-squared: 0.678 Model: OLS Adj. R-squared: 0.601 Within R-squared: 0.119 Method: Least Squares F-statistic: 22.3 Date: Wed, 28 Aug 2024 Prob (F-statistic): 2.81e-24 Time: 17:06:57 Log-Likelihood: 263.68 No. Observations: 880 AIC: -185.4 Df Residuals: 709 BIC: 632.0 Df Model: 170 Covariance Type: cluster ============================================================================== Coef. Std.Err. z P>|z| [0.025 0.975] Intercept -11.462215 10.966653 -1.045188 0.295936 -32.956461 10.032030 lnfer 2.805500 5.374390 0.522013 0.601661 -7.728111 13.339112 tmp 1.390408 0.885051 1.570991 0.116185 -0.344261 3.125076 tmp_tmp_interaction -0.027157 0.017715 -1.532982 0.125280 -0.061878 0.007564 pre 0.438126 0.107906 4.060258 0.000049 0.226634 0.649617 pre_pre_interaction -0.029398 0.007329 -4.011491 0.000060 -0.043762 -0.015035 lnfer_tmp_interaction -0.156050 0.435796 -0.358082 0.720282 -1.010195 0.698094 lnfer_tmp_tmp_interaction 0.002869 0.008765 0.327329 0.743419 -0.014311 0.020049 lnfer_pre_interaction -0.159927 0.049943 -3.202202 0.001364 -0.257813 -0.062041 lnfer_pre_pre_interaction 0.010881 0.003288 3.309123 0.000936 0.004436 0.017326
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['pre_pre_interaction'] = data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_interaction'] = data['lnfer'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['tmp_tmp_interaction'] = data['tmp'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp'] /var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp'] /Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 170, but rank is 9 warnings.warn('covariance of constraints does not have full '