In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import netCDF4 as nc
import pandas as pd
import os
import csv
from glob import glob
import xarray as xr
import matplotlib.ticker as ticker 

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import re
from datetime import datetime
In [2]:
# Step 1: Data Preparation
# Load your dataset into a pandas DataFrame
os.chdir('/Users/chenchenren/postdoc/paper/2N and water-US/regression/')
In [3]:
excel_file = "./soybean_reg_data.csv"
# Read the specific sheet from the Excel file
ds = pd.read_csv(excel_file)
df_soybean = ds
print(df_soybean)
         geoid    year        tmp       pre  irrigation1   lnyield     lnfer  \
0       1001.0  2009.0  24.140660  8.808839     0.718851  7.609614  2.022537   
1       1001.0  2010.0  25.675484  5.438990     0.830087  7.035731  1.910005   
2       1001.0  2011.0  24.544823  5.270162     0.595086  7.466514  2.025641   
3       1001.0  2013.0  24.036592  7.514143     0.001241  7.926855  2.906756   
4       1003.0  2008.0  24.727230  8.675528     0.001980  7.934111  1.240602   
...        ...     ...        ...       ...          ...       ...       ...   
17577  55141.0  2016.0  17.486904  7.304833     0.000000  8.134343  2.860158   
17578  55141.0  2017.0  16.590237  6.467205     0.000000  7.978877  2.633171   
17579  55141.0  2018.0  17.266659  8.373576     0.000000  7.838077  2.639034   
17580  55141.0  2019.0  16.238592  8.735889     0.000000  7.912185  2.671287   
17581  55141.0  2020.0  16.202635  6.832954     0.000000  8.173033  2.625827   

       group  zone  
0          1   5.0  
1          1   5.0  
2          1   5.0  
3          1   5.0  
4          1   5.0  
...      ...   ...  
17577      2   1.0  
17578      2   1.0  
17579      2   1.0  
17580      2   1.0  
17581      2   1.0  

[17582 rows x 9 columns]

All sample regression¶

In [4]:
# Filter data for irrigated soybean
data = df_soybean[df_soybean['group'] == 1]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3996640747.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1256, but rank is 7
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.709
Model:                            OLS   Adj. R-squared:                  0.651
Within R-squared:                 0.243
Method:                 Least Squares   F-statistic:                     21038509288831.2
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:41:13   Log-Likelihood:                 3156.68
No. Observations:                7507   AIC:                            -3799.4
Df Residuals:                    6250   BIC:                             4903.6
Df Model:                         1256                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            0.800648  0.388248   2.062206  3.918814e-02  0.039695  1.561600
lnfer                0.130809  0.008856  14.769889  2.290910e-49  0.113451  0.148168
irrigation1         -0.019336  0.017419  -1.110080  2.669645e-01 -0.053476  0.014804
irrigation12         0.003916  0.001987   1.970166  4.881935e-02  0.000020  0.007811
tmp                  0.544660  0.036766  14.814312  1.183961e-49  0.472600  0.616719
tmp_tmp_interaction -0.012964  0.000877 -14.789376  1.715359e-49 -0.014682 -0.011246
pre                  0.224017  0.016294  13.748319  5.212636e-43  0.192081  0.255953
pre_pre_interaction -0.013249  0.001157 -11.451744  2.304610e-30 -0.015517 -0.010981
In [5]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp = model.params['tmp']
coeff_tmp_tmp_interaction = model.params['tmp_tmp_interaction']

# Define the range of tmp based on the input data
tmp_range = np.linspace(data['tmp'].min(), data['tmp'].max(), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp = coeff_tmp * tmp_range + coeff_tmp_tmp_interaction * (tmp_range ** 2)-5.5

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['tmp', 'tmp'] * tmp_range**2 +
            2 * cov_matrix.loc['tmp', 'tmp_tmp_interaction'] * tmp_range**3 +
            cov_matrix.loc['tmp_tmp_interaction', 'tmp_tmp_interaction'] * tmp_range**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_tmp = lnyield_pred_tmp - 1.96 * pred_se
ci_upper_tmp = lnyield_pred_tmp + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range, lnyield_pred_tmp, label='Predicted lnyield', color='blue')
plt.fill_between(tmp_range, ci_lower_tmp, ci_upper_tmp, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
In [6]:
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre = model.params['pre']
coeff_pre_pre_interaction = model.params['pre_pre_interaction']

# Define the range of pre based on the input data
pre_range = np.linspace(np.percentile(data['pre'],1), np.percentile(data['pre'],99), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre = coeff_pre * pre_range + coeff_pre_pre_interaction * (pre_range ** 2)-0.8

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['pre', 'pre'] * pre_range**2 +
            2 * cov_matrix.loc['pre', 'pre_pre_interaction'] * pre_range**3 +
            cov_matrix.loc['pre_pre_interaction', 'pre_pre_interaction'] * pre_range**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_pre = lnyield_pred_pre - 1.96 * pred_se
ci_upper_pre = lnyield_pred_pre + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range, lnyield_pred_pre, label='Predicted lnyield', color='blue')
plt.fill_between(pre_range, ci_lower_pre, ci_upper_pre, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [7]:
# Filter data for rainfed soybean
data = df_soybean[df_soybean['group'] == 2]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/106284357.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1583, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.708
Model:                            OLS   Adj. R-squared:                  0.654
Within R-squared:                 0.233
Method:                 Least Squares   F-statistic:                     -22135224246007.9
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:41:27   Log-Likelihood:                 5039.48
No. Observations:                10075   AIC:                            -6911.0
Df Residuals:                    8491   BIC:                             4522.0
Df Model:                         1583                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            3.566492  0.220815  16.151458  1.109087e-58  3.133701  3.999282
lnfer                0.127607  0.006789  18.797386  7.933075e-79  0.114301  0.140912
tmp                  0.340510  0.024217  14.060477  6.643677e-45  0.293044  0.387975
tmp_tmp_interaction -0.007994  0.000639 -12.519552  5.836378e-36 -0.009246 -0.006743
pre                  0.177062  0.013533  13.083432  4.095265e-39  0.150537  0.203587
pre_pre_interaction -0.010534  0.000976 -10.791532  3.774445e-27 -0.012447 -0.008620
In [8]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp = model.params['tmp']
coeff_tmp_tmp_interaction = model.params['tmp_tmp_interaction']

# Define the range of tmp based on the input data
tmp_range_rfd = np.linspace(data['tmp'].min(), data['tmp'].max(), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd = coeff_tmp * tmp_range_rfd + coeff_tmp_tmp_interaction * (tmp_range_rfd ** 2)-3.1

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['tmp', 'tmp'] * tmp_range_rfd**2 +
            2 * cov_matrix.loc['tmp', 'tmp_tmp_interaction'] * tmp_range_rfd**3 +
            cov_matrix.loc['tmp_tmp_interaction', 'tmp_tmp_interaction'] * tmp_range_rfd**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_tmp_rfd = lnyield_pred_tmp_rfd - 1.96 * pred_se
ci_upper_tmp_rfd = lnyield_pred_tmp_rfd + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd, lnyield_pred_tmp_rfd, label='Predicted lnyield', color='blue')
plt.fill_between(tmp_range_rfd, ci_lower_tmp_rfd, ci_upper_tmp_rfd, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
In [9]:
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre = model.params['pre']
coeff_pre_pre_interaction = model.params['pre_pre_interaction']

# Define the range of pre based on the input data
pre_range_rfd = np.linspace(np.percentile(data['pre'],1), np.percentile(data['pre'],99), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd = coeff_pre * pre_range_rfd + coeff_pre_pre_interaction * (pre_range_rfd ** 2)-0.6

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['pre', 'pre'] * pre_range_rfd**2 +
            2 * cov_matrix.loc['pre', 'pre_pre_interaction'] * pre_range_rfd**3 +
            cov_matrix.loc['pre_pre_interaction', 'pre_pre_interaction'] * pre_range_rfd**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_pre_rfd = lnyield_pred_pre_rfd - 1.96 * pred_se
ci_upper_pre_rfd = lnyield_pred_pre_rfd + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd, lnyield_pred_pre_rfd, label='Predicted lnyield', color='blue')
plt.fill_between(pre_range_rfd, ci_lower_pre_rfd, ci_upper_pre_rfd, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()

Divide by climate zone¶

In [10]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 1)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.729
Model:                            OLS   Adj. R-squared:                  0.677
Within R-squared:                 0.213
Method:                 Least Squares   F-statistic:                     46.2
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               6.92e-34
Time:                        14:41:27   Log-Likelihood:                 635.36
No. Observations:                949   AIC:                            -964.7
Df Residuals:                    796   BIC:                             -221.8
Df Model:                         152                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z     P>|z|    [0.025    0.975]
Intercept            2.772323  1.411778  1.963710  0.049564  0.005289  5.539358
lnfer                0.071517  0.016528  4.326987  0.000015  0.039123  0.103912
irrigation1         -0.066475  0.175892 -0.377929  0.705483 -0.411217  0.278267
irrigation12         0.066211  0.122418  0.540861  0.588603 -0.173724  0.306146
tmp                  0.475086  0.159011  2.987752  0.002810  0.163430  0.786742
tmp_tmp_interaction -0.012017  0.004551 -2.640476  0.008279 -0.020937 -0.003097
pre                  0.161312  0.041595  3.878162  0.000105  0.079787  0.242837
pre_pre_interaction -0.010554  0.003401 -3.102931  0.001916 -0.017221 -0.003888
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1997379481.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 152, but rank is 7
  warnings.warn('covariance of constraints does not have full '
In [11]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp1 = model.params['tmp']
coeff_tmp_tmp_interaction1 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range1 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp1 = coeff_tmp1 * tmp_range1 + coeff_tmp_tmp_interaction1 * (tmp_range1 ** 2)-4.5
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range1, lnyield_pred_tmp1, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre1 = model.params['pre']
coeff_pre_pre_interaction1 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range1 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre1 = coeff_pre1 * pre_range1 + coeff_pre_pre_interaction1 * (pre_range1 ** 2)-0.57
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range1, lnyield_pred_pre1, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [12]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 2)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/163921588.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 305, but rank is 7
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.715
Model:                            OLS   Adj. R-squared:                  0.647
Within R-squared:                 0.310
Method:                 Least Squares   F-statistic:                     239346187613.5
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:41:28   Log-Likelihood:                 1205.19
No. Observations:                1585   AIC:                            -1798.4
Df Residuals:                    1279   BIC:                             -155.7
Df Model:                         305                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            2.799067  1.206685  2.319634  2.036067e-02  0.434009  5.164126
lnfer                0.062591  0.017941  3.488669  4.854314e-04  0.027427  0.097756
irrigation1         -0.157767  0.144642 -1.090741  2.753871e-01 -0.441261  0.125727
irrigation12         0.119411  0.104036  1.147788  2.510560e-01 -0.084495  0.323317
tmp                  0.349841  0.125661  2.784010  5.369134e-03  0.103550  0.596131
tmp_tmp_interaction -0.007060  0.003242 -2.177250  2.946190e-02 -0.013415 -0.000705
pre                  0.400009  0.048965  8.169204  3.104301e-16  0.304038  0.495979
pre_pre_interaction -0.029878  0.003767 -7.930857  2.176382e-15 -0.037261 -0.022494
In [13]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp2 = model.params['tmp']
coeff_tmp_tmp_interaction2 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range2 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp2 = coeff_tmp2 * tmp_range2 + coeff_tmp_tmp_interaction2 * (tmp_range2 ** 2)-4
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range2, lnyield_pred_tmp2, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre2 = model.params['pre']
coeff_pre_pre_interaction2 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range2 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre2 = coeff_pre2 * pre_range2 + coeff_pre_pre_interaction2 * (pre_range2 ** 2)-1.2
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range2, lnyield_pred_pre2, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [30]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 3)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2837784740.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 342, but rank is 7
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.743
Model:                            OLS   Adj. R-squared:                  0.682
Within R-squared:                 0.377
Method:                 Least Squares   F-statistic:                     -8850742018281.0
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:44:21   Log-Likelihood:                 809.10
No. Observations:                1772   AIC:                            -932.2
Df Residuals:                    1429   BIC:                             947.4
Df Model:                         342                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            2.474073  1.261171  1.961727  4.979423e-02  0.002224  4.945922
lnfer                0.135249  0.018715  7.226722  4.947932e-13  0.098568  0.171930
irrigation1         -0.002444  0.105756 -0.023113  9.815603e-01 -0.209722  0.204833
irrigation12         0.002774  0.014578  0.190293  8.490792e-01 -0.025798  0.031346
tmp                  0.417687  0.120396  3.469277  5.218615e-04  0.181715  0.653659
tmp_tmp_interaction -0.009835  0.002872 -3.424827  6.151920e-04 -0.015463 -0.004207
pre                  0.311809  0.034264  9.100094  9.025372e-20  0.244652  0.378966
pre_pre_interaction -0.018173  0.002632 -6.905689  4.996047e-12 -0.023331 -0.013015
In [31]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp3 = model.params['tmp']
coeff_tmp_tmp_interaction3 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range3 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp3 = coeff_tmp3 * tmp_range3 + coeff_tmp_tmp_interaction3 * (tmp_range3 ** 2)-4.3
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range3, lnyield_pred_tmp3, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre3 = model.params['pre']
coeff_pre_pre_interaction3 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range3 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre3 = coeff_pre3 * pre_range3 + coeff_pre_pre_interaction3 * (pre_range3 ** 2)-1.15
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range3, lnyield_pred_pre3, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [32]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 4)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2175967506.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 336, but rank is 7
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.595
Model:                            OLS   Adj. R-squared:                  0.527
Within R-squared:                 0.238
Method:                 Least Squares   F-statistic:                     -10494853094051.4
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:44:21   Log-Likelihood:                 811.28
No. Observations:                2343   AIC:                            -948.6
Df Residuals:                    2006   BIC:                             992.3
Df Model:                         336                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept           -2.123577  2.011294 -1.055826  2.910477e-01 -6.065642  1.818488
lnfer                0.125002  0.015565  8.031049  9.664307e-16  0.094496  0.155509
irrigation1         -0.010905  0.024445 -0.446115  6.555139e-01 -0.058817  0.037006
irrigation12         0.002964  0.002623  1.130108  2.584308e-01 -0.002177  0.008105
tmp                  0.760151  0.179720  4.229639  2.340663e-05  0.407906  1.112396
tmp_tmp_interaction -0.017714  0.003993 -4.436234  9.154629e-06 -0.025540 -0.009888
pre                  0.334204  0.034323  9.737126  2.093934e-22  0.266933  0.401475
pre_pre_interaction -0.019695  0.002210 -8.911486  5.035321e-19 -0.024027 -0.015363
In [33]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp4 = model.params['tmp']
coeff_tmp_tmp_interaction4 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range4 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp4 = coeff_tmp4 * tmp_range4 + coeff_tmp_tmp_interaction4 * (tmp_range4 ** 2)-8
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range4, lnyield_pred_tmp4, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre4 = model.params['pre']
coeff_pre_pre_interaction4 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range4 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre4 = coeff_pre4 * pre_range4 + coeff_pre_pre_interaction4 * (pre_range4 ** 2)-1.25
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range4, lnyield_pred_pre4, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [34]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 5)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.433
Within R-squared:                 0.219
Method:                 Least Squares   F-statistic:                     190.6
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.16e-67
Time:                        14:44:21   Log-Likelihood:                 147.76
No. Observations:                858   AIC:                            -3.5
Df Residuals:                    712   BIC:                             690.7
Df Model:                         145                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|     [0.025     0.975]
Intercept           -0.238525  9.524595 -0.025043  9.800207e-01 -18.906388  18.429339
lnfer                0.209968  0.023900  8.785335  1.558992e-18   0.163125   0.256811
irrigation1          0.015060  0.028786  0.523151  6.008695e-01  -0.041361   0.071480
irrigation12         0.003037  0.003670  0.827398  4.080113e-01  -0.004156   0.010230
tmp                  0.611501  0.773902  0.790153  4.294383e-01  -0.905319   2.128322
tmp_tmp_interaction -0.014099  0.015590 -0.904340  3.658151e-01  -0.044656   0.016458
pre                  0.191364  0.060136  3.182184  1.461687e-03   0.073499   0.309228
pre_pre_interaction -0.010960  0.003802 -2.882722  3.942549e-03  -0.018413  -0.003508
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/4176237218.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 145, but rank is 7
  warnings.warn('covariance of constraints does not have full '
In [35]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp5 = model.params['tmp']
coeff_tmp_tmp_interaction5 = model.params['tmp_tmp_interaction']

# Define the range of tmp based on the input data
tmp_range5 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp5 = coeff_tmp5 * tmp_range5 + coeff_tmp_tmp_interaction5 * (tmp_range5 ** 2)-6.5

# Calculate the standard errors and confidence intervals

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range5, lnyield_pred_tmp5, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()



# Extract the coefficients for pre and pre_pre_interaction
coeff_pre5 = model.params['pre']
coeff_pre_pre_interaction5 = model.params['pre_pre_interaction']

# Define the range of pre based on the input data
pre_range5 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre5 = coeff_pre5 * pre_range5 + coeff_pre_pre_interaction5 * (pre_range5 ** 2)-0.65

# Calculate the standard errors and confidence intervals

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range5, lnyield_pred_pre5, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [36]:
# Import the necessary library for setting fonts
from matplotlib import rcParams

# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (tmp_range, lnyield_pred) tuples for the five models
models = [(tmp_range1, lnyield_pred_tmp1), 
          (tmp_range2, lnyield_pred_tmp2), 
          (tmp_range3, lnyield_pred_tmp3), 
          (tmp_range4, lnyield_pred_tmp4), 
          (tmp_range5, lnyield_pred_tmp5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(tmp_range, lnyield_pred_tmp, label='One model', color='blue', linewidth=4)
plt.fill_between(tmp_range, ci_lower_tmp, ci_upper_tmp, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (tmp_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(tmp_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(tmp_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Temperature (°C)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(tmp_range.min(), tmp_range.max(), num=5)  # Custom X tick positions
y_ticks = np.linspace(-2, 1.2, num=4)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size



# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_irr_tem.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()
In [37]:
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (pre_range, lnyield_pred) tuples for the five models
models = [(pre_range1, lnyield_pred_pre1), 
          (pre_range2, lnyield_pred_pre2), 
          (pre_range3, lnyield_pred_pre3), 
          (pre_range4, lnyield_pred_pre4), 
          (pre_range5, lnyield_pred_pre5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(pre_range, lnyield_pred_pre, label='One model', color='blue', linewidth=4)
plt.fill_between(pre_range, ci_lower_pre, ci_upper_pre, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (pre_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(pre_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(pre_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Precipitation (100mm)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(2, 10, num=4)  # Custom X tick positions
y_ticks = np.linspace(-0.4, 0.4, num=4)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size


# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_irr_pre.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()
In [46]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 1))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.755
Model:                            OLS   Adj. R-squared:                  0.712
Within R-squared:                 0.223
Method:                 Least Squares   F-statistic:                     -24926737835298.1
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:45:39   Log-Likelihood:                 1144.73
No. Observations:                1777   AIC:                            -1767.5
Df Residuals:                    1516   BIC:                             -336.5
Df Model:                         260                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            5.043071  0.583724  8.639476  5.647164e-18  3.898992  6.187149
lnfer                0.050117  0.014981  3.345461  8.214596e-04  0.020756  0.079479
tmp                  0.247224  0.066761  3.703113  2.129701e-04  0.116375  0.378074
tmp_tmp_interaction -0.004787  0.001930 -2.479632  1.315180e-02 -0.008570 -0.001003
pre                  0.006076  0.022567  0.269250  7.877370e-01 -0.038154  0.050306
pre_pre_interaction  0.001494  0.001792  0.833344  4.046505e-01 -0.002019  0.005007
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/2623999164.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 260, but rank is 5
  warnings.warn('covariance of constraints does not have full '
In [47]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd1 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd1 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd1 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd1 = coeff_tmp_rfd1 * tmp_range_rfd1 + coeff_tmp_tmp_interaction_rfd1 * (tmp_range_rfd1 ** 2)-2.35
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd1, lnyield_pred_tmp_rfd1, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd1 = model.params['pre']
coeff_pre_pre_interaction_rfd1 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd1 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd1 = coeff_pre_rfd1 * pre_range_rfd1 + coeff_pre_pre_interaction_rfd1 * (pre_range_rfd1 ** 2)
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd1, lnyield_pred_pre_rfd1, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [48]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 2))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3799461747.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 414, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.645
Model:                            OLS   Adj. R-squared:                  0.590
Within R-squared:                 0.263
Method:                 Least Squares   F-statistic:                     65260093579113.6
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:45:39   Log-Likelihood:                 2350.19
No. Observations:                3099   AIC:                            -3870.4
Df Residuals:                    2684   BIC:                             -1364.3
Df Model:                         414                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            4.662776  0.517787  9.005192  2.152887e-19  3.647931  5.677620
lnfer                0.071498  0.008279  8.636499  5.796193e-18  0.055272  0.087724
tmp                  0.255181  0.054923  4.646170  3.381546e-06  0.147534  0.362828
tmp_tmp_interaction -0.005416  0.001451 -3.732438  1.896356e-04 -0.008260 -0.002572
pre                  0.189788  0.020793  9.127347  7.019874e-20  0.149034  0.230542
pre_pre_interaction -0.012937  0.001627 -7.949341  1.875062e-15 -0.016126 -0.009747
In [49]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd2 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd2 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd2 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd2 = coeff_tmp_rfd2 * tmp_range_rfd2 + coeff_tmp_tmp_interaction_rfd2 * (tmp_range_rfd2 ** 2)-2.4
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd2, lnyield_pred_tmp_rfd2, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd2 = model.params['pre']
coeff_pre_pre_interaction_rfd2 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd2 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd2 = coeff_pre_rfd2 * pre_range_rfd2 + coeff_pre_pre_interaction_rfd2 * (pre_range_rfd2 ** 2)-0.6
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd2, lnyield_pred_pre_rfd2, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [50]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 3))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3864630227.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 393, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.646
Model:                            OLS   Adj. R-squared:                  0.584
Within R-squared:                 0.404
Method:                 Least Squares   F-statistic:                     9708370610756.8
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:45:40   Log-Likelihood:                 1478.89
No. Observations:                2629   AIC:                            -2169.8
Df Residuals:                    2235   BIC:                             144.7
Df Model:                         393                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            5.988896  0.548497  10.918750  9.377795e-28  4.913863  7.063930
lnfer                0.135684  0.011772  11.525564  9.806759e-31  0.112611  0.158758
tmp                  0.009136  0.053946   0.169361  8.655131e-01 -0.096596  0.114869
tmp_tmp_interaction  0.000200  0.001353   0.147871  8.824449e-01 -0.002451  0.002851
pre                  0.475892  0.035173  13.529879  1.041943e-41  0.406954  0.544831
pre_pre_interaction -0.030512  0.002584 -11.810234  3.455978e-32 -0.035576 -0.025449
In [51]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd3 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd3 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd3 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd3 = coeff_tmp_rfd3 * tmp_range_rfd3 + coeff_tmp_tmp_interaction_rfd3 * (tmp_range_rfd3 ** 2)+0.2
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd3, lnyield_pred_tmp_rfd3, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd3 = model.params['pre']
coeff_pre_pre_interaction_rfd3 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd3 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd3 = coeff_pre_rfd3 * pre_range_rfd3 + coeff_pre_pre_interaction_rfd3 * (pre_range_rfd3 ** 2)-1.7
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd3, lnyield_pred_pre_rfd3, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [52]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 4))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/1697617172.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 366, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.658
Within R-squared:                 0.364
Method:                 Least Squares   F-statistic:                     7165590151798.0
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:45:40   Log-Likelihood:                 606.76
No. Observations:                1690   AIC:                            -479.5
Df Residuals:                    1323   BIC:                             1514.2
Df Model:                         366                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            3.928053  1.903232  2.063886  3.902855e-02  0.197787  7.658320
lnfer                0.180562  0.020036  9.011745  2.028031e-19  0.141292  0.219832
tmp                  0.177480  0.168832  1.051219  2.931582e-01 -0.153426  0.508385
tmp_tmp_interaction -0.004969  0.003783 -1.313541  1.890006e-01 -0.012384  0.002446
pre                  0.460187  0.054861  8.388181  4.937205e-17  0.352661  0.567713
pre_pre_interaction -0.028826  0.003597 -8.014023  1.110160e-15 -0.035876 -0.021776
In [53]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd4 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd4 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd4 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd4 = coeff_tmp_rfd4 * tmp_range_rfd4 + coeff_tmp_tmp_interaction_rfd4 * (tmp_range_rfd4 ** 2)-1
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd4, lnyield_pred_tmp_rfd4, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd4 = model.params['pre']
coeff_pre_pre_interaction_rfd4 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd4 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd4 = coeff_pre_rfd4 * pre_range_rfd4 + coeff_pre_pre_interaction_rfd4 * (pre_range_rfd4 ** 2)-1.7
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd4, lnyield_pred_pre_rfd4, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [58]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 5))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.672
Model:                            OLS   Adj. R-squared:                  0.595
Within R-squared:                 0.101
Method:                 Least Squares   F-statistic:                     30224499013205.9
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:45:54   Log-Likelihood:                 254.73
No. Observations:                880   AIC:                            -175.5
Df Residuals:                    713   BIC:                             622.8
Df Model:                         166                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|     [0.025    0.975]
Intercept           -7.146240  4.875147 -1.465851  1.426888e-01 -16.701352  2.408871
lnfer                0.144921  0.023243  6.234911  4.520359e-10   0.099365  0.190477
tmp                  1.146929  0.390122  2.939922  3.282947e-03   0.382303  1.911554
tmp_tmp_interaction -0.022834  0.007780 -2.934814  3.337475e-03  -0.038083 -0.007585
pre                  0.148563  0.036180  4.106212  4.022010e-05   0.077651  0.219474
pre_pre_interaction -0.009581  0.002490 -3.847653  1.192548e-04  -0.014462 -0.004701
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17453/3154588758.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 166, but rank is 5
  warnings.warn('covariance of constraints does not have full '
In [59]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd5 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd5 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd5 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd5 = coeff_tmp_rfd5 * tmp_range_rfd5 + coeff_tmp_tmp_interaction_rfd5 * (tmp_range_rfd5 ** 2)-13.95
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd5, lnyield_pred_tmp_rfd5, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd5 = model.params['pre']
coeff_pre_pre_interaction_rfd5 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd5 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd5 = coeff_pre_rfd5 * pre_range_rfd5 + coeff_pre_pre_interaction_rfd5 * (pre_range_rfd5 ** 2)-0.45
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd5, lnyield_pred_pre_rfd5, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [60]:
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (tmp_range, lnyield_pred) tuples for the five models
models = [(tmp_range_rfd1, lnyield_pred_tmp_rfd1), 
          (tmp_range_rfd2, lnyield_pred_tmp_rfd2), 
          (tmp_range_rfd3, lnyield_pred_tmp_rfd3), 
          (tmp_range_rfd4, lnyield_pred_tmp_rfd4), 
          (tmp_range_rfd5, lnyield_pred_tmp_rfd5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(tmp_range_rfd, lnyield_pred_tmp_rfd, label='One model', color='blue', linewidth=4)
plt.fill_between(tmp_range_rfd, ci_lower_tmp_rfd, ci_upper_tmp_rfd, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (tmp_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(tmp_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(tmp_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Temperature (°C)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(tmp_range_rfd.min(), tmp_range_rfd.max(), num=5)  # Custom X tick positions
y_ticks = np.linspace(-0.6, 1, num=4)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size



# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_rfd_tem.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()
In [61]:
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (pre_range, lnyield_pred) tuples for the five models
models = [(pre_range_rfd1, lnyield_pred_pre_rfd1), 
          (pre_range_rfd2, lnyield_pred_pre_rfd2), 
          (pre_range_rfd3, lnyield_pred_pre_rfd3), 
          (pre_range_rfd4, lnyield_pred_pre_rfd4), 
          (pre_range_rfd5, lnyield_pred_pre_rfd5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(pre_range_rfd, lnyield_pred_pre_rfd, label='One model', color='blue', linewidth=4)
plt.fill_between(pre_range_rfd, ci_lower_pre_rfd, ci_upper_pre_rfd, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (pre_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(pre_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(pre_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Precipitation (100mm)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(2, 10, num=4)  # Custom X tick positions
y_ticks = np.linspace(-0.3, 0.2, num=5)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size


# Save the entire figure as a JPG with DPI 300
plt.savefig('soybean_rfd_pre.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()

climate zone and interaction item¶

In [34]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 1)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.679
Within R-squared:                 0.220
Method:                 Least Squares   F-statistic:                     140.0
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               2.14e-71
Time:                        17:06:55   Log-Likelihood:                 639.91
No. Observations:                949   AIC:                            -965.8
Df Residuals:                    792   BIC:                             -203.5
Df Model:                         156                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept                               2.789019  1.420123  1.963927  4.953856e-02  0.005628  5.572410
lnfer                                   0.080143  0.016234  4.936688  7.946044e-07  0.048325  0.111962
irrigation1                             0.141556  0.441856  0.320367  7.486900e-01 -0.724466  1.007579
irrigation12                            0.338094  0.300276  1.125945  2.601887e-01 -0.250436  0.926623
tmp                                     0.475791  0.159445  2.984050  2.844602e-03  0.163285  0.788297
tmp_tmp_interaction                    -0.012027  0.004562 -2.636388  8.379394e-03 -0.020968 -0.003086
pre                                     0.148433  0.043822  3.387179  7.061526e-04  0.062543  0.234322
pre_pre_interaction                    -0.009605  0.003571 -2.689620  7.153343e-03 -0.016604 -0.002606
lnfer_irrigation1_tmp_interaction      -0.015636  0.018682 -0.836965  4.026124e-01 -0.052252  0.020980
lnfer_irrigation12_tmp_tmp_interaction -0.000531  0.000847 -0.626245  5.311539e-01 -0.002192  0.001130
lnfer_irrigation1_pre_interaction       0.032951  0.027199  1.211498  2.257048e-01 -0.020357  0.086260
lnfer_irrigation12_pre_pre_interaction -0.001150  0.002914 -0.394539  6.931831e-01 -0.006861  0.004561
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/857463593.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 156, but rank is 11
  warnings.warn('covariance of constraints does not have full '
In [35]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 2)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3475410727.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.649
Within R-squared:                 0.316
Method:                 Least Squares   F-statistic:                     3326159224107.0
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:06:55   Log-Likelihood:                 1212.82
No. Observations:                1585   AIC:                            -1805.6
Df Residuals:                    1275   BIC:                             -141.4
Df Model:                         309                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept                               2.592114  1.209910  2.142402  3.216113e-02  0.220734  4.963494
lnfer                                   0.064939  0.017695  3.669950  2.425980e-04  0.030258  0.099620
irrigation1                             0.173287  0.224644  0.771386  4.404783e-01 -0.267007  0.613582
irrigation12                           -0.248563  0.180658 -1.375881  1.688586e-01 -0.602646  0.105519
tmp                                     0.372547  0.127112  2.930853  3.380324e-03  0.123412  0.621682
tmp_tmp_interaction                    -0.007665  0.003293 -2.327491  1.993913e-02 -0.014119 -0.001210
pre                                     0.397774  0.044993  8.840842  9.499993e-19  0.309590  0.485958
pre_pre_interaction                    -0.029778  0.003470 -8.582117  9.314284e-18 -0.036579 -0.022978
lnfer_irrigation1_tmp_interaction      -0.024499  0.032200 -0.760858  4.467420e-01 -0.087609  0.038611
lnfer_irrigation12_tmp_tmp_interaction  0.001053  0.000815  1.291736  1.964485e-01 -0.000545  0.002651
lnfer_irrigation1_pre_interaction       0.052379  0.078906  0.663820  5.068058e-01 -0.102273  0.207031
lnfer_irrigation12_pre_pre_interaction -0.005112  0.006111 -0.836408  4.029255e-01 -0.017090  0.006866
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 309, but rank is 11
  warnings.warn('covariance of constraints does not have full '
In [36]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 3)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1569004877.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 346, but rank is 11
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.691
Within R-squared:                 0.397
Method:                 Least Squares   F-statistic:                     423233537990.0
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:06:55   Log-Likelihood:                 838.45
No. Observations:                1772   AIC:                            -982.9
Df Residuals:                    1425   BIC:                             918.6
Df Model:                         346                                         
Covariance Type:               cluster                             
==============================================================================

                                               Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept                               1.818522e+00  1.276758   1.424327  1.543518e-01 -0.683879  4.320922
lnfer                                   1.430024e-01  0.019184   7.454390  9.028422e-14  0.105403  0.180602
irrigation1                             1.697547e-01  0.096894   1.751958  7.978097e-02 -0.020155  0.359664
irrigation12                           -2.752325e-02  0.014778  -1.862439  6.254118e-02 -0.056488  0.001441
tmp                                     4.545799e-01  0.121412   3.744123  1.810246e-04  0.216618  0.692542
tmp_tmp_interaction                    -1.069193e-02  0.002894  -3.694690  2.201554e-04 -0.016364 -0.005020
pre                                     3.924562e-01  0.035756  10.976037  4.983096e-28  0.322376  0.462536
pre_pre_interaction                    -2.413294e-02  0.002722  -8.864334  7.696272e-19 -0.029469 -0.018797
lnfer_irrigation1_tmp_interaction       3.313665e-03  0.004712   0.703274  4.818849e-01 -0.005921  0.012549
lnfer_irrigation12_tmp_tmp_interaction -4.956403e-08  0.000029  -0.001730  9.986199e-01 -0.000056  0.000056
lnfer_irrigation1_pre_interaction      -7.708309e-02  0.019978  -3.858362  1.141494e-04 -0.116240 -0.037927
lnfer_irrigation12_pre_pre_interaction  2.357695e-03  0.000777   3.032901  2.422153e-03  0.000834  0.003881
In [37]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 4)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/1105145404.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 340, but rank is 11
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.538
Within R-squared:                 0.258
Method:                 Least Squares   F-statistic:                     2637640612839.9
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:06:56   Log-Likelihood:                 842.65
No. Observations:                2343   AIC:                            -1003.3
Df Residuals:                    2002   BIC:                             960.6
Df Model:                         340                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept                              -3.513804  2.063075  -1.703187  8.853301e-02 -7.557357  0.529749
lnfer                                   0.141105  0.015489   9.110276  8.217285e-20  0.110748  0.171462
irrigation1                             0.174299  0.080622   2.161924  3.062401e-02  0.016282  0.332315
irrigation12                           -0.022010  0.011244  -1.957382  5.030263e-02 -0.044048  0.000029
tmp                                     0.870480  0.184208   4.725523  2.295234e-06  0.509439  1.231521
tmp_tmp_interaction                    -0.020199  0.004096  -4.930965  8.182442e-07 -0.028227 -0.012170
pre                                     0.360418  0.035214  10.234981  1.382222e-24  0.291399  0.429437
pre_pre_interaction                    -0.020955  0.002283  -9.180254  4.300741e-20 -0.025428 -0.016481
lnfer_irrigation1_tmp_interaction       0.002370  0.002600   0.911446  3.620603e-01 -0.002726  0.007467
lnfer_irrigation12_tmp_tmp_interaction  0.000003  0.000011   0.293160  7.694001e-01 -0.000018  0.000024
lnfer_irrigation1_pre_interaction      -0.024085  0.007587  -3.174598  1.500442e-03 -0.038955 -0.009215
lnfer_irrigation12_pre_pre_interaction  0.000268  0.000088   3.041465  2.354300e-03  0.000095  0.000441
In [38]:
# Filter data for irrigated soybean
data = df_soybean[(df_soybean['group'] == 1)&(df_soybean['zone'] == 5)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.537
Model:                            OLS   Adj. R-squared:                  0.440
Within R-squared:                 0.233
Method:                 Least Squares   F-statistic:                     298.4
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               2.25e-90
Time:                        17:06:56   Log-Likelihood:                 155.38
No. Observations:                858   AIC:                            -10.8
Df Residuals:                    708   BIC:                             702.4
Df Model:                         149                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.         z         P>|z|     [0.025     0.975]
Intercept                               1.009895  8.858575  0.114002  9.092362e-01 -16.352593  18.372384
lnfer                                   0.229537  0.026924  8.525439  1.522308e-17   0.176768   0.282307
irrigation1                             0.103771  0.116006  0.894537  3.710344e-01  -0.123595   0.331138
irrigation12                           -0.012705  0.033366 -0.380778  7.033679e-01  -0.078100   0.052691
tmp                                     0.486572  0.719888  0.675900  4.991040e-01  -0.924382   1.897526
tmp_tmp_interaction                    -0.011602  0.014526 -0.798699  4.244648e-01  -0.040072   0.016868
pre                                     0.259156  0.062784  4.127714  3.663872e-05   0.136101   0.382211
pre_pre_interaction                    -0.014688  0.004041 -3.634390  2.786387e-04  -0.022609  -0.006767
lnfer_irrigation1_tmp_interaction       0.002022  0.002848  0.709733  4.778700e-01  -0.003561   0.007605
lnfer_irrigation12_tmp_tmp_interaction  0.000004  0.000026  0.136112  8.917328e-01  -0.000048   0.000055
lnfer_irrigation1_pre_interaction      -0.016022  0.008274 -1.936389  5.282012e-02  -0.032240   0.000195
lnfer_irrigation12_pre_pre_interaction  0.000124  0.000269  0.460528  6.451375e-01  -0.000404   0.000652
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2684753267.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 149, but rank is 11
  warnings.warn('covariance of constraints does not have full '
In [39]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 1))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/449609041.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 264, but rank is 9
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.715
Within R-squared:                 0.233
Method:                 Least Squares   F-statistic:                     96.9
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               4.32e-77
Time:                        17:06:56   Log-Likelihood:                 1156.31
No. Observations:                1777   AIC:                            -1782.6
Df Residuals:                    1512   BIC:                             -329.7
Df Model:                         264                                         
Covariance Type:               cluster                             
==============================================================================

                              Coef.  Std.Err.         z     P>|z|    [0.025    0.975]
Intercept                  4.111933  1.348002  3.050391  0.002285  1.469897  6.753969
lnfer                      0.938364  0.759405  1.235657  0.216586 -0.550043  2.426771
tmp                        0.338686  0.162593  2.083033  0.037248  0.020010  0.657361
tmp_tmp_interaction       -0.008189  0.004781 -1.712763  0.086756 -0.017560  0.001182
pre                        0.111375  0.063920  1.742423  0.081434 -0.013905  0.236656
pre_pre_interaction       -0.005409  0.004971 -1.088106  0.276548 -0.015152  0.004334
lnfer_tmp_interaction     -0.102543  0.092240 -1.111696  0.266269 -0.283331  0.078244
lnfer_tmp_tmp_interaction  0.003565  0.002747  1.297981  0.194294 -0.001818  0.008948
lnfer_pre_interaction     -0.051524  0.031974 -1.611418  0.107089 -0.114193  0.011145
lnfer_pre_pre_interaction  0.003138  0.002607  1.204084  0.228557 -0.001970  0.008247
In [40]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 2))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/3655828460.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.651
Model:                            OLS   Adj. R-squared:                  0.596
Within R-squared:                 0.275
Method:                 Least Squares   F-statistic:                     98.5
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               1.11e-96
Time:                        17:06:56   Log-Likelihood:                 2376.16
No. Observations:                3099   AIC:                            -3914.3
Df Residuals:                    2680   BIC:                             -1384.0
Df Model:                         418                                         
Covariance Type:               cluster                             
==============================================================================

                              Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept                  6.016660  0.951334  6.324447  2.541408e-10  4.152080  7.881240
lnfer                     -0.709806  0.570828 -1.243467  2.136957e-01 -1.828609  0.408996
tmp                        0.151761  0.101951  1.488573  1.365999e-01 -0.048059  0.351581
tmp_tmp_interaction       -0.003432  0.002694 -1.274149  2.026107e-01 -0.008712  0.001847
pre                        0.142751  0.030351  4.703284  2.560097e-06  0.083264  0.202239
pre_pre_interaction       -0.009072  0.002438 -3.721473  1.980640e-04 -0.013850 -0.004294
lnfer_tmp_interaction      0.046797  0.061703  0.758424  4.481973e-01 -0.074138  0.167732
lnfer_tmp_tmp_interaction -0.000562  0.001637 -0.343212  7.314389e-01 -0.003770  0.002646
lnfer_pre_interaction      0.036090  0.025593  1.410167  1.584905e-01 -0.014071  0.086250
lnfer_pre_pre_interaction -0.002996  0.002022 -1.481616  1.384426e-01 -0.006960  0.000967
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 418, but rank is 9
  warnings.warn('covariance of constraints does not have full '
In [41]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 3))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2831458825.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.662
Model:                            OLS   Adj. R-squared:                  0.602
Within R-squared:                 0.431
Method:                 Least Squares   F-statistic:                     40555524311549.3
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:06:57   Log-Likelihood:                 1539.75
No. Observations:                2629   AIC:                            -2283.5
Df Residuals:                    2231   BIC:                             54.5
Df Model:                         397                                         
Covariance Type:               cluster                             
==============================================================================

                              Coef.  Std.Err.         z         P>|z|    [0.025     0.975]
Intercept                  9.859382  0.986244  9.996894  1.572514e-23  7.926378  11.792385
lnfer                     -2.498688  0.510061 -4.898805  9.642113e-07 -3.498388  -1.498987
tmp                       -0.298623  0.097980 -3.047789  2.305321e-03 -0.490661  -0.106585
tmp_tmp_interaction        0.007264  0.002446  2.970185  2.976205e-03  0.002471   0.012058
pre                        0.357796  0.052796  6.776911  1.227725e-11  0.254317   0.461275
pre_pre_interaction       -0.024120  0.003909 -6.171033  6.784519e-10 -0.031780  -0.016459
lnfer_tmp_interaction      0.221760  0.048105  4.609935  4.027952e-06  0.127476   0.316044
lnfer_tmp_tmp_interaction -0.005130  0.001174 -4.370418  1.240088e-05 -0.007431  -0.002830
lnfer_pre_interaction      0.061620  0.026551  2.320819  2.029663e-02  0.009581   0.113659
lnfer_pre_pre_interaction -0.003340  0.001917 -1.742288  8.145813e-02 -0.007097   0.000417
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 397, but rank is 9
  warnings.warn('covariance of constraints does not have full '
In [42]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 4))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/2237879532.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.735
Model:                            OLS   Adj. R-squared:                  0.661
Within R-squared:                 0.372
Method:                 Least Squares   F-statistic:                     390901626296.6
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:06:57   Log-Likelihood:                 616.82
No. Observations:                1690   AIC:                            -491.6
Df Residuals:                    1319   BIC:                             1523.8
Df Model:                         370                                         
Covariance Type:               cluster                             
==============================================================================

                              Coef.  Std.Err.         z     P>|z|    [0.025     0.975]
Intercept                  6.329019  3.929353  1.610703  0.107245 -1.372371  14.030410
lnfer                     -1.002779  1.616194 -0.620457  0.534957 -4.170462   2.164903
tmp                       -0.013749  0.350351 -0.039243  0.968697 -0.700423   0.672926
tmp_tmp_interaction       -0.001632  0.007999 -0.204043  0.838320 -0.017310   0.014046
pre                        0.523433  0.131485  3.980927  0.000069  0.265727   0.781139
pre_pre_interaction       -0.032636  0.008597 -3.796332  0.000147 -0.049485  -0.015787
lnfer_tmp_interaction      0.099726  0.143031  0.697230  0.485659 -0.180610   0.380062
lnfer_tmp_tmp_interaction -0.001795  0.003238 -0.554314  0.579364 -0.008141   0.004551
lnfer_pre_interaction     -0.039611  0.051298 -0.772177  0.440009 -0.140154   0.060931
lnfer_pre_pre_interaction  0.002444  0.003376  0.723855  0.469155 -0.004173   0.009060
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 370, but rank is 9
  warnings.warn('covariance of constraints does not have full '
In [43]:
# Filter data for rainfed soybean
data = df_soybean[(df_soybean['group'] == 2)&((df_soybean['zone'] == 5))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.678
Model:                            OLS   Adj. R-squared:                  0.601
Within R-squared:                 0.119
Method:                 Least Squares   F-statistic:                     22.3
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               2.81e-24
Time:                        17:06:57   Log-Likelihood:                 263.68
No. Observations:                880   AIC:                            -185.4
Df Residuals:                    709   BIC:                             632.0
Df Model:                         170                                         
Covariance Type:               cluster                             
==============================================================================

                               Coef.   Std.Err.         z     P>|z|     [0.025     0.975]
Intercept                 -11.462215  10.966653 -1.045188  0.295936 -32.956461  10.032030
lnfer                       2.805500   5.374390  0.522013  0.601661  -7.728111  13.339112
tmp                         1.390408   0.885051  1.570991  0.116185  -0.344261   3.125076
tmp_tmp_interaction        -0.027157   0.017715 -1.532982  0.125280  -0.061878   0.007564
pre                         0.438126   0.107906  4.060258  0.000049   0.226634   0.649617
pre_pre_interaction        -0.029398   0.007329 -4.011491  0.000060  -0.043762  -0.015035
lnfer_tmp_interaction      -0.156050   0.435796 -0.358082  0.720282  -1.010195   0.698094
lnfer_tmp_tmp_interaction   0.002869   0.008765  0.327329  0.743419  -0.014311   0.020049
lnfer_pre_interaction      -0.159927   0.049943 -3.202202  0.001364  -0.257813  -0.062041
lnfer_pre_pre_interaction   0.010881   0.003288  3.309123  0.000936   0.004436   0.017326
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3145/969849701.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 170, but rank is 9
  warnings.warn('covariance of constraints does not have full '
In [ ]: