In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import netCDF4 as nc
import pandas as pd
import os
import csv
from glob import glob
import xarray as xr
import matplotlib.ticker as ticker 

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import re
from datetime import datetime
In [2]:
# Step 1: Data Preparation
# Load your dataset into a pandas DataFrame
os.chdir('/Users/chenchenren/postdoc/paper/2N and water-US/regression/')
In [3]:
# Filter the dataframe for rows where crop is 'corn'
# Path to the Excel file
excel_file = "./corn_reg_data.csv"
# Read the specific sheet from the Excel file
ds = pd.read_csv(excel_file)

df_corn =ds
print(df_corn)
         geoid    year        tmp        pre  irrigation1   lnyield     lnfer  \
0       1001.0  2011.0  22.514565   7.140138     0.502789  8.849908  6.394569   
1       1001.0  2013.0  21.656385  10.242795     0.002722  8.737623  6.622513   
2       1001.0  2017.0  22.842094  11.152807     0.018559  9.279970  6.419519   
3       1001.0  2018.0  22.934221  10.243298     1.663942  9.249052  6.429010   
4       1001.0  2019.0  23.288399   8.607845     0.354742  9.249052  6.419507   
...        ...     ...        ...        ...          ...       ...       ...   
19939  56031.0  2011.0  12.459663   3.571071     0.000000  9.105812  6.221997   
19940  56031.0  2014.0  12.139701   3.237197     0.000000  9.166642  6.244471   
19941  56043.0  2008.0  12.025408   2.338907     0.000000  9.182902  6.500314   
19942  56043.0  2011.0  12.087538   2.497200     0.000000  9.025305  6.809824   
19943  56043.0  2013.0  12.322425   2.396772     0.000000  9.164016  6.837906   

       group  zone  
0          1   5.0  
1          1   5.0  
2          1   5.0  
3          1   5.0  
4          1   5.0  
...      ...   ...  
19939      2   1.0  
19940      2   1.0  
19941      2   1.0  
19942      2   1.0  
19943      2   1.0  

[19944 rows x 9 columns]
In [4]:
excel_file = "./soybean_reg_data.csv"
# Read the specific sheet from the Excel file
ds = pd.read_csv(excel_file)
df_soybean = ds
print(df_soybean)
         geoid    year        tmp       pre  irrigation1   lnyield     lnfer  \
0       1001.0  2009.0  24.140660  8.808839     0.718851  7.609614  2.022537   
1       1001.0  2010.0  25.675484  5.438990     0.830087  7.035731  1.910005   
2       1001.0  2011.0  24.544823  5.270162     0.595086  7.466514  2.025641   
3       1001.0  2013.0  24.036592  7.514143     0.001241  7.926855  2.906756   
4       1003.0  2008.0  24.727230  8.675528     0.001980  7.934111  1.240602   
...        ...     ...        ...       ...          ...       ...       ...   
17577  55141.0  2016.0  17.486904  7.304833     0.000000  8.134343  2.860158   
17578  55141.0  2017.0  16.590237  6.467205     0.000000  7.978877  2.633171   
17579  55141.0  2018.0  17.266659  8.373576     0.000000  7.838077  2.639034   
17580  55141.0  2019.0  16.238592  8.735889     0.000000  7.912185  2.671287   
17581  55141.0  2020.0  16.202635  6.832954     0.000000  8.173033  2.625827   

       group  zone  
0          1   5.0  
1          1   5.0  
2          1   5.0  
3          1   5.0  
4          1   5.0  
...      ...   ...  
17577      2   1.0  
17578      2   1.0  
17579      2   1.0  
17580      2   1.0  
17581      2   1.0  

[17582 rows x 9 columns]

All sample regression¶

In [5]:
# Filter data for irrigated corn
data = df_corn[df_corn['group'] == 1]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1274286391.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1508, but rank is 7
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.645
Model:                            OLS   Adj. R-squared:                  0.589
Within R-squared:                 0.177
Method:                 Least Squares   F-statistic:                     3255195358504.2
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:46:50   Log-Likelihood:                 2533.88
No. Observations:                11044   AIC:                            -2049.8
Df Residuals:                    9535   BIC:                             8980.5
Df Model:                         1508                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            5.111854  0.307692  16.613552  5.560121e-62  4.508789  5.714919
lnfer                0.460742  0.040436  11.394350  4.461466e-30  0.381489  0.539995
irrigation1          0.033407  0.007911   4.222728  2.413634e-05  0.017901  0.048913
irrigation12        -0.001540  0.000807  -1.909466  5.620195e-02 -0.003121  0.000041
tmp                  0.013745  0.022950   0.598910  5.492332e-01 -0.031237  0.058727
tmp_tmp_interaction -0.000760  0.000673  -1.129258  2.587892e-01 -0.002079  0.000559
pre                  0.188568  0.012528  15.051895  3.354859e-51  0.164014  0.213122
pre_pre_interaction -0.007849  0.000711 -11.044381  2.333717e-28 -0.009242 -0.006456
In [6]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp = model.params['tmp']
coeff_tmp_tmp_interaction = model.params['tmp_tmp_interaction']

# Define the range of tmp based on the input data
tmp_range = np.linspace(data['tmp'].min(), data['tmp'].max(), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp = coeff_tmp * tmp_range + coeff_tmp_tmp_interaction * (tmp_range ** 2)

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['tmp', 'tmp'] * tmp_range**2 +
            2 * cov_matrix.loc['tmp', 'tmp_tmp_interaction'] * tmp_range**3 +
            cov_matrix.loc['tmp_tmp_interaction', 'tmp_tmp_interaction'] * tmp_range**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_tmp = lnyield_pred_tmp - 1.96 * pred_se
ci_upper_tmp = lnyield_pred_tmp + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range, lnyield_pred_tmp, label='Predicted lnyield', color='blue')
plt.fill_between(tmp_range, ci_lower_tmp, ci_upper_tmp, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
In [7]:
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre = model.params['pre']
coeff_pre_pre_interaction = model.params['pre_pre_interaction']

# Define the range of pre based on the input data
pre_range = np.linspace(np.percentile(data['pre'],1), np.percentile(data['pre'],99), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre = coeff_pre * pre_range + coeff_pre_pre_interaction * (pre_range ** 2)-0.8

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['pre', 'pre'] * pre_range**2 +
            2 * cov_matrix.loc['pre', 'pre_pre_interaction'] * pre_range**3 +
            cov_matrix.loc['pre_pre_interaction', 'pre_pre_interaction'] * pre_range**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_pre = lnyield_pred_pre - 1.96 * pred_se
ci_upper_pre = lnyield_pred_pre + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range, lnyield_pred_pre, label='Predicted lnyield', color='blue')
plt.fill_between(pre_range, ci_lower_pre, ci_upper_pre, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [8]:
# Filter data for rainfed corn
data = df_corn[df_corn['group'] == 2]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2811297337.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2811297337.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2811297337.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2811297337.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2811297337.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2811297337.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 1349, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.679
Model:                            OLS   Adj. R-squared:                  0.622
Within R-squared:                 0.213
Method:                 Least Squares   F-statistic:                     34780076720926.6
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:46:58   Log-Likelihood:                 1728.19
No. Observations:                8900   AIC:                            -756.4
Df Residuals:                    7550   BIC:                             8820.3
Df Model:                         1349                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            4.172842  0.286888  14.545210  6.263815e-48  3.610552  4.735131
lnfer                0.607814  0.046423  13.092929  3.613962e-39  0.516826  0.698802
tmp                  0.147547  0.024813   5.946388  2.741238e-09  0.098915  0.196179
tmp_tmp_interaction -0.005172  0.000751  -6.886305  5.726009e-12 -0.006644 -0.003700
pre                  0.166593  0.014075  11.835963  2.544069e-32  0.139006  0.194180
pre_pre_interaction -0.007199  0.000845  -8.516098  1.650193e-17 -0.008856 -0.005543
In [9]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp = model.params['tmp']
coeff_tmp_tmp_interaction = model.params['tmp_tmp_interaction']

# Define the range of tmp based on the input data
tmp_range_rfd = np.linspace(data['tmp'].min(), data['tmp'].max(), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd = coeff_tmp * tmp_range_rfd + coeff_tmp_tmp_interaction * (tmp_range_rfd ** 2)-0.7

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['tmp', 'tmp'] * tmp_range_rfd**2 +
            2 * cov_matrix.loc['tmp', 'tmp_tmp_interaction'] * tmp_range_rfd**3 +
            cov_matrix.loc['tmp_tmp_interaction', 'tmp_tmp_interaction'] * tmp_range_rfd**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_tmp_rfd = lnyield_pred_tmp_rfd - 1.96 * pred_se
ci_upper_tmp_rfd = lnyield_pred_tmp_rfd + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd, lnyield_pred_tmp_rfd, label='Predicted lnyield', color='blue')
plt.fill_between(tmp_range_rfd, ci_lower_tmp_rfd, ci_upper_tmp_rfd, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()
In [10]:
# Extract the coefficients for pre and pre_pre_interaction
coeff_pre = model.params['pre']
coeff_pre_pre_interaction = model.params['pre_pre_interaction']

# Define the range of pre based on the input data
pre_range_rfd = np.linspace(np.percentile(data['pre'],1), np.percentile(data['pre'],99), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd = coeff_pre * pre_range_rfd + coeff_pre_pre_interaction * (pre_range_rfd ** 2)-0.8

# Calculate the standard errors and confidence intervals
# Get the covariance matrix of the parameters
cov_matrix = model.cov_params()
# Calculate the variance of the prediction
pred_var = (cov_matrix.loc['pre', 'pre'] * pre_range_rfd**2 +
            2 * cov_matrix.loc['pre', 'pre_pre_interaction'] * pre_range_rfd**3 +
            cov_matrix.loc['pre_pre_interaction', 'pre_pre_interaction'] * pre_range_rfd**4)
# Calculate the standard error
pred_se = np.sqrt(pred_var)
# Calculate the 95% confidence interval
ci_lower_pre_rfd = lnyield_pred_pre_rfd - 1.96 * pred_se
ci_upper_pre_rfd = lnyield_pred_pre_rfd + 1.96 * pred_se

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd, lnyield_pred_pre_rfd, label='Predicted lnyield', color='blue')
plt.fill_between(pre_range_rfd, ci_lower_pre_rfd, ci_upper_pre_rfd, color='blue', alpha=0.2, label='95% CI')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()

Divide by climate zone¶

In [11]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 1)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.767
Model:                            OLS   Adj. R-squared:                  0.736
Within R-squared:                 0.195
Method:                 Least Squares   F-statistic:                     106.8
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               2.33e-57
Time:                        14:46:58   Log-Likelihood:                 982.95
No. Observations:                1452   AIC:                            -1625.9
Df Residuals:                    1282   BIC:                             -728.2
Df Model:                         169                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            3.052680  0.893944  3.414845  6.381827e-04  1.300582  4.804778
lnfer                0.276984  0.051838  5.343288  9.127539e-08  0.175384  0.378585
irrigation1          0.096729  0.032794  2.949605  3.181808e-03  0.032454  0.161005
irrigation12        -0.007572  0.003135 -2.415187  1.572714e-02 -0.013717 -0.001427
tmp                  0.526457  0.109636  4.801871  1.571897e-06  0.311575  0.741339
tmp_tmp_interaction -0.017897  0.003878 -4.614738  3.935915e-06 -0.025499 -0.010296
pre                  0.234201  0.042718  5.482543  4.192551e-08  0.150476  0.317926
pre_pre_interaction -0.014771  0.002948 -5.010080  5.440749e-07 -0.020550 -0.008993
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2640714645.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 169, but rank is 7
  warnings.warn('covariance of constraints does not have full '
In [12]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp1 = model.params['tmp']
coeff_tmp_tmp_interaction1 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range1 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp1 = coeff_tmp1 * tmp_range1 + coeff_tmp_tmp_interaction1 * (tmp_range1 ** 2)-3.8
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range1, lnyield_pred_tmp1, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre1 = model.params['pre']
coeff_pre_pre_interaction1 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range1 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre1 = coeff_pre1 * pre_range1 + coeff_pre_pre_interaction1 * (pre_range1 ** 2)-0.8
# Calculate the standard errors and confidence intervals
# Plotting the resultsplt.figure(figsize=(10, 6))
plt.plot(pre_range1, lnyield_pred_pre1, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [13]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 2)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2701290221.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.770
Model:                            OLS   Adj. R-squared:                  0.731
Within R-squared:                 0.291
Method:                 Least Squares   F-statistic:                     -757983258881.0
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:46:59   Log-Likelihood:                 1684.51
No. Observations:                2657   AIC:                            -2605.0
Df Residuals:                    2275   BIC:                             -357.0
Df Model:                         381                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            3.933547  0.565241   6.959065  3.425395e-12  2.825695  5.041398
lnfer                0.313007  0.063239   4.949596  7.436765e-07  0.189061  0.436952
irrigation1          0.024007  0.017712   1.355382  1.752960e-01 -0.010709  0.058722
irrigation12        -0.000612  0.001426  -0.429518  6.675460e-01 -0.003406  0.002182
tmp                  0.275186  0.056512   4.869494  1.118842e-06  0.164424  0.385948
tmp_tmp_interaction -0.008927  0.001766  -5.056012  4.281136e-07 -0.012387 -0.005466
pre                  0.335094  0.030346  11.042593  2.380621e-28  0.275618  0.394570
pre_pre_interaction -0.019440  0.001906 -10.200468  1.973184e-24 -0.023175 -0.015705
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 381, but rank is 7
  warnings.warn('covariance of constraints does not have full '
In [14]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp2 = model.params['tmp']
coeff_tmp_tmp_interaction2 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range2 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp2 = coeff_tmp2 * tmp_range2 + coeff_tmp_tmp_interaction2 * (tmp_range2 ** 2)-2.1
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range2, lnyield_pred_tmp2, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre2 = model.params['pre']
coeff_pre_pre_interaction2 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range2 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre2 = coeff_pre2 * pre_range2 + coeff_pre_pre_interaction2 * (pre_range2 ** 2)-1.2
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range2, lnyield_pred_pre2, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [51]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 3)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/867551411.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.619
Model:                            OLS   Adj. R-squared:                  0.553
Within R-squared:                 0.262
Method:                 Least Squares   F-statistic:                     -2559438698182.4
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:49:30   Log-Likelihood:                 715.60
No. Observations:                2826   AIC:                            -603.2
Df Residuals:                    2412   BIC:                             1858.7
Df Model:                         413                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            6.439422  0.661505  9.734504  2.148642e-22  5.142896  7.735948
lnfer                0.465042  0.083678  5.557544  2.735973e-08  0.301037  0.629048
irrigation1          0.023505  0.018396  1.277732  2.013441e-01 -0.012550  0.059560
irrigation12        -0.000226  0.001662 -0.135874  8.919207e-01 -0.003483  0.003032
tmp                 -0.007906  0.065092 -0.121466  9.033219e-01 -0.135484  0.119671
tmp_tmp_interaction -0.001382  0.001846 -0.748837  4.539556e-01 -0.005000  0.002235
pre                  0.251086  0.028952  8.672475  4.228260e-18  0.194341  0.307831
pre_pre_interaction -0.010942  0.001708 -6.408144  1.473022e-10 -0.014289 -0.007596
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 413, but rank is 7
  warnings.warn('covariance of constraints does not have full '
In [52]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp3 = model.params['tmp']
coeff_tmp_tmp_interaction3 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range3 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp3 = coeff_tmp3 * tmp_range3 + coeff_tmp_tmp_interaction3 * (tmp_range3 ** 2)+0.5
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range3, lnyield_pred_tmp3, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre3 = model.params['pre']
coeff_pre_pre_interaction3 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range3 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre3 = coeff_pre3 * pre_range3 + coeff_pre_pre_interaction3 * (pre_range3 ** 2)-1.1
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range3, lnyield_pred_pre3, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [53]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 4)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3225580690.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 380, but rank is 7
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.556
Model:                            OLS   Adj. R-squared:                  0.489
Within R-squared:                 0.233
Method:                 Least Squares   F-statistic:                     2085587338502.0
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:49:31   Log-Likelihood:                 99.75
No. Observations:                2908   AIC:                            562.5
Df Residuals:                    2527   BIC:                             2839.1
Df Model:                         380                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            3.279064  1.390753  2.357762  1.838550e-02  0.553238  6.004890
lnfer                0.694910  0.105472  6.588591  4.440189e-11  0.488189  0.901630
irrigation1          0.034188  0.012442  2.747852  5.998706e-03  0.009803  0.058573
irrigation12        -0.001114  0.001390 -0.801063  4.230950e-01 -0.003839  0.001611
tmp                 -0.072687  0.128949 -0.563688  5.729664e-01 -0.325421  0.180048
tmp_tmp_interaction  0.001571  0.003225  0.486982  6.262710e-01 -0.004750  0.007891
pre                  0.362097  0.039210  9.234787  2.588012e-20  0.285247  0.438948
pre_pre_interaction -0.016332  0.002041 -8.001175  1.232376e-15 -0.020333 -0.012332
In [54]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp4 = model.params['tmp']
coeff_tmp_tmp_interaction4 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range4 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp4 = coeff_tmp4 * tmp_range4 + coeff_tmp_tmp_interaction4 * (tmp_range4 ** 2)+0.9
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range4, lnyield_pred_tmp4, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre4 = model.params['pre']
coeff_pre_pre_interaction4 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range4 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre4 = coeff_pre4 * pre_range4 + coeff_pre_pre_interaction4 * (pre_range4 ** 2)-1.7
# Calculate the standard errors and confidence intervals
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range4, lnyield_pred_pre4, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [55]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 5)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.656
Model:                            OLS   Adj. R-squared:                  0.591
Within R-squared:                 0.092
Method:                 Least Squares   F-statistic:                     3743719045357.8
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:49:31   Log-Likelihood:                 102.65
No. Observations:                1201   AIC:                            174.7
Df Residuals:                    1011   BIC:                             1142.0
Df Model:                         189                                         
Covariance Type:               cluster                             
==============================================================================

                         Coef.  Std.Err.         z         P>|z|     [0.025     0.975]
Intercept            13.559728  1.563309  8.673733  4.181786e-18  10.495698  16.623758
lnfer                 0.222588  0.064952  3.426950  6.104016e-04   0.095284   0.349892
irrigation1          -0.001439  0.016073 -0.089508  9.286785e-01  -0.032940   0.030063
irrigation12         -0.000545  0.001343 -0.405702  6.849615e-01  -0.003177   0.002087
tmp                  -0.632408  0.139262 -4.541141  5.595053e-06  -0.905356  -0.359459
tmp_tmp_interaction   0.015438  0.003119  4.949351  7.446145e-07   0.009325   0.021552
pre                   0.062186  0.029144  2.133737  3.286435e-02   0.005064   0.119307
pre_pre_interaction  -0.001045  0.001525 -0.685619  4.929534e-01  -0.004034   0.001943
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/746417828.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 189, but rank is 7
  warnings.warn('covariance of constraints does not have full '
In [56]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp5 = model.params['tmp']
coeff_tmp_tmp_interaction5 = model.params['tmp_tmp_interaction']

# Define the range of tmp based on the input data
tmp_range5 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp5 = coeff_tmp5 * tmp_range5 + coeff_tmp_tmp_interaction5 * (tmp_range5 ** 2)+6.4

# Calculate the standard errors and confidence intervals

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range5, lnyield_pred_tmp5, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()



# Extract the coefficients for pre and pre_pre_interaction
coeff_pre5 = model.params['pre']
coeff_pre_pre_interaction5 = model.params['pre_pre_interaction']

# Define the range of pre based on the input data
pre_range5 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)

# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre5 = coeff_pre5 * pre_range5 + coeff_pre_pre_interaction5 * (pre_range5 ** 2)-0.3

# Calculate the standard errors and confidence intervals

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range5, lnyield_pred_pre5, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [57]:
# Import the necessary library for setting fonts
from matplotlib import rcParams

# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (tmp_range, lnyield_pred) tuples for the five models
models = [(tmp_range1, lnyield_pred_tmp1), 
          (tmp_range2, lnyield_pred_tmp2), 
          (tmp_range3, lnyield_pred_tmp3), 
          (tmp_range4, lnyield_pred_tmp4), 
          (tmp_range5, lnyield_pred_tmp5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(tmp_range, lnyield_pred_tmp, label='One model', color='blue', linewidth=4)
plt.fill_between(tmp_range, ci_lower_tmp, ci_upper_tmp, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (tmp_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(tmp_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(tmp_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Temperature (°C)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(tmp_range.min(), tmp_range.max(), num=5)  # Custom X tick positions
y_ticks = np.linspace(-2, 2.5, num=5)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size



# Save the entire figure as a JPG with DPI 300
plt.savefig('maize_irr_tem.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()
In [58]:
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (pre_range, lnyield_pred) tuples for the five models
models = [(pre_range1, lnyield_pred_pre1), 
          (pre_range2, lnyield_pred_pre2), 
          (pre_range3, lnyield_pred_pre3), 
          (pre_range4, lnyield_pred_pre4), 
          (pre_range5, lnyield_pred_pre5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(pre_range, lnyield_pred_pre, label='One model', color='blue', linewidth=4)
plt.fill_between(pre_range, ci_lower_pre, ci_upper_pre, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (pre_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(pre_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(pre_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Precipitation (100mm)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(2, 12, num=5)  # Custom X tick positions
y_ticks = np.linspace(-0.4, 0.4, num=4)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size

plt.legend()
# Save the entire figure as a JPG with DPI 300
plt.savefig('maize_irr_pre.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()
In [59]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 1))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.802
Model:                            OLS   Adj. R-squared:                  0.771
Within R-squared:                 0.135
Method:                 Least Squares   F-statistic:                     -12799057702605.9
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:49:41   Log-Likelihood:                 964.68
No. Observations:                1613   AIC:                            -1495.4
Df Residuals:                    1396   BIC:                             -326.6
Df Model:                         216                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept            5.252103  0.455395  11.533069  8.988165e-31  4.359545  6.144661
lnfer                0.394333  0.048448   8.139324  3.974923e-16  0.299377  0.489289
tmp                  0.198333  0.050717   3.910591  9.207056e-05  0.098930  0.297736
tmp_tmp_interaction -0.006274  0.001805  -3.475439  5.100177e-04 -0.009813 -0.002736
pre                  0.060279  0.021031   2.866151  4.154958e-03  0.019058  0.101499
pre_pre_interaction -0.002577  0.001448  -1.779549  7.514987e-02 -0.005415  0.000261
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1034416423.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1034416423.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1034416423.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1034416423.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1034416423.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1034416423.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 216, but rank is 5
  warnings.warn('covariance of constraints does not have full '
In [60]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd1 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd1 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd1 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd1 = coeff_tmp_rfd1 * tmp_range_rfd1 + coeff_tmp_tmp_interaction_rfd1 * (tmp_range_rfd1 ** 2)-1.2
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd1, lnyield_pred_tmp_rfd1, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd1 = model.params['pre']
coeff_pre_pre_interaction_rfd1 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd1 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd1 = coeff_pre_rfd1 * pre_range_rfd1 + coeff_pre_pre_interaction_rfd1 * (pre_range_rfd1 ** 2)-0.25
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd1, lnyield_pred_pre_rfd1, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [61]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 2))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3422232410.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3422232410.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3422232410.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3422232410.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3422232410.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3422232410.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 343, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.642
Model:                            OLS   Adj. R-squared:                  0.591
Within R-squared:                 0.356
Method:                 Least Squares   F-statistic:                     -2330358109381.1
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:49:42   Log-Likelihood:                 1468.85
No. Observations:                2765   AIC:                            -2249.7
Df Residuals:                    2421   BIC:                             -211.6
Df Model:                         343                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept           -1.102642  0.775462  -1.421917  1.550503e-01 -2.622519  0.417235
lnfer                0.927906  0.088061  10.537055  5.829597e-26  0.755309  1.100503
tmp                  0.467413  0.061618   7.585637  3.308585e-14  0.346644  0.588183
tmp_tmp_interaction -0.015473  0.001927  -8.027960  9.910679e-16 -0.019251 -0.011696
pre                  0.147393  0.021095   6.987085  2.806557e-12  0.106047  0.188738
pre_pre_interaction -0.006740  0.001355  -4.974587  6.538695e-07 -0.009396 -0.004085
In [62]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd2 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd2 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd2 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd2 = coeff_tmp_rfd2 * tmp_range_rfd2 + coeff_tmp_tmp_interaction_rfd2 * (tmp_range_rfd2 ** 2)-3.2
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd2, lnyield_pred_tmp_rfd2, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd2 = model.params['pre']
coeff_pre_pre_interaction_rfd2 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd2 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd2 = coeff_pre_rfd2 * pre_range_rfd2 + coeff_pre_pre_interaction_rfd2 * (pre_range_rfd2 ** 2)-0.65
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd2, lnyield_pred_pre_rfd2, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [63]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 3))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1910022112.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1910022112.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1910022112.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1910022112.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1910022112.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/1910022112.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 326, but rank is 5
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.534
Within R-squared:                 0.463
Method:                 Least Squares   F-statistic:                     83.5
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               5.64e-56
Time:                        14:49:42   Log-Likelihood:                 341.93
No. Observations:                2029   AIC:                            -29.9
Df Residuals:                    1702   BIC:                             1806.3
Df Model:                         326                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept            6.990404  0.977577  7.150747  8.630714e-13  5.074389  8.906420
lnfer                1.064302  0.135298  7.866380  3.650518e-15  0.799124  1.329480
tmp                 -0.354535  0.113748 -3.116850  1.827946e-03 -0.577476 -0.131593
tmp_tmp_interaction  0.008139  0.003117  2.611271  9.020632e-03  0.002030  0.014249
pre                  0.551791  0.072591  7.601347  2.930642e-14  0.409515  0.694067
pre_pre_interaction -0.027291  0.004045 -6.747497  1.504172e-11 -0.035218 -0.019363
In [64]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd3 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd3 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd3 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd3 = coeff_tmp_rfd3 * tmp_range_rfd3 + coeff_tmp_tmp_interaction_rfd3 * (tmp_range_rfd3 ** 2)+4
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd3, lnyield_pred_tmp_rfd3, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd3 = model.params['pre']
coeff_pre_pre_interaction_rfd3 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd3 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd3 = coeff_pre_rfd3 * pre_range_rfd3 + coeff_pre_pre_interaction_rfd3 * (pre_range_rfd3 ** 2)-2.6
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd3, lnyield_pred_pre_rfd3, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [75]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 4))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.643
Model:                            OLS   Adj. R-squared:                  0.549
Within R-squared:                 0.124
Method:                 Least Squares   F-statistic:                     179386718552.6
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        14:50:54   Log-Likelihood:                 0.71
No. Observations:                1329   AIC:                            556.6
Df Residuals:                    1050   BIC:                             2005.2
Df Model:                         278                                         
Covariance Type:               cluster                             
==============================================================================

                         Coef.  Std.Err.         z         P>|z|    [0.025     0.975]
Intercept            12.242545  2.358220  5.191435  2.086796e-07  7.620519  16.864571
lnfer                 0.371699  0.119700  3.105247  1.901203e-03  0.137091   0.606307
tmp                  -0.575415  0.215495 -2.670204  7.580525e-03 -0.997777  -0.153053
tmp_tmp_interaction   0.012350  0.005323  2.319926  2.034488e-02  0.001916   0.022784
pre                   0.150640  0.036275  4.152740  3.285175e-05  0.079542   0.221737
pre_pre_interaction  -0.006717  0.002009 -3.342598  8.299800e-04 -0.010655  -0.002778
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2475739950.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2475739950.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2475739950.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2475739950.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2475739950.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/2475739950.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 278, but rank is 5
  warnings.warn('covariance of constraints does not have full '
In [76]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd4 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd4 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd4 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd4 = coeff_tmp_rfd4 * tmp_range_rfd4 + coeff_tmp_tmp_interaction_rfd4 * (tmp_range_rfd4 ** 2)+6.65
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd4, lnyield_pred_tmp_rfd4, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd4 = model.params['pre']
coeff_pre_pre_interaction_rfd4 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd4 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd4 = coeff_pre_rfd4 * pre_range_rfd4 + coeff_pre_pre_interaction_rfd4 * (pre_range_rfd4 ** 2)-0.72
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd4, lnyield_pred_pre_rfd4, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [77]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 5))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.715
Model:                            OLS   Adj. R-squared:                  0.656
Within R-squared:                 0.129
Method:                 Least Squares   F-statistic:                     -2271384724277.0
Date:                Thu, 29 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        14:50:54   Log-Likelihood:                 -28.81
No. Observations:                1164   AIC:                            463.6
Df Residuals:                    961   BIC:                             1490.7
Df Model:                         202                                         
Covariance Type:               cluster                             
==============================================================================

                        Coef.  Std.Err.         z     P>|z|     [0.025    0.975]
Intercept           -1.589625  4.649129 -0.341919  0.732412 -10.701750  7.522500
lnfer               -0.043215  0.135072 -0.319941  0.749013  -0.307951  0.221521
tmp                  0.878089  0.396345  2.215468  0.026728   0.101268  1.654910
tmp_tmp_interaction -0.019393  0.008664 -2.238393  0.025195  -0.036374 -0.002412
pre                  0.219743  0.045706  4.807772  0.000002   0.130161  0.309325
pre_pre_interaction -0.011562  0.002710 -4.265606  0.000020  -0.016874 -0.006249
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3732171178.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3732171178.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3732171178.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3732171178.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3732171178.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_17445/3732171178.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 202, but rank is 5
  warnings.warn('covariance of constraints does not have full '
In [78]:
# Extract the coefficients for tmp and tmp_tmp_interaction
coeff_tmp_rfd5 = model.params['tmp']
coeff_tmp_tmp_interaction_rfd5 = model.params['tmp_tmp_interaction']
# Define the range of tmp based on the input data
tmp_range_rfd5 = np.linspace(np.percentile(data['tmp'],5), np.percentile(data['tmp'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_tmp_rfd5 = coeff_tmp_rfd5 * tmp_range_rfd5 + coeff_tmp_tmp_interaction_rfd5 * (tmp_range_rfd5 ** 2)-9.85
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(tmp_range_rfd5, lnyield_pred_tmp_rfd5, label='Predicted lnyield', color='blue')
plt.xlabel('tmp')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between tmp and lnyield')
plt.legend()
plt.show()


# Extract the coefficients for pre and pre_pre_interaction
coeff_pre_rfd5 = model.params['pre']
coeff_pre_pre_interaction_rfd5 = model.params['pre_pre_interaction']
# Define the range of pre based on the input data
pre_range_rfd5 = np.linspace(np.percentile(data['pre'],5), np.percentile(data['pre'],95), 100)
# Calculate predicted lnyield using the quadratic form
lnyield_pred_pre_rfd5 = coeff_pre_rfd5 * pre_range_rfd5 + coeff_pre_pre_interaction_rfd5 * (pre_range_rfd5 ** 2)-0.91
# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(pre_range_rfd5, lnyield_pred_pre_rfd5, label='Predicted lnyield', color='blue')
plt.xlabel('pre')
plt.ylabel('lnyield')
plt.title('Nonlinear Relationship between pre and lnyield')
plt.legend()
plt.show()
In [79]:
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (tmp_range, lnyield_pred) tuples for the five models
models = [(tmp_range_rfd1, lnyield_pred_tmp_rfd1), 
          (tmp_range_rfd2, lnyield_pred_tmp_rfd2), 
          (tmp_range_rfd3, lnyield_pred_tmp_rfd3), 
          (tmp_range_rfd4, lnyield_pred_tmp_rfd4), 
          (tmp_range_rfd5, lnyield_pred_tmp_rfd5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(tmp_range_rfd, lnyield_pred_tmp_rfd, label='One model', color='blue', linewidth=4)
plt.fill_between(tmp_range_rfd, ci_lower_tmp_rfd, ci_upper_tmp_rfd, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (tmp_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(tmp_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(tmp_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Temperature (°C)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(tmp_range_rfd.min(), tmp_range_rfd.max(), num=5)  # Custom X tick positions
y_ticks = np.linspace(-1, 1, num=5)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size

# Save the entire figure as a JPG with DPI 300
plt.savefig('maize_rfd_tem.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()
In [80]:
# Set font to Arial and size to 12
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define a list of (pre_range, lnyield_pred) tuples for the five models
models = [(pre_range_rfd1, lnyield_pred_pre_rfd1), 
          (pre_range_rfd2, lnyield_pred_pre_rfd2), 
          (pre_range_rfd3, lnyield_pred_pre_rfd3), 
          (pre_range_rfd4, lnyield_pred_pre_rfd4), 
          (pre_range_rfd5, lnyield_pred_pre_rfd5)]

# Plotting the results
plt.figure(figsize=(4, 4))

# Plot the first model (One model)
plt.plot(pre_range_rfd, lnyield_pred_pre_rfd, label='One model', color='blue', linewidth=4)
plt.fill_between(pre_range_rfd, ci_lower_pre_rfd, ci_upper_pre_rfd, color='blue', alpha=0.2, label='95% CI')

# Plot the five models
for idx, (pre_range_, lnyield_pred_) in enumerate(models):
    if idx == 0:
        plt.plot(pre_range_, lnyield_pred_, color='green', label='Five models')
    else:
        plt.plot(pre_range_, lnyield_pred_, color='green')

# Add labels and title
plt.xlabel('Precipitation (100mm)',fontsize=18, fontfamily='Arial')
plt.ylabel('Ln Yield (kg/ha/yr)', fontsize=18, fontfamily='Arial')

x_ticks = np.linspace(2, 12, num=5)  # Custom X tick positions
y_ticks = np.linspace(-0.6, 0.4, num=5)  # Custom Y tick positions

plt.xticks(ticks=x_ticks, labels=[f'{x:.1f}' for x in x_ticks])  # Custom X tick labels
plt.yticks(ticks=y_ticks, labels=[f'{y:.1f}' for y in y_ticks])  # Custom Y tick labels

plt.tick_params(axis='both', which='major', labelsize=16)  # Adjust tick label size


# Save the entire figure as a JPG with DPI 300
plt.savefig('maize_rfd_pre.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()

climate zone and interaction item¶

In [35]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 1)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.769
Model:                            OLS   Adj. R-squared:                  0.738
Within R-squared:                 0.203
Method:                 Least Squares   F-statistic:                     -316470679904.5
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        17:02:15   Log-Likelihood:                 989.80
No. Observations:                1452   AIC:                            -1631.6
Df Residuals:                    1278   BIC:                             -712.8
Df Model:                         173                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.         z         P>|z|    [0.025    0.975]
Intercept                               2.649869  0.938825  2.822537  4.764534e-03  0.809805  4.489932
lnfer                                   0.289802  0.051848  5.589490  2.277374e-08  0.188182  0.391422
irrigation1                             0.417184  0.221199  1.886013  5.929321e-02 -0.016358  0.850726
irrigation12                           -0.025987  0.011875 -2.188390  2.864119e-02 -0.049261 -0.002713
tmp                                     0.567397  0.112572  5.040287  4.648339e-07  0.346759  0.788035
tmp_tmp_interaction                    -0.019233  0.003970 -4.844130  1.271675e-06 -0.027014 -0.011451
pre                                     0.237504  0.043151  5.503992  3.712862e-08  0.152929  0.322079
pre_pre_interaction                    -0.014896  0.002958 -5.036466  4.742043e-07 -0.020693 -0.009099
lnfer_irrigation1_tmp_interaction      -0.003786  0.002849 -1.329037  1.838357e-01 -0.009370  0.001797
lnfer_irrigation12_tmp_tmp_interaction  0.000027  0.000011  2.586742  9.688822e-03  0.000007  0.000048
lnfer_irrigation1_pre_interaction      -0.001136  0.002160 -0.525699  5.990974e-01 -0.005370  0.003099
lnfer_irrigation12_pre_pre_interaction -0.000027  0.000018 -1.453211  1.461653e-01 -0.000063  0.000009
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/728768620.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 173, but rank is 11
  warnings.warn('covariance of constraints does not have full '
In [36]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 2)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/1463388600.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.782
Model:                            OLS   Adj. R-squared:                  0.745
Within R-squared:                 0.327
Method:                 Least Squares   F-statistic:                     120.6
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               1.13e-115
Time:                        17:02:15   Log-Likelihood:                 1754.74
No. Observations:                2657   AIC:                            -2737.5
Df Residuals:                    2271   BIC:                             -465.9
Df Model:                         385                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept                               3.504853  0.581110   6.031303  1.626432e-09  2.365897  4.643808
lnfer                                   0.338743  0.059702   5.673910  1.395748e-08  0.221730  0.455757
irrigation1                             0.344109  0.091500   3.760741  1.694111e-04  0.164772  0.523446
irrigation12                           -0.015341  0.004296  -3.570881  3.557820e-04 -0.023761 -0.006921
tmp                                     0.243285  0.059594   4.082347  4.458323e-05  0.126482  0.360088
tmp_tmp_interaction                    -0.007862  0.001850  -4.248628  2.150837e-05 -0.011488 -0.004235
pre                                     0.430679  0.031464  13.687868  1.199786e-42  0.369010  0.492348
pre_pre_interaction                    -0.025172  0.001932 -13.031066  8.146185e-39 -0.028959 -0.021386
lnfer_irrigation1_tmp_interaction      -0.001285  0.001061  -1.211259  2.257960e-01 -0.003365  0.000795
lnfer_irrigation12_tmp_tmp_interaction  0.000004  0.000003   1.249374  2.115284e-01 -0.000002  0.000010
lnfer_irrigation1_pre_interaction      -0.006504  0.001022  -6.364307  1.961734e-10 -0.008508 -0.004501
lnfer_irrigation12_pre_pre_interaction  0.000050  0.000010   5.190225  2.100405e-07  0.000031  0.000069
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 385, but rank is 11
  warnings.warn('covariance of constraints does not have full '
In [37]:
# Filter data for irrigated corn
data = df_corn[(df_corn['group'] == 1)&(df_corn['zone'] == 3)]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['irrigation12'] = data['irrigation1'] * data['irrigation1']
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer + irrigation1 + irrigation12 + tmp + tmp_tmp_interaction + pre + pre_pre_interaction+ lnfer_irrigation1_tmp_interaction+lnfer_irrigation12_tmp_tmp_interaction+lnfer_irrigation1_pre_interaction +lnfer_irrigation12_pre_pre_interaction + C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['irrigation12'] = data['irrigation1'] * data['irrigation1']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2621376652.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['irrigation1'] * data['tmp'] * data['tmp']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.633
Model:                            OLS   Adj. R-squared:                  0.569
Within R-squared:                 0.289
Method:                 Least Squares   F-statistic:                     -217776744522.7
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               1.00e+00
Time:                        17:02:16   Log-Likelihood:                 768.82
No. Observations:                2826   AIC:                            -701.6
Df Residuals:                    2408   BIC:                             1784.0
Df Model:                         417                                         
Covariance Type:               cluster                             
==============================================================================

                                           Coef.  Std.Err.          z         P>|z|    [0.025    0.975]
Intercept                               6.389456  0.666257   9.590075  8.802091e-22  5.083616  7.695296
lnfer                                   0.437887  0.081787   5.353988  8.603674e-08  0.277587  0.598186
irrigation1                             0.120379  0.093586   1.286283  1.983442e-01 -0.063047  0.303805
irrigation12                           -0.008318  0.004713  -1.765050  7.755538e-02 -0.017555  0.000919
tmp                                    -0.064156  0.062492  -1.026640  3.045899e-01 -0.186638  0.058325
tmp_tmp_interaction                     0.000135  0.001770   0.076487  9.390319e-01 -0.003334  0.003605
pre                                     0.373887  0.033229  11.251865  2.267459e-29  0.308759  0.439014
pre_pre_interaction                    -0.017997  0.001895  -9.495813  2.185004e-21 -0.021711 -0.014282
lnfer_irrigation1_tmp_interaction       0.001887  0.000959   1.968281  4.903566e-02  0.000008  0.003767
lnfer_irrigation12_tmp_tmp_interaction -0.000003  0.000003  -1.260237  2.075838e-01 -0.000009  0.000002
lnfer_irrigation1_pre_interaction      -0.007009  0.000846  -8.288348  1.148319e-16 -0.008667 -0.005352
lnfer_irrigation12_pre_pre_interaction  0.000036  0.000005   7.591397  3.164756e-14  0.000026  0.000045
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 417, but rank is 11
  warnings.warn('covariance of constraints does not have full '
In [38]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 1))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.803
Model:                            OLS   Adj. R-squared:                  0.772
Within R-squared:                 0.142
Method:                 Least Squares   F-statistic:                     17185022218331.2
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:02:16   Log-Likelihood:                 971.11
No. Observations:                1613   AIC:                            -1500.2
Df Residuals:                    1392   BIC:                             -309.9
Df Model:                         220                                         
Covariance Type:               cluster                             
==============================================================================

                              Coef.  Std.Err.         z     P>|z|    [0.025    0.975]
Intercept                  5.037728  2.531666  1.989887  0.046603  0.075754  9.999703
lnfer                      0.461925  0.491526  0.939779  0.347331 -0.501447  1.425298
tmp                        0.108871  0.352919  0.308486  0.757712 -0.582837  0.800579
tmp_tmp_interaction       -0.005214  0.012338 -0.422636  0.672561 -0.029396  0.018967
pre                        0.389833  0.211415  1.843922  0.065194 -0.024533  0.804200
pre_pre_interaction       -0.020473  0.013800 -1.483511  0.137939 -0.047520  0.006575
lnfer_tmp_interaction      0.011324  0.068910  0.164334  0.869468 -0.123737  0.146385
lnfer_tmp_tmp_interaction -0.000005  0.002426 -0.002022  0.998387 -0.004759  0.004749
lnfer_pre_interaction     -0.058714  0.037238 -1.576734  0.114857 -0.131700  0.014271
lnfer_pre_pre_interaction  0.003161  0.002448  1.290948  0.196722 -0.001638  0.007959
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2124702747.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2124702747.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2124702747.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2124702747.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2124702747.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2124702747.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 220, but rank is 9
  warnings.warn('covariance of constraints does not have full '
In [39]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 2))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/3708700106.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/3708700106.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/3708700106.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/3708700106.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/3708700106.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/3708700106.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 347, but rank is 9
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.645
Model:                            OLS   Adj. R-squared:                  0.594
Within R-squared:                 0.361
Method:                 Least Squares   F-statistic:                     98355558560.4
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:02:16   Log-Likelihood:                 1479.48
No. Observations:                2765   AIC:                            -2263.0
Df Residuals:                    2417   BIC:                             -201.1
Df Model:                         347                                         
Covariance Type:               cluster                             
==============================================================================

                              Coef.  Std.Err.         z     P>|z|    [0.025     0.975]
Intercept                  9.558795  5.008853  1.908380  0.056342 -0.258378  19.375967
lnfer                     -1.027908  0.913702 -1.124993  0.260592 -2.818730   0.762914
tmp                       -0.811899  0.583617 -1.391150  0.164180 -1.955768   0.331970
tmp_tmp_interaction        0.024148  0.017884  1.350292  0.176922 -0.010903   0.059199
pre                        0.025515  0.186386  0.136893  0.891116 -0.339794   0.390824
pre_pre_interaction       -0.002868  0.011725 -0.244611  0.806758 -0.025848   0.020112
lnfer_tmp_interaction      0.233981  0.107230  2.182052  0.029106  0.023814   0.444148
lnfer_tmp_tmp_interaction -0.007265  0.003294 -2.205842  0.027395 -0.013721  -0.000810
lnfer_pre_interaction      0.022029  0.034336  0.641564  0.521156 -0.045269   0.089326
lnfer_pre_pre_interaction -0.000660  0.002160 -0.305279  0.760154 -0.004894   0.003575
In [40]:
# Filter data for rainfed corn
data = df_corn[(df_corn['group'] == 2)&((df_corn['zone'] == 3))]

# Prepare the interaction terms
data['pre_pre_interaction'] = data['pre'] * data['pre']
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']

data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']

# Define the formula for the regression model
formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'

# Run the regression using the formula interface with clustered standard errors
model = sm.OLS.from_formula(formula, data=data).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})

# Extract the summary table
summary = model.summary2().tables[1]

# Filter out rows corresponding to dummy variables for geoid
exclude_terms = ['C(geoid)']
pattern = '|'.join([re.escape(term) for term in exclude_terms])
filtered_summary = summary.loc[~summary.index.str.contains(pattern)]

fixed_effects_only_model = sm.OLS.from_formula('lnyield ~ C(geoid)', data=data).fit()

# Calculate the total sum of squares (TSS) using the model with only fixed effects
tss_within = np.sum((fixed_effects_only_model.resid) ** 2)

# Calculate the residual sum of squares (RSS) from your main model
rss_within = np.sum((model.resid) ** 2)

# Calculate the within R-squared
within_r2 = 1 - (rss_within / tss_within)

# Format the OLS summary with the within R-squared
ols_info = f"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       {model.rsquared:.3f}
Model:                            OLS   Adj. R-squared:                  {model.rsquared_adj:.3f}
Within R-squared:                 {within_r2:.3f}
Method:                 Least Squares   F-statistic:                     {model.fvalue:.1f}
Date:                {datetime.now():%a, %d %b %Y}   Prob (F-statistic):               {model.f_pvalue:.2e}
Time:                        {datetime.now():%H:%M:%S}   Log-Likelihood:                 {model.llf:.2f}
No. Observations:                {model.nobs:.0f}   AIC:                            {model.aic:.1f}
Df Residuals:                    {model.df_resid:.0f}   BIC:                             {model.bic:.1f}
Df Model:                         {model.df_model:.0f}                                         
Covariance Type:               cluster                             
==============================================================================
"""

# Combine the OLS info with the filtered summary
final_summary = ols_info + "\n" + filtered_summary.to_string()

# Print the final summary
print(final_summary)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2848172568.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_pre_interaction'] = data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2848172568.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2848172568.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] * data['pre']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2848172568.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tmp_tmp_interaction'] = data['tmp'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2848172568.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_3056/2848172568.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] * data['tmp']
/Users/chenchenren/anaconda3/lib/python3.11/site-packages/statsmodels/base/model.py:1888: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 330, but rank is 9
  warnings.warn('covariance of constraints does not have full '
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lnyield   R-squared:                       0.610
Model:                            OLS   Adj. R-squared:                  0.534
Within R-squared:                 0.465
Method:                 Least Squares   F-statistic:                     9439797092.9
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00e+00
Time:                        17:02:17   Log-Likelihood:                 346.20
No. Observations:                2029   AIC:                            -30.4
Df Residuals:                    1698   BIC:                             1828.3
Df Model:                         330                                         
Covariance Type:               cluster                             
==============================================================================

                               Coef.   Std.Err.         z     P>|z|    [0.025     0.975]
Intercept                  19.811681  10.060982  1.969160  0.048935  0.092518  39.530843
lnfer                      -1.146685   1.778483 -0.644755  0.519086 -4.632448   2.339077
tmp                        -1.959786   1.092110 -1.794495  0.072734 -4.100282   0.180710
tmp_tmp_interaction         0.051468   0.029999  1.715623  0.086231 -0.007330   0.110266
pre                         1.053770   0.537326  1.961136  0.049863  0.000630   2.106911
pre_pre_interaction        -0.056528   0.030081 -1.879235  0.060212 -0.115485   0.002428
lnfer_tmp_interaction       0.277258   0.187402  1.479485  0.139011 -0.090043   0.644559
lnfer_tmp_tmp_interaction  -0.007477   0.005139 -1.454984  0.145674 -0.017550   0.002595
lnfer_pre_interaction      -0.087917   0.089388 -0.983539  0.325342 -0.263115   0.087281
lnfer_pre_pre_interaction   0.005110   0.004994  1.023229  0.306199 -0.004678   0.014898
In [ ]: