import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import netCDF4 as nc
import pandas as pd
import os
import csv
from glob import glob
import xarray as xr
import matplotlib.ticker as ticker 

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.colors import LinearSegmentedColormap, Normalize
from mpl_toolkits.axes_grid1 import make_axes_locatable

import geopandas as gpd
import matplotlib.colors as mcolors


# Step 1: Data Preparation
# Load your dataset into a pandas DataFrame
os.chdir('/Users/chenchenren/postdoc/paper/2N and water-US/Figure 3 Yiled under warming scenario/')


excel_file = "../regression/corn_reg_data.csv"
# Read the specific sheet from the Excel file
reg= pd.read_csv(excel_file)


excel_file = "../regression/climate_zone.csv"
# Read the specific sheet from the Excel file
climate_zone= pd.read_csv(excel_file)


# Load data from an Excel file
excel_file = "scenario_management.xlsx"
sheet_name = "maize"
data = pd.read_excel(excel_file, sheet_name=sheet_name)
data=pd.merge(data, climate_zone, on='geoid', how='left')

data1=data[data['sample'] == 1].copy()
data1=data1.sort_values(by=['geoid', 'scenario'])
print(data1)

data2015=data1[data1['scenario'] == 2015].copy()
print(data2015)


data1['tmp'] = data1.groupby(['geoid','scenario'])['tmp'].transform('mean')
data1['pre'] = data1.groupby(['geoid','scenario'])['pre'].transform('mean')
data1['tmp2015'] = data1.groupby(['geoid','scenario'])['tmp2015'].transform('mean')
data1['pre2015'] = data1.groupby(['geoid','scenario'])['pre2015'].transform('mean')
data1=data1.drop_duplicates(subset=['geoid','scenario'])
data1=data1.drop(['model','lnyield'], axis=1)
print(data1)

       geoid  year       tmp        pre       har  irrigation1  lnyield_obs  \
30022   1001  2015  23.28228   7.960044  446.6812     0.429451     9.093130   
30023   1001  2015  22.57526   8.138788  446.6812     0.429451     9.093130   
30026   1001  2015  22.91238   8.173495  446.6812     0.429451     9.093130   
30027   1001  2015  22.94868  10.270980  446.6812     0.429451     9.093130   
30029   1001  2015  23.06275   8.530843  446.6812     0.429451     9.093130   
...      ...   ...       ...        ...       ...          ...          ...   
30018  56043  2015  12.24377   2.675818  972.0000     0.000000     9.124074   
30020  56043  2015  13.58658   3.059424  972.0000     0.000000     9.124074   
50961  56043  2015  12.85831   3.102477  972.0000     0.000000     9.124074   
50963  56043  2015  13.56218   2.712176  972.0000     0.000000     9.124074   
50967  56043  2015  11.19868   2.964274  972.0000     0.000000     9.124074   

          lnfer  lnyield  group  sample  scenario    pre2015   model  \
30022  6.438559      NaN      1       1       1.5   5.544486    gfdl   
30023  6.438559      NaN      1       1       1.5   8.369947    ipsl   
30026  6.438559      NaN      1       1       1.5   6.570490     mpi   
30027  6.438559      NaN      1       1       1.5  11.500060     mri   
30029  6.438559      NaN      1       1       1.5  11.407870  ukesm1   
...         ...      ...    ...     ...       ...        ...     ...   
30018  6.780481      NaN      2       1    2015.0   2.675818     mpi   
30020  6.780481      NaN      2       1    2015.0   3.059424  ukesm1   
50961  6.780481      NaN      2       1    2015.0   3.102477    gfdl   
50963  6.780481      NaN      2       1    2015.0   2.712176    ipsl   
50967  6.780481      NaN      2       1    2015.0   2.964274     mri   

        tmp2015  zone  
30022  23.74543   3.0  
30023  21.99282   3.0  
30026  22.97267   3.0  
30027  23.24994   3.0  
30029  21.75993   3.0  
...         ...   ...  
30018  12.24377   1.0  
30020  13.58658   1.0  
50961  12.85831   1.0  
50963  13.56218   1.0  
50967  11.19868   1.0  

[31425 rows x 16 columns]
       geoid  year       tmp        pre       har  irrigation1  lnyield_obs  \
19546   1001  2015  23.74543   5.544486  446.6812     0.429451     9.093130   
30024   1001  2015  21.99282   8.369947  446.6812     0.429451     9.093130   
30025   1001  2015  22.97267   6.570490  446.6812     0.429451     9.093130   
30028   1001  2015  23.24994  11.500060  446.6812     0.429451     9.093130   
30030   1001  2015  21.75993  11.407870  446.6812     0.429451     9.093130   
...      ...   ...       ...        ...       ...          ...          ...   
30018  56043  2015  12.24377   2.675818  972.0000     0.000000     9.124074   
30020  56043  2015  13.58658   3.059424  972.0000     0.000000     9.124074   
50961  56043  2015  12.85831   3.102477  972.0000     0.000000     9.124074   
50963  56043  2015  13.56218   2.712176  972.0000     0.000000     9.124074   
50967  56043  2015  11.19868   2.964274  972.0000     0.000000     9.124074   

          lnfer  lnyield  group  sample  scenario    pre2015   model  \
19546  6.438559      NaN      1       1    2015.0   5.544486    gfdl   
30024  6.438559      NaN      1       1    2015.0   8.369947    ipsl   
30025  6.438559      NaN      1       1    2015.0   6.570490     mpi   
30028  6.438559      NaN      1       1    2015.0  11.500060     mri   
30030  6.438559      NaN      1       1    2015.0  11.407870  ukesm1   
...         ...      ...    ...     ...       ...        ...     ...   
30018  6.780481      NaN      2       1    2015.0   2.675818     mpi   
30020  6.780481      NaN      2       1    2015.0   3.059424  ukesm1   
50961  6.780481      NaN      2       1    2015.0   3.102477    gfdl   
50963  6.780481      NaN      2       1    2015.0   2.712176    ipsl   
50967  6.780481      NaN      2       1    2015.0   2.964274     mri   

        tmp2015  zone  
19546  23.74543   3.0  
30024  21.99282   3.0  
30025  22.97267   3.0  
30028  23.24994   3.0  
30030  21.75993   3.0  
...         ...   ...  
30018  12.24377   1.0  
30020  13.58658   1.0  
50961  12.85831   1.0  
50963  13.56218   1.0  
50967  11.19868   1.0  

[10475 rows x 16 columns]
       geoid  year        tmp        pre        har  irrigation1  lnyield_obs  \
30022   1001  2015  22.956270   8.614830   446.6812     0.429451     9.093130   
19547   1001  2015  24.564876   8.484914   446.6812     0.429451     9.093130   
19546   1001  2015  22.744158   8.678571   446.6812     0.429451     9.093130   
19551   1003  2015  24.364442  11.696664  2463.8170     1.376478     9.058842   
19553   1003  2015  25.822928  11.421008  2463.8170     1.376478     9.058842   
...      ...   ...        ...        ...        ...          ...          ...   
30013  56031  2015  15.676956   4.132068  3807.0000     0.000000     9.136227   
30014  56031  2015  13.419220   4.092800  3807.0000     0.000000     9.136227   
50962  56043  2015  12.759980   2.925866   972.0000     0.000000     9.124074   
30016  56043  2015  14.796224   2.868278   972.0000     0.000000     9.124074   
30018  56043  2015  12.689904   2.902834   972.0000     0.000000     9.124074   

          lnfer  group  sample  scenario    pre2015    tmp2015  zone  
30022  6.438559      1       1       1.5   8.678571  22.744158   3.0  
19547  6.438559      1       1       3.0   8.678571  22.744158   3.0  
19546  6.438559      1       1    2015.0   8.678571  22.744158   3.0  
19551  5.456232      1       1       1.5  11.511937  24.155120   3.0  
19553  5.456232      1       1       3.0  11.511937  24.155120   3.0  
...         ...    ...     ...       ...        ...        ...   ...  
30013  6.229183      2       1       3.0   4.092800  13.419220   1.0  
30014  6.229183      2       1    2015.0   4.092800  13.419220   1.0  
50962  6.780481      2       1       1.5   2.902834  12.689904   1.0  
30016  6.780481      2       1       3.0   2.902834  12.689904   1.0  
30018  6.780481      2       1    2015.0   2.902834  12.689904   1.0  

[6285 rows x 14 columns]


import pandas as pd
import statsmodels.api as sm

# Load data from an Excel file
excel_file = "scenario_management.xlsx"
sheet_name = "maize"
data = pd.read_excel(excel_file, sheet_name=sheet_name)

# Assuming climate_zone is already loaded or defined
data = pd.merge(data, climate_zone, on='geoid', how='left')

# Create interaction terms
data['tmp_tmp_interaction'] = data['tmp'] ** 2
data['pre_pre_interaction'] = data['pre'] ** 2
data['irrigation12'] = data['irrigation1'] ** 2
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation12'] * data['tmp'] ** 2
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation12'] * data['pre'] ** 2

data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] ** 2
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] ** 2

response_variable = 'lnyield'

# Function to perform regression analysis
def perform_regression(group_data, predictor_variables):
    X = sm.add_constant(group_data[predictor_variables])
    y = group_data[response_variable]
    return sm.OLS(y, X).fit()

# Initialize a nested dictionary to store regression results
reg = {}
data_rows = []

sample_data = data[data['sample'] == 0].copy()
project_data = data.copy()

zone_ = [1, 2, 3]
group_ = [1, 2]

# Check the unique values in the 'zone' and 'group' columns
print("Unique zones:", sample_data['zone'].unique())
print("Unique groups:", sample_data['group'].unique())

# Iterate through groups and zones in sample_data
for zone in zone_:
    reg[zone] = {}
    for group in group_:
        print(f"\nSubset for Zone {zone}, Group {group}:")
 # Calculate y_temp_base based on regression parameters
        tmp_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['tmp'].values
        pre_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['pre'].values
        lnfer_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['lnfer'].values
        irrigation1_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['irrigation1'].values

        if group == 1:
            group_data1 = sample_data[(sample_data['zone'] == zone) & (sample_data['group'] == group)]
            predictor_variables = ['lnfer', 'tmp', 'irrigation1', 'irrigation12', 'tmp_tmp_interaction',
                                    'pre', 'pre_pre_interaction', 'lnfer_irrigation1_pre_interaction', 'lnfer_irrigation12_pre_pre_interaction',
                                    'lnfer_irrigation1_tmp_interaction', 'lnfer_irrigation12_tmp_tmp_interaction']
            print(f"\nSubset for Zone {zone}, Group {group}:")
            
            reg_model = perform_regression(group_data1, predictor_variables)
            print(f"Regression const for Zone {zone}, Group {group}: {reg_model.params['lnfer']}")
            y_temp_base = (
                reg_model.params['tmp_tmp_interaction'] * tmp_values**2 +
                reg_model.params['tmp'] * tmp_values +
                reg_model.params['pre_pre_interaction'] * pre_values**2 +
                reg_model.params['pre'] * pre_values +
                reg_model.params['lnfer'] * lnfer_values +
                reg_model.params['irrigation1'] * irrigation1_values +
                reg_model.params['irrigation12'] * irrigation1_values**2 +
                reg_model.params['lnfer_irrigation1_tmp_interaction'] * irrigation1_values * tmp_values * lnfer_values +
                reg_model.params['lnfer_irrigation12_tmp_tmp_interaction'] * irrigation1_values**2 * lnfer_values * tmp_values**2 +
                reg_model.params['lnfer_irrigation1_pre_interaction'] * irrigation1_values * pre_values * lnfer_values +
                reg_model.params['lnfer_irrigation12_pre_pre_interaction'] * irrigation1_values**2 * lnfer_values * pre_values**2 +
                reg_model.params['const']
            )
        else:
            group_data2 = sample_data[(sample_data['zone'] == zone) & (sample_data['group'] == group)]
            predictor_variables = ['lnfer', 'tmp', 'tmp_tmp_interaction', 'irrigation1', 'irrigation12', 'pre', 'pre_pre_interaction',
                                    'lnfer_tmp_interaction', 'lnfer_tmp_tmp_interaction',
                                    'lnfer_pre_interaction', 'lnfer_pre_pre_interaction']
            print(f"\nSubset for Zone {zone}, Group {group}:") 
            reg_model = perform_regression(group_data2, predictor_variables)
             
            print(f"Regression const for Zone {zone}, Group {group}: {reg_model.params['lnfer']}")         
            y_temp_base = (
                reg_model.params['tmp_tmp_interaction'] * tmp_values**2 +
                reg_model.params['tmp'] * tmp_values +
                reg_model.params['pre_pre_interaction'] * pre_values**2 +
                reg_model.params['pre'] * pre_values +
                reg_model.params['lnfer'] * lnfer_values +
                reg_model.params['lnfer_tmp_interaction'] * tmp_values * lnfer_values +
                reg_model.params['lnfer_tmp_tmp_interaction'] * lnfer_values * tmp_values**2 +
                reg_model.params['lnfer_pre_interaction'] * pre_values * lnfer_values +
                reg_model.params['lnfer_pre_pre_interaction'] * lnfer_values * pre_values**2 +
                reg_model.params['const']
            )

        # Extract additional information
        geoid_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['geoid'].values
        scenario_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['scenario'].values
        lnyield_obs_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['lnyield_obs'].values
        har_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['har'].values

        for tmp, pre, lnfer, irrigation1, y_temp_base_val, geoid, scenario, lnyield_obs, har in zip(
            tmp_values, pre_values, lnfer_values, irrigation1_values, y_temp_base, geoid_values, scenario_values, lnyield_obs_values, har_values
        ):
            data_rows.append([tmp, pre, lnfer, irrigation1, y_temp_base_val, geoid, scenario, lnyield_obs, har])

# Create a DataFrame from the data
results_base = pd.DataFrame(data_rows, columns=['tmp', 'pre', 'lnfer', 'irrigation1',
                                                'y_temp_base', 'geoid', 'scenario', 
                                                'lnyield_obs', 'har'])

# Filter for specific scenarios
results_base_t = results_base[(results_base['scenario'] == 1.5) | (results_base['scenario'] == 3)]
# Rename the 'y_temp_base' column to 'y_temp_base_t'
results_base_t = results_base_t.rename(columns={'y_temp_base': 'y_temp_base_t'})

# Select relevant columns
results_base_t = results_base_t[['geoid', 'scenario', 'y_temp_base_t', 'lnyield_obs', 'har']]
# Remove duplicates based on 'geoid' and 'scenario' columns
results_base_t = results_base_t.drop_duplicates(subset=['geoid', 'scenario'])
print(results_base_t)

Unique zones: [3. 2. 1.]
Unique groups: [1 2]

Subset for Zone 1, Group 1:

Subset for Zone 1, Group 1:
Regression const for Zone 1, Group 1: 0.45801946844805047

Subset for Zone 1, Group 2:

Subset for Zone 1, Group 2:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[16], line 91
     89 formula = 'lnyield ~ lnfer  + tmp + tmp_tmp_interaction + pre + pre_pre_interaction +lnfer_tmp_interaction+lnfer_tmp_tmp_interaction+lnfer_pre_interaction+lnfer_pre_pre_interaction+ C(geoid)'
     90 # Run the regression using the formula interface with clustered standard errors
---> 91 reg_model = sm.OLS.from_formula(formula, data=group_data2).fit(cov_type='cluster', cov_kwds={'groups': data['geoid']})
     94 print(f"Regression const for Zone {zone}, Group {group}: {reg_model.params['lnfer']}")         
     95 y_temp_base = (
     96     reg_model.params['tmp_tmp_interaction'] * tmp_values**2 +
     97     reg_model.params['tmp'] * tmp_values +
   (...)
    105     reg_model.params['const']
    106 )

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/regression/linear_model.py:373, in RegressionModel.fit(self, method, cov_type, cov_kwds, use_t, **kwargs)
    370     self.df_resid = self.nobs - self.rank
    372 if isinstance(self, OLS):
--> 373     lfit = OLSResults(
    374         self, beta,
    375         normalized_cov_params=self.normalized_cov_params,
    376         cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t)
    377 else:
    378     lfit = RegressionResults(
    379         self, beta,
    380         normalized_cov_params=self.normalized_cov_params,
    381         cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t,
    382         **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/regression/linear_model.py:1652, in RegressionResults.__init__(self, model, params, normalized_cov_params, scale, cov_type, cov_kwds, use_t, **kwargs)
   1650             use_t = use_t_2
   1651         # TODO: warn or not?
-> 1652     self.get_robustcov_results(cov_type=cov_type, use_self=True,
   1653                                use_t=use_t, **cov_kwds)
   1654 for key in kwargs:
   1655     setattr(self, key, kwargs[key])

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/regression/linear_model.py:2599, in RegressionResults.get_robustcov_results(self, cov_type, use_t, **kwargs)
   2595     if adjust_df:
   2596         # need to find number of groups
   2597         # duplicate work
   2598         self.n_groups = n_groups = len(np.unique(groups))
-> 2599     res.cov_params_default = sw.cov_cluster(
   2600         self, groups, use_correction=use_correction)
   2602 elif groups.ndim == 2:
   2603     if hasattr(groups, 'values'):

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/stats/sandwich_covariance.py:531, in cov_cluster(results, group, use_correction)
    528 else:
    529     clusters = np.unique(group)
--> 531 scale = S_crosssection(xu, group)
    533 nobs, k_params = xu.shape
    534 n_groups = len(clusters) #replace with stored group attributes if available

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/stats/sandwich_covariance.py:485, in S_crosssection(x, group)
    476 def S_crosssection(x, group):
    477     '''inner covariance matrix for White on group sums sandwich
    478 
    479     I guess for a single categorical group only,
   (...)
    483 
    484     '''
--> 485     x_group_sums = group_sums(x, group).T  #TODO: why transposed
    487     return S_white_simple(x_group_sums)

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/tools/grouputils.py:105, in group_sums(x, group, use_bincount)
    102     if np.max(group) > 2 * x.shape[0]:
    103         group = pd.factorize(group)[0]
--> 105     return np.array([np.bincount(group, weights=x[:, col])
    106                      for col in range(x.shape[1])])
    107 else:
    108     uniques = np.unique(group)

File ~/anaconda3/lib/python3.11/site-packages/statsmodels/tools/grouputils.py:105, in <listcomp>(.0)
    102     if np.max(group) > 2 * x.shape[0]:
    103         group = pd.factorize(group)[0]
--> 105     return np.array([np.bincount(group, weights=x[:, col])
    106                      for col in range(x.shape[1])])
    107 else:
    108     uniques = np.unique(group)

File <__array_function__ internals>:200, in bincount(*args, **kwargs)

ValueError: The weights and list don't have the same length.


# Load data from an Excel file
excel_file = "scenario_management.xlsx"
sheet_name = "maize"
data = pd.read_excel(excel_file, sheet_name=sheet_name)

# Assuming climate_zone is already loaded or defined
data = pd.merge(data, climate_zone, on='geoid', how='left')

# Create interaction terms
data['tmp_tmp_interaction'] = data['tmp'] ** 2
data['pre_pre_interaction'] = data['pre'] ** 2
data['irrigation12'] = data['irrigation1'] ** 2
data['lnfer_irrigation1_tmp_interaction'] = data['lnfer'] * data['irrigation1'] * data['tmp']
data['lnfer_irrigation12_tmp_tmp_interaction'] = data['lnfer'] * data['irrigation12'] * data['tmp'] ** 2
data['lnfer_irrigation1_pre_interaction'] = data['lnfer'] * data['irrigation1'] * data['pre']
data['lnfer_irrigation12_pre_pre_interaction'] = data['lnfer'] * data['irrigation12'] * data['pre'] ** 2

data['lnfer_tmp_interaction'] = data['lnfer'] * data['tmp']
data['lnfer_tmp_tmp_interaction'] = data['lnfer'] * data['tmp'] ** 2
data['lnfer_pre_interaction'] = data['lnfer'] * data['pre']
data['lnfer_pre_pre_interaction'] = data['lnfer'] * data['pre'] ** 2

response_variable = 'lnyield'

# Function to perform regression analysis
def perform_regression(group_data, predictor_variables):
    X = sm.add_constant(group_data[predictor_variables])
    y = group_data[response_variable]
    return sm.OLS(y, X).fit()

# Initialize a nested dictionary to store regression results
reg = {}
data_rows = []

sample_data = data[data['sample'] == 0].copy()
project_data = data1

zone_ = [1, 2, 3]
group_ = [1, 2]

# Check the unique values in the 'zone' and 'group' columns
print("Unique zones:", sample_data['zone'].unique())
print("Unique groups:", sample_data['group'].unique())

# Iterate through groups and zones in sample_data
for zone in zone_:
    reg[zone] = {}
    for group in group_:
        print(f"\nSubset for Zone {zone}, Group {group}:")
        tmp_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['tmp'].values
        pre_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['pre'].values
        lnfer_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['lnfer'].values
        irrigation1_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['irrigation1'].values

        if group == 1:
            print(f"\nSubset for Zone {zone}, Group {group}:")
            group_data = sample_data[(sample_data['zone'] == zone) & (sample_data['group'] == group)]
            predictor_variables = ['lnfer', 'tmp', 'irrigation1', 'irrigation12', 'tmp_tmp_interaction',
                                    'pre', 'pre_pre_interaction', 'lnfer_irrigation1_pre_interaction', 'lnfer_irrigation12_pre_pre_interaction',
                                    'lnfer_irrigation1_tmp_interaction', 'lnfer_irrigation12_tmp_tmp_interaction']
            reg_model = perform_regression(group_data, predictor_variables)
            print(f"Regression const for Zone {zone}, Group {group}: {reg_model.params['lnfer']}")
            y_temp_base = (
                reg_model.params['tmp_tmp_interaction'] * tmp_values**2 +
                reg_model.params['tmp'] * tmp_values +
                reg_model.params['pre_pre_interaction'] * pre_values**2 +
                reg_model.params['pre'] * pre_values +
                reg_model.params['lnfer'] * lnfer_values +
                reg_model.params['irrigation1'] * irrigation1_values +
                reg_model.params['irrigation12'] * irrigation1_values**2 +
                reg_model.params['lnfer_irrigation1_tmp_interaction'] * irrigation1_values * tmp_values * lnfer_values +
                reg_model.params['lnfer_irrigation12_tmp_tmp_interaction'] * irrigation1_values**2 * lnfer_values * tmp_values**2 +
                reg_model.params['lnfer_irrigation1_pre_interaction'] * irrigation1_values * pre_values * lnfer_values +
                reg_model.params['lnfer_irrigation12_pre_pre_interaction'] * irrigation1_values**2 * lnfer_values * pre_values**2 +
                reg_model.params['const']
            )
        else:
            group_data = sample_data[(sample_data['zone'] == zone) & (sample_data['group'] == group)]
            predictor_variables = ['lnfer', 'tmp', 'tmp_tmp_interaction', 'irrigation1', 'irrigation12', 'pre', 'pre_pre_interaction',
                                    'lnfer_tmp_interaction', 'lnfer_tmp_tmp_interaction',
                                    'lnfer_pre_interaction', 'lnfer_pre_pre_interaction']
            reg_model = perform_regression(group_data, predictor_variables)
            print(f"\nSubset for Zone {zone}, Group {group}:")
            print(f"Regression const for Zone {zone}, Group {group}: {reg_model.params['lnfer']}")
            y_temp_base = (
                reg_model.params['tmp_tmp_interaction'] * tmp_values**2 +
                reg_model.params['tmp'] * tmp_values +
                reg_model.params['pre_pre_interaction'] * pre_values**2 +
                reg_model.params['pre'] * pre_values +
                reg_model.params['lnfer'] * lnfer_values +
                reg_model.params['lnfer_tmp_interaction'] * tmp_values * lnfer_values +
                reg_model.params['lnfer_tmp_tmp_interaction'] * lnfer_values * tmp_values**2 +
                reg_model.params['lnfer_pre_interaction'] * pre_values * lnfer_values +
                reg_model.params['lnfer_pre_pre_interaction'] * lnfer_values * pre_values**2 +
                reg_model.params['const']
            )

        geoid_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['geoid'].values
        scenario_values = project_data[(project_data['group'] == group) & (project_data['zone'] == zone)]['scenario'].values
            
        for tmp, pre, lnfer, irrigation1,y_temp_base_val, geoid, scenario in zip(
            tmp_values, pre_values, lnfer_values, irrigation1_values,
            y_temp_base, geoid_values, scenario_values
        ):
            data_rows.append([tmp, pre, lnfer, irrigation1, y_temp_base_val, geoid, scenario])


# Create a DataFrame from the data
results_base = pd.DataFrame(data_rows, columns=['tmp','pre','lnfer','irrigation1',
                                               'y_temp_base', 'geoid', 'scenario'])

results_base = results_base[results_base['scenario'] == 2015]

# Rename the 'y_temp_base' column to 'y_temp_base2015'
results_base.rename(columns={'y_temp_base': 'y_temp_base2015'}, inplace=True)

# Keep only the desired columns in results_2015
results_base2015 = results_base[['geoid',  'y_temp_base2015']]

print(results_base2015)

Unique zones: [3. 2. 1.]
Unique groups: [1 2]

Subset for Zone 1, Group 1:

Subset for Zone 1, Group 1:
Regression const for Zone 1, Group 1: 0.45801946844805047

Subset for Zone 1, Group 2:

Subset for Zone 1, Group 2:
Regression const for Zone 1, Group 2: -0.1479070385993198

Subset for Zone 2, Group 1:

Subset for Zone 2, Group 1:
Regression const for Zone 2, Group 1: 0.45801945047853226

Subset for Zone 2, Group 2:

Subset for Zone 2, Group 2:
Regression const for Zone 2, Group 2: -0.14790934219663043

Subset for Zone 3, Group 1:

Subset for Zone 3, Group 1:
Regression const for Zone 3, Group 1: 0.45801945576264025

Subset for Zone 3, Group 2:

Subset for Zone 3, Group 2:
Regression const for Zone 3, Group 2: -0.14791391669856416
      geoid  y_temp_base2015
2      8001         8.480254
5      8013         8.916054
8      8025         8.732799
11     8039         8.666240
14     8069         8.752595
...     ...              ...
6272  48481         9.196748
6275  48489         8.295716
6278  48491         7.688966
6281  48493         8.843162
6284  48507         8.524203

[2095 rows x 2 columns]


data = results_base_t.merge(results_base, left_on=['geoid'], right_on=['geoid'], how='left')
data = data.drop(columns=['scenario_y'])
data.rename(columns={'scenario_x': 'scenario'}, inplace=True)

data = data.sort_values(by=['geoid', 'scenario'])
print(data)

data['yield_warming']=np.exp(data['y_temp_base_t']-data['y_temp_base2015']+data['lnyield_obs'])
data['gap_yield_warming']=(data['yield_warming']-np.exp(data['lnyield_obs']))/1000 #tonne/ha
data['gap_production_warming']=data['gap_yield_warming']*data['har']/10000  #10000 tonne

data['yield_obs']=(np.exp(data['lnyield_obs']))/1000 #tonne/ha
data['production_obs']=data['yield_obs']*data['har']/10000  #10000 tonne

print(data)
alldata=data

# Extract the corresponding lnfer_value and irrigation1_value
output_df = data[['geoid', 'scenario','gap_yield_warming','gap_production_warming']]
# Rename the columns to match your desired output
output_df.columns = ['geoid', 'scenario','gap_yield_warming','gap_production_warming']

# Define the output CSV file pat
output_csv_file = "maize_tem.csv"

# Save the 'output_df' DataFrame to a CSV file
output_df.to_csv(output_csv_file, index=False)

print(f"Data saved to '{output_csv_file}'")

      geoid  scenario  y_temp_base_t  lnyield_obs        har        tmp  \
3799   1001       1.5       9.365828     9.093130   446.6812  22.744158   
3248   1001       3.0       9.390342     9.093130   446.6812  22.744158   
3249   1003       1.5       9.008146     9.058842  2463.8170  24.155120   
3250   1003       3.0       8.974818     9.058842  2463.8170  24.155120   
3251   1005       1.5       9.247303     9.133368   808.6837  23.166352   
...     ...       ...            ...          ...        ...        ...   
1070  56029       3.0       8.798372     9.065753  1269.0000   9.406989   
1071  56031       1.5       9.339298     9.136227  3807.0000  13.419220   
1072  56031       3.0       9.271814     9.136227  3807.0000  13.419220   
1135  56043       1.5       9.479943     9.124074   972.0000  12.689904   
1073  56043       3.0       9.560979     9.124074   972.0000  12.689904   

            pre     lnfer  irrigation1  y_temp_base2015  
3799   8.678571  6.438559     0.429451         9.419198  
3248   8.678571  6.438559     0.429451         9.419198  
3249  11.511937  5.456232     1.376478         9.019800  
3250  11.511937  5.456232     1.376478         9.019800  
3251   8.473777  6.234782     0.153295         9.295728  
...         ...       ...          ...              ...  
1070   3.394136  5.616840     0.000000         8.733599  
1071   4.092800  6.229183     0.000000         9.293578  
1072   4.092800  6.229183     0.000000         9.293578  
1135   2.902834  6.780481     0.000000         9.435028  
1073   2.902834  6.780481     0.000000         9.435028  

[4190 rows x 10 columns]
      geoid  scenario  y_temp_base_t  lnyield_obs        har        tmp  \
3799   1001       1.5       9.365828     9.093130   446.6812  22.744158   
3248   1001       3.0       9.390342     9.093130   446.6812  22.744158   
3249   1003       1.5       9.008146     9.058842  2463.8170  24.155120   
3250   1003       3.0       8.974818     9.058842  2463.8170  24.155120   
3251   1005       1.5       9.247303     9.133368   808.6837  23.166352   
...     ...       ...            ...          ...        ...        ...   
1070  56029       3.0       8.798372     9.065753  1269.0000   9.406989   
1071  56031       1.5       9.339298     9.136227  3807.0000  13.419220   
1072  56031       3.0       9.271814     9.136227  3807.0000  13.419220   
1135  56043       1.5       9.479943     9.124074   972.0000  12.689904   
1073  56043       3.0       9.560979     9.124074   972.0000  12.689904   

            pre     lnfer  irrigation1  y_temp_base2015  yield_warming  \
3799   8.678571  6.438559     0.429451         9.419198    8431.748247   
3248   8.678571  6.438559     0.429451         9.419198    8640.996972   
3249  11.511937  5.456232     1.376478         9.019800    8494.620047   
3250  11.511937  5.456232     1.376478         9.019800    8216.172302   
3251   8.473777  6.234782     0.153295         9.295728    8821.461742   
...         ...       ...          ...              ...            ...   
1070   3.394136  5.616840     0.000000         8.733599    9232.882507   
1071   4.092800  6.229183     0.000000         9.293578    9720.060517   
1072   4.092800  6.229183     0.000000         9.293578    9085.755718   
1135   2.902834  6.780481     0.000000         9.435028    9594.921006   
1073   2.902834  6.780481     0.000000         9.435028   10404.826799   

      gap_yield_warming  gap_production_warming  yield_obs  production_obs  
3799          -0.462232               -0.020647   8.893981        0.397277  
3248          -0.252984               -0.011300   8.893981        0.397277  
3249          -0.099573               -0.024533   8.594193        2.117452  
3250          -0.378021               -0.093137   8.594193        2.117452  
3251          -0.437693               -0.035395   9.259154        0.748773  
...                 ...                     ...        ...             ...  
1070           0.579090                0.073486   8.653793        1.098166  
1071           0.434396                0.165375   9.285664        3.535052  
1072          -0.199908               -0.076105   9.285664        3.535052  
1135           0.421423                0.040962   9.173498        0.891664  
1073           1.231328                0.119685   9.173498        0.891664  

[4190 rows x 15 columns]
Data saved to 'maize_tem.csv'


test = alldata[(alldata['scenario'] == 1.5)&(alldata['geoid'] == 1001)]

print(test)

      geoid  scenario  y_temp_base_t  lnyield_obs       har        tmp  \
3799   1001       1.5       9.365828      9.09313  446.6812  22.744158   

           pre     lnfer  irrigation1  y_temp_base2015  yield_warming  \
3799  8.678571  6.438559     0.429451         9.419198    8431.748247   

      gap_yield_warming  gap_production_warming  yield_obs  production_obs  
3799          -0.462232               -0.020647   8.893981        0.397277


data = alldata[alldata['scenario'] == 3]
data1= data[data['gap_production_warming'] >0]
print(data)
print(data1)
print(np.min(data['gap_production_warming']),np.max(data['gap_production_warming']),np.mean(data['gap_production_warming']))
print(np.percentile(data['gap_production_warming'], 5),np.percentile(data['gap_production_warming'], 95))
print("sum of gap production", np.sum(data['gap_production_warming']))
print("sum of production total", np.sum(data['production_obs']))
print("sum of production total in warming", np.sum(data1['production_obs']))

# Replace values in 'gap_yield' greater than p90 with p90
data.loc[data['gap_production_warming'] > 2.1, 'gap_production_warming'] = 2.1
data.loc[data['gap_production_warming'] < -2.1, 'gap_production_warming'] = -2.1

data['geoid'] = data['geoid'].astype(float)


# Load US county boundaries shapefile using geopandas
shapefile_path = '/Users/chenchenren/postdoc/paper/2N and water-US/Data/cb_2018_us_county_20m/surface_shapefile.shp'
gdf_county = gpd.read_file(shapefile_path)

# Ensure 'GEOID' column in the shapefile has the 'object' data type
gdf_county['GEOID'] = gdf_county['GEOID'].astype(float)

# Rename the 'geoid' column to 'GEOID' in year_2020_data
data.rename(columns={'geoid': 'GEOID'}, inplace=True)

# Convert the 'GEOID' column in the 'data' DataFrame to 'object' data type
data['GEOID'] = data['GEOID'].astype(float)
# Now, merge 'data' with county boundaries based on 'GEOID' attribute
merged_data = gdf_county.merge(data, on='GEOID', how='left')

# Filter the data to include only geometries within the specified longitude and latitude range
filtered_data = merged_data.cx[-130:-60, 25:50]

cmap = mcolors.ListedColormap(['white'] + list(plt.cm.Spectral(np.linspace(0, 1, 255))))
norm = mcolors.Normalize(vmin=-2.2, vmax=2.2)

# Plot both county boundaries and 'y_temp' values on the same map within the specified range
fig, ax = plt.subplots(figsize=(10, 4))
shapefile_path = '/Users/chenchenren/postdoc/paper/2N and water-US/Data/cb_2018_us_state_20m/cb_2018_us_state_20m.shp'
gdf_state = gpd.read_file(shapefile_path)
gdf_state.rename(columns={'GEOID': 'GEOID_state'}, inplace=True)
gdf_state = gdf_state.cx[-130:-60, 25:50]
gdf_state.boundary.plot(ax=ax, linewidth=0.3, color='grey')  # Plot county boundaries in grey

sc = plt.scatter([], [], c=[], cmap=cmap, norm=norm)  # Create an empty ScalarMappable for the colorbar

# Define custom colorbar ticks
custom_ticks = [-2, -1, 0, 1, 2]

# Adjust the height of the colorbar by setting the shrink parameter
cbar = plt.colorbar(sc, ax=ax, label='Production change\n($10^{4}$ t)', orientation='vertical', pad=0.02, shrink=0.8, ticks=custom_ticks)

# Set the font size of the colorbar label
# Set the colorbar label text, font size, and padding
cbar.ax.set_ylabel('Production change\n($10^{4}$ tonnes/yr)', fontsize=16, fontfamily='Arial')
cbar.ax.tick_params(labelsize=16)  # Adjust tick label fontsize
cbar.ax.set_yticklabels(custom_ticks, fontsize=16, fontfamily='Arial')

filtered_data.plot(column='gap_production_warming', cmap=cmap, ax=ax, norm=norm)

# Add labels, titles, or other plot configurations as needed
ax.set_xticks([])  # Remove x-axis ticks
ax.set_yticks([])  # Remove y-axis ticks
ax.set_frame_on(False)  # Turn off the frame

plt.text(0.25, 0.2, '(c) Maize', transform=plt.gcf().transFigure,
         fontsize=18, fontweight='bold', fontfamily='Arial', ha='left', va='bottom')

# Save the entire figure as a JPG with DPI 300
plt.savefig('maize_all_production_3.jpg', format='jpg', dpi=300, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()

      geoid  scenario  y_temp_base_t  lnyield_obs         har        tmp  \
3248   1001       3.0       9.390342     9.093130    446.6812  22.744158   
3250   1003       3.0       8.974818     9.058842   2463.8170  24.155120   
3800   1005       3.0       9.207111     9.133368    808.6837  23.166352   
3253   1009       3.0       9.452216     8.896661    517.2429  21.386674   
3254   1011       3.0       9.456756     9.043855    386.7075  22.986824   
...     ...       ...            ...          ...         ...        ...   
1065  56015       3.0       9.132208     9.101155  11643.5000  14.455618   
1067  56021       3.0       9.237525     8.890697   4066.9880  13.101518   
1070  56029       3.0       8.798372     9.065753   1269.0000   9.406989   
1072  56031       3.0       9.271814     9.136227   3807.0000  13.419220   
1073  56043       3.0       9.560979     9.124074    972.0000  12.689904   

            pre     lnfer  irrigation1  y_temp_base2015  yield_warming  \
3248   8.678571  6.438559     0.429451         9.419198    8640.996972   
3250  11.511937  5.456232     1.376478         9.019800    8216.172302   
3800   8.473777  6.234782     0.153295         9.295728    8473.937109   
3253   9.825275  6.467910     0.143544         9.504825    6933.032790   
3254   8.412398  6.550778     0.142055         9.440683    8603.527030   
...         ...       ...          ...              ...            ...   
1065   4.212446  6.046691     0.000000         9.216206    8243.303499   
1067   4.287189  5.969113     0.000000         9.157526    7869.079132   
1070   3.394136  5.616840     0.000000         8.733599    9232.882507   
1072   4.092800  6.229183     0.000000         9.293578    9085.755718   
1073   2.902834  6.780481     0.000000         9.435028   10404.826799   

      gap_yield_warming  gap_production_warming  yield_obs  production_obs  
3248          -0.252984               -0.011300   8.893981        0.397277  
3250          -0.378021               -0.093137   8.594193        2.117452  
3800          -0.785217               -0.063499   9.259154        0.748773  
3253          -0.374500               -0.019371   7.307533        0.377977  
3254           0.137175                0.005305   8.466352        0.327400  
...                 ...                     ...        ...             ...  
1065          -0.722339               -0.841055   8.965642       10.439145  
1067           0.604999                0.246052   7.264080        2.954293  
1070           0.579090                0.073486   8.653793        1.098166  
1072          -0.199908               -0.076105   9.285664        3.535052  
1073           1.231328                0.119685   9.173498        0.891664  

[2095 rows x 15 columns]
      geoid  scenario  y_temp_base_t  lnyield_obs        har        tmp  \
3254   1011       3.0       9.456756     9.043855   386.7075  22.986824   
3263   1031       3.0       9.558609     8.575626  2001.3650  23.365386   
3273   1045       3.0       9.498349     8.883370   823.0410  23.346208   
3290   1067       3.0       9.250031     8.858751  1445.5350  23.346362   
3292   1069       3.0       9.573791     8.947829   925.2023  23.538006   
...     ...       ...            ...          ...        ...        ...   
1062  55141       3.0       9.765169     9.058293  9748.5210  13.706112   
1064  56003       3.0       9.178361     9.037472  1950.7500  12.497862   
1067  56021       3.0       9.237525     8.890697  4066.9880  13.101518   
1070  56029       3.0       8.798372     9.065753  1269.0000   9.406989   
1073  56043       3.0       9.560979     9.124074   972.0000  12.689904   

           pre     lnfer  irrigation1  y_temp_base2015  yield_warming  \
3254  8.412398  6.550778     0.142055         9.440683    8603.527030   
3263  9.032545  6.731169     0.113650         9.545826    5369.064610   
3273  8.621824  6.616074     0.128754         9.472996    7396.215499   
3290  8.435011  6.049888     0.176296         9.205317    7357.425076   
3292  8.458624  6.752632     0.280205         9.526649    8062.435291   
...        ...       ...          ...              ...            ...   
1062  7.787753  6.466931     0.000000         9.753810    8687.599013   
1064  2.879550  6.291661     0.000000         9.153077    8627.898510   
1067  4.287189  5.969113     0.000000         9.157526    7869.079132   
1070  3.394136  5.616840     0.000000         8.733599    9232.882507   
1073  2.902834  6.780481     0.000000         9.435028   10404.826799   

      gap_yield_warming  gap_production_warming  yield_obs  production_obs  
3254           0.137175                0.005305   8.466352        0.327400  
3263           0.068196                0.013648   5.300869        1.060897  
3273           0.185164                0.015240   7.211051        0.593499  
3290           0.321735                0.046508   7.035690        1.017034  
3292           0.371259                0.034349   7.691176        0.711589  
...                 ...                     ...        ...             ...  
1062           0.098123                0.095656   8.589476        8.373469  
1064           0.215415                0.042022   8.412483        1.641065  
1067           0.604999                0.246052   7.264080        2.954293  
1070           0.579090                0.073486   8.653793        1.098166  
1073           1.231328                0.119685   9.173498        0.891664  

[597 rows x 15 columns]
-15.121197088855459 14.56734233552591 -0.44799736855099326
-3.548367489973667 1.0275948159689103
sum of gap production -938.5544871143309
sum of production total 33517.93639932836
sum of production total in warming 10222.009370426244

/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_78167/3664264077.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['geoid'] = data['geoid'].astype(float)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_78167/3664264077.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'geoid': 'GEOID'}, inplace=True)
/var/folders/vd/0_phd7hx2n51y4412862zww00000gp/T/ipykernel_78167/3664264077.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['GEOID'] = data['GEOID'].astype(float)