In [26]:
# STEP 1: Install dependency
!pip install pycountry

# STEP 2: Imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy

# STEP 3: Load CSVs
gdp = pd.read_csv('/content/GDPPC.csv')
culture = pd.read_csv('/content/culture_exports.csv')
vdem = pd.read_csv('/content/V-Dem-CY-Core-v15.csv')

# STEP 4: Clean column names
gdp.columns = gdp.columns.str.strip()
culture.columns = culture.columns.str.strip()
vdem.columns = vdem.columns.str.strip()

# STEP 5: Reshape GDP
gdp = gdp[gdp['Series Code'] == 'NY.GDP.PCAP.CD']
gdp_long = gdp.melt(id_vars=['Country Name', 'geoUnit', 'Series Name', 'Series Code'],
                    var_name='year', value_name='gdp_per_capita')
gdp_long['year'] = gdp_long['year'].str.extract(r'(\d{4})').astype(float)
gdp_long['gdp_per_capita'] = pd.to_numeric(gdp_long['gdp_per_capita'], errors='coerce')

# STEP 6: Clean culture exports
culture = culture.rename(columns={'value': 'cultural_exports'})
culture['year'] = pd.to_numeric(culture['year'], errors='coerce')
culture = culture[['geoUnit', 'year', 'cultural_exports']].dropna()

# STEP 7: Select and clean V-Dem
vdem = vdem[['geoUnit', 'year', 'v2x_libdem']].dropna()

# STEP 8: Merge
merged = culture.merge(gdp_long[['geoUnit', 'year', 'gdp_per_capita']], on=['geoUnit', 'year'], how='left')
merged = merged.merge(vdem, on=['geoUnit', 'year'], how='left')
merged = merged.dropna()

# STEP 9: Log-transform cultural exports
merged['log_exports'] = np.log1p(merged['cultural_exports'])

# STEP 10: Regression with year fixed effects and robust SE
formula = 'log_exports ~ v2x_libdem + C(year)'
y, X = patsy.dmatrices(formula, data=merged, return_type='dataframe')
model = sm.OLS(y, X).fit(cov_type='HC3')

# STEP 11: Output
print("✅ Final merged shape:", merged.shape)
print(model.summary())


✅ Final merged shape: (1043, 6)
                            OLS Regression Results                            
Dep. Variable:            log_exports   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     4.896
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           5.94e-06
Time:                        02:03:27   Log-Likelihood:                -714.96
No. Observations:                1043   AIC:                             1448.
Df Residuals:                    1034   BIC:                             1492.
Df Model:                           8                                         
Covariance Type:                  HC3                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept 