* Collapse census data for CHC project
* Bryan Stuart
* This do-file starts with the Census/ACS microdata linked to the NUMIDENT and
* collapses it to means by birth year, birth month, survey year, and county of birth
* Updated 5/5/2016 (reflect various changes in basic processing)
* Updated 7/11/2016 (Incorporate geographically consistent "super counties" and use age 25-64 for disability)
* Updated 7/28/2016 (Subgroups are white male, white female, nonwhite male, nonwhite female)
* Updated 9/6/2016 (Collapse by month of birth instead of year) 
* Updated 2/12/2017 (add disability index with different sample)
* Updated 2018 to include LE estimates and those who moved and stayed - Matt Tarduno
* Updated 8/31/2021 cap drop some variables due to dtacr 20 differences - Kate Moulton
* Updated 11/5/2021 to restrict LE sample to the full info sample and add corresponding LE3 weights - Kate Moulton

cap log close
log using "$output/dtacr_23_birthmo_newsub.log", replace 

* Prepare Chetty data for merge
use "$datap/chetty_data_040816.dta", clear
isid cty2000	// 5 digit: state + county
gen statefip = floor(cty2000/1000)
gen coufip = cty2000-statefip*1000
isid statefip coufip
rename cz cz_chetty
keep statefip coufip cz_chetty e_rank_b_cty e_rank_b_cz pct_causal_p25*
tempfile temp
save `temp'

* Census data at micro level
use if runiform() <= $sample/100 using "$datatemp_basic/dtacr_20.dta", clear



* Limit by birth cohort and age 
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=64  

* Drop if sex is missing 
drop if female==.
* Generate race 
gen race = race_white + 2*race_black + 3*race_other
* Drop if missing
drop if race==.
tab race female, m

* Limit to obs with non missing birth county
drop if co_fips_1==. 		// fips for county of birth 1
drop if st_fips==.		// fips for state of birth


*** Added 10/28/2018: Identify those who moved and those who stayed
gen birth_fips = st_fips*1000+co_fips_1
gen obs_fips = statefip *1000 + coufip 
gen mover = cond(birth_fips != obs_fips,1,0)
gen stayer = cond(birth_fips == obs_fips,1,0)


* Drop if missing pik (shouldn't do anything)
drop if pik==""

* Drop obs if PIK appears more than once in a given year
gen one = 1
sort year pik
bys year pik: egen num_pik = total(one)
tab num_pik, mi
tab year if num_pik>1 & num_pik!=.
gen mult_pik = num_pik>1 & num_pik!=.
bys pik: egen num_mult_pik = total(mult_pik)
drop if num_mult_pik>0 & num_mult_pik!=.
 
* Drop if birth month is missing
su birthyr birthmo
drop if birthmo==. 
 
tab match_step, mi
tab birthyr match_step, mi


* Bring in Chetty data
merge m:1 statefip coufip using `temp', gen(_mergechetty) keep(1 3)
gen nei_12 = e_rank_b_cty


* 7/23 this code merges in LE measures 

gen sex=gender  

xtile incq = hhincome, nq(4)

merge m:1 year statefip incq sex using "/projects/programs/foodstamps/master_folder/LE_build/dataSTATA/chetty_reformatted.dta", nogen

* note 7.11.2019: RW Gompertz version does not have LE by cohort (that is, there is no 'year' to merge on)
*merge m:1 coufip sex year using "/projects/programs/foodstamps/master_folder/LE_build/dataSTATA/LE_gompertz_reformatted.dta", nogen

* 9/1/21 use the individual estimations instead
merge m:1 sex birthyr coufip using "/projects/programs/foodstamps/master_folder/LE_build/dataSTATA/LE_gompertz_reformatted_estimation1.dta", nogen keep(1 3)
merge m:1 sex year coufip using "/projects/programs/foodstamps/master_folder/LE_build/dataSTATA/LE_gompertz_reformatted_estimation2.dta", nogen keep(1 3)
merge m:1 sex birthyr coufip using "/projects/programs/foodstamps/master_folder/LE_build/dataSTATA/LE_gompertz_reformatted_estimation3.dta", nogen keep(1 3)
merge m:1 sex birthyr coufip using "/projects/programs/foodstamps/master_folder/LE_build/dataSTATA/LE_gompertz_reformatted_estimation4.dta", nogen keep(1 3)

tab coufip statefip if _mergechetty==1

* Run a couple of quick data checks here: Do observables jump across birth or survey years?
tabstat race_white [aw=perwt], by(birthyr) stats(mean sd)
tabstat race_white [aw=perwt], by(year) stats(mean sd)
tabstat race_black [aw=perwt], by(birthyr) stats(mean sd)
tabstat race_black [aw=perwt], by(year) stats(mean sd)
tabstat race_other [aw=perwt], by(birthyr) stats(mean sd)
tabstat race_other [aw=perwt], by(year) stats(mean sd)
tabstat female [aw=perwt], by(birthyr) stats(mean sd)
tabstat female [aw=perwt], by(year) stats(mean sd)

gen race_nonwhite = (race_white != 1)
tab race_nonwhite race_black, m


**** FOR DISCLOSURE ****

* Identify index sample membership
* Updated 1/16/17: Not using DIS to choose disclosure sample
gen flag_hc = 0
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 {
	replace flag_hc = 1 if `v' != .
}
*gen flag_dis = 0
*foreach v of varlist dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 {
*	replace flag_dis = 1 if `v' != .
*}
gen flag_nei = 0
foreach v of varlist nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 {
	replace flag_nei = 1 if `v' != .
}
gen flag_ess = 0
foreach v of varlist ess_1 ess_2 ess_3 ess_4 ess_5 ess_6 /*ess_7*/ ess_8 ess_9 ess_10 {
	replace flag_ess = 1 if `v' != .
}
egen flag = rowtotal(flag_*)
sum flag_*
tab flag, m
gen keeper = flag==3


* Does adding the HSA ESS index change the sample?
gen flag_hsa = 0
foreach v of varlist hc_2 ess_2 ess_5 ess_8 ess_9 ess_10 {
	replace flag_hsa = 1 if `v' != .
}
gen keeper_hsa = flag + flag_hsa == 4
tab keeper_hsa, m
drop keeper_hsa flag_hsa

* All variables should be set to missing for those who are incarcerated, except
* "not incarcerated." But some have other variables present, so make sure those
* are all missing to preserve consistent sample size across dis, hc, and nei indices
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 /// 
dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 ///
nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 {	
	replace `v' = . if ess_7==0
}

drop flag_* flag
*keep if keeper==1		/* Restrict to main disclosure sample */

**** END OF SAMPLE CUTS FOR DISCLOSURE ****

** Make indicator variables 
cap drop ess_6_1
gen ess_6_1 = cond(ess_6 > 0 & ess_6 !=., 1,0)
cap drop ess_5_1
gen ess_5_1 = cond(ess_5 > 0 & ess_5 !=., 1,0)
cap drop ess_8_1
gen ess_8_1 = cond(ess_8 >0 & ess_8 !=.,1,0)
cap drop ess_10_1
gen ess_10_1 = cond(ess_10 > 0 & ess_10 !=. ,1,0)
cap drop nei_1_1
gen nei_1_1 = cond(nei_1 > 0 & nei_1!=. , 1,0)
cap drop nei_5_1
gen nei_5_1 = cond(nei_5 > 0 & nei_5 !=., 1, 0)




*** Another check, added 1/23/2017
* Here we want to generate a complete-information sample, except that we will
* allow people to be missing either house value or rent, and we will allow them to 
* be missing labor income
gen fullinfo = 1
replace fullinfo = 0 if !inrange(age,25,54)	// These variables are limited to ages 25-54
replace fullinfo = 0 if ess_7 == 0		// Incarcerated (reverse-coded variable)
gen neiflag = nei_1==. & nei_2==.
tab neiflag, m
replace fullinfo = 0 if neiflag == 1		// Missing both home value and gross rent
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 ess_1 ess_2 ess_3 ess_4 ess_5 ///
	ess_6 /*ess_7*/ ess_8 ess_9 ess_10 /*ess_5_1*/ ess_6_1 ess_8_1 ess_10_1 ///
	/*nei_1 nei_2*/ nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 ///
	/*nei_1_1*/ nei_5_1 {

	sum `v' [aw=perwt]
	count if `v' == .
	gen missing_`v' = `v' == .
	if "`v'"!="ess_5" {
		replace fullinfo = 0 if `v' == .	// Don't do this for ess_5 (labor income)
	}
}
egen totmissing = rowtotal(missing_*)	// Total number of variables missing
sum totmissing, d
sum missing_* if totmissing==1		// Among those with only one variable missing, which is most common?
correlate missing_*			// Which variables tend to be missing together?
drop missing_* totmissing neiflag

tab fullinfo, m



* NEXT, sample for disability index
gen fullinfo_dis = 1
foreach v of varlist dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 {
	sum `v' [aw=perwt]
	replace fullinfo_dis = 0 if `v'==.
}


tab fullinfo_dis, m
tab fullinfo fullinfo_dis, m


* FINALLY, keep people who are incarcerated, as a separate incarcerated sample
gen incarc = ess_7 == 0 if inrange(age,25,54) & inrange(year,2006,2013)
sum perwt if ess_7 == 0
tab fullinfo incarc, m


** KEEP only disclosure sample
keep if fullinfo==1 | fullinfo_dis==1 | incarc == 1	// Keep appropriate samples for DIS and for NEI/HC/ESS

tab year fullinfo, m
tab year fullinfo_dis, m
tab year incarc, m
tab birthyr fullinfo, m
tab birthyr fullinfo_dis, m
tab birthyr incarc, m

* 11/5/21: Limit LE sample to those in the full info sample
replace LE1 = . if fullinfo != 1
replace LE2 = . if fullinfo != 1
replace LE3 = . if fullinfo != 1
replace LE4 = . if fullinfo != 1
count if fullinfo == 1 & LE3 == .

**** END OF SAMPLE CUTS FOR DISCLOSURE ****







*** Finally, log 9 variables:
* ess_5 ess_6 ess_8 ess_10 nei_1 nei_2 nei_5 nei_10 nei_11
*** Of these, six have variables for which we need to add binary indicators to preserve sample size:
* log_ess_5 log_ess_6 log_ess_8 log_ess_10 log_nei_1 log_nei_5
foreach v of varlist ess_5 ess_6 ess_8 ess_10 nei_1 nei_2 nei_5 nei_10 nei_11 {

	if "`v'"=="ess_10" {
		replace `v' = -log(-`v')	// Reverse coded income from public sources
		sum `v'
	}
	
	else {
		replace `v' = log(`v')
		sum `v'
	}
}


* And make sure that "not incarcerated" (ess_7) is missing for anyone not in the incarceration sample
replace ess_7 = . if !(incarc == 1 | fullinfo == 1)
replace ess_7 = . if  year<=2005			// No incarceration variable from 2000-2005





* Normalize each variable according to 1950-1954 mean and sd
* Updated 1/10/2017 (Add binary indicators for 5 newly logged variables, removed incarceration from ESS)
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 /// 
ess_1 ess_2 ess_3 ess_4 ess_5 ess_6 /*ess_7*/ ess_8 ess_9 ess_10  ///
/*dis_1 dis_2 dis_3 dis_4 dis_5 dis_6*/ ///
nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 ///
ess_5_1 ess_6_1 ess_8_1 ess_10_1 nei_1_1 nei_5_1 {
	* All
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & fullinfo==1 [aw=perwt]		
	gen z_`v'_all = (`v'-r(mean))/r(sd) if inrange(age,25,54) & fullinfo==1	
	* White males
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & race_white==1 & female==0 & fullinfo==1 [aw=perwt]
	gen z_`v'_wm = (`v'-r(mean))/r(sd) if race_white==1 & female==0 & inrange(age,25,54) & fullinfo==1
	* White females
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & race_white==1 & female==1 & fullinfo==1 [aw=perwt]
	gen z_`v'_wf = (`v'-r(mean))/r(sd) if race_white==1 & female==1 ///
		& inrange(age,25,54) & fullinfo==1
	* Nonwhite males
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & (race_black==1 | race_other==1) & female==0 & fullinfo==1 [aw=perwt]
	gen z_`v'_nm = (`v'-r(mean))/r(sd) if (race_black==1 | race_other==1) & female==0 & inrange(age,25,54) & fullinfo==1
	* Nonwhite females
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & (race_black==1 | race_other==1) & female==1 & fullinfo==1 [aw=perwt]
	gen z_`v'_nf = (`v'-r(mean))/r(sd) if (race_black==1 | race_other==1) & female==1  & inrange(age,25,54) & fullinfo==1
	
	compress	// experienced memory issues before
}


* Normalize disability index variables according to 1950-1954 mean and sd
* This is done separately because we use age 25-64 instead of 25-54
foreach v of varlist dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 {
	* All
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,64) & fullinfo_dis==1 [aw=perwt]		
	gen z_`v'_all = (`v'-r(mean))/r(sd) if inrange(age,25,64) & fullinfo_dis==1
	* White males
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,64) & race_white==1 & female==0 & fullinfo_dis==1 [aw=perwt]
	gen z_`v'_wm = (`v'-r(mean))/r(sd) if race_white==1 & female==0 & inrange(age,25,64) & fullinfo_dis==1
	* White females
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,64) & race_white==1 & female==1 & fullinfo_dis==1 [aw=perwt]
	gen z_`v'_wf = (`v'-r(mean))/r(sd) if race_white==1 & female==1 ///
		& inrange(age,25,64) & fullinfo_dis==1
	* Nonwhite males
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,64) & (race_black==1 | race_other==1) & female==0 & fullinfo_dis==1 [aw=perwt]
	gen z_`v'_nm = (`v'-r(mean))/r(sd) if (race_black==1 | race_other==1) & female==0 & inrange(age,25,64) & fullinfo_dis==1
	* Nonwhite females
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,64) & (race_black==1 | race_other==1) & female==1 & fullinfo_dis==1 [aw=perwt]
	gen z_`v'_nf = (`v'-r(mean))/r(sd) if (race_black==1 | race_other==1) & female==1  & inrange(age,25,64) & fullinfo_dis==1
	
	compress	// experienced memory issues before
}
*/



**** Super counties ****
* Convert county codes to "super counties" that are
* consistent from 1950-1990
do "$dofile/countyStandardizeGNIS1950.do"
gen fips = 1000*st_fips + co_fips_1
* Save original fips
gen fips_orig = fips
countyFipsToREISfips, county(fips)
fixCounty, county(fips)
* Replace county fips code with new version
replace co_fips_1 = fips - 1000*st_fip



* Collapse into birth-year/survey-year/county cells
* But first generate variable that counts number of raw observations in each cell
* Use this as weights in the analysis
sum perwt if inrange(age,25,54) & fullinfo==1
gen obs = 1 if perwt>0 & perwt!=. & inrange(age,25,54) & fullinfo==1		// Weights for "full information" sample
tab obs fullinfo, m
sum perwt if inrange(age,25,64) & fullinfo_dis==1
gen obs_dis = 1 if perwt>0 & perwt!=. & inrange(age,25,64) & fullinfo_dis==1	// Weights for "full information" disability sample
tab obs_dis fullinfo_dis, m
sum perwt if inrange(age,25,54) & (incarc==1 | fullinfo==1)
gen obs_inc = 1 if perwt>0 & perwt!=. & inrange(age,25,54) & (incarc==1 | fullinfo==1) & inrange(year,2006,2013)	// Weight for incarceration regression
tab obs_inc, m
gen obs_LE1 = 1 if obs == 1 & !missing(LE1)						// Weight for LE1 sample
tab obs_LE1, m
gen obs_LE2 = 1 if obs == 1 & !missing(LE2)						// Weight for LE2 sample
tab obs_LE2, m
gen obs_LE3 = 1 if obs == 1 & !missing(LE3)						// Weight for LE3 sample
tab obs_LE3, m
gen obs_LE4 = 1 if obs == 1 & !missing(LE4)						// Weight for LE4 sample
tab obs_LE4, m


* Couple of quick data checks before collapsing
tabstat obs, by(birthyr) stat(sum)
tabstat obs, by(year) stat(sum)
tabstat obs_dis, by(birthyr) stat(sum)
tabstat obs_dis, by(year) stat(sum)
tabstat obs_inc, by(birthyr) stat(sum)
tabstat obs_dis, by(year) stat(sum)


***** Disability outcomes: These use people age 25-64, instead of 25-54
*** Collapse for all 
* Collapse for all matches

// Edits 11.05.2021 to only keep LE_chetty and not other LE through these disability sample collapses

preserve
keep if fullinfo_dis==1
collapse (mean)  LE_chetty* z_dis*_all dis_* age_dis=age female_dis=female race_white_dis=race_white ///
	/*yearwgt_all*/ (rawsum) perwt_dis=perwt obs_dis [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE_chetty* dis_* perwt* obs* age* female* race_white* {
	rename `v' `v'_all
}
gen exactmatch=0
tempfile temp_all_dis
save `temp_all_dis'
restore
* Collapse for exact matches only [currently only doing this for entire group; could extend to race/sex]
preserve
keep if match_step=="EXACT"
keep if fullinfo_dis==1
collapse (mean)  LE_chetty* z_dis*_all dis_* age_dis=age female_dis=female race_white_dis=race_white ///
	/*yearwgt_all*/ (rawsum) perwt_dis=perwt obs_dis [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE_chetty* dis_* perwt* obs* age* female* race_white* {
	rename `v' `v'_all
}
gen exactmatch=1
tempfile temp_exact_dis
save `temp_exact_dis'
restore
* Collapse for white males
preserve
keep if race_white==1 & female==0
keep if fullinfo_dis==1
collapse (mean)  LE_chetty* z_dis*_wm dis_* age_dis=age ///
	/*yearwgt_white*/ ///
	(rawsum) perwt_dis=perwt obs_dis [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE_chetty* dis_* age* perwt* obs* {
	rename `v' `v'_wm
}
tempfile temp_wm_dis
save `temp_wm_dis'
restore
* Collapse for white females
preserve
keep if race_white==1 & female==1
keep if fullinfo_dis==1
collapse (mean)  LE_chetty* z_dis*_wf dis_* age_dis=age ///
	/*yearwgt_nonwhite*/ /*tractobs*/ ///
	(rawsum) perwt_dis=perwt obs_dis [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE_chetty* dis_* age* /*tractobs*/ perwt* obs* {
	rename `v' `v'_wf
}
tempfile temp_wf_dis
save `temp_wf_dis'
restore
* Collapse for nonwhite males
preserve
keep if female==0 & (race_black==1 | race_other==1)
keep if fullinfo_dis==1
collapse (mean)  LE_chetty* z_dis*_nm dis_* age_dis=age ///
	/*yearwgt_men tractobs*/ ///
	(rawsum) perwt_dis=perwt obs_dis [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE_chetty* dis_* age* perwt* obs* {
	rename `v' `v'_nm
}
tempfile temp_nm_dis
save `temp_nm_dis'
restore
* Collapse for nonwhite females
preserve
keep if female==1 & (race_black==1 | race_other==1)
keep if fullinfo_dis==1
collapse (mean)  LE_chetty* z_dis*_nf dis_* age_dis=age  ///
	/*yearwgt_women tractobs*/ ///
	(rawsum) perwt_dis=perwt obs_dis [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE_chetty* dis_* age* perwt* obs* {
	rename `v' `v'_nf
}
tempfile temp_nf_dis
save `temp_nf_dis'
restore
*/


***** Non-disability outcomes
*** Collapse for all 
* Collapse for all matches
preserve
* Keep only ages 25-54
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if fullinfo==1 | incarc==1
collapse (mean)  LE* z_*_all hc_* ess_* /*dis_*/ nei_* age female race_white race_black race_nonwhite ///
	/*yearwgt_all*/ tractobs (rawsum) perwt* obs obs_inc obs_LE* [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist LE* hc_* ess_* /*dis_*/ nei_* /*yearwgt*/ tractobs perwt* obs* age* female* race_* {
	rename `v' `v'_all
}
gen exactmatch=0
tempfile temp_all
save `temp_all'
restore
* Collapse for exact matches only [currently only doing this for entire group; could extend to race/sex]
preserve
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if match_step=="EXACT"
keep if fullinfo==1 | incarc==1
collapse (mean)  LE* z_*_all hc_* ess_* /*dis_*/ nei_* age female race_white race_black race_nonwhite ///
	/*yearwgt_all*/ tractobs (rawsum) perwt* obs obs_inc obs_LE* [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE* hc_* ess_* /*dis_*/ nei_* /*yearwgt*/ tractobs perwt* obs* age* female* race_* {
	rename `v' `v'_all
}
gen exactmatch=1
tempfile temp_exact
save `temp_exact'
restore
* Collapse for white males
preserve
keep if race_white==1 & female==0
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if fullinfo==1 | incarc==1
collapse (mean)  LE* z_*_wm hc_* ess_* /*dis_*/ nei_* age ///
	/*yearwgt_white*/ tractobs ///
	(rawsum) perwt* obs obs_inc obs_LE* [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist LE* hc_* ess_* /*dis_*/ nei_* age* tractobs perwt* obs* {
	rename `v' `v'_wm
}
tempfile temp_wm
save `temp_wm'
restore
* Collapse for white females
preserve
keep if race_white==1 & female==1
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if fullinfo==1 | incarc==1
collapse (mean)  LE* z_*_wf hc_* ess_* /*dis_*/ nei_* age ///
	/*yearwgt_nonwhite*/ tractobs ///
	(rawsum) perwt* obs obs_inc obs_LE* [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist  LE* hc_* ess_* /*dis_*/ nei_* age* tractobs perwt* obs* {
	rename `v' `v'_wf
}
tempfile temp_wf
save `temp_wf'
restore
* Collapse for nonwhite males
preserve
keep if female==0 & (race_black==1 | race_other==1)
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if fullinfo==1 | incarc==1
collapse (mean)  LE* z_*_nm hc_* ess_* /*dis_*/ nei_* age ///
	/*yearwgt_men*/ tractobs ///
	(rawsum) perwt* obs obs_inc obs_LE* [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist LE* hc_* ess_* /*dis_*/ nei_* age* tractobs perwt* obs* {
	rename `v' `v'_nm
}
tempfile temp_nm
save `temp_nm'
restore
* Collapse for nonwhite women
preserve
keep if female==1 & (race_black==1 | race_other==1)
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if fullinfo==1 | incarc==1
collapse (mean)  LE* z_*_nf hc_* ess_* /*dis_*/ nei_* age  ///
	/*yearwgt_women*/ tractobs ///
	(rawsum) perwt* obs obs_inc obs_LE* [aw=perwt], ///
	by(st_fips co_fips_1 birthyr stayer birthmo year)
foreach v of varlist   LE* hc_* ess_* /*dis_*/ nei_* age* tractobs perwt* obs* {
	rename `v' `v'_nf
}
tempfile temp_nf
save `temp_nf'
restore



*** MERGE
* Put exact disab and non-disab observations together
use `temp_exact_dis', clear
merge 1:1 st_fips co_fips_1 birthyr birthmo stayer year using `temp_exact', gen(_mergeexact)
tempfile temp_exact_all
save `temp_exact_all'


* Combine census files -- start with disability outcomes because some counties
* will have observations where people are 55-64, and thus disability outcomes,
* but no people under age 55, so no non-disability outcomes
use `temp_all_dis', clear
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_wm_dis', gen(_mergewm_dis)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_wf_dis', gen(_mergewf_dis)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_nm_dis', gen(_mergenm_dis)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_nf_dis', gen(_mergenf_dis)

* Now merge non-disability outcomes
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_all', gen(_mergeall)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_wm', gen(_mergewm)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_wf', gen(_mergewf)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_nm', gen(_mergenm)
merge 1:1 st_fips co_fips_1 birthyr stayer birthmo year using `temp_nf', gen(_mergenf)

append using `temp_exact_all'
*/





* Create indices and apply labels
foreach x in all wm wf nm nf {

if "`x'"=="all" local g "all"
if "`x'"=="wm" local g "white males"
if "`x'"=="wf" local g "white females"
if "`x'"=="nm" local g "nonwhite males"
if "`x'"=="nf" local g "nonwhite females"

* note that rowmean function ignores missing values (they don't contribute to mean)
* Updated January 2017: Take Not Incarcerated out of ESS index, composite is only ESS + NEI + HC
egen z_hc_`x' = rowmean(z_hc_1_`x' z_hc_2_`x' z_hc_3_`x' z_hc_4_`x' z_hc_5_`x' z_hc_6_`x') 	
egen z_ess_`x' = rowmean(z_ess_1_`x' z_ess_2_`x' z_ess_3_`x' z_ess_4_`x' z_ess_5_`x' ///
	z_ess_6_`x' z_ess_8_`x' z_ess_9_`x' z_ess_10_`x' z_ess_5_1_`x' z_ess_6_1_`x' ///
	z_ess_8_1_`x' z_ess_10_1_`x') 
egen z_dis_`x' = rowmean(z_dis_1_`x' z_dis_2_`x' z_dis_3_`x' z_dis_4_`x' z_dis_5_`x' ///
	z_dis_6_`x')	// /*z_dis_7_`x' z_dis_8_`x'*/ 
egen z_nei_`x' = rowmean(z_nei_1_`x' z_nei_2_`x' z_nei_3_`x' z_nei_4_`x' z_nei_5_`x' ///
	z_nei_6_`x' z_nei_7_`x' z_nei_8_`x' z_nei_9_`x' z_nei_10_`x' z_nei_11_`x' z_nei_12_`x' ///
	z_nei_1_1_`x' z_nei_5_1_`x') 
egen z_composite_`x' = rowmean(z_hc_`x' z_ess_`x' z_nei_`x')	

* New index: approximation of Hoynes Schanzenbach Almond 2016 ESS index
* Updated 1/16/17 to add binary indicators for logged variables
egen z_hsa_`x' = rowmean(z_hc_2_`x' z_ess_2_`x' z_ess_5_`x' z_ess_8_`x' ///
	z_ess_9_`x' z_ess_10_`x' z_ess_5_1_`x' z_ess_8_1_`x' z_ess_10_1_`x')

la var z_hc_1_`x' "Years of schooling, `g' (normalized)"
la var z_hc_2_`x' "High school or GED completed, `g' (normalized)"
la var z_hc_3_`x' "Attended some college, `g' (normalized)"
la var z_hc_4_`x' "Completed 4 year college, `g' (normalized)"
la var z_hc_5_`x' "Completed professional or doctoral degree, `g' (normalized)"
la var z_hc_6_`x' "Has a professional job, `g' (normalized)"

la var hc_1_`x' "Years of schooling, `g'"
la var hc_2_`x' "High school or GED completed, `g'"
la var hc_3_`x' "Attended some college, `g'"
la var hc_4_`x' "Completed 4 year college, `g'"
la var hc_5_`x' "Completed professional or doctoral degree, `g'"
la var hc_6_`x' "Has a professional job, `g'"

la var z_ess_1_`x' "In labor force, `x' (normalized)"
la var z_ess_2_`x' "Worked last year, `x' (normalized)"
la var z_ess_3_`x' "Number of weeks worked last year, `x' (normalized)"
la var z_ess_4_`x' "Usual hours works per week, `x' (normalized)"
la var z_ess_5_`x' "Log labor income (wage + business/farm), `x' (normalized)"
la var z_ess_6_`x' "Log non-labor income not from public sources, `x' (normalized)"
*la var z_ess_7_`x' "Not incarcerated (avail. 2006-2013), `x' (normalized)"
la var z_ess_8_`x' "Log small family income to poverty ratio, `x' (normalized)"
la var z_ess_9_`x' "Not in poverty indicator, `x' (normalized)"
la var z_ess_10_`x' "- log of income from public sources, `x' (normalized)"
la var z_ess_5_1_`x' "Labor income greater than 0, `x' (normalized)"
la var z_ess_6_1_`x' "Non-labor income not from public sources greater than 0, `x' (normalized)"
la var z_ess_8_1_`x' "Small family income greater than 0, `x' (normalized)"
la var z_ess_10_1_`x' "No income from public sources, `x' (normalized)"

la var ess_1_`x' "In labor force, `x'"
la var ess_2_`x' "Worked last year, `x'"
la var ess_3_`x' "Number of weeks worked last year, `x'"
la var ess_4_`x' "Usual hours works per week, `x'"
la var ess_5_`x' "Log labor income (wage + business/farm), `x'"
la var ess_5_1_`x' "Positive labor income (wage + business/farm), `x'"
la var ess_6_`x' "Log non-labor income not from public sources, `x'"
la var ess_6_1_`x' "Positive non-labor income not from public sources, `x'"
la var ess_7_`x' "Not incarcerated (avail. 2006-2013), `x'"
la var ess_8_`x' "Log small family income to poverty ratio, `x'"
la var ess_8_1_`x' "Positive small family income to poverty ratio, `x'"
la var ess_9_`x' "Not in poverty indicator, `x'"
la var ess_10_`x' "- log of income from public sources, `x'"
la var ess_10_1_`x' "No income from public sources, `x'"

la var z_dis_1_`x' "No work disability (only avail. 2000-2007), `g' (normalized)"
la var z_dis_2_`x' "No ambulatory difficulty, `g' (normalized)"
la var z_dis_3_`x' "No cognitive difficulty, `g' (normalized)"
la var z_dis_4_`x' "No indepenent living difficulty, `g' (normalized)"
la var z_dis_5_`x' "No vision/hearing difficulty difficulty, `g' (normalized)"
la var z_dis_6_`x' "No self-care difficulty, `g' (normalized)"
*la var z_dis_7_`x' "Share which lived to 2000 (NUMIDENT), `g'"
*la var z_dis_8_`x' "Age at death, conditional on death (NUMIDENT), `g'"

la var dis_1_`x' "No work disability (only avail. 2000-2007), `g'"
la var dis_2_`x' "No ambulatory difficulty, `g'"
la var dis_3_`x' "No cognitive difficulty, `g'"
la var dis_4_`x' "No indepenent living difficulty, `g'"
la var dis_5_`x' "No vision/hearing difficulty difficulty, `g'"
la var dis_6_`x' "No self-care difficulty, `g'"
*la var z_dis_7_`x' "Share which lived to 2000 (NUMIDENT), `g'"
*la var z_dis_8_`x' "Age at death, conditional on death (NUMIDENT), `g'"

la var z_nei_1_`x' "Log house value, `x' (normalized)"
la var z_nei_2_`x' "Log gross rent, `x' (normalized)"
la var z_nei_3_`x' "Home ownership, `x' (normalized)"
la var z_nei_4_`x' "Residence with single family, `x' (normalized)"
la var z_nei_5_`x' "Log mean small family income to poverty ratio in tract, `x' (normalized)"
la var z_nei_6_`x' "-1 x teen pregnancy rate in tract, `x' (normalized)"
la var z_nei_7_`x' "-1 x share single head of household in tract, `x' (normalized)"
la var z_nei_8_`x' "-1 x share of kids in poverty in tract, `x' (normalized)"
la var z_nei_9_`x' "Mean home ownership in tract, `x' (normalized)"
la var z_nei_10_`x' "Log median value of home in tract, `x' (normalized)"
la var z_nei_11_`x' "Log median gross rent in tract, `x' (normalized)"
la var z_nei_12_`x' "Chetty and Hendren, absolute upward mobility, `x' (normalized)"
la var z_nei_1_1_`x' "House value greater than 0, `x' (normalized)"
la var z_nei_5_1_`x' "Mean small family income in tract greater than 0, `x' (normalized)"

la var nei_1_`x' "Log house value, `x'"
la var nei_1_1_`x' "Positive house value, `x'"
la var nei_2_`x' "Log gross rent, `x'"
la var nei_3_`x' "Home ownership, `x'"
la var nei_4_`x' "Residence with single family, `x'"
la var nei_5_`x' "Log mean small family income to poverty ratio in tract, `x'"
la var nei_5_1_`x' "Positive mean small family income to poverty ratio in tract, `x'"
la var nei_6_`x' "-1 x teen pregnancy rate in tract, `x'"
la var nei_7_`x' "-1 x share single head of household in tract, `x'"
la var nei_8_`x' "-1 x share of kids in poverty in tract, `x'"
la var nei_9_`x' "Mean home ownership in tract, `x'"
la var nei_10_`x' "Log median value of home in tract, `x'"
la var nei_11_`x' "Log median gross rent in tract, `x'"
la var nei_12_`x' "Chetty and Hendren, absolute upward mobility, `x'"

la var z_hc_`x' "Human capital sub-index, `g'"
la var z_ess_`x' "Economic self-sufficiency sub-index, `g'"
la var z_dis_`x' "Physical health sub-index, `g'"
la var z_nei_`x' "Neighborhood sub-index, `g'"
la var z_composite_`x' "Composite index, `g'"
la var z_hsa_`x' "Hoynes Almond Schanzenbach ESS sub-index, `g'"

*la var obs1999_`x' "Number of NUMIDENT obs used to construct dis_7, `g'"
*la var obsdead_`x' "Number of NUMIDENT obs used to construct dis_8, `g'"
*la var yearwgt_`x' "Weight for collapse across survey years, `g'" 
la var tractobs_`x' "Number of observations in census tract, `g'"
la var perwt_`x' "Sum of person weights, `g'"
la var obs_`x' "Number of observations from Census/ACS, `g'"
la var obs_dis_`x' "Number of dis observs from Census/ACS, `g'"
la var obs_LE1_`x' "Number of LE1 sample obs from Census/ACS, `g'"
la var obs_LE2_`x' "Number of LE2 sample obs from Census/ACS, `g'"
la var obs_LE3_`x' "Number of LE3 sample obs from Census/ACS, `g'"
la var obs_LE4_`x' "Number of LE4 sample obs from Census/ACS, `g'"

}

su

des

* examine cell size
su obs_* if exactmatch==0	// pooled sex and race

*tab _mergeall if obs_all>100 & exactmatch==0
tab _mergewm if obs_all>100 & exactmatch==0
tab _mergewf if obs_all>100 & exactmatch==0
tab _mergenm if obs_all>100 & exactmatch==0
tab _mergenf if obs_all>100 & exactmatch==0

*tab _mergeall if obs_all>20 & exactmatch==0
tab _mergewm if obs_all>20 & exactmatch==0
tab _mergewf if obs_all>20 & exactmatch==0
tab _mergenm if obs_all>20 & exactmatch==0
tab _mergenf if obs_all>20 & exactmatch==0


* Run a couple of quick data checks here: Do observables jump across birth or survey years?
* Weighted by number of observations
tabstat race_white_all [aw=obs_all] if exactmatch==0, by(birthyr) stats(mean sd)
tabstat race_white_all [aw=obs_all] if exactmatch==0, by(year) stats(mean sd)
tabstat race_black_all [aw=obs_all] if exactmatch==0, by(birthyr) stats(mean sd)
tabstat race_black_all [aw=obs_all] if exactmatch==0, by(year) stats(mean sd)
tabstat race_nonwhite_all [aw=obs_all] if exactmatch==0, by(birthyr) stats(mean sd)
tabstat race_nonwhite_all [aw=obs_all] if exactmatch==0, by(year) stats(mean sd)

* Unweighted
tabstat race_white_all if exactmatch==0, by(birthyr) stats(mean sd)
tabstat race_white_all if exactmatch==0, by(year) stats(mean sd)
tabstat race_black_all if exactmatch==0, by(birthyr) stats(mean sd)
tabstat race_black_all if exactmatch==0, by(year) stats(mean sd)
tabstat race_nonwhite_all if exactmatch==0, by(birthyr) stats(mean sd)
tabstat race_nonwhite_all if exactmatch==0, by(year) stats(mean sd)



compress
save "$datatemp/dtacr_23_birthmo_newsub.dta", replace

log close
