* Collapse census data	
* Bryan Stuart
* Updated 5/2/2016 (use birth month for food stamps project)
* Updated 5/23/2016 (collapse to birth-year level instead of birth-month)
* Updated 6/6/2016 (new way of collapsing)
* Updated 7/11/2016 (Incorporate geographically consistent "super counties")
* Updated 12/2016 (Restrict sample to those we want to disclose for index results)
* Updated 12/14/2016 (Test implications of converting dollar variables to logs)

cap log close
log using "$output/dtacr_23.log", replace 

*local for testing
*local teststat "if st_fips == 46"

* Prepare Chetty data for merge
use "$datap/chetty_data_040816.dta", clear
isid cty2000	// 5 digit: state + county
gen statefip = floor(cty2000/1000)
gen coufip = cty2000-statefip*1000
isid statefip coufip
rename cz cz_chetty
keep statefip coufip cz_chetty e_rank_b_cty e_rank_b_cz pct_causal_p25*
tempfile temp
save `temp'

* Census data at micro level
use "$datatemp_basic/dtacr_20.dta" `teststat', clear

***** IMPOSE SAMPLE RESTRICTIONS *****

* Limit by birth cohort and age 
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54  

* Drop if sex is missing 
drop if female==.
* Generate race 
gen race = race_white + 2*race_black + 3*race_other
* Drop if missing
drop if race==.
tab race female, m

* Limit to obs with non missing birth county
drop if co_fips_1==. 		// fips for county of birth 1
drop if st_fips==.		// fips for state of birth

* Drop if missing pik (shouldn't do anything)
drop if pik==""

* Drop obs if PIK appears more than once in a given year
gen one = 1
sort year pik
bys year pik: egen num_pik = total(one)
tab num_pik, mi
tab year if num_pik>1 & num_pik!=.
gen mult_pik = num_pik>1 & num_pik!=.
bys pik: egen num_mult_pik = total(mult_pik)
drop if num_mult_pik>0 & num_mult_pik!=.
 
* Drop if birth month is missing
su birthyr birthmo
drop if birthmo==. 
 
tab match_step, mi
tab birthyr match_step, mi

* Bring in Chetty data
merge m:1 statefip coufip using `temp', gen(_mergechetty) keep(1 3)
gen nei_12 = e_rank_b_cty

tab coufip statefip if _mergechetty==1

* Run a couple of quick data checks here: Do observables jump across birth or survey years?
tabstat race_white [aw=perwt], by(birthyr) stats(mean sd)
tabstat race_white [aw=perwt], by(year) stats(mean sd)
tabstat race_black [aw=perwt], by(birthyr) stats(mean sd)
tabstat race_black [aw=perwt], by(year) stats(mean sd)
tabstat race_other [aw=perwt], by(birthyr) stats(mean sd)
tabstat race_other [aw=perwt], by(year) stats(mean sd)
tabstat female [aw=perwt], by(birthyr) stats(mean sd)
tabstat female [aw=perwt], by(year) stats(mean sd)

gen race_nonwhite = (race_white != 1)
tab race_nonwhite race_black, m






**** FOR DISCLOSURE ****
* All variables should be set to missing for those who are incarcerated, except
* "not incarcerated." But some have other variables present, so make sure those
* are all missing to preserve consistent sample size across dis, hc, and nei indices
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 /// 
dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 ///
nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 {	
	replace `v' = . if ess_7==0
}



* Identify index sample membership
gen flag_hc = 0
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 {
	replace flag_hc = 1 if `v' != .
}
*gen flag_dis = 0
*foreach v of varlist dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 {
*	replace flag_dis = 1 if `v' != .
*}
gen flag_nei = 0
foreach v of varlist nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 {
	replace flag_nei = 1 if `v' != .
}
gen flag_ess = 0
foreach v of varlist ess_1 ess_2 ess_3 ess_4 ess_5 ess_6 /*ess_7*/ ess_8 ess_9 ess_10 {
	replace flag_ess = 1 if `v' != .
}
egen flag = rowtotal(flag_*)
sum flag_*
tab flag, m
gen keeper = flag==3		// Updated 1/16/17: Not using DIS in this sample
drop flag_* flag



*** Extra check, added 1/19/2017
* How would the sample change if we limited it to people who have all non-missing data?
gen nomissing = 1
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 ess_1 ess_2 ess_3 ess_4 ess_5 ///
	ess_6 /*ess_7*/ ess_8 ess_9 ess_10 ess_5_1 ess_6_1 ess_8_1 ess_10_1 /// 
	nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 ///
	nei_1_1 nei_5_1 {
	sum `v' [aw=perwt]
	count if `v' == .
	replace nomissing = 0 if `v' == .
}

tab nomissing keeper, m


* What if it was limited to people with non-missing data in HC and ESS?
gen nomissing_2cat = 1
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 ess_1 ess_2 ess_3 ess_4 ess_5 ///
	ess_6 /*ess_7*/ ess_8 ess_9 ess_10 ess_5_1 ess_6_1 ess_8_1 ess_10_1 {
	sum `v' [aw=perwt]
	count if `v' == .
	replace nomissing_2cat = 0 if `v' == .
}

tab nomissing_2cat keeper, m
tab nomissing_2cat nomissing, m


*** Another check, added 1/23/2017
* Here we want to generate a complete-information sample, except that we will
* allow people to be missing either house value or rent, and we will allow them to 
* be missing labor income
gen nomissing3 = 1
gen neiflag = nei_1==. & nei_2==.
tab neiflag, m
replace nomissing3 = 0 if neiflag == 1		// Missing both home value and gross rent
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 ess_1 ess_2 ess_3 ess_4 ess_5 ///
	ess_6 /*ess_7*/ ess_8 ess_9 ess_10 /*ess_5_1*/ ess_6_1 ess_8_1 ess_10_1 ///
	/*nei_1 nei_2*/ nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 ///
	/*nei_1_1*/ nei_5_1 {

	sum `v' [aw=perwt]
	count if `v' == .
	gen missing_`v' = `v' == .
	if "`v'"!="ess_5" {
		replace nomissing3 = 0 if `v' == .	// Don't do this for ess_5 (labor income)
	}
}
egen totmissing = rowtotal(missing_*)	// Total number of variables missing
sum totmissing, d
sum missing_* if totmissing==1		// Among those with only one variable missing, which is most common?
correlate missing_*			// Which variables tend to be missing together?

tab nomissing3 keeper, m
tab nomissing3 nomissing_2cat, m

drop missing_* totmissing neiflag

foreach v in ess_1 ess_2 {
	tab `v' ess_5_1, m	// Is labor income more likely to be missing if you're not in workforce?
}

** KEEP only disclosure sample (one of these should be commented out)
keep if keeper==1		/* Restrict to main disclosure sample */
*keep if nomissing == 1		// Restrict to NO-MISSING sample
*keep if nomissing_2cat == 1	// Restrict to NO-MISSING sample for HC and ESS



**** END OF SAMPLE CUTS FOR DISCLOSURE ****




/**** Look at consequences of logging dollar variables *****

gen ess_10_rev = -ess_10

* Generate variables in logs
foreach var in ess_5 ess_6 ess_8 ess_10_rev nei_1 nei_2 nei_5 nei_10 nei_11 {
	if "`var'"=="ess_5" {
		local w = ", width(8500)"
		local w2 = ", width(.2)"
	}
	if "`var'"=="ess_8" {
		local w = ", width(100)"
		local w2 = ", width(.22)"
	}
	else {
		local w = ""
		local w2 = ""
	}

	gen log_`var' = log(`var')
	gen zero_`var' = (`var'<=0) if `var'!=.
	tab zero_`var' race_white
	tab zero_`var' female
	tabstat zero_`var', by(birthyr)
	tabstat zero_`var', by(year)
	
	* All
	graph tw (hist `var' `w') (kdensity `var'), title("Distribution of `var', all") subtitle("Individual-level data")
	graph export "$output/pre_`var'.pdf", replace
	graph tw (hist log_`var' `w2') (kdensity log_`var'), title("Distribution of log of `var', all") subtitle("Individual-level data")
	graph export "$output/pre_log_`var'.pdf", replace

	* White
	graph tw (hist `var' if race_white==1 `w') ///
		(kdensity `var' if race_white==1), title("Distribution of `var', white") subtitle("Individual-level data")
	graph export "$output/pre_`var'_white.pdf", replace
	graph tw (hist log_`var' if race_white==1 `w2') ///
		(kdensity log_`var' if race_white==1), title("Distribution of log of `var', white") subtitle("Individual-level data")
	graph export "$output/pre_log_`var'_white.pdf", replace
	
	* Nonwhite
	graph tw (hist `var' if race_white==0 `w') ///
		(kdensity `var' if race_white==0), title("Distribution of `var', nonwhite") subtitle("Individual-level data")
	graph export "$output/pre_`var'_nonwhite.pdf", replace
	graph tw (hist log_`var' if race_white==0 `w2') ///
		(kdensity log_`var' if race_white==0), title("Distribution of log of `var', nonwhite") subtitle("Individual-level data")
	graph export "$output/pre_log_`var'_nonwhite.pdf", replace
	
	* Women
	graph tw (hist `var' if female==1 `w') ///
		(kdensity `var' if female==1), title("Distribution of `var', women") subtitle("Individual-level data")
	graph export "$output/pre_`var'_women.pdf", replace
	graph tw (hist log_`var' if female==1 `w2') ///
		(kdensity log_`var' if female==1), title("Distribution of log of `var', women") subtitle("Individual-level data")
	graph export "$output/pre_log_`var'_women.pdf", replace
	
	* Men
	graph tw (hist `var' if female==0 `w') ///
		(kdensity `var' if female==0), title("Distribution of `var', men") subtitle("Individual-level data")
	graph export "$output/pre_`var'_men.pdf", replace
	graph tw (hist log_`var' if female==0 `w2') ///
		(kdensity log_`var' if female==0), title("Distribution of log of `var', men") subtitle("Individual-level data")
	graph export "$output/pre_log_`var'_men.pdf", replace
}

drop ess_10_rev
drop zero_* log_*
*/



*** Now we move forward with 9 variables logged:
* ess_5 ess_6 ess_8 ess_10 nei_1 nei_2 nei_5 nei_10 nei_11
*** Of these, six have variables for which we need to add binary indicators to preserve sample size:
* log_ess_5 log_ess_6 log_ess_8 log_ess_10 log_nei_1 log_nei_5
foreach v of varlist ess_5 ess_6 ess_8 ess_10 nei_1 nei_2 nei_5 nei_10 nei_11 {

	if "`v'"=="ess_10" {
		replace `v' = -log(-`v')	// Reverse coded income from public sources
		sum `v'
	}
	
	else {
		replace `v' = log(`v')
		sum `v'
	}
}




* Normalize each variable 
* according to 1950-1954 mean and sd
* Updated 1/10/2017 (Add binary indicators for 5 newly logged variables, removed incarceration from ESS)
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 /// 
ess_1 ess_2 ess_3 ess_4 ess_5 ess_6 /*ess_7*/ ess_8 ess_9 ess_10 ess_11 ///
dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 ///
nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 ///
ess_5_1 ess_6_1 ess_8_1 ess_10_1 nei_1_1 nei_5_1 {
	* All
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) [aw=perwt]		
	gen z_`v'_all = (`v'-r(mean))/r(sd) if inrange(age,25,54)	
	* Whites
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & race_white==1 [aw=perwt]
	gen z_`v'_white = (`v'-r(mean))/r(sd) if race_white==1 & inrange(age,25,54)
	* Non-whites
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & (race_black==1|race_other==1) [aw=perwt]
	gen z_`v'_nonwhite = (`v'-r(mean))/r(sd) if (race_black==1|race_other==1) ///
		& inrange(age,25,54)
	* Men
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & female==0 [aw=perwt]
	gen z_`v'_men = (`v'-r(mean))/r(sd) if female==0  & inrange(age,25,54) 
	* Women
	su `v' if inrange(birthyr,1950,1954) & inrange(age,25,54) & female==1 [aw=perwt]
	gen z_`v'_women = (`v'-r(mean))/r(sd) if female==1  & inrange(age,25,54) 
	
	compress	// experienced memory issues before
}




**** FOR DISCLOSURE ****
* (Doing this again post-log, just to make sure sample size doesn't change. Could
* eventually just do it once)
* Identify index sample membership
* Updated 1/16/17: Not using DIS in this sample
gen flag_hc = 0
foreach v of varlist hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 {
	replace flag_hc = 1 if `v' != .
}
*gen flag_dis = 0
*foreach v of varlist dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 {
*	replace flag_dis = 1 if `v' != .
*}
gen flag_nei = 0
foreach v of varlist nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 ///
	nei_1_1 nei_5_1 {
	replace flag_nei = 1 if `v' != .
}
gen flag_ess = 0
foreach v of varlist ess_1 ess_2 ess_3 ess_4 ess_5 ess_6 /*ess_7*/ ess_8 ess_9 ess_10 ///
	ess_5_1 ess_6_1 ess_8_1 ess_10_1 {
	replace flag_ess = 1 if `v' != .
}
egen flag = rowtotal(flag_*)
sum flag_*
tab flag, m
gen keeper_postlog = flag==3	// Updated 1/16/17: Not using DIS in this sample

tab keeper_postlog keeper, m	// Should match perfectly

*keep if keeper==1
drop keeper_postlog







* NEW SECTION, 7/8/2016: Convert county codes to "super counties" that are
* consistent from 1950-1980
do "$dofile/countyStandardizeGNIS1950.do"
gen fips = 1000*st_fips + co_fips_1
* Save original fips
gen fips_orig = fips
countyFipsToREISfips, county(fips)
fixCounty, county(fips)
* Replace county fips code with new version
replace co_fips_1 = fips - 1000*st_fips






* Collapse into birth-year/survey-year/county cells
gen obs = 1 if perwt>0 & perwt!=.


*** Collapse for all 
* Collapse for all matches
preserve
* Keep only ages 25-54
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if keeper==1
collapse (mean) z_*_all hc_* ess_* dis_* nei_* age female race_white race_black race_nonwhite ///
	/*yearwgt_all*/ tractobs (rawsum) perwt* obs [aw=perwt], ///
	by(st_fips co_fips_1 birthyr /*birthmo*/ year)
foreach v of varlist hc_* ess_* dis_* nei_* /*yearwgt*/ tractobs perwt* obs* age* female* race_* {
	rename `v' `v'_all
}
gen exactmatch=0
tempfile temp_all
save `temp_all'
restore
* Collapse for exact matches only [currently only doing this for entire group; could extend to race/sex]
preserve
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if match_step=="EXACT"
keep if keeper==1
collapse (mean) z_*_all hc_* ess_* dis_* nei_* age female race_white race_black race_nonwhite ///
	/*yearwgt_all*/ tractobs (rawsum) perwt* obs [aw=perwt], ///
	by(st_fips co_fips_1 birthyr /*birthmo*/ year)
foreach v of varlist hc_* ess_* dis_* nei_* /*yearwgt*/ tractobs perwt* obs* age* female* race_* {
	rename `v' `v'_all
}
gen exactmatch=1
tempfile temp_exact
save `temp_exact'
restore
* Collapse for whites
preserve
keep if race_white==1
keep if keeper==1
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
collapse (mean) z_*_white hc_* ess_* dis_* nei_* age female ///
	/*yearwgt_white*/ tractobs ///
	(rawsum) perwt* obs [aw=perwt], ///
	by(st_fips co_fips_1 birthyr /*birthmo*/ year)
foreach v of varlist hc_* ess_* dis_* nei_* age* female* tractobs perwt* obs* {
	rename `v' `v'_white
}
tempfile temp_w
save `temp_w'
restore
* Collapse for non-whites
preserve
keep if race_black==1 | race_other==1
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if keeper==1
collapse (mean) z_*_nonwhite hc_* ess_* dis_* nei_* age female ///
	/*yearwgt_nonwhite*/ tractobs ///
	(rawsum) perwt* obs [aw=perwt], ///
	by(st_fips co_fips_1 birthyr /*birthmo*/ year)
foreach v of varlist hc_* ess_* dis_* nei_* age* female* tractobs perwt* obs* {
	rename `v' `v'_nonwhite
}
tempfile temp_b
save `temp_b'
restore
* Collapse for men
preserve
keep if female==0
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if keeper==1
collapse (mean) z_*_men hc_* ess_* dis_* nei_* age race_white race_black race_nonwhite ///
	/*yearwgt_men*/ tractobs ///
	(rawsum) perwt* obs [aw=perwt], ///
	by(st_fips co_fips_1 birthyr /*birthmo*/ year)
foreach v of varlist hc_* ess_* dis_* nei_* age* race_* tractobs perwt* obs* {
	rename `v' `v'_men
}
tempfile temp_m
save `temp_m'
restore
* Collapse for women
preserve
keep if female==1
keep if birthyr>=1950 & birthyr<=1980 & age>=25 & age<=54
keep if keeper==1
collapse (mean) z_*_women hc_* ess_* dis_* nei_* age race_white race_black race_nonwhite  ///
	/*yearwgt_women*/ tractobs ///
	(rawsum) perwt* obs [aw=perwt], ///
	by(st_fips co_fips_1 birthyr /*birthmo*/ year)
foreach v of varlist hc_* ess_* dis_* nei_* age* race_* tractobs perwt* obs* {
	rename `v' `v'_women
}
tempfile temp_f
save `temp_f'
restore


/* Put exact disab and non-disab observations together
use `temp_exact_dis', clear
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_exact', gen(_mergeexact)
tempfile temp_exact_all
save `temp_exact_all'


* Combine census files -- start with disability outcomes because some counties
* will have observations where people are 55-64, and thus disability outcomes,
* but no people under age 55, so no non-disability outcomes
use `temp_all_dis', clear
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_w_dis', gen(_mergew_dis)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_b_dis', gen(_mergeb_dis)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_m_dis', gen(_mergem_dis)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_f_dis', gen(_mergef_dis)

* Now merge non-disability outcomes
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_all', gen(_mergeall)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_w', gen(_mergew)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_b', gen(_mergeb)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_m', gen(_mergem)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_f', gen(_mergef)

append using `temp_exact_all'
*/


use "`temp_all'", clear
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_w', gen(_mergew)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_b', gen(_mergeb)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_m', gen(_mergem)
merge 1:1 st_fips co_fips_1 birthyr /*birthmo*/ year using `temp_f', gen(_mergef)
append using `temp_exact'

* Create indices and apply labels
foreach x in all white nonwhite men women {

* note that rowmean function ignores missing values (they don't contribute to mean)
egen z_hc_`x' = rowmean(z_hc_1_`x' z_hc_2_`x' z_hc_3_`x' z_hc_4_`x' z_hc_5_`x' z_hc_6_`x') 	
egen z_ess_`x' = rowmean(z_ess_1_`x' z_ess_2_`x' z_ess_3_`x' z_ess_4_`x' z_ess_5_`x' ///
	z_ess_6_`x' z_ess_8_`x' z_ess_9_`x' z_ess_10_`x' z_ess_5_1_`x' z_ess_6_1_`x' ///
	z_ess_8_1_`x' z_ess_10_1_`x') 
egen z_dis_`x' = rowmean(z_dis_1_`x' z_dis_2_`x' z_dis_3_`x' z_dis_4_`x' z_dis_5_`x' ///
	z_dis_6_`x')	// /*z_dis_7_`x' z_dis_8_`x'*/ 
egen z_nei_`x' = rowmean(z_nei_1_`x' z_nei_2_`x' z_nei_3_`x' z_nei_4_`x' z_nei_5_`x' ///
	z_nei_6_`x' z_nei_7_`x' z_nei_8_`x' z_nei_9_`x' z_nei_10_`x' z_nei_11_`x' z_nei_12_`x' ///
	z_nei_1_1_`x' z_nei_5_`x') 
egen z_composite_`x' = rowmean(z_hc_`x' z_ess_`x' z_nei_`x')	

* Updated 1/10/2017: Took out z_ess_7_`x', added binary variables
* Updated 1/16/17: Not using DIS index in the composite index

/* Added 6/17/2016: Three new index outcomes:
* 1.) ESS leaving out "not incarcerated," which is only available some years
* 2.) DIS leaving out "no work disability," which is only available some years
* 3.) DIS using only 2000-2007
egen z_ess2_`x' = rowmean(z_ess_1_`x' z_ess_2_`x' z_ess_3_`x' z_ess_4_`x' z_ess_5_`x' ///
	z_ess_6_`x' z_ess_8_`x' z_ess_9_`x' z_ess_10_`x') 
egen z_dis2_`x' = rowmean(z_dis_2_`x' z_dis_3_`x' z_dis_4_`x' z_dis_5_`x' ///
	z_dis_6_`x')
egen z_dis3_`x' = rowmean(z_dis3_1_`x' z_dis3_2_`x' z_dis3_3_`x' z_dis3_4_`x' z_dis3_5_`x' ///
	z_dis3_6_`x')*/

la var z_hc_1_`x' "Years of schooling, `x' (normalized)"
la var z_hc_2_`x' "High school or GED completed, `x' (normalized)"
la var z_hc_3_`x' "Attended some college, `x' (normalized)"
la var z_hc_4_`x' "Completed 4 year college, `x' (normalized)"
la var z_hc_5_`x' "Completed professional or doctoral degree, `x' (normalized)"
la var z_hc_6_`x' "Has a professional job, `x' (normalized)"

la var hc_1_`x' "Years of schooling, `x'"
la var hc_2_`x' "High school or GED completed, `x'"
la var hc_3_`x' "Attended some college, `x'"
la var hc_4_`x' "Completed 4 year college, `x'"
la var hc_5_`x' "Completed professional or doctoral degree, `x'"
la var hc_6_`x' "Has a professional job, `x'"

la var z_ess_1_`x' "In labor force, `x' (normalized)"
la var z_ess_2_`x' "Worked last year, `x' (normalized)"
la var z_ess_3_`x' "Number of weeks worked last year, `x' (normalized)"
la var z_ess_4_`x' "Usual hours works per week, `x' (normalized)"
la var z_ess_5_`x' "Log labor income (wage + business/farm), `x' (normalized)"
la var z_ess_6_`x' "Log non-labor income not from public sources, `x' (normalized)"
*la var z_ess_7_`x' "Not incarcerated (avail. 2006-2013), `x' (normalized)"
la var z_ess_8_`x' "Log small family income to poverty ratio, `x' (normalized)"
la var z_ess_9_`x' "Not in poverty indicator, `x' (normalized)"
la var z_ess_10_`x' "- log of income from public sources, `x' (normalized)"
la var z_ess_5_1_`x' "Labor income greater than 0, `x' (normalized)"
la var z_ess_6_1_`x' "Non-labor income not from public sources greater than 0, `x' (normalized)"
la var z_ess_8_1_`x' "Small family income greater than 0, `x' (normalized)"
la var z_ess_10_1_`x' "No income from public sources, `x' (normalized)"

la var ess_1_`x' "In labor force, `x'"
la var ess_2_`x' "Worked last year, `x'"
la var ess_3_`x' "Number of weeks worked last year, `x'"
la var ess_4_`x' "Usual hours works per week, `x'"
la var ess_5_`x' "Log labor income (wage + business/farm), `x'"
la var ess_5_1_`x' "Positive labor income (wage + business/farm), `x'"
la var ess_6_`x' "Log non-labor income not from public sources, `x'"
la var ess_6_1_`x' "Positive non-labor income not from public sources, `x'"
la var ess_7_`x' "Not incarcerated (avail. 2006-2013), `x'"
la var ess_8_`x' "Log small family income to poverty ratio, `x'"
la var ess_8_1_`x' "Positive small family income to poverty ratio, `x'"
la var ess_9_`x' "Not in poverty indicator, `x'"
la var ess_10_`x' "- log of income from public sources, `x'"
la var ess_10_1_`x' "No income from public sources, `x'"
la var ess_11_`x' "Labor income (wage only), `x'"

la var z_dis_1_`x' "No work disability (only avail. 2000-2007), `x' (normalized)"
la var z_dis_2_`x' "No ambulatory difficulty, `x' (normalized)"
la var z_dis_3_`x' "No cognitive difficulty, `x' (normalized)"
la var z_dis_4_`x' "No indepenent living difficulty, `x' (normalized)"
la var z_dis_5_`x' "No vision/hearing difficulty difficulty, `x' (normalized)"
la var z_dis_6_`x' "No self-care difficulty, `x' (normalized)"
*la var z_dis_7_`x' "Share which lived to 2000 (NUMIDENT), `x'"
*la var z_dis_8_`x' "Age at death, conditional on death (NUMIDENT), `x'"

*la var z_dis3_1_`x' "No work disability, restricted to 2000-2007, `x' (normalized)"
*la var z_dis3_2_`x' "No ambulatory difficulty, restricted to 2000-2007, `x' (normalized)"
*la var z_dis3_3_`x' "No cognitive difficulty, restricted to 2000-2007, `x' (normalized)"
*la var z_dis3_4_`x' "No indepenent living difficulty, restricted to 2000-2007, `x' (normalized)"
*la var z_dis3_5_`x' "No vision/hearing difficulty difficulty, restricted to 2000-2007, `x' (normalized)"
*la var z_dis3_6_`x' "No self-care difficulty, restricted to 2000-2007, `x' (normalized)"

la var dis_1_`x' "No work disability (only avail. 2000-2007), `x'"
la var dis_2_`x' "No ambulatory difficulty, `x'"
la var dis_3_`x' "No cognitive difficulty, `x'"
la var dis_4_`x' "No independent living difficulty, `x'"
la var dis_5_`x' "No vision/hearing difficulty difficulty, `x'"
la var dis_6_`x' "No self-care difficulty, `x'"
*la var z_dis_7_`x' "Share which lived to 2000 (NUMIDENT), `x'"
*la var z_dis_8_`x' "Age at death, conditional on death (NUMIDENT), `x'"

la var z_nei_1_`x' "Log house value, `x' (normalized)"
la var z_nei_2_`x' "Log gross rent, `x' (normalized)"
la var z_nei_3_`x' "Home ownership, `x' (normalized)"
la var z_nei_4_`x' "Residence with single family, `x' (normalized)"
la var z_nei_5_`x' "Log mean small family income to poverty ratio in tract, `x' (normalized)"
la var z_nei_6_`x' "-1 x teen pregnancy rate in tract, `x' (normalized)"
la var z_nei_7_`x' "-1 x share single head of household in tract, `x' (normalized)"
la var z_nei_8_`x' "-1 x share of kids in poverty in tract, `x' (normalized)"
la var z_nei_9_`x' "Mean home ownership in tract, `x' (normalized)"
la var z_nei_10_`x' "Log median value of home in tract, `x' (normalized)"
la var z_nei_11_`x' "Log median gross rent in tract, `x' (normalized)"
la var z_nei_12_`x' "Chetty and Hendren, absolute upward mobility, `x' (normalized)"
la var z_nei_1_1_`x' "House value greater than 0, `x' (normalized)"
la var z_nei_5_1_`x' "Mean small family income in tract greater than 0, `x' (normalized)"

la var nei_1_`x' "Log house value, `x'"
la var nei_1_1_`x' "Positive house value, `x'"
la var nei_2_`x' "Log gross rent, `x'"
la var nei_3_`x' "Home ownership, `x'"
la var nei_4_`x' "Residence with single family, `x'"
la var nei_5_`x' "Log mean small family income to poverty ratio in tract, `x'"
la var nei_5_1_`x' "Positive mean small family income to poverty ratio in tract, `x'"
la var nei_6_`x' "-1 x teen pregnancy rate in tract, `x'"
la var nei_7_`x' "-1 x share single head of household in tract, `x'"
la var nei_8_`x' "-1 x share of kids in poverty in tract, `x'"
la var nei_9_`x' "Mean home ownership in tract, `x'"
la var nei_10_`x' "Log median value of home in tract, `x'"
la var nei_11_`x' "Log median gross rent in tract, `x'"
la var nei_12_`x' "Chetty and Hendren, absolute upward mobility, `x'"

la var z_hc_`x' "Human capital sub-index, `x'"
la var z_ess_`x' "Economic self-sufficiency sub-index, `x'"
la var z_dis_`x' "Physical health sub-index, `x'"
la var z_nei_`x' "Neighborhood sub-index, `x'"
la var z_composite_`x' "Composite index, `x'"

*la var z_dis2_`x' "Physical health sub-index, work disability outcome excluded, `x'"
*la var z_ess2_`x' "Economic self-sufficiency sub-index, incarceration outcome excluded, `x'"
*la var z_dis3_`x' "Physical health sub-index, restricted to 2000-2007, `x'"

*la var obs1999_`x' "Number of NUMIDENT obs used to construct dis_7, `x'"
*la var obsdead_`x' "Number of NUMIDENT obs used to construct dis_8, `x'"
*la var yearwgt_`x' "Weight for collapse across survey years, `x'" 
*la var yearwgt_`x'_dis "Weight for collapse of disability index across survey years, `x'" 
la var tractobs_`x' "Number of observations in census tract, `x'"
la var perwt_`x' "Sum of person weights, `x'"
*la var perwt_dis_`x' "Sum of person weights for disability measure, `x'"
la var obs_`x' "Number of observations from Census/ACS, `x'"
*la var obs_dis_`x' "Number of observations for disability index, `x'"
*la var obs_dis3_`x' "Number of obs for disab index, restricted to 2000-2007, `x'"

}

su

des

* examine cell size
su obs_* if exactmatch==0, d	// pooled sex and race

*tab _mergeall if obs_all>100 & exactmatch==0
tab _mergew if obs_all>100 & exactmatch==0
tab _mergeb if obs_all>100 & exactmatch==0
tab _mergem if obs_all>100 & exactmatch==0
tab _mergef if obs_all>100 & exactmatch==0

*tab _mergeall if obs_all>20 & exactmatch==0
tab _mergew if obs_all>20 & exactmatch==0
tab _mergeb if obs_all>20 & exactmatch==0
tab _mergem if obs_all>20 & exactmatch==0
tab _mergef if obs_all>20 & exactmatch==0

/*
tab _mergew_dis if obs_dis_all>100 & exactmatch==0
tab _mergeb_dis if obs_dis_all>100 & exactmatch==0
tab _mergem_dis if obs_dis_all>100 & exactmatch==0
tab _mergef_dis if obs_dis_all>100 & exactmatch==0

tab _mergew_dis if obs_dis_all>20 & exactmatch==0
tab _mergeb_dis if obs_dis_all>20 & exactmatch==0
tab _mergem_dis if obs_dis_all>20 & exactmatch==0
tab _mergef_dis if obs_dis_all>20 & exactmatch==0
*/



/* Run a couple of quick data checks here: Do observables jump across birth or survey years?
tabstat race_white [aw=obs_all], by(birthyr) stats(mean sd)
tabstat race_white [aw=obs_all], by(year) stats(mean sd)
tabstat race_black [aw=obs_all], by(birthyr) stats(mean sd)
tabstat race_black [aw=obs_all], by(year) stats(mean sd)
tabstat race_nonwhite [aw=obs_all], by(birthyr) stats(mean sd)
tabstat race_nonwhite [aw=obs_all], by(year) stats(mean sd)

tabstat race_white, by(birthyr) stats(mean sd)
tabstat race_white, by(year) stats(mean sd)
tabstat race_black, by(birthyr) stats(mean sd)
tabstat race_black, by(year) stats(mean sd)
tabstat race_nonwhite, by(birthyr) stats(mean sd)
tabstat race_nonwhite, by(year) stats(mean sd)
*/

/* Examine missing cells by year

foreach v in hc_1 hc_2 hc_3 hc_4 hc_5 hc_6 /// 
ess_1 ess_2 ess_3 ess_4 ess_5 ess_6 ess_7 ess_8 ess_9 ess_10 ///
dis_1 dis_2 dis_3 dis_4 dis_5 dis_6 ///
nei_1 nei_2 nei_3 nei_4 nei_5 nei_6 nei_7 nei_8 nei_9 nei_10 nei_11 nei_12 {
	foreach y in all men women white nonwhite {
		
		capture drop missing_`v'_`y'
		gen missing_`v'_`y' = (z_`v'_`y' == .)

		tabstat missing_`v'_`y', by(year) statistics(mean n)
		drop missing_`v'_`y'
	}
}
*/



/* Generate variables in logs
foreach g in all women men white nonwhite {
	foreach var in ess_5_`g' ess_6_`g' ess_8_`g' ess_10_`g' nei_1_`g' nei_2_`g' ///
		nei_5_`g' nei_10_`g' nei_11_`g' {
	
		if "`var'"=="ess_5_`g'" {
			local w = ", width(8500)"
			local w2 = ", width(.2)"
		}
		if "`var'"=="ess_8_`g'" {
			local w = ", width(100)"
			local w2 = ", width(.22)"
		}
		if "`var'"=="ess_10_`g'" {
			gen ess_10_`g'_rev = -ess_10_`g'
			local var = "ess_10_`g'_rev"
		}
		else {
			local w = ""
			local w2 = ""
		}
	
		gen log_`var' = log(`var')
		gen zero_`var' = (`var'<=0) if `var'!=.
		tabstat zero_`var', by(birthyr)
		tabstat zero_`var', by(year)
		
		* All
		graph tw (hist `var' `w') (kdensity `var'), title("Distribution of `var'") ////
			subtitle("Collapsed data")
		graph export "$output/post_`var'.pdf", replace
		graph tw (hist log_`var' `w2') (kdensity log_`var'), title("Distribution of log of `var'") ///
			subtitle("Collapsed data")
		graph export "$output/post_log_`var'.pdf", replace
	}
}

cap drop ess_10_*_rev log_* zero_*
*/

* Check distributions of logged and collapsed data
foreach g in all women men white nonwhite {
	foreach var in ess_5_`g' ess_6_`g' ess_8_`g' ess_10_`g' nei_1_`g' nei_2_`g' ///
		nei_5_`g' nei_10_`g' nei_11_`g' {
		
		graph tw (hist `var') (kdensity `var'), title("Distribution of `var'") ///
			subtitle("Data logged then collapsed to county/year/survey year level")
		graph export "$output/hist_`var'.pdf", replace
		
	}
}







compress
save "$datatemp/dtacr_23.dta", replace

log close
