Source code for geosnap.util.cenpy_fetch

"""Utility functions for downloading Census data."""

import pandas
import sys
from tqdm.auto import tqdm


[docs]def main(): from geosnap.data import data_store from cenpy import products def fetch_acs(level="tract", state="all", year=2017): """Collect the variables defined in `geosnap.data.data_store.codebook` from the Census API. Parameters ---------- level : str Census geographic tabulation unit e.g. "block", "tract", or "county" (the default is 'tract'). state : str State for which data should be collected, e.g. "Maryland". if 'all' (default) the function will loop through each state and return a combined dataframe. year : int ACS release year to query (the default is 2017). Returns ------- type pandas.DataFrame Examples ------- >>> dc = fetch_acs('District of Columbia', year=2015) """ states = data_store.states() _variables = data_store.codebook.copy() acsvars = process_columns(_variables["acs"].dropna()) evalcols = [ normalize_relation(rel) for rel in _variables["acs"].dropna().tolist() ] varnames = _variables.dropna(subset=["acs"])["variable"] evals = [parts[0] + "=" + parts[1] for parts in zip(varnames, evalcols)] if state == "all": dfs = [] with tqdm(total=len(states()), file=sys.stdout) as pbar: for state in states().sort_values(by="name").name.tolist(): try: df = products.ACS(year).from_state( state, level=level, variables=acsvars.copy() ) dfs.append(df) pbar.update(1) except: tqdm.write("{state} failed".format(state=state)) pbar.update(1) df = pandas.concat(dfs) else: df = products.ACS(year).from_state( name=state, level=level, variables=acsvars.copy() ) df.set_index("GEOID", inplace=True) df = df.apply(lambda x: pandas.to_numeric(x, errors="coerce"), axis=1) # compute additional variables from lookup table for row in evals: try: df.eval(row, inplace=True, engine="python") except Exception as e: print(row + " " + str(e)) for row in _variables["formula"].dropna().tolist(): try: df.eval(row, inplace=True, engine="python") except Exception as e: print(str(row) + " " + str(e)) keeps = [col for col in df.columns if col in _variables.variable.tolist()] df = df[keeps] return df def process_columns(input_columns): # prepare by taking all sum-of-columns as lists outcols_processing = [s.replace("+", ",") for s in input_columns] outcols = [] while outcols_processing: # stack col = outcols_processing.pop() col = col.replace("-", ",").replace("(", "").replace(")", "") col = [c.strip() for c in col.split(",")] # get each part if len(col) > 1: # if there are many parts col, *rest = col # put the rest back for r in rest: outcols_processing.insert(0, r) else: col = col[0] if ":" in col: # if a part is a range start, stop = col.split(":") # split the range stem = start[:-3] start = int(start[-3:]) stop = int(stop) # and expand the range cols = [stem + str(col).rjust(3, "0") for col in range(start, stop + 1)] outcols.extend(cols) else: outcols.append(col) return outcols def normalize_relation(relation): parts = relation.split("+") if len(parts) == 1: if ":" not in relation: return relation else: relation = parts[0] else: relation = "+".join([normalize_relation(rel.strip()) for rel in parts]) if ":" in relation: start, stop = relation.split(":") stem = start[:-3] start = int(start[-3:]) stop = int(stop) # and expand the range cols = [stem + str(col).rjust(3, "0") for col in range(start, stop + 1)] return "+".join(cols) return relation
if __name__ == "__main__": main()