Source code for geosnap.io.storage

"""Tools for creating and manipulating neighborhood datasets."""

import os
import pathlib
import zipfile
from warnings import warn

from appdirs import user_data_dir

import geopandas as gpd
import pandas as pd
import quilt3

from .._data import datasets
from .util import adjust_inflation, convert_gdf

_fipstable = pd.read_csv(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "stfipstable.csv"),
    converters={"FIPS Code": str},
)

appname = "geosnap"
appauthor = "geosnap"
data_dir = user_data_dir(appname, appauthor)
if not os.path.exists(data_dir):
    pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)

# look for local storage and create if missing
try:
    from quilt3.data.geosnap_data import storage
except ImportError:
    storage = quilt3.Package()


[docs]def store_census(): """Save census data to the local quilt package storage. Returns ------- None Data will be available in the geosnap.data.datasets and will be used in place of streaming data for all census queries. The raster package is 3.05 GB. """ quilt3.Package.install("census/tracts_cartographic", "s3://quilt-cgs") quilt3.Package.install("census/administrative", "s3://quilt-cgs")
[docs]def store_blocks_2000(): """Save census 2000 census block data to the local quilt package storage. Returns ------- None Data will be available in the geosnap.data.datasets and will be used in place of streaming data for all census queries. """ quilt3.Package.install("census/blocks_2000", "s3://quilt-cgs")
[docs]def store_blocks_2010(): """Save census 2010 census block data to the local quilt package storage. Returns ------- None Data will be available in the geosnap.data.datasets and will be used in place of streaming data for all census queries. """ quilt3.Package.install("census/blocks_2010", "s3://quilt-cgs")
[docs]def store_ltdb(sample, fullcount): """ Read & store data from Brown's Longitudinal Tract Database (LTDB). Parameters ---------- sample : str file path of the zip file containing the standard Sample CSV files downloaded from https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx fullcount: str file path of the zip file containing the standard Fullcount CSV files downloaded from https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx Returns ------- pandas.DataFrame """ sample_zip = zipfile.ZipFile(sample) fullcount_zip = zipfile.ZipFile(fullcount) def _ltdb_reader(path, file, year, dropcols=None): df = pd.read_csv( path.open(file), na_values=["", " ", 99999, -999], converters={0: str, "placefp10": str}, low_memory=False, encoding="latin1", ) if dropcols: df.drop(dropcols, axis=1, inplace=True) df.columns = df.columns.str.lower() names = df.columns.values.tolist() names[0] = "geoid" newlist = [] # ignoring the first 4 columns, remove year suffix from column names for name in names[4:]: newlist.append(name[:-2]) colnames = names[:4] + newlist df.columns = colnames # prepend a 0 when FIPS is too short df["geoid"] = df["geoid"].str.rjust(11, "0") df.set_index("geoid", inplace=True) df["year"] = year inflate_cols = [ "mhmval", "mrent", "incpc", "hinc", "hincw", "hincb", "hinch", "hinca", ] inflate_available = list(set(df.columns).intersection(set(inflate_cols))) if len(inflate_available): df = adjust_inflation(df, inflate_available, year) return df # read in Brown's LTDB data, both the sample and fullcount files for each # year population, housing units & occupied housing units appear in both # "sample" and "fullcount" files-- currently drop sample and keep fullcount sample70 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_1970_sample.csv", dropcols=["POP70SP1", "HU70SP", "OHU70SP"], year=1970, ) fullcount70 = _ltdb_reader(fullcount_zip, "LTDB_Std_1970_fullcount.csv", year=1970) sample80 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_1980_sample.csv", dropcols=["pop80sf3", "pop80sf4", "hu80sp", "ohu80sp"], year=1980, ) fullcount80 = _ltdb_reader(fullcount_zip, "LTDB_Std_1980_fullcount.csv", year=1980) sample90 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_1990_sample.csv", dropcols=["POP90SF3", "POP90SF4", "HU90SP", "OHU90SP"], year=1990, ) fullcount90 = _ltdb_reader(fullcount_zip, "LTDB_Std_1990_fullcount.csv", year=1990) sample00 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_2000_sample.csv", dropcols=["POP00SF3", "HU00SP", "OHU00SP"], year=2000, ) fullcount00 = _ltdb_reader(fullcount_zip, "LTDB_Std_2000_fullcount.csv", year=2000) sample10 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_2010_sample.csv", year=2010 ) # join the sample and fullcount variables into a single df for the year ltdb_1970 = sample70.drop(columns=["year"]).join( fullcount70.iloc[:, 7:], how="left" ) ltdb_1980 = sample80.drop(columns=["year"]).join( fullcount80.iloc[:, 7:], how="left" ) ltdb_1990 = sample90.drop(columns=["year"]).join( fullcount90.iloc[:, 7:], how="left" ) ltdb_2000 = sample00.drop(columns=["year"]).join( fullcount00.iloc[:, 7:], how="left" ) ltdb_2010 = sample10 df = pd.concat([ltdb_1970, ltdb_1980, ltdb_1990, ltdb_2000, ltdb_2010], sort=True) renamer = dict( zip(datasets.codebook()["ltdb"].tolist(), datasets.codebook()["variable"].tolist()) ) df.rename(renamer, axis="columns", inplace=True) # compute additional variables from lookup table for row in datasets.codebook()["formula"].dropna().tolist(): df.eval(row, inplace=True) keeps = df.columns[ df.columns.isin(datasets.codebook()["variable"].tolist() + ["year"]) ] df = df[keeps] df.to_parquet(os.path.join(data_dir, "ltdb.parquet"), compression="brotli") storage.set("ltdb", os.path.join(data_dir, "ltdb.parquet")) storage.build("geosnap_data/storage")
[docs]def store_ncdb(filepath): """ Read & store data from Geolytics's Neighborhood Change Database. Parameters ---------- filepath : str location of the input CSV file extracted from your Geolytics DVD """ ncdb_vars = datasets.codebook()["ncdb"].dropna()[1:].values names = [] for name in ncdb_vars: for suffix in ["7", "8", "9", "0", "1", "2"]: names.append(name + suffix) names.append("GEO2010") c = pd.read_csv(filepath, nrows=1).columns c = pd.Series(c.values) keep = [] for _, col in c.items(): for name in names: if col.startswith(name): keep.append(col) df = pd.read_csv( filepath, usecols=keep, engine="c", na_values=["", " ", 99999, -999], converters={ "GEO2010": str, "COUNTY": str, "COUSUB": str, "DIVISION": str, "REGION": str, "STATE": str, }, ) cols = df.columns fixed = [] for col in cols: if col.endswith("D"): fixed.append("D" + col[:-1]) elif col.endswith("N"): fixed.append("N" + col[:-1]) elif col.endswith("1A"): fixed.append(col[:-2] + "2") orig = [] for col in cols: if col.endswith("D"): orig.append(col) elif col.endswith("N"): orig.append(col) elif col.endswith("1A"): orig.append(col) renamer = dict(zip(orig, fixed)) df.rename(renamer, axis="columns", inplace=True) df = df[df.columns[df.columns.isin(names)]] df = pd.wide_to_long( df, stubnames=ncdb_vars, i="GEO2010", j="year", suffix="(7|8|9|0|1|2)" ).reset_index() df["year"] = df["year"].replace( {7: 1970, 8: 1980, 9: 1990, 0: 2000, 1: 2010, 2: 2010} ) df = df.groupby(["GEO2010", "year"]).first() mapper = dict(zip(datasets.codebook().ncdb, datasets.codebook().variable)) df.reset_index(inplace=True) df = df.rename(mapper, axis="columns") df = df.set_index("geoid") for row in datasets.codebook()["formula"].dropna().tolist(): try: df.eval(row, inplace=True) except: warn("Unable to compute " + str(row)) keeps = df.columns[ df.columns.isin(datasets.codebook()["variable"].tolist() + ["year"]) ] df = df[keeps] df = df.loc[df.n_total_pop != 0] df.to_parquet(os.path.join(data_dir, "ncdb.parquet"), compression="brotli") storage.set("ncdb", os.path.join(data_dir, "ncdb.parquet")) storage.build("geosnap_data/storage")
def _fips_filter( state_fips=None, county_fips=None, msa_fips=None, fips=None, data=None ): data = data.copy() fips_list = [] for each in [state_fips, county_fips, fips]: if isinstance(each, (str,)): each = [each] if isinstance(each, (list,)): fips_list += each if msa_fips: fips_list += datasets.msa_definitions()[ datasets.msa_definitions()["CBSA Code"] == msa_fips ]["stcofips"].tolist() df = data[data.geoid.str.startswith(tuple(fips_list))] return df def _from_db( data, state_fips=None, county_fips=None, msa_fips=None, fips=None, years=None ): data = data[data.year.isin(years)] data = data.reset_index() df = _fips_filter( state_fips=state_fips, county_fips=county_fips, msa_fips=msa_fips, fips=fips, data=data, ) # we know we're using 2010, need to drop the year column so no conficts tracts = datasets.tracts_2010(convert=False) tracts = tracts[["geoid", "wkb"]] tracts = tracts[tracts.geoid.isin(df.geoid)] tracts = convert_gdf(tracts) gdf = df.merge(tracts, on="geoid", how="left").set_index("geoid") gdf = gpd.GeoDataFrame(gdf) return gdf