"""Tools for creating and manipulating neighborhood datasets."""
import multiprocessing
import os
import pathlib
from warnings import warn
import geopandas as gpd
import pandas as pd
import quilt3
from appdirs import user_data_dir
from requests.exceptions import Timeout
from shapely import wkb, wkt
appname = "geosnap"
appauthor = "geosnap"
data_dir = user_data_dir(appname, appauthor)
if not os.path.exists(data_dir):
pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)
# look for local storage and create if missing
try:
storage = quilt3.Package.browse("geosnap_data/storage")
except FileNotFoundError:
storage = quilt3.Package()
class _Map(dict):
"""tabbable dict."""
def __init__(self, *args, **kwargs):
super(_Map, self).__init__(*args, **kwargs)
for arg in args:
if isinstance(arg, dict):
for k, v in arg.iteritems():
self[k] = v
if kwargs:
for k, v in kwargs.iteritems():
self[k] = v
def __getattr__(self, attr):
return self.get(attr)
def __setattr__(self, key, value):
self.__setitem__(key, value)
def __setitem__(self, key, value):
super(_Map, self).__setitem__(key, value)
self.__dict__.update({key: value})
def __delattr__(self, item):
self.__delitem__(item)
def __delitem__(self, key):
super(_Map, self).__delitem__(key)
del self.__dict__[key]
def _deserialize_wkb(str):
return wkb.loads(str, hex=True)
def _deserialize_wkt(str):
return wkt.loads(str)
def _convert_gdf(df):
"""Convert DataFrame to GeoDataFrame.
DataFrame to GeoDataFrame by converting wkt/wkb geometry representation
back to Shapely object.
Parameters
----------
df : pandas.DataFrame
dataframe with column named either "wkt" or "wkb" that stores
geometric information as well-known text or well-known binary,
(hex encoded) respectively.
Returns
-------
geopandas.GeoDataFrame
geodataframe with converted `geometry` column.
"""
df = df.copy()
df.reset_index(inplace=True, drop=True)
if "wkt" in df.columns.tolist():
with multiprocessing.Pool() as P:
df["geometry"] = P.map(_deserialize_wkt, df["wkt"])
df = df.drop(columns=["wkt"])
else:
with multiprocessing.Pool() as P:
df["geometry"] = P.map(_deserialize_wkb, df["wkb"])
df = df.drop(columns=["wkb"])
df = gpd.GeoDataFrame(df)
df.crs = {"init": "epsg:4326"}
return df
class DataStore:
"""Storage for geosnap data. Currently supports US Census data.
Unless otherwise noted, data are collected from the U.S. Census Bureau's TIGER/LINE Files
https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2018 and converted to
parquet files.
"""
def __init__(self):
"""Instantiate a new DataStore object."""
try: # if any of these aren't found, stream them insteead
administrative = quilt3.Package.browse("census/administrative")
tracts_cartographic = quilt3.Package.browse("census/tracts_cartographic")
except FileNotFoundError:
warn(
"Unable to locate local census data. Streaming instead.\n"
"If you plan to use census data repeatedly you can store it locally "
"with the io.store_census function for better performance"
)
try:
tracts_cartographic = quilt3.Package.browse(
"census/tracts_cartographic", "s3://spatial-ucr"
)
administrative = quilt3.Package.browse(
"census/administrative", "s3://spatial-ucr"
)
except Timeout:
warn(
"Unable to locate local census data and unable to reach s3 bucket."
"You will be unable to use built-in data during this session. "
"If you need these data, please try downloading a local copy "
"with the io.store_census function, then restart your "
"python kernel and try again."
)
self.tracts_cartographic = tracts_cartographic
self.administrative = administrative
def __dir__(self):
atts = [
"blocks_2000",
"blocks_2010",
"codebook",
"counties",
"ltdb",
"msa_definitions",
"msas",
"ncdb",
"states",
"tracts_1990",
"tracts_2000",
"tracts_2010",
]
return atts
def blocks_2000(self, states=None, convert=True, fips=None):
"""Census blocks for 2000.
Parameters
----------
states : list-like
list of state fips codes to return as a datafrrame.
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
type
pandas.DataFrame or geopandas.GeoDataFrame
2000 blocks as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
try:
blocks_2000 = quilt3.Package.browse("census/blocks_2000") # if any of these aren't found, stream them instead
except FileNotFoundError:
warn(
"Unable to locate local census 2000 block data. Streaming instead.\n"
"If you plan to use census data repeatedly you can store it locally "
"with the io.store_blocks_2000 function for better performance"
)
try:
blocks_2000 = quilt3.Package.browse(
"census/blocks_2000", "s3://spatial-ucr"
)
except Timeout:
warn(
"Unable to locate local census data and unable to reach s3 bucket."
"You will be unable to use built-in data during this session. "
"Try downloading a local copy with the io.store_blocks_2000 function,"
"then restart your python kernel and try again."
)
if isinstance(states, (str,)):
states = [states]
if isinstance(states, (int,)):
states = [states]
blks = {}
for state in states:
blks[state] = blocks_2000[f"{state}.parquet"]()
if fips:
blks[state] = blks[state][blks[state]["geoid"].str.startswith(fips)]
blks[state]["year"] = 2000
blocks = list(blks.values())
blocks = pd.concat(blocks, sort=True)
if convert:
return _convert_gdf(blocks)
return blocks
def blocks_2010(self, states=None, convert=True, fips=None):
"""Census blocks for 2010.
Parameters
----------
states : list-like
list of state fips codes to return as a datafrrame.
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
type
pandas.DataFrame or geopandas.GeoDataFrame
2010 blocks as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
try:
blocks_2010 = quilt3.Package.browse("census/blocks_2010") # if any of these aren't found, stream them instead
except FileNotFoundError:
warn(
"Unable to locate local census 2010 block data. Streaming instead.\n"
"If you plan to use census data repeatedly you can store it locally "
"with the io.store_blocks_2010 function for better performance"
)
try:
blocks_2010 = quilt3.Package.browse(
"census/blocks_2010", "s3://spatial-ucr"
)
except Timeout:
warn(
"Unable to locate local census data and unable to reach s3 bucket."
"You will be unable to use built-in data during this session. "
"If you need these data, please try downloading a local copy "
"with the io.store_blocks_2010 function, then restart your "
"python kernel and try again."
)
if isinstance(states, (str, int)):
states = [states]
blks = {}
for state in states:
blks[state] = blocks_2010[f"{state}.parquet"]()
if fips:
blks[state] = blks[state][blks[state]["geoid"].str.startswith(fips)]
blks[state]["year"] = 2010
blocks = list(blks.values())
blocks = pd.concat(blocks, sort=True)
if convert:
return _convert_gdf(blocks)
return blocks
def tracts_1990(self, states=None, convert=True):
"""Nationwide Census Tracts as drawn in 1990 (cartographic 500k).
Parameters
----------
states : list-like
list of state fips to subset the national dataframe
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
pandas.DataFrame or geopandas.GeoDataFrame
1990 tracts as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
t = self.tracts_cartographic["tracts_1990_500k.parquet"]()
if states:
t = t[t.geoid.str[:2].isin(states)]
t["year"] = 1990
if convert:
return _convert_gdf(t)
else:
return t
def tracts_2000(self, states=None, convert=True):
"""Nationwide Census Tracts as drawn in 2000 (cartographic 500k).
Parameters
----------
states : list-like
list of state fips to subset the national dataframe
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
pandas.DataFrame or geopandas.GeoDataFrame
2000 tracts as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
t = self.tracts_cartographic["tracts_2000_500k.parquet"]()
if states:
t = t[t.geoid.str[:2].isin(states)]
t["year"] = 2000
if convert:
return _convert_gdf(t)
else:
return t
def tracts_2010(self, states=None, convert=True):
"""Nationwide Census Tracts as drawn in 2010 (cartographic 500k).
Parameters
----------
states : list-like
list of state fips to subset the national dataframe
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
pandas.DataFrame or geopandas.GeoDataFrame
2010 tracts as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
t = self.tracts_cartographic["tracts_2010_500k.parquet"]()
if states:
t = t[t.geoid.str[:2].isin(states)]
t["year"] = 2010
if convert:
return _convert_gdf(t)
else:
return t
def msas(self, convert=True):
"""Metropolitan Statistical Areas as drawn in 2010.
Data come from the U.S. Census Bureau's most recent TIGER/LINE files
https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2018&layergroup=Core+Based+Statistical+Areas
Parameters
----------
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
pandas.DataFrame or geopandas.GeoDataFrame
2010 MSAs as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
if convert:
return _convert_gdf(
self.administrative["msas.parquet"]().sort_values(by="name")
)
return self.administrative["msas.parquet"]().sort_values(by="name")
def states(self, convert=True):
"""States.
Parameters
----------
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
pandas.DataFrame or geopandas.GeoDataFrame
US States as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
if convert:
return _convert_gdf(self.administrative["states.parquet"]())
return self.administrative["states.parquet"]()
def counties(self):
"""Nationwide counties as drawn in 2010.
Parameters
----------
convert : bool
if True, return geodataframe, else return dataframe (the default is True).
Returns
-------
geopandas.GeoDataFrame
2010 counties as a geodataframe or as a dataframe with geometry
stored as well-known binary on the 'wkb' column.
"""
return _convert_gdf(self.administrative["counties.parquet"]())
def msa_definitions(self):
"""2010 Metropolitan Statistical Area definitions.
Data come from the U.S. Census Bureau's most recent delineation files, available at
https://www.census.gov/geographies/reference-files/time-series/demo/metro-micro/delineation-files.html
Returns
-------
pandas.DataFrame.
dataframe that stores state/county --> MSA crosswalk definitions.
"""
return self.administrative["msa_definitions.parquet"]()
def ltdb(self):
"""Longitudinal Tract Database (LTDB).
Returns
-------
pandas.DataFrame or geopandas.GeoDataFrame
LTDB as a long-form geo/dataframe
"""
try:
return storage["ltdb"]()
except KeyError:
print(
"Unable to locate LTDB data. Try saving the data again "
"using the `store_ltdb` function"
)
def ncdb(self):
"""Geolytics Neighborhood Change Database (NCDB).
Returns
-------
pandas.DataFrarme
NCDB as a long-form dataframe
"""
try:
return storage["ncdb"]()
except KeyError:
print(
"Unable to locate NCDB data. Try saving the data again "
"using the `store_ncdb` function"
)
def codebook(self):
"""Codebook.
Returns
-------
pandas.DataFrame
codebook that stores variable names, definitions, and formulas.
"""
return pd.read_csv(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "io/variables.csv")
)
datasets = DataStore()