Source code for geosnap.analyze.analytics

"""Tools for the spatial analysis of neighborhood change."""

import numpy as np
import pandas as pd
from libpysal.weights import attach_islands
from libpysal.weights.contiguity import Queen, Rook
from libpysal.weights.distance import KNN
from sklearn.preprocessing import StandardScaler

from .cluster import (
    azp,
    affinity_propagation,
    gaussian_mixture,
    hdbscan,
    kmeans,
    max_p,
    skater,
    spectral,
    spenc,
    ward,
    ward_spatial,
)


[docs]def cluster( gdf, n_clusters=6, method=None, best_model=False, columns=None, verbose=False, time_var="year", id_var="geoid", return_model=False, scaler=None, **kwargs, ): """Create a geodemographic typology by running a cluster analysis on the study area's neighborhood attributes Parameters ---------- gdf : pandas.DataFrame long-form (geo)DataFrame containing neighborhood attributes n_clusters : int the number of clusters to model. The default is 6). method : str the clustering algorithm used to identify neighborhood types best_model : bool if using a gaussian mixture model, use BIC to choose the best n_clusters. (the default is False). columns : list-like subset of columns on which to apply the clustering verbose : bool whether to print warning messages (the default is False). time_var: str which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var: str which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index scaler: str or sklearn.preprocessing.Scaler a scikit-learn preprocessing class that will be used to rescale the data. Defaults to StandardScaler Returns ------- pandas.DataFrame with a column of neighborhood cluster labels appended as a new column. Will overwrite columns of the same name. """ if not columns: raise ValueError("You must provide a subset of columns as input") if not method: raise ValueError("You must choose a clustering algorithm to use") times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns] data = data.dropna(how="any", subset=columns) # if the user doesn't specify, use the standard scalar if not scaler: scaler = StandardScaler() for time in times: data.loc[time] = scaler.fit_transform(data.loc[time].values) # the rescalar can create nans if a column has no variance, so fill with 0 data = data.fillna(0) specification = { "ward": ward, "kmeans": kmeans, "affinity_propagation": affinity_propagation, "gaussian_mixture": gaussian_mixture, "spectral": spectral, "hdbscan": hdbscan, } # run the cluster model then join the labels back to the original data model = specification[method]( data, n_clusters=n_clusters, best_model=best_model, verbose=verbose, **kwargs ) labels = model.labels_.astype(str) data = data.reset_index() clusters = pd.DataFrame( {method: labels, time_var: data[time_var], id_var: data[id_var]} ) clusters.set_index([time_var, id_var], inplace=True) gdf = gdf.join(clusters, how="left") gdf = gdf.reset_index() if return_model: return gdf, model return gdf
[docs]def cluster_spatial( gdf, n_clusters=6, spatial_weights="rook", method=None, columns=None, threshold_variable="count", threshold=10, time_var="year", id_var="geoid", return_model=False, scaler=None, **kwargs, ): """Create a *spatial* geodemographic typology by running a cluster analysis on the metro area's neighborhood attributes and including a contiguity constraint. Parameters ---------- gdf : geopandas.GeoDataFrame long-form geodataframe holding neighborhood attribute and geometry data. n_clusters : int the number of clusters to model. The default is 6). weights_type : str 'queen' or 'rook' spatial weights matrix specification` (the default is "rook"). method : str the clustering algorithm used to identify neighborhood types columns : list-like subset of columns on which to apply the clustering threshold_variable : str for max-p, which variable should define `p`. The default is "count", which will grow regions until the threshold number of polygons have been aggregated threshold : numeric threshold to use for max-p clustering (the default is 10). time_var: str which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var: str which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index scaler: str or sklearn.preprocessing.Scaler a scikit-learn preprocessing class that will be used to rescale the data. Defaults to StandardScaler Returns ------- geopandas.GeoDataFrame with a column of neighborhood cluster labels appended as a new column. Will overwrite columns of the same name. """ if not columns: raise ValueError("You must provide a subset of columns as input") if not method: raise ValueError("You must choose a clustering algorithm to use") times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns + ["geometry"]] contiguity_weights = {"queen": Queen, "rook": Rook} if spatial_weights in contiguity_weights.keys(): W = contiguity_weights[spatial_weights] else: W = spatial_weights specification = { "azp": azp, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p, } # if the user doesn't specify, use the standard scalar if not scaler: scaler = StandardScaler() ws = {} clusters = [] dfs = [] # loop over each time period, standardize the data and build a weights matrix for time in times: df = data.loc[time].dropna(how="any", subset=columns).reset_index() df[time_var] = time df[columns] = scaler.fit_transform(df[columns].values) w0 = W.from_dataframe(df) w1 = KNN.from_dataframe(df, k=1) ws = [w0, w1] # the rescalar can create nans if a column has no variance, so fill with 0 df = df.fillna(0) if threshold_variable and threshold_variable != "count": data[threshold_variable] = gdf[threshold_variable] threshold_var = data.threshold_variable.values ws[0] = attach_islands(ws[0], ws[1]) elif threshold_variable == "count": threshold_var = np.ones(len(data.loc[time])) ws[0] = attach_islands(ws[0], ws[1]) else: threshold_var = None model = specification[method]( df[columns], w=ws[0], n_clusters=n_clusters, threshold_variable=threshold_var, threshold=threshold, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame( {method: labels, time_var: df[time_var], id_var: df[id_var]} ) clusters.set_index([time_var, id_var], inplace=True) dfs.append(gdf.loc[time].join(clusters, how="left")) gdf = pd.concat(dfs).reset_index() if return_model: return gdf, model return gdf