Source code for geosnap.analyze.analytics

"""Tools for the spatial analysis of neighborhood change."""

from collections import namedtuple

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from libpysal.weights import attach_islands
from libpysal.weights.contiguity import Queen, Rook
from libpysal.weights.distance import KNN

from .._data import _Map
from .cluster import (
    affinity_propagation,
    azp,
    gaussian_mixture,
    hdbscan,
    kmeans,
    max_p,
    skater,
    spectral,
    spenc,
    ward,
    ward_spatial,
)

ModelResults = namedtuple(
    "model", ["X", "columns", "labels", "instance", "W"], rename=False
)


[docs]def cluster( gdf, n_clusters=6, method=None, best_model=False, columns=None, verbose=False, time_var="year", id_var="geoid", scaler='std', pooling="fixed", **kwargs, ): """Create a geodemographic typology by running a cluster analysis on the study area's neighborhood attributes. Parameters ---------- gdf : geopandas.GeoDataFrame, required long-form GeoDataFrame containing neighborhood attributes n_clusters : int, required the number of clusters to model. The default is 6). method : str in ['kmeans', 'ward', 'affinity_propagation', 'spectral','gaussian_mixture', 'hdbscan'], required the clustering algorithm used to identify neighborhood types best_model : bool, optional if using a gaussian mixture model, use BIC to choose the best n_clusters. (the default is False). columns : list-like, required subset of columns on which to apply the clustering verbose : bool, optional whether to print warning messages (the default is False). time_var : str, optional which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var : str, optional which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index scaler : None or scaler from sklearn.preprocessing, optional a scikit-learn preprocessing class that will be used to rescale the data. Defaults to sklearn.preprocessing.StandardScaler pooling : ["fixed", "pooled", "unique"], optional (default='fixed') How to treat temporal data when applying scaling. Options include: * fixed : scaling is fixed to each time period * pooled : data are pooled across all time periods * unique : if scaling, apply the scaler to each time period, then generate clusters unique to each time period. Returns ------- gdf : geopandas.GeoDataFrame GeoDataFrame with a column of neighborhood cluster labels appended as a new column. If cluster method exists as a column on the DataFrame then the column will be incremented. model : named tuple A tuple with attributes X, columns, labels, instance, W, which store the input matrix, column labels, fitted model instance, and spatial weights matrix model_name : str name of model to be stored in a Community """ specification = { "ward": ward, "kmeans": kmeans, "affinity_propagation": affinity_propagation, "gaussian_mixture": gaussian_mixture, "spectral": spectral, "hdbscan": hdbscan, } if scaler == "std": scaler = StandardScaler() if method not in specification.keys(): raise ValueError( "`method` must of one of ['kmeans', 'ward', 'affinity_propagation', 'spectral', 'gaussian_mixture', 'hdbscan']" ) # if we already have a column named after the clustering method, then increment it. if method in gdf.columns.tolist(): model_name = method + str(len(gdf.columns[gdf.columns.str.startswith(method)])) else: model_name = method if not columns: raise ValueError("You must provide a subset of columns as input") times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns] data = data.dropna(how="any", subset=columns) if scaler: if pooling in ["fixed", "unique"]: # if fixed (or unique), scale within each time period for time in times: data.loc[time] = scaler.fit_transform(data.loc[time].values) elif pooling == "pooled": # if pooled, scale the whole series at once data.loc[:, columns] = scaler.fit_transform(data.values) # the rescalar can create nans if a column has no variance, so fill with 0 data = data.fillna(0) if pooling != "unique": # run the cluster model then join the labels back to the original data model = specification[method]( data, n_clusters=n_clusters, best_model=best_model, verbose=verbose, **kwargs, ) labels = model.labels_.astype(str) data = data.reset_index() clusters = pd.DataFrame( {model_name: labels, time_var: data[time_var], id_var: data[id_var]} ) clusters.set_index([time_var, id_var], inplace=True) clusters = clusters[~clusters.index.duplicated(keep='first')] gdf = gdf.join(clusters, how="left") gdf = gdf.reset_index() results = ModelResults( X=data.values, columns=columns, labels=model.labels_, instance=model, W=None ) return gdf, results, model_name elif pooling == 'unique': models = _Map() gdf[model_name] = np.nan data = data.reset_index() for time in times: df = data[data[time_var] == time] model = specification[method]( df[columns], n_clusters=n_clusters, best_model=best_model, verbose=verbose, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame( {model_name: labels, time_var: time, id_var: df[id_var]} ) clusters.set_index([time_var, id_var], inplace=True) gdf.update(clusters) results = ModelResults( X=df[columns].values, columns=columns, labels=model.labels_, instance=model, W=None ) models[time] = results gdf = gdf.reset_index() return gdf, models, model_name
[docs]def cluster_spatial( gdf, n_clusters=6, spatial_weights="rook", method=None, columns=None, threshold_variable="count", threshold=10, time_var="year", id_var="geoid", scaler="std", weights_kwargs=None, **kwargs, ): """Create a *spatial* geodemographic typology by running a cluster analysis on the metro area's neighborhood attributes and including a contiguity constraint. Parameters ---------- gdf : geopandas.GeoDataFrame long-form geodataframe holding neighborhood attribute and geometry data. n_clusters : int the number of clusters to model. The default is 6). spatial_weights : ['queen', 'rook'] or libpysal.weights.W object spatial weights matrix specification`. By default, geosnap will calculate Rook weights, but you can also pass a libpysal.weights.W object for more control over the specification. method : str in ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p'] the clustering algorithm used to identify neighborhood types columns : array-like subset of columns on which to apply the clustering threshold_variable : str for max-p, which variable should define `p`. The default is "count", which will grow regions until the threshold number of polygons have been aggregated threshold : numeric threshold to use for max-p clustering (the default is 10). time_var : str which column on the dataframe defines time and or sequencing of the long-form data. Default is "year" id_var : str which column on the long-form dataframe identifies the stable units over time. In a wide-form dataset, this would be the unique index weights_kwargs : dict If passing a libpysal.weights.W instance to spatial_weights, these additional keyword arguments that will be passed to the weights constructor scaler : None or scaler class from sklearn.preprocessing a scikit-learn preprocessing class that will be used to rescale the data. Defaults to sklearn.preprocessing.StandardScaler Returns ------- gdf : geopandas.GeoDataFrame GeoDataFrame with a column of neighborhood cluster labels appended as a new column. If cluster method exists as a column on the DataFrame then the column will be incremented. models : dict of named tuples tab-completable dictionary of named tuples keyed on the Community's time variable (e.g. year). The tuples store model results and have attributes X, columns, labels, instance, W, which store the input matrix, column labels, fitted model instance, and spatial weights matrix model_name : str name of model to be stored in a Community """ specification = { "azp": azp, "spenc": spenc, "ward_spatial": ward_spatial, "skater": skater, "max_p": max_p, } if method not in specification.keys(): raise ValueError( "`method` must be one of ['ward_spatial', 'spenc', 'skater', 'azp', 'max_p']" ) if method in gdf.columns.tolist(): model_name = method + str(len(gdf.columns[gdf.columns.str.startswith(method)])) else: model_name = method if not columns: raise ValueError("You must provide a subset of columns as input") if not method: raise ValueError("You must choose a clustering algorithm to use") if scaler == "std": scaler = StandardScaler() times = gdf[time_var].unique() gdf = gdf.set_index([time_var, id_var]) # this is the dataset we'll operate on data = gdf.copy()[columns + ["geometry"]] contiguity_weights = {"queen": Queen, "rook": Rook} if spatial_weights in contiguity_weights.keys(): W = contiguity_weights[spatial_weights] else: W = spatial_weights models = _Map() ws = {} clusters = [] gdf[model_name] = np.nan # loop over each time period, standardize the data and build a weights matrix for time in times: df = data.loc[time].dropna(how="any", subset=columns).reset_index() df[time_var] = time if scaler: df[columns] = scaler.fit_transform(df[columns].values) if weights_kwargs: w0 = W.from_dataframe(df, **weights_kwargs) else: w0 = W.from_dataframe(df) w1 = KNN.from_dataframe(df, k=1) ws = [w0, w1] if threshold_variable and threshold_variable != "count": data[threshold_variable] = gdf[threshold_variable] threshold_var = data.threshold_variable.values ws[0] = attach_islands(ws[0], ws[1]) elif threshold_variable == "count": threshold_var = np.ones(len(data.loc[time])) ws[0] = attach_islands(ws[0], ws[1]) else: threshold_var = None model = specification[method]( df[columns], w=ws[0], n_clusters=n_clusters, threshold_variable=threshold_var, threshold=threshold, **kwargs, ) labels = model.labels_.astype(str) clusters = pd.DataFrame( {model_name: labels, time_var: df[time_var], id_var: df[id_var]} ) clusters = clusters.drop_duplicates(subset=[id_var]) clusters.set_index([time_var, id_var], inplace=True) gdf.update(clusters) results = ModelResults( X=df[columns].values, columns=columns, labels=model.labels_, instance=model, W=ws[0], ) models[time] = results gdf = gdf.reset_index() return gdf, models, model_name