Source code for geosnap.analyze.cluster

"""Wrappers for multivariate clustering algorithms."""

from warnings import warn

import numpy as np
from region.max_p_regions.heuristics import MaxPRegionsHeu
from region.p_regions.azp import AZP
from region.skater.skater import Spanning_Forest
from sklearn.cluster import (
    AffinityPropagation,
    AgglomerativeClustering,
    KMeans,
    MiniBatchKMeans,
    SpectralClustering,
)
from sklearn.mixture import GaussianMixture
from spenc import SPENC

# Sklearn a-spatial models


[docs]def ward(X, n_clusters=5, **kwargs): """Agglomerative clustering using Ward linkage. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 8 The number of clusters to form. Returns ------- fitted model : sklearn.cluster.AgglomerativeClustering instance """ model = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward") model.fit(X) return model
[docs]def kmeans( X, n_clusters, init="k-means++", n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto", precompute_distances="auto", **kwargs ): """K-Means clustering. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. Returns ------- fitted model : sklearn.cluster.KMeans instance """ if X.shape[0] > 12000: model = MiniBatchKMeans( n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, verbose=verbose, random_state=random_state, ) else: model = KMeans( n_clusters=n_clusters, init="k-means++", n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs, algorithm=algorithm, ) model.fit(X) return model
[docs]def affinity_propagation( X, damping=0.8, preference=-1000, max_iter=500, convergence_iter=15, copy=True, affinity="euclidean", verbose=False, **kwargs ): """Clustering with Affinity Propagation. Parameters ---------- X : array-like n x k attribute data preference : array-like, shape (n_samples,) or float, optional, default: None The preference parameter passed to scikit-learn's affinity propagation algorithm damping : float, optional, default: 0.8 The damping parameter passed to scikit-learn's affinity propagation algorithm max_iter : int, optional, default: 1000 Maximum number of iterations Returns ------- fitted cluster instance : sklearn.cluster.AffinityPropagation """ model = AffinityPropagation( preference=preference, damping=damping, max_iter=max_iter, convergence_iter=convergence_iter, copy=copy, affinity=affinity, verbose=verbose, ) model.fit(X) return model
[docs]def spectral( X, n_clusters, eigen_solver=None, random_state=None, n_init=10, gamma=1.0, affinity="rbf", n_neighbors=10, eigen_tol=0.0, assign_labels="kmeans", degree=3, coef0=1, kernel_params=None, n_jobs=-1, **kwargs ): """Spectral Clustering. Parameters ---------- X : array-like n x k attribute data n_clusters : int The number of clusters to form as well as the number of centroids to generate. eigen_solver : {None, ‘arpack’, ‘lobpcg’, or ‘amg’} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. n_components : integer, optional, default=n_clusters Number of eigen vectors to use for the spectral embedding random_state : int, RandomState instance or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver='amg' and by the K-Means initialization. Use an int to make the randomness deterministic. See Glossary. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. gamma : float, default=1.0 Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. Ignored for affinity='nearest_neighbors'. affinity : string or callable, default ‘rbf’ How to construct the affinity matrix. n_neighbors : integer Number of neighbors to use when constructing the affinity matrix using the nearest neighbors method. Ignored for affinity='rbf'. eigen_tol : float, optional, default: 0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when eigen_solver='arpack'. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. coef0 : float, default=1 Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels. n_jobs : int or None, optional (default=None) The number of parallel jobs to run. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. **kwargs : dict additional wkargs. Returns ------- fitted cluster instance : sklearn.cluster.SpectralClustering """ model = SpectralClustering( n_clusters=n_clusters, eigen_solver=eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, degree=degree, coef0=coef0, kernel_params=kernel_params, n_jobs=n_jobs, ) model.fit(X) return model
[docs]def gaussian_mixture( X, n_clusters=5, covariance_type="full", best_model=False, max_clusters=10, random_state=None, **kwargs ): """Clustering with Gaussian Mixture Model. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 5 The number of clusters to form. covariance_type: str, optional, default: "full"" The covariance parameter passed to scikit-learn's GaussianMixture algorithm best_model: bool, optional, default: False Option for finding endogenous K according to Bayesian Information Criterion max_clusters: int, optional, default:10 The max number of clusters to test if using `best_model` option random_state: int, optional, default: None The seed used to generate replicable results kwargs Returns ------- fitted cluster instance: sklearn.mixture.GaussianMixture """ if random_state is None: warn( "Note: Gaussian Mixture Clustering is probabilistic--" "cluster labels may be different for different runs. If you need consistency, " "you should set the `random_state` parameter" ) if best_model is True: # selection routine from # https://plot.ly/scikit-learn/plot-gmm-selection/ lowest_bic = np.infty bic = [] maxn = max_clusters + 1 n_components_range = range(1, maxn) cv_types = ["spherical", "tied", "diag", "full"] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture( n_components=n_components, random_state=random_state, covariance_type=cv_type, ) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) model = best_gmm else: model = GaussianMixture( n_components=n_clusters, random_state=random_state, covariance_type=covariance_type, ) model.fit(X) model.labels_ = model.predict(X) return model
[docs]def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs): """Clustering with Hierarchical DBSCAN. Parameters ---------- X : array-like n x k attribute data min_cluster_size : int, default: 5 the minimum number of points necessary to generate a cluster gen_min_span_tree : bool Description of parameter `gen_min_span_tree` (the default is True). kwargs Returns ------- fitted cluster instance: hdbscan.hdbscan.HDBSCAN """ try: from hdbscan import HDBSCAN except ImportError: raise ImportError( "You must have the hdbscan package installed to use this function" ) model = HDBSCAN(min_cluster_size=min_cluster_size) model.fit(X) return model
# Spatially Explicit/Encouraged Methods
[docs]def ward_spatial(X, w, n_clusters=5, **kwargs): """Agglomerative clustering using Ward linkage with a spatial connectivity constraint. Parameters ---------- X : array-like n x k attribute data w : libpywal.weights.W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. Returns ------- fitted cluster instance: sklearn.cluster.AgglomerativeClustering """ model = AgglomerativeClustering( n_clusters=n_clusters, connectivity=w.sparse, linkage="ward" ) model.fit(X) return model
[docs]def spenc(X, w, n_clusters=5, gamma=1, **kwargs): """Spatially encouraged spectral clustering. :cite:`wolf2018` Parameters ---------- X : array-like n x k attribute data w : libpysal.weights.W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. gamma : int, default:1 TODO. Returns ------- fitted cluster instance: spenc.SPENC """ model = SPENC(n_clusters=n_clusters, gamma=gamma) model.fit(X, w.sparse) return model
[docs]def skater( X, w, n_clusters=5, floor=-np.inf, trace=False, islands="increase", **kwargs ): """SKATER spatial clustering algorithm. Parameters ---------- X : array-like n x k attribute data w : libpysal.weights.W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. floor : type TODO. trace : type TODO. islands : type TODO. Returns ------- fitted cluster instance: region.skater.skater.Spanning_Forest """ model = Spanning_Forest() model.fit(n_clusters, w, data=X.values, quorum=floor, trace=trace) model.labels_ = model.current_labels_ return model
[docs]def azp(X, w, n_clusters=5, **kwargs): """AZP clustering algorithm. Parameters ---------- X : array-like n x k attribute data w : libpysal.weights.W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. Returns ------- fitted cluster instance: region.p_regions.azp.AZP """ model = AZP() model.fit_from_w(attr=X.values, w=w, n_regions=n_clusters) return model
[docs]def max_p(X, w, threshold_variable="count", threshold=10, **kwargs): """Max-p clustering algorithm :cite:`Duque2012`. Parameters ---------- X : array-like n x k attribute data w : libpysal.weights.W instance spatial weights matrix threshold_variable : str, default:"count" attribute variable to use as floor when calculate threshold : int, default:10 integer that defines the upper limit of a variable that can be grouped into a single region Returns ------- fitted cluster instance: region.p_regions.heuristics.MaxPRegionsHeu """ model = MaxPRegionsHeu() model.fit_from_w(w, X.values, threshold_variable, threshold) return model