Source code for geosnap.analyze.cluster

from warnings import warn

import numpy as np
from region.max_p_regions.heuristics import MaxPRegionsHeu
from region.p_regions.azp import AZP
from region.skater.skater import Spanning_Forest
from sklearn.cluster import (
    AffinityPropagation,
    AgglomerativeClustering,
    KMeans,
    MiniBatchKMeans,
    SpectralClustering,
)
from sklearn.mixture import GaussianMixture
from spenc import SPENC

# Sklearn a-spatial models


[docs]def ward(X, n_clusters=5, **kwargs): """Agglomerative clustering using Ward linkage. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 8 The number of clusters to form. Returns ------- model: sklearn AgglomerativeClustering instance """ model = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward") model.fit(X) return model
[docs]def kmeans( X, n_clusters, init="k-means++", n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto", precompute_distances="auto", **kwargs ): """K-Means clustering. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. Returns ------- model: sklearn KMeans instance """ if X.shape[0] > 12000: model = MiniBatchKMeans( n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, verbose=verbose, random_state=random_state, ) else: model = KMeans( n_clusters=n_clusters, init="k-means++", n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs, algorithm=algorithm, ) model.fit(X) return model
[docs]def affinity_propagation( X, damping=0.8, preference=-1000, max_iter=500, convergence_iter=15, copy=True, affinity="euclidean", verbose=False, **kwargs ): """Clustering with Affinity Propagation. Parameters ---------- X : array-like n x k attribute data preference : array-like, shape (n_samples,) or float, optional, default: None The preference parameter passed to scikit-learn's affinity propagation algorithm damping: float, optional, default: 0.8 The damping parameter passed to scikit-learn's affinity propagation algorithm max_iter : int, optional, default: 1000 Maximum number of iterations Returns ------- model: sklearn AffinityPropagation instance """ model = AffinityPropagation( preference=preference, damping=damping, max_iter=max_iter, convergence_iter=convergence_iter, copy=copy, affinity=affinity, verbose=verbose, ) model.fit(X) return model
[docs]def spectral( X, n_clusters, eigen_solver=None, random_state=None, n_init=10, gamma=1.0, affinity="rbf", n_neighbors=10, eigen_tol=0.0, assign_labels="kmeans", degree=3, coef0=1, kernel_params=None, n_jobs=-1, **kwargs ): """Short summary. Parameters ---------- X : arral-like n x k attribute data n_clusters : type The number of clusters to form as well as the number of centroids to generate. eigen_solver : type Description of parameter `eigen_solver` (the default is None). random_state : type Description of parameter `random_state` (the default is None). n_init : type Description of parameter `n_init` (the default is 10). gamma : type Description of parameter `gamma` (the default is 1.0). affinity : type Description of parameter `affinity` (the default is 'rbf'). n_neighbors : type Description of parameter `n_neighbors` (the default is 10). eigen_tol : type Description of parameter `eigen_tol` (the default is 0.0). assign_labels : type Description of parameter `assign_labels` (the default is 'kmeans'). degree : type Description of parameter `degree` (the default is 3). coef0 : type Description of parameter `coef0` (the default is 1). kernel_params : type Description of parameter `kernel_params` (the default is None). n_jobs : type Description of parameter `n_jobs` (the default is -1). **kwargs : type Description of parameter `**kwargs`. Returns ------- model: sklearn SpectralClustering instance """ model = SpectralClustering( n_clusters=n_clusters, eigen_solver=eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, degree=degree, coef0=coef0, kernel_params=kernel_params, n_jobs=n_jobs, ) model.fit(X) return model
[docs]def gaussian_mixture( X, n_clusters=5, covariance_type="full", best_model=False, max_clusters=10, random_state=None, **kwargs ): """Clustering with Gaussian Mixture Model Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 5 The number of clusters to form. covariance_type: str, optional, default: "full"" The covariance parameter passed to scikit-learn's GaussianMixture algorithm best_model: bool, optional, default: False Option for finding endogenous K according to Bayesian Information Criterion max_clusters: int, optional, default:10 The max number of clusters to test if using `best_model` option random_state: int, optional, default: None The seed used to generate replicable results Returns ------- model: sklearn GaussianMixture instance """ if random_state is None: warn( "Note: Gaussian Mixture Clustering is probabilistic--\ cluster labels may be different for different runs. If you need consistency,\ you should set the `random_state` parameter" ) if best_model is True: # selection routine from # https://plot.ly/scikit-learn/plot-gmm-selection/ lowest_bic = np.infty bic = [] maxn = max_clusters + 1 n_components_range = range(1, maxn) cv_types = ["spherical", "tied", "diag", "full"] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture( n_components=n_components, random_state=random_state, covariance_type=cv_type, ) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) model = best_gmm else: model = GaussianMixture( n_components=n_clusters, random_state=random_state, covariance_type=covariance_type, ) model.fit(X) model.labels_ = model.predict(X) return model
[docs]def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs): """Clustering with Hierarchical DBSCAN Parameters ---------- X : array-like n x k attribute data min_cluster_size : int, default: 5 the minimum number of points necessary to generate a cluster gen_min_span_tree : bool Description of parameter `gen_min_span_tree` (the default is True). Returns ------- model: hdbscan HDBSCAN instance """ try: from hdbscan import HDBSCAN except ImportError: raise ImportError( "You must have the hdbscan package installed to use this function" ) model = HDBSCAN(min_cluster_size=min_cluster_size) model.fit(X) return model
# Spatially Explicit/Encouraged Methods
[docs]def ward_spatial(X, w, n_clusters=5, **kwargs): """Agglomerative clustering using Ward linkage with a spatial connectivity constraint Parameters ---------- X : array-like n x k attribute data w : PySAL W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. Returns ------- model: sklearn AgglomerativeClustering instance """ model = AgglomerativeClustering( n_clusters=n_clusters, connectivity=w.sparse, linkage="ward" ) model.fit(X) return model
[docs]def spenc(X, w, n_clusters=5, gamma=1, **kwargs): """Spatially encouraged spectral clustering :cite:`wolf2018` Parameters ---------- X : array-like n x k attribute data w : PySAL W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. gamma : int, default:1 TODO. Returns ------- model: spenc SPENC instance """ model = SPENC(n_clusters=n_clusters, gamma=gamma) model.fit(X, w.sparse) return model
[docs]def skater( X, w, n_clusters=5, floor=-np.inf, trace=False, islands="increase", **kwargs ): """SKATER spatial clustering algorithm. Parameters ---------- X : array-like n x k attribute data w : PySAL W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. floor : type TODO. trace : type TODO. islands : type TODO. Returns ------- model: skater SKATER instance """ model = Spanning_Forest() model.fit(n_clusters, w, data=X.values, quorum=floor, trace=trace) model.labels_ = model.current_labels_ return model
[docs]def azp(X, w, n_clusters=5, **kwargs): """AZP clustering algorithm Parameters ---------- X : array-like n x k attribute data w : PySAL W instance spatial weights matrix n_clusters : int, optional, default: 5 The number of clusters to form. Returns ------- model: region AZP instance """ model = AZP() model.fit_from_w(attr=X.values, w=w, n_regions=n_clusters) return model
[docs]def max_p(X, w, threshold_variable="count", threshold=10, **kwargs): """Max-p clustering algorithm :cite:`Duque2012` Parameters ---------- X : array-like n x k attribute data w : PySAL W instance spatial weights matrix threshold_variable : str, default:"count" attribute variable to use as floor when calculate threshold : int, default:10 integer that defines the upper limit of a variable that can be grouped into a single region Returns ------- model: region MaxPRegionsHeu instance """ model = MaxPRegionsHeu() model.fit_from_w(w, X.values, threshold_variable, threshold) return model