"""
Base class for cluster sets, i.e. clustering for a range of number of clusters
"""
from collections import Sequence
import numpy as np
from phasik.classes.clustering import ClusterSet
from phasik.drawing.drawing import plot_events, plot_phases
from phasik.drawing.drawing_clusters import plot_cluster_sets, relabel_clustersets
from phasik.drawing.utils import adjust_margin, display_name
__all__ = ['ClusterSets']
[docs]class ClusterSets(Sequence) :
"""Base class for sets of clusters (partition) of timepoints
Attributes
-----------
cluster_sets : iterable of phasik.ClusterSet
List of ClusterSets
clusters : numpy array of int
Summary array of the cluster labels, with dim (len(ns_max), len(times))
n_clusters : list of int
Number of clusters in the cluster set (partition)
times : list of (int or float)
Sorted list of time associated to each clustered snapshot
distance_metric : str
Distance metric used to compute the distance between snapshots, e.g. 'euclidean',
with sklearn.metrics.pairwise.paired_distances.
It must be one of the options allowed by scipy.spatial.distance.pdist
for its metric parameter (e.g. 'chebyshev', 'cityblock', 'correlation',
'cosine', 'euclidean', 'hamming', 'jaccard', etc.), or a metric listed
in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
n_max_type : float
Method that was used to determine when to stop clustering when creating this cluster
set. e.g. A cluster set can be created by clustering until a particular number of clusters has been
reached ('maxclust'), or until every cluster is at least a certain distance away from each other
('distance').
ns_max : list of int
List of values corresponding to the n_max_type described above, in other words,
list of numbers clusters to be computed. The number of elements in this list
is the number of ClusterSet computed.
silhouettes_average : numpy array
Value of average silouette for each clustering
"""
def __init__(self, cluster_sets, n_max_type, ns_max) :
"""
Parameters
----------
cluster_sets : iterable of ClusterSet
n_max_type : str
Method that was used to determine when to stop clustering when creating these cluster
sets. e.g. A cluster set can be created by clustering until a particular number of clusters has been
reached ('maxclust'), or until every cluster is at least a certain distance away from each other
('distance')
ns_max : list of int
List of values corresponding to the n_max_type described above, in other words,
list of numbers clusters to be computed. The number of elements in this list
is the number of ClusterSet computed.
"""
self._cluster_sets = cluster_sets
self.clusters = np.array([cluster_set.clusters for cluster_set in cluster_sets]) # array of cluster labels
self.n_clusters = np.array([cluster_set.n_clusters for cluster_set in cluster_sets])
self.times = cluster_sets[0].times # times must be the same in all sets
self.distance_metric = cluster_sets[0].distance_metric # must be the same in all sets
self.n_max_type = n_max_type
self.ns_max = ns_max
self.silhouettes_average = np.array([cluster_set.silhouette_average for cluster_set in cluster_sets])
def __len__(self) :
return len(self._cluster_sets)
def __getitem__(self, key) :
return self._cluster_sets[key]
@property
def clusters_sets(self) :
"""Returns the list of ClusterSet"""
return self._cluster_sets
[docs] @classmethod
def from_distance_matrix(cls, distance_matrix, n_max_type, ns_clusters_max, cluster_method) :
"""Generates ClusterSets from a distance matrix
Parameters
----------
distance_matrix : phasik.DistanceMatrix
Distance matrix from which to cluster
cluster_method : str
Clustering method used to cluster the temporal network snapshots. Examples :
'k_means', 'centroid', 'average', 'complete', 'weighted', 'median', 'single', 'ward'
n_max_type : str
The method that determines when to stop clustering. For example, cluster set
can be created by clustering until a particular number of clusters has been
reached ('maxclust'), or until every cluster is at least a certain distance
away from each other ('distance').
ns_clusters_max : list of int
List of values corresponding to the n_max_type described above, in other words,
list of numbers clusters to be computed. The number of elements in this list
is the number of ClusterSet computed.
Returns
-------
ClusterSets
"""
cluster_sets = [ClusterSet.from_distance_matrix(distance_matrix,
n_max_type, n_max, cluster_method) for n_max in ns_clusters_max]
return cls(cluster_sets, n_max_type, ns_clusters_max)
[docs] def plot(self, axs=None, coloring="consistent", with_silhouettes=False, with_n_clusters=False) :
"""Plots these cluster sets as a scatter graph
Parameters
----------
ax : matplotlib.Axes, optional
Axes on which to plot
coloring : {'ascending', 'consistent', None}
with_silhouettes : bool
If True, also plot the average silhouettes on a 2nd axis. Defaults to False.
with_n_clusters : bool
If True, also plot the actual number of clusters on a 3rd axis. Defaults to False.
Returns
-------
None
"""
return plot_cluster_sets(self, axs, coloring, with_silhouettes, with_n_clusters)
[docs] def plot_silhouette_samples(self, axs, coloring="consistent"):
"""Plot the average silhouettes across this range of cluster sets
Parameters
----------
axs : list of matplotlib.Axes
Axes on which to plot; should be an iterable object with at least as many items as there
are cluster sets in this class.
Returns
-------
None
"""
if coloring=="consistent":
self = relabel_clustersets(self)
for cluster_set, ax in zip(self._cluster_sets, axs.flatten()):
cluster_set.plot_silhouette_samples(ax=ax)