Getting started

Demo - Dissimilarity Based Selection Synthetic Data

In this tutorial, we are going to show how the QC-Selector work with 2D synthetic data. Each axis represents one feature. For each example, we will have two different synthetic datasets where the first one can be seen as randomly generated data points and the latter generates data points belonging to different clusters. By using 2D feature data points, it makes visualization easy.

Generating Synthetic Data

# install the selector module
!pip install git+https://github.com/theochem/Selector
!pip install rdkit
import selector
import matplotlib.pyplot as plt
import numpy as np
from typing import Any, Tuple, Union
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances

def generate_synthetic_data(n_samples: int = 100,
                            n_features: int = 2,
                            n_clusters: int = 2,
                            cluster_std: float = 1.0,
                            center_box: Tuple[float, float] = (-10.0, 10.0),
                            metric: str = "euclidean",
                            shuffle: bool = True,
                            random_state: int = 42,
                            pairwise_dist: bool = False,
                            **kwargs: Any,
                            ) -> Union[Tuple[np.ndarray, np.ndarray],
                                       Tuple[np.ndarray, np.ndarray, np.ndarray]]:
    """Generate synthetic data.

    Parameters
    ----------
    n_samples : int, optional
        The number of samples. Default=100.
    n_features : int, optional
        The number of features. Default=2.
    n_clusters : int, optional
        The number of clusters. Default=2.
    cluster_std : float, optional
        The standard deviation of the clusters. Default=1.0.
    center_box : tuple[float, float], optional
        The bounding box for each cluster center when centers are generated at random.
        Default=(-10.0, 10.0).
    metric : str, optional
        The metric used for computing pairwise distances. For the supported
        distance matrix, please refer to
        https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html.
        Default="euclidean".
    shuffle : bool, optional
        Whether to shuffle the samples. Default=True.
    random_state : int, optional
        The random state used for generating synthetic data. Default=42.
    pairwise_dist : bool, optional
        If True, then compute and return the pairwise distances between samples. Default=False.
    **kwargs : Any, optional
            Additional keyword arguments for the scikit-learn `pairwise_distances` function.

    Returns
    -------
    syn_data : np.ndarray
        The synthetic data.
    class_labels : np.ndarray
        The integer labels for cluster membership of each sample.
    dist: np.ndarray
        The symmetric pairwise distances between samples.

    """
    # pylint: disable=W0632
    syn_data, class_labels = make_blobs(n_samples=n_samples,
                                        n_features=n_features,
                                        centers=n_clusters,
                                        cluster_std=cluster_std,
                                        center_box=center_box,
                                        shuffle=shuffle,
                                        random_state=random_state,
                                        return_centers=False,
                                        )
    if pairwise_dist:
        dist = pairwise_distances(X=syn_data,
                                  Y=None,
                                  metric=metric,
                                  **kwargs,
                                  )
        return syn_data, class_labels, dist
    else:
        return syn_data, class_labels
coords, class_labels, arr_dist = generate_synthetic_data(n_samples=100,
                                                         n_features=2,
                                                         n_clusters=1,
                                                         pairwise_dist=True,
                                                         metric="euclidean",
                                                         random_state=42)

coords_cluster, class_labels_cluster, arr_dist_cluster = generate_synthetic_data(n_samples=100,
                                                         n_features=2,
                                                         n_clusters=3,
                                                         pairwise_dist=True,
                                                         metric="euclidean",
                                                         random_state=42)
def graph_data(coords, selected=None, reference=False):
    plt.figure(dpi=150)
    plt.scatter(coords[:, 0], coords[:, 1])
    if selected:
        for i, mol_id in enumerate(selected):
            plt.scatter(coords[mol_id, 0], coords[mol_id, 1], c='r')
            plt.text(coords[mol_id,0], coords[mol_id,1], str(i+1))
    if reference:
        plt.scatter(coords[0, 0], coords[0, 1], c='black')
    plt.show()
graph_data(coords)
graph_data(coords_cluster)
_images/starting_6_0.png _images/starting_6_1.png

Running Dissimilarity Algorithms

Brute Strength - MaxMin

# MaxMin method
from selector.methods.partition import *
from selector.methods.dissimilarity import *

# diverse subset selection from randomly generated data points
selector = MaxMin()
selected_ids1 = selector.select(arr=arr_dist, size=12)

# diverse subset selection from data points with obvious patterns (different clusters)
selector = MaxMin()
selected_ids2 = selector.select(arr=arr_dist_cluster, labels=class_labels_cluster, size=12)
graph_data(coords,selected_ids1)
graph_data(coords_cluster,selected_ids2)
_images/starting_10_0.png _images/starting_10_1.png

Adapted Optimizable K-Dissimilarity Selection (OptiSim)

# diverse subset selection from randomly generated data points
selected_id3 = OptiSim().select(coords, 12)
# diverse subset selection from data points with obvious patterns (different clusters)
selected_id4 = OptiSim().select(coords_cluster, size=12, labels=class_labels_cluster)

graph_data(coords,selected_id3)
graph_data(coords_cluster,selected_id4)
_images/starting_12_0.png _images/starting_12_1.png

Directed Sphere Exclusion

# diverse subset selection from randomly generated data points
selected_id5 = DirectedSphereExclusion().select(coords, 12)
# diverse subset selection from data points with obvious patterns (different clusters)
selected_id6 = DirectedSphereExclusion().select(coords_cluster, size=12, labels=class_labels_cluster)

graph_data(coords,selected_id5)
graph_data(coords_cluster,selected_id6)
_images/starting_14_0.png _images/starting_14_1.png

Grid Partitioning Method

# diverse subset selection from randomly generated data points
selected_id7 = GridPartitioning(2, "equisized_independent").select(coords, 12)
# diverse subset selection from data points with obvious patterns (different clusters)
selected_id8 = GridPartitioning(2, "equisized_independent").select(coords_cluster,
                                                                   size=12,
                                                                   labels=class_labels_cluster)

graph_data(coords,selected_id7)
graph_data(coords_cluster,selected_id8)
# 20, 5, 3, "equisized_independent"
_images/starting_16_0.png _images/starting_16_1.png

MinMax selection with clusters, but without assuming there are clusters

selector = MaxMin()
selected_ids = selector.select(arr=arr_dist_cluster, size=15)
graph_data(coords_cluster, selected_ids)
_images/starting_18_0.png
import numpy as np
from sklearn.metrics import pairwise_distances
np.random.seed(42)
cluster_one = np.random.normal(0, 1, (3,2))
cluster_two = np.random.normal(10, 1, (6,2))
cluster_three = np.random.normal(20, 1, (7,2))
labels_mocked = np.hstack([[0 for i in range(3)],
                          [1 for i in range(6)],
                          [2 for i in range(7)]])

mocked_cluster_coords = np.vstack([cluster_one, cluster_two, cluster_three])
selector = MaxMin(lambda x: pairwise_distances(x, metric='euclidean'))
selected_mocked = selector.select(mocked_cluster_coords, size=15, labels=labels_mocked)

graph_data(mocked_cluster_coords, selected_mocked)

selected_mocked
_images/starting_19_0.png
[0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 10, 11, 9, 13]

Brute Strength - maxsum

selector = DissimilaritySelection(num_selected=12,
                                  arr_dist=arr_dist,
                                  random_seed=42,
                                  method="maxsum")
selector.starting_idx = 0
selected_ids2 = selector.select()
graph_data(selected_ids2)
_images/starting_22_0.png

Grid Partitioning - Equisized Independent

selector = DissimilaritySelection(features=coords,
                                  num_selected=12,
                                  arr_dist=arr_dist,
                                  grid_method="grid_partioning",
                                  random_seed=42)
selector.starting_idx = 0
selected_ids3 = selector.select()
graph_data(selected_ids3)
_images/starting_25_0.png

Grid Partitioning - Equisized Dependent

selector = DissimilaritySelection(num_selected=12,
                                  features = coords,
                                  random_seed=42,
                                  dissim_func="grid_partitioning",
                                  grid_method="equisized_dependent")
selector.starting_idx = 0
selected_ids4 = selector.select()
graph_data(selected_ids4)
_images/starting_28_0.png

Directed Sphere Exclusion

selector = DissimilaritySelection(num_selected=12,
                                  features=coords,
                                  dissim_func="sphere_exclusion",
                                  random_seed=42)
selector.starting_idx = 0
selected_ids5 = selector.select()
graph_data(selected_ids5, True)
_images/starting_31_0.png

Optisim

selector = DissimilaritySelection(num_selected=12,
                                  features=coords,
                                  dissim_func="optisim",
                                  random_seed=42)
selector.starting_idx = 0
selected_ids6 = selector.select()
graph_data(selected_ids6)
_images/starting_34_0.png