Module sortedness.local
Expand source code
# Copyright (c) 2023. Davi Pereira dos Santos
# This file is part of the sortedness project.
# Please respect the license - more about this in the section (*) below.
#
# sortedness is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# sortedness is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with sortedness. If not, see <http://www.gnu.org/licenses/>.
#
# (*) Removing authorship by any means, e.g. by distribution of derived
# works or verbatim, obfuscated, compiled or rewritten versions of any
# part of this work is illegal and it is unethical regarding the effort and
# time spent here.
#
import gc
from functools import partial
import numpy as np
import pathos.multiprocessing as mp
from numpy import eye, mean, sqrt, ndarray
from numpy.random import permutation
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.stats import rankdata, kendalltau, weightedtau
from sortedness.parallel import rank_alongrow, rank_alongcol
def remove_diagonal(X):
n_points = len(X)
nI = ~eye(n_points, dtype=bool) # Mask to remove diagonal.
return X[nI].reshape(n_points, -1)
weightedtau.isweightedtau = True
def sortedness(X, X_, i=None, f=weightedtau, distance_dependent=True, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs):
"""
Calculate the sortedness (stress-like correlation-based measure that ignores distance proportions) value for each point
Functions available as scipy correlation coefficients:
ฯ-sortedness (Spearman),
๐-sortedness (Kendall's ๐),
w๐-sortedness (Sebastiano Vigna weighted Kendall's ๐) โ default
# TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)
This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets
Values can be lower due to the presence of ties, but only when the projection isn't perfect for all points.
In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.
Parameters
----------
X
matrix with an instance by row in a given space (often the original one)
X_
matrix with an instance by row in another given space (often the projected one)
i
None: calculate sortedness for all instances
`int`: index of the instance of interest
f
Distance criteria:
callable = scipy correlation function:
weightedtau (weighted Kendallโs ฯ is the default), kendalltau, spearmanr
Meaning of resulting values for correlation-based functions:
1.0: perfect projection (regarding order of examples)
0.0: random projection (enough distortion to have no information left when considering the overall ordering)
-1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
return_pvalues
For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr'
This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment]
The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
parallel
None: Avoid high-memory parallelization
True: Full parallelism
False: No parallelism
parallel_kwargs
Any extra argument to be provided to pathos parallelization
parallel_n_trigger
Threshold to disable parallelization for small n values
kwargs
Arguments to be passed to the correlation measure
Returns
-------
list of sortedness values (or tuples that also include pvalues)
>>> ll = [[i] for i in range(17)]
>>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
>>> b.ravel()
array([ 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
>>> r = sortedness(a, b)
>>> min(r), max(r)
(-1.0, 0.998638259786)
>>> rnd = np.random.default_rng(0)
>>> rnd.shuffle(ll)
>>> b = np.array(ll)
>>> b.ravel()
array([ 2, 10, 3, 11, 0, 4, 7, 5, 16, 12, 13, 6, 9, 14, 8, 1, 15])
>>> r = sortedness(a, b)
>>> r
array([ 0.19597951, -0.37003858, 0.06014087, -0.42174564, 0.2448619 ,
0.24635858, 0.16814336, 0.06919001, 0.1627886 , 0.33136454,
0.41592274, -0.10615388, 0.17549727, 0.17371479, -0.21360864,
-0.3677769 , -0.08292823])
>>> min(r), max(r)
(-0.421745643027, 0.415922739891)
>>> round(mean(r), 12)
0.040100605153
>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> me = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(me, cov, size=12)
>>> from sklearn.decomposition import PCA
>>> projected2 = PCA(n_components=2).fit_transform(original)
>>> projected1 = PCA(n_components=1).fit_transform(original)
>>> np.random.seed(0)
>>> projectedrnd = permutation(original)
>>> s = sortedness(original, original)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> s = sortedness(original, projected2)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> s = sortedness(original, projected1)
>>> min(s), max(s), s
(0.432937128932, 0.944810120534, array([0.43293713, 0.53333015, 0.88412753, 0.94481012, 0.81485109,
0.81330052, 0.76691474, 0.91169619, 0.88998817, 0.90102615,
0.61372341, 0.86996213]))
>>> s = sortedness(original, projectedrnd)
>>> min(s), max(s), s
(-0.578096068617, 0.396112816715, array([ 0.21296126, -0.57809607, 0.33083346, -0.00638865, -0.16007932,
0.39611282, -0.27357934, 0.04360717, -0.54534052, 0.19042181,
-0.32805008, -0.04178184]))
>>> sortedness(original, original, f=kendalltau, return_pvalues=True)
array([[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08],
[1.0000e+00, 5.0104e-08]])
>>> sortedness(original, projected2, f=kendalltau)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
>>> sortedness(original, projected1, f=kendalltau)
array([0.56363636, 0.52727273, 0.81818182, 0.96363636, 0.70909091,
0.85454545, 0.74545455, 0.92727273, 0.85454545, 0.89090909,
0.6 , 0.74545455])
>>> sortedness(original, projectedrnd, f=kendalltau)
array([ 0.2 , -0.38181818, 0.23636364, -0.09090909, -0.05454545,
0.23636364, -0.09090909, 0.23636364, -0.63636364, -0.01818182,
-0.2 , -0.01818182])
>>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
>>> sortedness(original, original, f=wf, return_pvalues=True)
array([[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan],
[ 1., nan]])
>>> sortedness(original, projected2, f=wf)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
>>> sortedness(original, projected1, f=wf)
array([0.89469168, 0.89269637, 0.92922928, 0.99721669, 0.86529591,
0.97806422, 0.94330979, 0.99357377, 0.87959707, 0.92182767,
0.87256459, 0.87747329])
>>> sortedness(original, projectedrnd, f=wf)
array([ 0.23771513, -0.2790059 , 0.3718005 , -0.16623167, 0.06179047,
0.40434396, -0.00130294, 0.46569739, -0.67581876, -0.23852189,
-0.39125007, 0.12131153])
>>> np.random.seed(14980)
>>> projectedrnd = permutation(original)
>>> sortedness(original, projectedrnd)
array([ 0.21666209, -0.23798067, 0.16830046, -0.5559795 , -0.23470751,
0.35716692, -0.0054951 , -0.35890385, 0.13101743, -0.48460059,
-0.46430457, 0.27400939])
>>> sortedness(original, np.flipud(original))
array([-0.26471073, 0.39699442, 0.05022757, 0.21215372, 0.23467884,
-0.33277347, -0.31616633, 0.19381204, -0.16114967, 0.08829425,
0.3383928 , -0.31012412])
>>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
>>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
>>> sortedness(original, projected)
array([1., 1., 1., 1., 1., 1., 1.])
>>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
>>> sortedness(original, projected)
array([-1. , 0.42263889, 0.80668827, 0.98180162, 0.98180162,
0.82721863, 0.61648537])
>>> sortedness(original, projected, 1)
0.4226388949217901
>>> sortedness([[1,2,3,3],[1,2,7,3],[3,4,8,5],[1,8,3,5]], [[2,1,2,3],[3,1,2,3],[5,4,5,6],[9,7,6,3]], 1)
0.18181818181818177
"""
isweightedtau = False
if hasattr(f, "isweightedtau") and f.isweightedtau:
isweightedtau = True
if "rank" not in kwargs:
kwargs["rank"] = None
if parallel_kwargs is None:
parallel_kwargs = {}
result, pvalues = [], []
npoints = len(X)
if i is not None:
x = X[i] if isinstance(X, (ndarray, list)) else X.iloc[i]
x_ = X_[i] if isinstance(X_, (ndarray, list)) else X_.iloc[i]
X = np.delete(X, i, axis=0)
X_ = np.delete(X_, i, axis=0)
d_ = np.sum((X_ - x_) ** 2, axis=1)
if distance_dependent:
d = np.sum((X - x) ** 2, axis=1)
scores_X, scores_X_ = (-d, -d_) if isweightedtau else (d, d_)
corr, pvalue = f(scores_X, scores_X_, **kwargs)
return (corr, pvalue) if return_pvalues else corr
else: # pragma: no cover
raise Exception(f"Not implemented yet; it is an open problem")
D = abs(X - x).T
scores_X, scores_x_ = (-D, -d_) if isweightedtau else (D, d_)
for j in range(len(scores_X)):
corr, pvalue = f(scores_X[j], scores_x_, **kwargs)
result.append(round(corr, 12))
pvalues.append(round(pvalue, 12))
return (mean(result), mean(pvalues)) if return_pvalues else mean(result)
if distance_dependent:
tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
sqdist_X, sqdist_X_ = tmap(lambda M: cdist(M, M, metric='sqeuclidean'), [X, X_])
D = remove_diagonal(sqdist_X)
D_ = remove_diagonal(sqdist_X_)
scores_X, scores_X_ = (-D, -D_) if isweightedtau else (D, D_)
for i in range(len(X)):
corr, pvalue = f(scores_X[i], scores_X_[i], **kwargs)
result.append(round(corr, 12))
pvalues.append(round(pvalue, 12))
else: # pragma: no cover
raise Exception(f"Not implemented yet; it is an open problem")
# for i in range(len(X)):
# corr, pvalue = sortedness(X, X_, i, f=f, distance_dependent=False, return_pvalues=True,
# parallel=parallel, parallel_n_trigger=parallel_n_trigger, parallel_kwargs=parallel_kwargs, **kwargs)
# result.append(round(corr, 12))
# pvalues.append(round(pvalue, 12))
result = np.array(result, dtype=float)
if return_pvalues:
return np.array(list(zip(result, pvalues)))
return result
def pwsortedness(X, X_, i=None, parallel=True, parallel_n_trigger=200, batches=10, debug=False, dist=None, cython=False, **parallel_kwargs):
"""
Local pairwise sortedness (ฮ๐w) based on Sebastiano Vigna weighted Kendall's ๐
Importance rankings are calculated internally based on proximity of each pair to the point of interest.
# TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)
This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets
Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.
Parameters
----------
X
Original dataset
X_
Projected points
i
None: calculate pwsortedness for all instances
`int`: index of the instance of interest
parallel
None: Avoid high-memory parallelization
True: Full parallelism
False: No parallelism
parallel_kwargs
Any extra argument to be provided to pathos parallelization
parallel_n_trigger
Threshold to disable parallelization for small n values
batches
Parallel batch size
debug
Whether to print more info
dist
Provide distance matrices (D, D_) instead of points
X and X_ should be None
cython
Whether to:
(True) improve speed by ~2x; or,
(False) be more compatible/portable.
Returns
-------
Numpy vector
>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> m = (1, 12)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(m, cov, size=12)
>>> from sklearn.decomposition import PCA
>>> projected2 = PCA(n_components=2).fit_transform(original)
>>> projected1 = PCA(n_components=1).fit_transform(original)
>>> np.random.seed(0)
>>> projectedrnd = permutation(original)
>>> r = pwsortedness(original, original)
>>> min(r), max(r), round(mean(r), 12)
(1.0, 1.0, 1.0)
>>> r = pwsortedness(original, projected2)
>>> min(r), round(mean(r), 12), max(r)
(1.0, 1.0, 1.0)
>>> r = pwsortedness(original, projected1)
>>> min(r), round(mean(r), 12), max(r)
(0.730078995423, 0.774457348878, 0.837310352695)
>>> r = pwsortedness(original, projected2[:, 1:])
>>> min(r), round(mean(r), 12), max(r)
(0.18672441995, 0.281712132364, 0.420214364305)
>>> r = pwsortedness(original, projectedrnd)
>>> min(r), round(mean(r), 12), max(r)
(-0.198780473657, -0.064598420372, 0.147224384381)
>>> pwsortedness(original, projected1)[1]
0.730078995423
>>> pwsortedness(original, projected1, cython=True)[1]
0.730078995423
>>> pwsortedness(original, projected1, i=1)
0.730078995423
"""
npoints = len(X) if X is not None else len(dist[0]) # pragma: no cover
tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
if debug: # pragma: no cover
print(1)
thread = lambda M: -pdist(M, metric="sqeuclidean")
# if i is not None:
# tmp, tmp_ = X[0], X_[0]
# X[0], X_[0] = X[i], X_[i]
# X[i], X_[i] = tmp, tmp_
scores_X, scores_X_ = tmap(thread, [X, X_]) if X is not None else (-squareform(dist[0]), -squareform(dist[1]))
if debug: # pragma: no cover
print(2)
def makeM(E, single=False):
n = len(E)
m = (n ** 2 - n) // 2
M = np.zeros((m, 1 if single else n))
if debug: # pragma: no cover
print(4)
c = 0
for i in range(n - 1): # a bit slow, but only a fraction of wtau (~5%)
h = n - i - 1
d = c + h
M[c:d] = E[i] + E[i + 1:]
c = d
if debug: # pragma: no cover
print(5)
del E
gc.collect()
return M.T
D = squareform(-scores_X) if dist is None else dist[0]
if i is None:
if debug: # pragma: no cover
print(3)
n = len(D)
M = makeM(D)
if debug: # pragma: no cover
print(6)
R = rank_alongrow(M, step=n // batches, parallel=parallel, **parallel_kwargs).T
del M
gc.collect()
if debug: # pragma: no cover
print(7)
if cython:
from sortedness.wtau import parwtau
res = np.round(parwtau(scores_X, scores_X_, npoints, R, parallel=parallel, **parallel_kwargs), 12)
del R
gc.collect()
return res
else:
def thread(r):
corr, pvalue = weightedtau(scores_X, scores_X_, rank=r)
return round(corr, 12), round(pvalue, 12)
result, pvalues = [], []
lst = (R[:, i] for i in range(len(X)))
for corrs, pvalue in pmap(thread, lst):
result.append(corrs)
pvalues.append(pvalue)
result = np.array(result, dtype=float)
return result
M = makeM(D[:, i:i + 1], single=True)
r = rankdata(M, axis=1, method="average")[0].astype(int) - 1
return round(weightedtau(scores_X, scores_X_, r)[0], 12)
# D = cdist(X, X[:1])
# M = makeM(D, single=True)
# r = rankdata(M[0], method="average").astype(int) - 1
# return round(weightedtau(scores_X, scores_X_, r)[0], 12)
def rsortedness(X, X_, f=weightedtau, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs): # pragma: no cover
"""
Reciprocal sortedness: consider the neighborhood relation the other way around
Might be good to assess the effect of a projection on hubness, and also to serve as a loss function for a custom projection algorithm.
WARNING: this function is experimental, i.e., not as well tested as the others; it might need a better algorithm/fomula as well.
# TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm)
This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets
Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.
Parameters
----------
f
Distance criteria:
callable = scipy correlation function:
weightedtau (weighted Kendallโs ฯ is the default), kendalltau, spearmanr
Meaning of resulting values for correlation-based functions:
1.0: perfect projection (regarding order of examples)
0.0: random projection (enough distortion to have no information left when considering the overall ordering)
-1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
return_pvalues
For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr'
This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment]
The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
parallel
None: Avoid high-memory parallelization
True: Full parallelism
False: No parallelism
parallel_kwargs
Any extra argument to be provided to pathos parallelization
parallel_n_trigger
Threshold to disable parallelization for small n values
kwargs
Arguments to be passed to the correlation measure
Returns
-------
Numpy vector
# >>> ll = [[i, ] for i in range(17)]
# >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
# >>> b.ravel()
# array([ 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
# >>> #r = rsortedness(a, b)
# >>> #min(r), max(r)
# (-0.707870893072, 0.962964134515)
#
# >>> rnd = np.random.default_rng(0)
# >>> rnd.shuffle(ll)
# >>> b = np.array(ll)
# >>> b.ravel()
# array([ 2, 10, 3, 11, 0, 4, 7, 5, 16, 12, 13, 6, 9, 14, 8, 1, 15])
# >>> r = rsortedness(b, a)
# >>> r
# array([ 0.36861667, -0.07147685, 0.39350142, -0.04581926, -0.03951645,
# 0.31100414, -0.18107755, 0.28268222, -0.29248869, 0.19177107,
# 0.48076521, -0.17640674, 0.13098522, 0.34833996, 0.01844146,
# -0.58291518, 0.34742337])
# >>> min(r), max(r), round(mean(r), 12)
# (-0.582915181328, 0.480765206133, 0.087284118913)
# >>> rsortedness(b, a, f=kendalltau, return_pvalues=True)
# array([[ 0.2316945 , 0.04863508],
# [-0.37005403, 0.21347624],
# [ 0.17618709, 0.04863508],
# [-0.35418588, 0.21347624]])
"""
if hasattr(f, "isweightedtau") and f.isweightedtau and "rank" not in kwargs:
kwargs["rank"] = None
if parallel_kwargs is None:
parallel_kwargs = {}
npoints = len(X)
tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
D, D_ = tmap(lambda M: cdist(M, M, metric="sqeuclidean"), [X, X_])
R, R_ = (rank_alongcol(M, parallel=parallel) for M in [D, D_])
scores_X, scores_X_ = tmap(lambda M: -remove_diagonal(M), [R, R_]) # For f=weightedtau: scores = -ranks.
if hasattr(f, "isparwtau"): # pragma: no cover
raise Exception("TODO: Pairtau implementation disagree with scipy weightedtau")
# return parwtau(scores_X, scores_X_, npoints, parallel=parallel, **kwargs)
def thread(l):
lst1 = []
lst2 = []
for i in l:
corr, pvalue = f(scores_X[i], scores_X_[i], **kwargs)
lst1.append(round(corr, 12))
lst2.append(round(pvalue, 12))
return lst1, lst2
result, pvalues = [], []
try:
from shelchemy.lazy import ichunks
except Exception as e:
print("please install shelchemy library.")
exit()
jobs = pmap(thread, ichunks(range(npoints), 15, asgenerators=False))
for corrs, pvalues in jobs:
result.extend(corrs)
pvalues.extend(pvalues)
result = np.array(result, dtype=float)
if return_pvalues:
return np.array(list(zip(result, pvalues)))
return result
def stress(X, X_, metric=True, parallel=True, parallel_size_trigger=10000, **parallel_kwargs):
"""
Kruskal's "Stress Formula 1" normalized before comparing distances.
default: Euclidean
>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> mean = (1, 12)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(mean, cov, size=12)
>>> s = stress(original, original*5)
>>> min(s), max(s), s
(0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
>>> from sklearn.decomposition import PCA
>>> projected = PCA(n_components=2).fit_transform(original)
>>> s = stress(original, projected)
>>> min(s), max(s), s
(0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
>>> projected = PCA(n_components=1).fit_transform(original)
>>> s = stress(original, projected)
>>> min(s), max(s), s
(0.073462710191, 0.333885390367, array([0.2812499 , 0.31103416, 0.21994448, 0.07346271, 0.2810867 ,
0.16411944, 0.17002148, 0.14748528, 0.18341208, 0.14659984,
0.33388539, 0.24110857]))
>>> stress(original, projected)
array([0.2812499 , 0.31103416, 0.21994448, 0.07346271, 0.2810867 ,
0.16411944, 0.17002148, 0.14748528, 0.18341208, 0.14659984,
0.33388539, 0.24110857])
>>> stress(original, projected, metric=False)
array([0.33947258, 0.29692937, 0.30478874, 0.10509128, 0.2516135 ,
0.2901905 , 0.1662822 , 0.13153341, 0.34299717, 0.164696 ,
0.35266095, 0.35276684])
Parameters
----------
X
matrix with an instance by row in a given space (often the original one)
X_
matrix with an instance by row in another given space (often the projected one)
metric
Stress formula version: metric or nonmetric
parallel
Parallelize processing when |X|>1000. Might use more memory.
Returns
-------
"""
tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and X.size > parallel_size_trigger else map
# TODO: parallelize cdist in slices?
if metric:
thread = lambda M, m: cdist(M, M, metric=m)
Dsq, D_ = tmap(thread, [X, X_], ["sqeuclidean", "Euclidean"]) # Slowest part (~98%).
Dsq /= np.max(Dsq)
D = sqrt(Dsq)
D_ /= np.max(D_)
else:
thread = lambda M: rankdata(cdist(M, M, metric="sqeuclidean"), method="average", axis=1) - 1
D, D_ = tmap(thread, [X, X_])
Dsq = D ** 2
sqdiff = (D - D_) ** 2
nume = sum(sqdiff)
deno = sum(Dsq)
result = np.round(sqrt(nume / deno), 12)
return result
Functions
def permutation(x)-
Randomly permute a sequence, or return a permuted range.
If
xis a multi-dimensional array, it is only shuffled along its first index.Note
New code should use the
~numpy.random.Generator.permutationmethod of a~numpy.random.Generatorinstance instead; please see the :ref:random-quick-start.Parameters
x:intorarray_like- If
xis an integer, randomly permutenp.arange(x). Ifxis an array, make a copy and shuffle the elements randomly.
Returns
out:ndarray- Permuted sequence or array range.
See Also
random.Generator.permutation- which should be used for new code.
Examples
>>> np.random.permutation(10) array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6]) # random>>> np.random.permutation([1, 4, 9, 12, 15]) array([15, 1, 9, 4, 12]) # random>>> arr = np.arange(9).reshape((3, 3)) >>> np.random.permutation(arr) array([[6, 7, 8], # random [0, 1, 2], [3, 4, 5]]) def pwsortedness(X, X_, i=None, parallel=True, parallel_n_trigger=200, batches=10, debug=False, dist=None, cython=False, **parallel_kwargs)-
Local pairwise sortedness (ฮ๐w) based on Sebastiano Vigna weighted Kendall's ๐
Importance rankings are calculated internally based on proximity of each pair to the point of interest.
TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)
This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points. In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.Parameters
X- Original dataset
X_- Projected points
i- None:
calculate pwsortedness for all instances
int: index of the instance of interest parallel- None: Avoid high-memory parallelization True: Full parallelism False: No parallelism
parallel_kwargs- Any extra argument to be provided to pathos parallelization
parallel_n_trigger- Threshold to disable parallelization for small n values
batches- Parallel batch size
debug- Whether to print more info
dist- Provide distance matrices (D, D_) instead of points X and X_ should be None
cython- Whether to: (True) improve speed by ~2x; or, (False) be more compatible/portable.
Returns
Numpy vector>>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> m = (1, 12) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(m, cov, size=12) >>> from sklearn.decomposition import PCA >>> projected2 = PCA(n_components=2).fit_transform(original) >>> projected1 = PCA(n_components=1).fit_transform(original) >>> np.random.seed(0) >>> projectedrnd = permutation(original)>>> r = pwsortedness(original, original) >>> min(r), max(r), round(mean(r), 12) (1.0, 1.0, 1.0) >>> r = pwsortedness(original, projected2) >>> min(r), round(mean(r), 12), max(r) (1.0, 1.0, 1.0) >>> r = pwsortedness(original, projected1) >>> min(r), round(mean(r), 12), max(r) (0.730078995423, 0.774457348878, 0.837310352695) >>> r = pwsortedness(original, projected2[:, 1:]) >>> min(r), round(mean(r), 12), max(r) (0.18672441995, 0.281712132364, 0.420214364305) >>> r = pwsortedness(original, projectedrnd) >>> min(r), round(mean(r), 12), max(r) (-0.198780473657, -0.064598420372, 0.147224384381) >>> pwsortedness(original, projected1)[1] <code>0.730078995423</code> : >>> pwsortedness(original, projected1, cython=True)[1] <code>0.730078995423</code> : >>> pwsortedness(original, projected1, i=1) <code>0.730078995423</code> : Expand source code
def pwsortedness(X, X_, i=None, parallel=True, parallel_n_trigger=200, batches=10, debug=False, dist=None, cython=False, **parallel_kwargs): """ Local pairwise sortedness (ฮ๐w) based on Sebastiano Vigna weighted Kendall's ๐ Importance rankings are calculated internally based on proximity of each pair to the point of interest. # TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm) This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points. In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order. Parameters ---------- X Original dataset X_ Projected points i None: calculate pwsortedness for all instances `int`: index of the instance of interest parallel None: Avoid high-memory parallelization True: Full parallelism False: No parallelism parallel_kwargs Any extra argument to be provided to pathos parallelization parallel_n_trigger Threshold to disable parallelization for small n values batches Parallel batch size debug Whether to print more info dist Provide distance matrices (D, D_) instead of points X and X_ should be None cython Whether to: (True) improve speed by ~2x; or, (False) be more compatible/portable. Returns ------- Numpy vector >>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> m = (1, 12) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(m, cov, size=12) >>> from sklearn.decomposition import PCA >>> projected2 = PCA(n_components=2).fit_transform(original) >>> projected1 = PCA(n_components=1).fit_transform(original) >>> np.random.seed(0) >>> projectedrnd = permutation(original) >>> r = pwsortedness(original, original) >>> min(r), max(r), round(mean(r), 12) (1.0, 1.0, 1.0) >>> r = pwsortedness(original, projected2) >>> min(r), round(mean(r), 12), max(r) (1.0, 1.0, 1.0) >>> r = pwsortedness(original, projected1) >>> min(r), round(mean(r), 12), max(r) (0.730078995423, 0.774457348878, 0.837310352695) >>> r = pwsortedness(original, projected2[:, 1:]) >>> min(r), round(mean(r), 12), max(r) (0.18672441995, 0.281712132364, 0.420214364305) >>> r = pwsortedness(original, projectedrnd) >>> min(r), round(mean(r), 12), max(r) (-0.198780473657, -0.064598420372, 0.147224384381) >>> pwsortedness(original, projected1)[1] 0.730078995423 >>> pwsortedness(original, projected1, cython=True)[1] 0.730078995423 >>> pwsortedness(original, projected1, i=1) 0.730078995423 """ npoints = len(X) if X is not None else len(dist[0]) # pragma: no cover tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map if debug: # pragma: no cover print(1) thread = lambda M: -pdist(M, metric="sqeuclidean") # if i is not None: # tmp, tmp_ = X[0], X_[0] # X[0], X_[0] = X[i], X_[i] # X[i], X_[i] = tmp, tmp_ scores_X, scores_X_ = tmap(thread, [X, X_]) if X is not None else (-squareform(dist[0]), -squareform(dist[1])) if debug: # pragma: no cover print(2) def makeM(E, single=False): n = len(E) m = (n ** 2 - n) // 2 M = np.zeros((m, 1 if single else n)) if debug: # pragma: no cover print(4) c = 0 for i in range(n - 1): # a bit slow, but only a fraction of wtau (~5%) h = n - i - 1 d = c + h M[c:d] = E[i] + E[i + 1:] c = d if debug: # pragma: no cover print(5) del E gc.collect() return M.T D = squareform(-scores_X) if dist is None else dist[0] if i is None: if debug: # pragma: no cover print(3) n = len(D) M = makeM(D) if debug: # pragma: no cover print(6) R = rank_alongrow(M, step=n // batches, parallel=parallel, **parallel_kwargs).T del M gc.collect() if debug: # pragma: no cover print(7) if cython: from sortedness.wtau import parwtau res = np.round(parwtau(scores_X, scores_X_, npoints, R, parallel=parallel, **parallel_kwargs), 12) del R gc.collect() return res else: def thread(r): corr, pvalue = weightedtau(scores_X, scores_X_, rank=r) return round(corr, 12), round(pvalue, 12) result, pvalues = [], [] lst = (R[:, i] for i in range(len(X))) for corrs, pvalue in pmap(thread, lst): result.append(corrs) pvalues.append(pvalue) result = np.array(result, dtype=float) return result M = makeM(D[:, i:i + 1], single=True) r = rankdata(M, axis=1, method="average")[0].astype(int) - 1 return round(weightedtau(scores_X, scores_X_, r)[0], 12) # D = cdist(X, X[:1]) # M = makeM(D, single=True) # r = rankdata(M[0], method="average").astype(int) - 1 # return round(weightedtau(scores_X, scores_X_, r)[0], 12) def remove_diagonal(X)-
Expand source code
def remove_diagonal(X): n_points = len(X) nI = ~eye(n_points, dtype=bool) # Mask to remove diagonal. return X[nI].reshape(n_points, -1) def rsortedness(X, X_, f=<function weightedtau>, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs)-
Reciprocal sortedness: consider the neighborhood relation the other way around
Might be good to assess the effect of a projection on hubness, and also to serve as a loss function for a custom projection algorithm.
WARNING: this function is experimental, i.e., not as well tested as the others; it might need a better algorithm/fomula as well.
TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm)
This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points. In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.Parameters
f- Distance criteria: callable = scipy correlation function: weightedtau (weighted Kendallโs ฯ is the default), kendalltau, spearmanr Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
return_pvalues- For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr' This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment] The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
parallel- None: Avoid high-memory parallelization True: Full parallelism False: No parallelism
parallel_kwargs- Any extra argument to be provided to pathos parallelization
parallel_n_trigger- Threshold to disable parallelization for small n values
kwargs- Arguments to be passed to the correlation measure
Returns
Numpy vector>>> ll = [[i, ] for i in range(17)]
>>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
>>> b.ravel()
array([ 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
>>> #r = rsortedness(a, b)
>>> #min(r), max(r)
(-0.707870893072, 0.962964134515)
>>> rnd = np.random.default_rng(0)
>>> rnd.shuffle(ll)
>>> b = np.array(ll)
>>> b.ravel()
array([ 2, 10, 3, 11, 0, 4, 7, 5, 16, 12, 13, 6, 9, 14, 8, 1, 15])
>>> r = rsortedness(b, a)
>>> r
array([ 0.36861667, -0.07147685, 0.39350142, -0.04581926, -0.03951645,
0.31100414, -0.18107755, 0.28268222, -0.29248869, 0.19177107,
0.48076521, -0.17640674, 0.13098522, 0.34833996, 0.01844146,
-0.58291518, 0.34742337])
>>> min(r), max(r), round(mean(r), 12)
(-0.582915181328, 0.480765206133, 0.087284118913)
>>> rsortedness(b, a, f=kendalltau, return_pvalues=True)
array([[ 0.2316945 , 0.04863508],
[-0.37005403, 0.21347624],
[ 0.17618709, 0.04863508],
[-0.35418588, 0.21347624]])
Expand source code
def rsortedness(X, X_, f=weightedtau, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs): # pragma: no cover """ Reciprocal sortedness: consider the neighborhood relation the other way around Might be good to assess the effect of a projection on hubness, and also to serve as a loss function for a custom projection algorithm. WARNING: this function is experimental, i.e., not as well tested as the others; it might need a better algorithm/fomula as well. # TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm) This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points. In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order. Parameters ---------- f Distance criteria: callable = scipy correlation function: weightedtau (weighted Kendallโs ฯ is the default), kendalltau, spearmanr Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering) return_pvalues For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr' This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment] The null hypothesis is that the projection is random, i.e., sortedness = 0.0. parallel None: Avoid high-memory parallelization True: Full parallelism False: No parallelism parallel_kwargs Any extra argument to be provided to pathos parallelization parallel_n_trigger Threshold to disable parallelization for small n values kwargs Arguments to be passed to the correlation measure Returns ------- Numpy vector # >>> ll = [[i, ] for i in range(17)] # >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:]))) # >>> b.ravel() # array([ 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]) # >>> #r = rsortedness(a, b) # >>> #min(r), max(r) # (-0.707870893072, 0.962964134515) # # >>> rnd = np.random.default_rng(0) # >>> rnd.shuffle(ll) # >>> b = np.array(ll) # >>> b.ravel() # array([ 2, 10, 3, 11, 0, 4, 7, 5, 16, 12, 13, 6, 9, 14, 8, 1, 15]) # >>> r = rsortedness(b, a) # >>> r # array([ 0.36861667, -0.07147685, 0.39350142, -0.04581926, -0.03951645, # 0.31100414, -0.18107755, 0.28268222, -0.29248869, 0.19177107, # 0.48076521, -0.17640674, 0.13098522, 0.34833996, 0.01844146, # -0.58291518, 0.34742337]) # >>> min(r), max(r), round(mean(r), 12) # (-0.582915181328, 0.480765206133, 0.087284118913) # >>> rsortedness(b, a, f=kendalltau, return_pvalues=True) # array([[ 0.2316945 , 0.04863508], # [-0.37005403, 0.21347624], # [ 0.17618709, 0.04863508], # [-0.35418588, 0.21347624]]) """ if hasattr(f, "isweightedtau") and f.isweightedtau and "rank" not in kwargs: kwargs["rank"] = None if parallel_kwargs is None: parallel_kwargs = {} npoints = len(X) tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map D, D_ = tmap(lambda M: cdist(M, M, metric="sqeuclidean"), [X, X_]) R, R_ = (rank_alongcol(M, parallel=parallel) for M in [D, D_]) scores_X, scores_X_ = tmap(lambda M: -remove_diagonal(M), [R, R_]) # For f=weightedtau: scores = -ranks. if hasattr(f, "isparwtau"): # pragma: no cover raise Exception("TODO: Pairtau implementation disagree with scipy weightedtau") # return parwtau(scores_X, scores_X_, npoints, parallel=parallel, **kwargs) def thread(l): lst1 = [] lst2 = [] for i in l: corr, pvalue = f(scores_X[i], scores_X_[i], **kwargs) lst1.append(round(corr, 12)) lst2.append(round(pvalue, 12)) return lst1, lst2 result, pvalues = [], [] try: from shelchemy.lazy import ichunks except Exception as e: print("please install shelchemy library.") exit() jobs = pmap(thread, ichunks(range(npoints), 15, asgenerators=False)) for corrs, pvalues in jobs: result.extend(corrs) pvalues.extend(pvalues) result = np.array(result, dtype=float) if return_pvalues: return np.array(list(zip(result, pvalues))) return result def sortedness(X, X_, i=None, f=<function weightedtau>, distance_dependent=True, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs)-
Calculate the sortedness (stress-like correlation-based measure that ignores distance proportions) value for each point Functions available as scipy correlation coefficients: ฯ-sortedness (Spearman), ๐-sortedness (Kendall's ๐), w๐-sortedness (Sebastiano Vigna weighted Kendall's ๐) โ default
TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)
This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets Values can be lower due to the presence of ties, but only when the projection isn't perfect for all points. In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.Parameters
X- matrix with an instance by row in a given space (often the original one)
X_- matrix with an instance by row in another given space (often the projected one)
i- None:
calculate sortedness for all instances
int: index of the instance of interest f- Distance criteria: callable = scipy correlation function: weightedtau (weighted Kendallโs ฯ is the default), kendalltau, spearmanr Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
return_pvalues- For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr' This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment] The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
parallel- None: Avoid high-memory parallelization True: Full parallelism False: No parallelism
parallel_kwargs- Any extra argument to be provided to pathos parallelization
parallel_n_trigger- Threshold to disable parallelization for small n values
kwargs- Arguments to be passed to the correlation measure
Returns
list of sortedness values (or tuples that also include pvalues)>>> ll = [[i] for i in range(17)] >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:]))) >>> b.ravel() array([ 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]) >>> r = sortedness(a, b) >>> min(r), max(r) (-1.0, 0.998638259786)>>> rnd = np.random.default_rng(0) >>> rnd.shuffle(ll) >>> b = np.array(ll) >>> b.ravel() array([ 2, 10, 3, 11, 0, 4, 7, 5, 16, 12, 13, 6, 9, 14, 8, 1, 15]) >>> r = sortedness(a, b) >>> r array([ 0.19597951, -0.37003858, 0.06014087, -0.42174564, 0.2448619 , 0.24635858, 0.16814336, 0.06919001, 0.1627886 , 0.33136454, 0.41592274, -0.10615388, 0.17549727, 0.17371479, -0.21360864, -0.3677769 , -0.08292823]) >>> min(r), max(r) (-0.421745643027, 0.415922739891) >>> round(mean(r), 12) 0.040100605153>>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> me = (1, 2) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(me, cov, size=12) >>> from sklearn.decomposition import PCA >>> projected2 = PCA(n_components=2).fit_transform(original) >>> projected1 = PCA(n_components=1).fit_transform(original) >>> np.random.seed(0) >>> projectedrnd = permutation(original)>>> s = sortedness(original, original) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> s = sortedness(original, projected2) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> s = sortedness(original, projected1) >>> min(s), max(s), s (0.432937128932, 0.944810120534, array([0.43293713, 0.53333015, 0.88412753, 0.94481012, 0.81485109, 0.81330052, 0.76691474, 0.91169619, 0.88998817, 0.90102615, 0.61372341, 0.86996213])) >>> s = sortedness(original, projectedrnd) >>> min(s), max(s), s (-0.578096068617, 0.396112816715, array([ 0.21296126, -0.57809607, 0.33083346, -0.00638865, -0.16007932, 0.39611282, -0.27357934, 0.04360717, -0.54534052, 0.19042181, -0.32805008, -0.04178184]))>>> sortedness(original, original, f=kendalltau, return_pvalues=True) array([[1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08]]) >>> sortedness(original, projected2, f=kendalltau) array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) >>> sortedness(original, projected1, f=kendalltau) array([0.56363636, 0.52727273, 0.81818182, 0.96363636, 0.70909091, 0.85454545, 0.74545455, 0.92727273, 0.85454545, 0.89090909, 0.6 , 0.74545455]) >>> sortedness(original, projectedrnd, f=kendalltau) array([ 0.2 , -0.38181818, 0.23636364, -0.09090909, -0.05454545, 0.23636364, -0.09090909, 0.23636364, -0.63636364, -0.01818182, -0.2 , -0.01818182])>>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1)) >>> sortedness(original, original, f=wf, return_pvalues=True) array([[ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan]]) >>> sortedness(original, projected2, f=wf) array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) >>> sortedness(original, projected1, f=wf) array([0.89469168, 0.89269637, 0.92922928, 0.99721669, 0.86529591, 0.97806422, 0.94330979, 0.99357377, 0.87959707, 0.92182767, 0.87256459, 0.87747329]) >>> sortedness(original, projectedrnd, f=wf) array([ 0.23771513, -0.2790059 , 0.3718005 , -0.16623167, 0.06179047, 0.40434396, -0.00130294, 0.46569739, -0.67581876, -0.23852189, -0.39125007, 0.12131153]) >>> np.random.seed(14980) >>> projectedrnd = permutation(original) >>> sortedness(original, projectedrnd) array([ 0.21666209, -0.23798067, 0.16830046, -0.5559795 , -0.23470751, 0.35716692, -0.0054951 , -0.35890385, 0.13101743, -0.48460059, -0.46430457, 0.27400939]) >>> sortedness(original, np.flipud(original)) array([-0.26471073, 0.39699442, 0.05022757, 0.21215372, 0.23467884, -0.33277347, -0.31616633, 0.19381204, -0.16114967, 0.08829425, 0.3383928 , -0.31012412]) >>> original = np.array([[0],[1],[2],[3],[4],[5],[6]]) >>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]]) >>> sortedness(original, projected) array([1., 1., 1., 1., 1., 1., 1.]) >>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]]) >>> sortedness(original, projected) array([-1. , 0.42263889, 0.80668827, 0.98180162, 0.98180162, 0.82721863, 0.61648537]) >>> sortedness(original, projected, 1) 0.4226388949217901 >>> sortedness([[1,2,3,3],[1,2,7,3],[3,4,8,5],[1,8,3,5]], [[2,1,2,3],[3,1,2,3],[5,4,5,6],[9,7,6,3]], 1) 0.18181818181818177Expand source code
def sortedness(X, X_, i=None, f=weightedtau, distance_dependent=True, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs): """ Calculate the sortedness (stress-like correlation-based measure that ignores distance proportions) value for each point Functions available as scipy correlation coefficients: ฯ-sortedness (Spearman), ๐-sortedness (Kendall's ๐), w๐-sortedness (Sebastiano Vigna weighted Kendall's ๐) โ default # TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm) This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets Values can be lower due to the presence of ties, but only when the projection isn't perfect for all points. In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order. Parameters ---------- X matrix with an instance by row in a given space (often the original one) X_ matrix with an instance by row in another given space (often the projected one) i None: calculate sortedness for all instances `int`: index of the instance of interest f Distance criteria: callable = scipy correlation function: weightedtau (weighted Kendallโs ฯ is the default), kendalltau, spearmanr Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering) return_pvalues For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr' This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment] The null hypothesis is that the projection is random, i.e., sortedness = 0.0. parallel None: Avoid high-memory parallelization True: Full parallelism False: No parallelism parallel_kwargs Any extra argument to be provided to pathos parallelization parallel_n_trigger Threshold to disable parallelization for small n values kwargs Arguments to be passed to the correlation measure Returns ------- list of sortedness values (or tuples that also include pvalues) >>> ll = [[i] for i in range(17)] >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:]))) >>> b.ravel() array([ 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]) >>> r = sortedness(a, b) >>> min(r), max(r) (-1.0, 0.998638259786) >>> rnd = np.random.default_rng(0) >>> rnd.shuffle(ll) >>> b = np.array(ll) >>> b.ravel() array([ 2, 10, 3, 11, 0, 4, 7, 5, 16, 12, 13, 6, 9, 14, 8, 1, 15]) >>> r = sortedness(a, b) >>> r array([ 0.19597951, -0.37003858, 0.06014087, -0.42174564, 0.2448619 , 0.24635858, 0.16814336, 0.06919001, 0.1627886 , 0.33136454, 0.41592274, -0.10615388, 0.17549727, 0.17371479, -0.21360864, -0.3677769 , -0.08292823]) >>> min(r), max(r) (-0.421745643027, 0.415922739891) >>> round(mean(r), 12) 0.040100605153 >>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> me = (1, 2) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(me, cov, size=12) >>> from sklearn.decomposition import PCA >>> projected2 = PCA(n_components=2).fit_transform(original) >>> projected1 = PCA(n_components=1).fit_transform(original) >>> np.random.seed(0) >>> projectedrnd = permutation(original) >>> s = sortedness(original, original) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> s = sortedness(original, projected2) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> s = sortedness(original, projected1) >>> min(s), max(s), s (0.432937128932, 0.944810120534, array([0.43293713, 0.53333015, 0.88412753, 0.94481012, 0.81485109, 0.81330052, 0.76691474, 0.91169619, 0.88998817, 0.90102615, 0.61372341, 0.86996213])) >>> s = sortedness(original, projectedrnd) >>> min(s), max(s), s (-0.578096068617, 0.396112816715, array([ 0.21296126, -0.57809607, 0.33083346, -0.00638865, -0.16007932, 0.39611282, -0.27357934, 0.04360717, -0.54534052, 0.19042181, -0.32805008, -0.04178184])) >>> sortedness(original, original, f=kendalltau, return_pvalues=True) array([[1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08], [1.0000e+00, 5.0104e-08]]) >>> sortedness(original, projected2, f=kendalltau) array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) >>> sortedness(original, projected1, f=kendalltau) array([0.56363636, 0.52727273, 0.81818182, 0.96363636, 0.70909091, 0.85454545, 0.74545455, 0.92727273, 0.85454545, 0.89090909, 0.6 , 0.74545455]) >>> sortedness(original, projectedrnd, f=kendalltau) array([ 0.2 , -0.38181818, 0.23636364, -0.09090909, -0.05454545, 0.23636364, -0.09090909, 0.23636364, -0.63636364, -0.01818182, -0.2 , -0.01818182]) >>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1)) >>> sortedness(original, original, f=wf, return_pvalues=True) array([[ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan], [ 1., nan]]) >>> sortedness(original, projected2, f=wf) array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]) >>> sortedness(original, projected1, f=wf) array([0.89469168, 0.89269637, 0.92922928, 0.99721669, 0.86529591, 0.97806422, 0.94330979, 0.99357377, 0.87959707, 0.92182767, 0.87256459, 0.87747329]) >>> sortedness(original, projectedrnd, f=wf) array([ 0.23771513, -0.2790059 , 0.3718005 , -0.16623167, 0.06179047, 0.40434396, -0.00130294, 0.46569739, -0.67581876, -0.23852189, -0.39125007, 0.12131153]) >>> np.random.seed(14980) >>> projectedrnd = permutation(original) >>> sortedness(original, projectedrnd) array([ 0.21666209, -0.23798067, 0.16830046, -0.5559795 , -0.23470751, 0.35716692, -0.0054951 , -0.35890385, 0.13101743, -0.48460059, -0.46430457, 0.27400939]) >>> sortedness(original, np.flipud(original)) array([-0.26471073, 0.39699442, 0.05022757, 0.21215372, 0.23467884, -0.33277347, -0.31616633, 0.19381204, -0.16114967, 0.08829425, 0.3383928 , -0.31012412]) >>> original = np.array([[0],[1],[2],[3],[4],[5],[6]]) >>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]]) >>> sortedness(original, projected) array([1., 1., 1., 1., 1., 1., 1.]) >>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]]) >>> sortedness(original, projected) array([-1. , 0.42263889, 0.80668827, 0.98180162, 0.98180162, 0.82721863, 0.61648537]) >>> sortedness(original, projected, 1) 0.4226388949217901 >>> sortedness([[1,2,3,3],[1,2,7,3],[3,4,8,5],[1,8,3,5]], [[2,1,2,3],[3,1,2,3],[5,4,5,6],[9,7,6,3]], 1) 0.18181818181818177 """ isweightedtau = False if hasattr(f, "isweightedtau") and f.isweightedtau: isweightedtau = True if "rank" not in kwargs: kwargs["rank"] = None if parallel_kwargs is None: parallel_kwargs = {} result, pvalues = [], [] npoints = len(X) if i is not None: x = X[i] if isinstance(X, (ndarray, list)) else X.iloc[i] x_ = X_[i] if isinstance(X_, (ndarray, list)) else X_.iloc[i] X = np.delete(X, i, axis=0) X_ = np.delete(X_, i, axis=0) d_ = np.sum((X_ - x_) ** 2, axis=1) if distance_dependent: d = np.sum((X - x) ** 2, axis=1) scores_X, scores_X_ = (-d, -d_) if isweightedtau else (d, d_) corr, pvalue = f(scores_X, scores_X_, **kwargs) return (corr, pvalue) if return_pvalues else corr else: # pragma: no cover raise Exception(f"Not implemented yet; it is an open problem") D = abs(X - x).T scores_X, scores_x_ = (-D, -d_) if isweightedtau else (D, d_) for j in range(len(scores_X)): corr, pvalue = f(scores_X[j], scores_x_, **kwargs) result.append(round(corr, 12)) pvalues.append(round(pvalue, 12)) return (mean(result), mean(pvalues)) if return_pvalues else mean(result) if distance_dependent: tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map sqdist_X, sqdist_X_ = tmap(lambda M: cdist(M, M, metric='sqeuclidean'), [X, X_]) D = remove_diagonal(sqdist_X) D_ = remove_diagonal(sqdist_X_) scores_X, scores_X_ = (-D, -D_) if isweightedtau else (D, D_) for i in range(len(X)): corr, pvalue = f(scores_X[i], scores_X_[i], **kwargs) result.append(round(corr, 12)) pvalues.append(round(pvalue, 12)) else: # pragma: no cover raise Exception(f"Not implemented yet; it is an open problem") # for i in range(len(X)): # corr, pvalue = sortedness(X, X_, i, f=f, distance_dependent=False, return_pvalues=True, # parallel=parallel, parallel_n_trigger=parallel_n_trigger, parallel_kwargs=parallel_kwargs, **kwargs) # result.append(round(corr, 12)) # pvalues.append(round(pvalue, 12)) result = np.array(result, dtype=float) if return_pvalues: return np.array(list(zip(result, pvalues))) return result def stress(X, X_, metric=True, parallel=True, parallel_size_trigger=10000, **parallel_kwargs)-
Kruskal's "Stress Formula 1" normalized before comparing distances. default: Euclidean
>>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> mean = (1, 12) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(mean, cov, size=12) >>> s = stress(original, original*5) >>> min(s), max(s), s (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])) >>> from sklearn.decomposition import PCA >>> projected = PCA(n_components=2).fit_transform(original) >>> s = stress(original, projected) >>> min(s), max(s), s (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])) >>> projected = PCA(n_components=1).fit_transform(original) >>> s = stress(original, projected) >>> min(s), max(s), s (0.073462710191, 0.333885390367, array([0.2812499 , 0.31103416, 0.21994448, 0.07346271, 0.2810867 , 0.16411944, 0.17002148, 0.14748528, 0.18341208, 0.14659984, 0.33388539, 0.24110857])) >>> stress(original, projected) array([0.2812499 , 0.31103416, 0.21994448, 0.07346271, 0.2810867 , 0.16411944, 0.17002148, 0.14748528, 0.18341208, 0.14659984, 0.33388539, 0.24110857]) >>> stress(original, projected, metric=False) array([0.33947258, 0.29692937, 0.30478874, 0.10509128, 0.2516135 , 0.2901905 , 0.1662822 , 0.13153341, 0.34299717, 0.164696 , 0.35266095, 0.35276684])Parameters
X- matrix with an instance by row in a given space (often the original one)
X_- matrix with an instance by row in another given space (often the projected one)
metric- Stress formula version: metric or nonmetric
parallel- Parallelize processing when |X|>1000. Might use more memory.
Returns
Expand source code
def stress(X, X_, metric=True, parallel=True, parallel_size_trigger=10000, **parallel_kwargs): """ Kruskal's "Stress Formula 1" normalized before comparing distances. default: Euclidean >>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> mean = (1, 12) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(mean, cov, size=12) >>> s = stress(original, original*5) >>> min(s), max(s), s (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])) >>> from sklearn.decomposition import PCA >>> projected = PCA(n_components=2).fit_transform(original) >>> s = stress(original, projected) >>> min(s), max(s), s (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])) >>> projected = PCA(n_components=1).fit_transform(original) >>> s = stress(original, projected) >>> min(s), max(s), s (0.073462710191, 0.333885390367, array([0.2812499 , 0.31103416, 0.21994448, 0.07346271, 0.2810867 , 0.16411944, 0.17002148, 0.14748528, 0.18341208, 0.14659984, 0.33388539, 0.24110857])) >>> stress(original, projected) array([0.2812499 , 0.31103416, 0.21994448, 0.07346271, 0.2810867 , 0.16411944, 0.17002148, 0.14748528, 0.18341208, 0.14659984, 0.33388539, 0.24110857]) >>> stress(original, projected, metric=False) array([0.33947258, 0.29692937, 0.30478874, 0.10509128, 0.2516135 , 0.2901905 , 0.1662822 , 0.13153341, 0.34299717, 0.164696 , 0.35266095, 0.35276684]) Parameters ---------- X matrix with an instance by row in a given space (often the original one) X_ matrix with an instance by row in another given space (often the projected one) metric Stress formula version: metric or nonmetric parallel Parallelize processing when |X|>1000. Might use more memory. Returns ------- """ tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and X.size > parallel_size_trigger else map # TODO: parallelize cdist in slices? if metric: thread = lambda M, m: cdist(M, M, metric=m) Dsq, D_ = tmap(thread, [X, X_], ["sqeuclidean", "Euclidean"]) # Slowest part (~98%). Dsq /= np.max(Dsq) D = sqrt(Dsq) D_ /= np.max(D_) else: thread = lambda M: rankdata(cdist(M, M, metric="sqeuclidean"), method="average", axis=1) - 1 D, D_ = tmap(thread, [X, X_]) Dsq = D ** 2 sqdiff = (D - D_) ** 2 nume = sum(sqdiff) deno = sum(Dsq) result = np.round(sqrt(nume / deno), 12) return result