Source code for pseudorank
"""
.. py:currentmodule:: pseudorank
.. module:: pseudorank
:platform: Unix, Windows
:synopsis: Module to calculate pseudo-ranks (mid/min/max).
.. moduleauthor:: Martin Happ <martin.happ@aon.at>
"""
import pandas as pd
import numpy as np
import numba as nu
@nu.jit(nopython=True)
def _psrank_average(data: list, group: list, N: int, n: list, a: int) -> list:
tmp = [0.0 for i in range(len(data))]
# recursion start
tmp[0] = 1 / 2 + N / a * 1 / 2 * 1 / n[group[0]]
# case: no ties
for i in range(1, N):
tmp[i] = tmp[i - 1] + N / a * 1 / 2 * (1 / n[group[i]] + 1 / n[group[i - 1]])
# case: ties
lpsrank = tmp[:]
add = 0.0
j = 0
i = 0
while i < (N - 1):
if data[i] == data[i + 1]:
add = 0.0
j = i + 1
while data[i] == data[j]:
add += 1 / n[group[j]]
j += 1
if j == N:
break
for k in range(i, j):
if i > 0:
lpsrank[k] = tmp[i - 1] + N / a * 1 / 2 * (1 / n[group[i]] + 1 / n[group[i - 1]]) + N / a * 1 / 2 * add
else:
lpsrank[k] = tmp[0] + N / a * 1 / 2 * add
i = j - 1
i += 1
return lpsrank
@nu.jit(nopython=True)
def _psrank_max(data: list, group: list, N: int, n: list, a: int) -> list:
tmp = [0.0 for i in range(len(data))]
# recursion start
tmp[0] = N / a * 1 / n[group[0]]
# case: no ties
for i in range(1, N):
tmp[i] = tmp[i - 1] + N / a * (1 / n[group[i]])
# case: ties
lpsrank = tmp[:]
add = 0.0
j = 0
i = 0
while i < (N - 1):
if data[i] == data[i + 1]:
add = 1 / n[group[i]]
j = i + 1
while data[i] == data[j]:
add += 1 / n[group[j]]
j += 1
if j == N:
break
for k in range(i, (j)):
if i > 0:
lpsrank[k] = tmp[i - 1] + N / a * add
else:
lpsrank[k] = N / a * add
i = j - 1
i += 1
return lpsrank
@nu.jit(nopython=True)
def _psrank_min(data: list, group: list, N: int, n: list, a: int) -> list:
tmp = [0.0 for i in range(len(data))]
# recursion start
tmp[0] = 1
# case: no ties
for i in range(1, N):
tmp[i] = tmp[i - 1] + N / a * (1 / n[group[i - 1]])
# case: ties
lpsrank = tmp[:]
add = 0.0
j = 0
i = 0
while i < (N - 1):
if data[i] == data[i + 1]:
add = 1 / n[group[i]]
j = i + 1
while data[i] == data[j]:
add += 1 / n[group[j]]
j += 1
if j == N:
break
for k in range((i + 1), j):
lpsrank[k] = tmp[i]
if j < N:
lpsrank[j] = tmp[i] + N / a * add
i = j - 1
i += 1
return lpsrank
[docs]def psrank(data, group, ties_method = "average"):
"""
Function to calculate pseudo-ranks.
Args:
data (list(float)): values to be ranked \n
group (list(int)): group factor \n
ties_method (str): either 'average', 'max' or 'min' for mid, max or min pseudo-ranks \n
Returns:
pseudo-ranks (list(float))
"""
# Check inputs
if not isinstance(data, list):
raise TypeError("data must be a list")
if not isinstance(group, list):
raise TypeError("group must be a list")
if (not isinstance(ties_method, str)) or (ties_method not in ['average', 'min', 'max']):
raise TypeError("ties_method must be either 'average', 'min' or 'max'")
d = {'data': data, 'grp': group}
df = pd.DataFrame(data=d)
df["grp"] = df["grp"].astype('category')
dff = df.sort_values(['data'])
# dff.assign(grp = dff['grp'][dff.index].values)
orig_sort = dff.index
dff = dff.reset_index(drop=True)
dff['grp'] = dff['grp'].cat.codes
# calculate pseudo-ranks
dff['tmp'] = dff['data'].astype('float')
N = len(dff['data'])
a = len(np.unique(dff['grp']))
n = [0 for x in range(a)]
for i in range(a):
n[i] = len(dff[dff['grp'] == i])
if ties_method == "max":
dff['psrank'] = _psrank_max(dff['data'].tolist(), dff['grp'].tolist(), N, n, a)
elif ties_method == "min":
dff['psrank'] = _psrank_min(dff['data'].tolist(), dff['grp'].tolist(), N, n, a)
else:
dff['psrank'] = _psrank_average(dff['data'].tolist(), dff['grp'].tolist(), N, n, a)
# sort back
dff.index = orig_sort
dff.sort_index(inplace=True)
return dff['psrank'].tolist()