"""
.. py:currentmodule:: multisample
.. module:: multisample
:platform: Unix, Windows
:synopsis: Module to calculate multi sample tets: Kruskal-Wallis, Hettmansperger-Norton
.. moduleauthor:: Martin Happ <martin.happ@aon.at>
"""
import pandas as pd
import numpy as np
import PyNonpar.pseudorank as ps
import math
import scipy
import scipy.stats
from collections import namedtuple
import PyNonpar.pseudorank
[docs]def kruskal_wallis_test(data, group, pseudoranks = True):
"""
Function to calculate the Kruskal-Wallis test. It is recommended to use pseudo-ranks as ranks may lead to paradoxical results.\n
Null hypothesis H_0: F_1 = ... F_a
Args:
data (list(float)): data from all groups \n
group (list(int)): group factor \n
pseudoranks (bool): True if pseudo-ranks instead of ranks are used \n
Returns:
namedtuple('KruskalWallisResult', ('statistic', 'pvalue')):\n
test statistic (float)\n
p-value (float)
"""
N = len(data)
ranks = [0 for i in range(N)]
if pseudoranks:
ranks = PyNonpar.pseudorank.psrank(data, group)
else:
ranks = PyNonpar.pseudorank.psrank(data, [1 for i in range(N)])
d = {'data': ranks, 'grp': group}
df = pd.DataFrame(data=d)
df["grp"] = df["grp"].astype('category')
df['grp'] = df['grp'].cat.codes
a = len(np.unique(df['grp']))
# numerator
rank_means = [0.0 for i in range(a)]
n = [0 for i in range(a)]
numerator = 0.0
for i in range(a):
n[i] = len(df[df['grp'] == i]['data'])
rank_means[i] = np.mean( df[df['grp'] == i]['data'] )
numerator += n[i] * ( rank_means[i] - (N + 1)*1/2 ) ** 2
denominator = 0.0
for i in range(N):
denominator += ( df['data'][i] - (N + 1)*1/2 ) ** 2
Q_N = numerator*1/denominator*(N - 1)
p_value = 1 - scipy.stats.chi2.cdf(Q_N, a - 1)
result = namedtuple('KruskalWallisResult', ('statistic', 'pvalue'))
output = result(Q_N, p_value)
return output
[docs]def hettmansperger_norton_test(data, group, alternative = "increasing", trend = None):
"""
Function to calculate the Hettmansperger-Norton test.
Args:
data (list(float)): data from all groups \n
group (list(int)): group factor \n
alternative (str): either 'increasing', 'decreasing' or 'custom' \n
trend (list(float)): a vector specifying the alternative; only used, if alternative = 'custom' \n
Returns:
namedtuple('HettmanspergerNortonResult', ('alternative', 'weight', 'statistic', 'pvalue')): \n
chosen alternative (str) \n
trend (list(float))\n
test statistic (float)\n
one sided p-value (float)
References:
Hettmansperger, T. P., & Norton, R. M. (1987). Tests for patterned alternatives in k-sample problems. Journal of the American Statistical Association, 82(397), 292-299.
"""
d = {'data': ps.psrank(data, group, ties_method = "average"), 'grp': group}
df = pd.DataFrame(data=d)
df['grp'] = df['grp'].astype(pd.api.types.CategoricalDtype(categories=np.unique(df['grp'].tolist()), ordered=True))
df = df.sort_values(['grp'])
# dff.assign(grp = dff['grp'][dff.index].values)
# orig_sort = df.index
df = df.reset_index(drop=True)
df['codes'] = df['grp'].cat.codes
# calculate group sizes, number of groups and unweighted relative effects
N = len(df['data'])
a = len(np.unique(df['codes']))
n = [0 for x in range(a)]
p_hat = [0 for x in range(a)]
for i in range(0, a):
tmp = df[df['codes'] == i]
n[i] = len(tmp)
p_hat[i] = 1/N*(np.mean(tmp['data']) - 0.5)
# define weight function for hypothesis testing
w = np.arange(1, (a + 1))
if alternative == "decreasing":
w = w[::-1]
elif alternative == "custom":
w = trend
# defining matrices and vectors for the computing test statistic
v1 = [1 for x in range(a)]
mS = np.diag(n)
mI = np.diag(v1)
mJ = np.outer(v1, v1)
W = np.dot( mS, ( mI - np.dot( 1/N*mJ, mS ) ) )
# variance estimator
sum_psranks_squared = 0
for i in range(N):
sum_psranks_squared += (df['data'].tolist()[i] - (N + 1)/2) ** 2
sum_psranks_squared = (1/N **2)*1/(N-1)*sum_psranks_squared
sigma_part = np.dot(W, w)
sigma_hat_squared = N*sum_psranks_squared*np.dot(np.dot( sigma_part.transpose(), np.linalg.inv(mS) ), sigma_part )
# test statistic
test_hettmansperger = math.sqrt(N)*np.dot(sigma_part.transpose(), p_hat)*1/math.sqrt(sigma_hat_squared)
p_value = 1 - scipy.stats.norm.cdf(test_hettmansperger)
result = namedtuple('HettmanspergerNortonResult', ('alternative', 'weight', 'statistic', 'pvalue'))
output = result(alternative, w, test_hettmansperger, p_value)
return output