Source code for pyleoclim.Stats

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 27 10:46:59 2017

@author: deborahkhider

Statistics toolbox for pyleoclim

"""

import numpy as np
from scipy.stats import pearsonr
from scipy.stats.mstats import gmean
from scipy.stats import t as stu
from scipy.stats import gaussian_kde
import statsmodels.api as sm
from sklearn import preprocessing
from tqdm import tqdm


"""
Simple stats
"""


[docs]def simpleStats(y, axis=None): """ Computes simple statistics Computes the mean, median, min, max, standard deviation, and interquartile range of a numpy array y. Args: y (array): A Numpy array axis (int, typle of ints): Optional. Axis or Axes along which the means are computed, the default is to compute the mean of the flattened array. If a tuple of ints, performed over multiple axes Returns: The mean, median, min, max, standard deviation and IQR by columns """ # make sure that y is an array y = np.array(y, dtype='float64') # Perform the various calculations mean = np.nanmean(y, axis=axis) std = np.nanstd(y, axis=axis) median = np.nanmedian(y, axis=axis) min_ = np.nanmin(y, axis=axis) max_ = np.nanmax(y, axis=axis) IQR = np.nanpercentile(y, 75, axis=axis) - np.nanpercentile(y, 25, axis=axis) return mean, median, min_, max_, std, IQR
""" Correlation """ class Correlation(object): """ Estimates the significance of correlations """ def corr_sig(self, y1, y2, nsim=1000, method='isospectral', alpha=0.5): """ Estimates the significance of correlations between non IID time series by 3 independent methods: 1) 'ttest': T-test where d.o.f are corrected for the effect of serial correlation 2) 'isopersistent': AR(1) modeling of x and y. 3) 'isospectral': phase randomization of original inputs. (default) The T-test is parametric test, hence cheap but usually wrong except in idyllic circumstances. The others are non-parametric, but their computational requirements scales with nsim. Args: y1, y2 (array)- vector of (real) numbers of identical length, no NaNs allowed nsim (int)- the number of simulations [1000] method (str)- methods 1-3 above ['isospectral'] alpha (float)- significance level for critical value estimation [0.05] Returns: r (real): correlation between x and y \n signif (boolean): true (1) if significant; false (0) otherwise \n p (real): Fraction of time series with higher correlation coefficents than observed (approximates the p-value). \n Note that signif = True if and only if p <= alpha. """ y1 = np.array(y1, dtype=float) y2 = np.array(y2, dtype=float) assert np.size(y1) == np.size(y2), 'The size of X and the size of Y should be the same!' if method == 'ttest': (r, signif, p) = self.corr_ttest(y1, y2, alpha=alpha) elif method == 'isopersistent': (r, signif, p) = self.corr_isopersist(y1, y2, alpha=alpha, nsim=nsim) elif method == 'isospectral': (r, signif, p) = self.corr_isospec(y1, y2, alpha=alpha, nsim=nsim) return r, signif, p def corr_ttest(self, y1, y2, alpha=0.05): """ Estimates the significance of correlations between 2 time series using the classical T-test with degrees of freedom modified for autocorrelation. This function creates 'nsim' random time series that have the same power spectrum as the original time series but with random phases. Args: y1, y2 (array): vectors of (real) numbers with identical length, no NaNs allowed alpha (real): significance level for critical value estimation [default: 0.05] Returns: r (real)- correlation between x and y \n signif (boolean)- true (1) if significant; false (0) otherwise \n pval (real)- test p-value (the probability of the test statstic exceeding the observed one by chance alone) """ r = pearsonr(y1, y2)[0] g1 = self.ar1_fit(y1) g2 = self.ar1_fit(y2) N = np.size(y1) Ney1 = N * (1-g1) / (1+g1) Ney2 = N * (1-g2) / (1+g2) Ne = gmean([Ney1+Ney2]) assert Ne >= 10, 'Too few effective d.o.f. to apply this method!' df = Ne - 2 t = np.abs(r) * np.sqrt(df/(1-r**2)) pval = 2 * stu.cdf(-np.abs(t), df) signif = pval <= alpha return r, signif, pval def corr_isopersist(self, y1, y2, alpha=0.05, nsim=1000): ''' Computes correlation between two timeseries, and their significance. The latter is gauged via a non-parametric (Monte Carlo) simulation of correlations with nsim AR(1) processes with identical persistence properties as x and y ; the measure of which is the lag-1 autocorrelation (g). Args: y1, y2 (array): vectors of (real) numbers with identical length, no NaNs allowed alpha (real): significance level for critical value estimation [default: 0.05] nsim (int): number of simulations [default: 1000] Returns: r (real) - correlation between x and y \n signif (boolean) - true (1) if significant; false (0) otherwise \n pval (real) - test p-value (the probability of the test statstic exceeding the observed one by chance alone) Remarks: The probability of obtaining a test statistic at least as extreme as the one actually observed, assuming that the null hypothesis is true. \n The test is 1 tailed on |r|: Ho = { |r| = 0 }, Ha = { |r| > 0 } \n The test is rejected (signif = 1) if pval <= alpha, otherwise signif=0; \n (Some Rights Reserved) Hepta Technologies, 2009 \n v1.0 USC, Aug 10 2012, based on corr_signif.m ''' r = pearsonr(y1, y2)[0] ra = np.abs(r) y1_red, g1 = self.isopersistent_rn(y1, nsim) y2_red, g2 = self.isopersistent_rn(y2, nsim) rs = np.zeros(nsim) for i in np.arange(nsim): rs[i] = pearsonr(y1_red[:, i], y2_red[:, i])[0] rsa = np.abs(rs) xi = np.linspace(0, 1.1*np.max([ra, np.max(rsa)]), 200) kde = gaussian_kde(rsa) prob = kde(xi).T diff = np.abs(ra - xi) # min_diff = np.min(diff) pos = np.argmin(diff) pval = np.trapz(prob[pos:], xi[pos:]) rcrit = np.percentile(rsa, 100*(1-alpha)) signif = ra >= rcrit return r, signif, pval def isopersistent_rn(self, X, p): ''' Generates p realization of a red noise [i.e. AR(1)] process with same persistence properties as X (Mean and variance are also preserved). Args: X (array): vector of (real) numbers as a time series, no NaNs allowed p (int): number of simulations Returns: red (matrix) - n rows by p columns matrix of an AR1 process, where n is the size of X \n g (real) - lag-1 autocorrelation coefficient Remarks: (Some Rights Reserved) Hepta Technologies, 2008 ''' n = np.size(X) sig = np.std(X, ddof=1) g = self.ar1_fit(X) # red = red_noise(N, M, g) red = self.ar1_sim(n, p, g, sig) return red, g def ar1_fit(self, ts): ''' Return the lag-1 autocorrelation from ar1 fit. Args: ts (array): vector of (real) numbers as a time series Returns: g (real): lag-1 autocorrelation coefficient ''' ar1_mod = sm.tsa.AR(ts, missing='drop').fit(maxlag=1, trend='nc') g = ar1_mod.params[0] return g def ar1_sim(self, n, p, g, sig): ''' Produce p realizations of an AR1 process of length n with lag-1 autocorrelation g Args: n, p (int): dimensions as n rows by p columns g (real): lag-1 autocorrelation coefficient sig (real): the standard deviation of the original time series Returns: red (matrix): n rows by p columns matrix of an AR1 process ''' # specify model parameters (statsmodel wants lag0 coefficents as unity) ar = np.r_[1, -g] # AR model parameter ma = np.r_[1, 0.0] # MA model parameters sig_n = sig*np.sqrt(1-g**2) # theoretical noise variance for red to achieve the same variance as X red = np.empty(shape=(n, p)) # declare array # simulate AR(1) model for each column for i in np.arange(p): red[:, i] = sm.tsa.arma_generate_sample(ar=ar, ma=ma, nsample=n, burnin=50, sigma=sig_n) return red def red_noise(self, N, M, g): ''' Produce M realizations of an AR1 process of length N with lag-1 autocorrelation g Args: N, M (int): dimensions as N rows by M columns g (real): lag-1 autocorrelation coefficient Returns: red (matrix): N rows by M columns matrix of an AR1 process Remarks: (Some Rights Reserved) Hepta Technologies, 2008 J.E.G., GaTech, Oct 20th 2008 ''' red = np.zeros(shape=(N, M)) red[0, :] = np.random.randn(1, M) for i in np.arange(1, N): red[i, :] = g * red[i-1, :] + np.random.randn(1, M) return red def corr_isospec(self, y1, y2, alpha=0.05, nsim=1000): ''' Phase randomization correltation estimates Estimates the significance of correlations between non IID time series by phase randomization of original inputs. This function creates 'nsim' random time series that have the same power spectrum as the original time series but random phases. Args: y1, y2 (array): vectors of (real) numbers with identical length, no NaNs allowed alpha (real): significance level for critical value estimation [default: 0.05] nsim (int): number of simulations [default: 1000] Returns: r (real): correlation between y1 and y2 \n signif (boolean): true (1) if significant; false (0) otherwise \n F : Fraction of time series with higher correlation coefficents than observed (approximates the p-value). References: - Ebisuzaki, W, 1997: A method to estimate the statistical significance of a correlation when the data are serially correlated. J. of Climate, 10, 2147-2153. - Prichard, D., Theiler, J. Generating Surrogate Data for Time Series with Several Simultaneously Measured Variables (1994) Physical Review Letters, Vol 73, Number 7 (Some Rights Reserved) USC Climate Dynamics Lab, 2012. ''' r = pearsonr(y1, y2)[0] # generate phase-randomized samples using the Theiler & Prichard method Y1surr = self.phaseran(y1, nsim) Y2surr = self.phaseran(y2, nsim) # compute correlations Y1s = preprocessing.scale(Y1surr) Y2s = preprocessing.scale(Y2surr) n = np.size(y1) C = np.dot(np.transpose(Y1s), Y2s) / (n-1) rSim = np.diag(C) # compute fraction of values higher than observed F = np.sum(np.abs(rSim) >= np.abs(r)) / nsim # establish significance signif = F < alpha # significant or not? return r, signif, F def phaseran(self, recblk, nsurr): ''' Phaseran by Carlos Gias http://www.mathworks.nl/matlabcentral/fileexchange/32621-phase-randomization/content/phaseran.m Args: recblk (2D array): Row: time sample. Column: recording. An odd number of time samples (height) is expected. If that is not the case, recblock is reduced by 1 sample before the surrogate data is created. The class must be double and it must be nonsparse. nsurr (int): is the number of image block surrogates that you want to generate. Returns: surrblk: 3D multidimensional array image block with the surrogate datasets along the third dimension Reference: Prichard, D., Theiler, J. Generating Surrogate Data for Time Series with Several Simultaneously Measured Variables (1994) Physical Review Letters, Vol 73, Number 7 ''' # Get parameters nfrms = recblk.shape[0] if nfrms % 2 == 0: nfrms = nfrms-1 recblk = recblk[0:nfrms] len_ser = int((nfrms-1)/2) interv1 = np.arange(1, len_ser+1) interv2 = np.arange(len_ser+1, nfrms) # Fourier transform of the original dataset fft_recblk = np.fft.fft(recblk) surrblk = np.zeros((nfrms, nsurr)) for k in tqdm(np.arange(nsurr)): ph_rnd = np.random.rand(len_ser) # Create the random phases for all the time series ph_interv1 = np.exp(2*np.pi*1j*ph_rnd) ph_interv2 = np.conj(np.flipud(ph_interv1)) # Randomize all the time series simultaneously fft_recblk_surr = np.copy(fft_recblk) fft_recblk_surr[interv1] = fft_recblk[interv1] * ph_interv1 fft_recblk_surr[interv2] = fft_recblk[interv2] * ph_interv2 # Inverse transform surrblk[:, k] = np.real(np.fft.ifft(fft_recblk_surr)) return surrblk
[docs]def corrsig(y1, y2, nsim=1000, method='isospectral', alpha=0.5): """ Estimates the significance of correlations between non IID time series by 3 independent methods: 1) 'ttest': T-test where d.o.f are corrected for the effect of serial correlation 2) 'isopersistent': AR(1) modeling of x and y. 3) 'isospectral': phase randomization of original inputs. (default) The T-test is parametric test, hence cheap but usually wrong except in idyllic circumstances. The others are non-parametric, but their computational requirements scales with nsim. Args: y1, y2 (array)- vector of (real) numbers of identical length, no NaNs allowed nsim (int)- the number of simulations [1000] method (str)- methods 1-3 above ['isospectral'] alpha (float)- significance level for critical value estimation [0.05] Returns: r (real): correlation between x and y \n signif (int): true if significant; false otherwise \n p (real): Fraction of time series with higher correlation coefficents than observed (approximates the p-value). \n Note that signif = True if and only if p <= alpha. """ corr = Correlation() r, signif, p = corr.corr_sig(y1,y2, nsim = nsim, method = method, alpha = alpha) return r, signif, p