Source code for pyleoclim.Timeseries

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 27 11:08:39 2017

@author: deborahkhider

Basic manipulation of timeseries for the pyleoclim module
"""

import numpy as np
import pandas as pd
import warnings


[docs]def bin(x, y, bin_size="", start="", end=""): """ Bin the values Args: x (array): the x-axis series. y (array): the y-axis series. bin_size (float): The size of the bins. Default is the average resolution start (float): Where/when to start binning. Default is the minimum end (float): When/where to stop binning. Defulat is the maximum Returns: binned_values - the binned output \n bins - the bins (centered on the median, i.e., the 100-200 bin is 150) \n n - number of data points in each bin \n error - the standard error on the mean in each bin """ # Make sure x and y are numpy arrays x = np.array(x, dtype='float64') y = np.array(y, dtype='float64') # Get the bin_size if not available if not bin_size: bin_size = np.nanmean(np.diff(x)) # Get the start/end if not given if type(start) is str: start = np.nanmin(x) if type(end) is str: end = np.nanmax(x) # Set the bin medians bins = np.arange(start+bin_size/2, end + bin_size/2, bin_size) # Perform the calculation binned_values = [] n = [] error = [] for val in np.nditer(bins): idx = [idx for idx, c in enumerate(x) if c >= (val-bin_size/2) and c < (val+bin_size/2)] if y[idx].size == 0: binned_values.append(np.nan) n.append(np.nan) error.append(np.nan) else: binned_values.append(np.nanmean(y[idx])) n.append(y[idx].size) error.append(np.nanstd(y[idx])) return bins, binned_values, n, error
[docs]def interp(x,y,interp_step="",start="",end=""): """ Linear interpolation onto a new x-axis Args: x (array): the x-axis y (array): the y-axis interp_step (float): the interpolation step. Default is mean resolution. start (float): where/when to start the interpolation. Default is min.. end (float): where/when to stop the interpolation. Defaul is max. Returns: xi - the interpolated x-axis \n interp_values - the interpolated values """ #Make sure x and y are numpy arrays x = np.array(x,dtype='float64') y = np.array(y,dtype='float64') # get the interpolation step if not available if not interp_step: interp_step = np.nanmean(np.diff(x)) # Get the start and end point if not given if type(start) is str: start = np.nanmin(np.asarray(x)) if type(end) is str: end = np.nanmax(np.asarray(x)) # Get the interpolated x-axis. xi = np.arange(start,end,interp_step) #Make sure the data is increasing data = pd.DataFrame({"x-axis": x, "y-axis": y}).sort_values('x-axis') interp_values = np.interp(xi,data['x-axis'],data['y-axis']) return xi, interp_values
[docs]def onCommonAxis(x1, y1, x2, y2, interp_step="", start="", end=""): """Places two timeseries on a common axis Args: x1 (array): x-axis values of the first timeseries y1 (array): y-axis values of the first timeseries x2 (array): x-axis values of the second timeseries y2 (array): y-axis values of the second timeseries interp_step (float): The interpolation step. Default is mean resolution of lowest resolution series start (float): where/when to start. Default is the maximum of the minima of the two timeseries end (float): Where/when to end. Default is the minimum of the maxima of the two timeseries Returns: xi - the interpolated x-axis \n interp_values1 - the interpolated y-values for the first timeseries interp_values2 - the intespolated y-values for the second timeseries """ # make sure that x1, y1, x2, y2 are numpy arrays x1 = np.array(x1, dtype='float64') y1 = np.array(y1, dtype='float64') x2 = np.array(x2, dtype='float64') y2 = np.array(y2, dtype='float64') # Find the mean/max x-axis is not provided if type(start) is str: start = np.nanmax([np.nanmin(x1), np.nanmin(x2)]) if type(end) is str: end = np.nanmin([np.nanmax(x1), np.nanmax(x2)]) # Get the interp_step if not interp_step: interp_step = np.nanmin([np.nanmean(np.diff(x1)), np.nanmean(np.diff(x2))]) # perform the interpolation xi, interp_values1 = interp(x1, y1, interp_step=interp_step, start=start, end=end) xi, interp_values2 = interp(x2, y2, interp_step=interp_step, start=start, end=end) return xi, interp_values1, interp_values2
[docs]def standardize(x, scale=1, axis=0, ddof=0, eps=1e-3): """ Centers and normalizes a given time series. Constant or nearly constant time series not rescaled. Args: x (array): vector of (real) numbers as a time series, NaNs allowed scale (real): a scale factor used to scale a record to a match a given variance axis (int or None): axis along which to operate, if None, compute over the whole array ddof (int): degress of freedom correction in the calculation of the standard deviation eps (real): a threshold to determine if the standard deviation is too close to zero Returns: z (array): the standardized time series (z-score), Z = (X - E[X])/std(X)*scale, NaNs allowed mu (real): the mean of the original time series, E[X] sig (real): the standard deviation of the original time series, std[X] References: 1. Tapio Schneider's MATLAB code: http://www.clidyn.ethz.ch/imputation/standardize.m 2. The zscore function in SciPy: https://github.com/scipy/scipy/blob/master/scipy/stats/stats.py @author: fzhu """ x = np.asanyarray(x) assert x.ndim <= 2, 'The time series x should be a vector or 2-D array!' mu = np.nanmean(x, axis=axis) # the mean of the original time series sig = np.nanstd(x, axis=axis, ddof=ddof) # the std of the original time series mu2 = np.copy(mu) # the mean used in the calculation of zscore sig2 = np.copy(sig) / scale # the std used in the calculation of zscore if np.any(np.abs(sig) < eps): # check if x contains (nearly) constant time series warnings.warn('Constant or nearly constant time series not rescaled.') where_const = np.where(np.abs(sig) < eps) # find out where we have (nearly) constant time series # if a vector is (nearly) constant, keep it the same as original, i.e., substract by 0 and divide by 1. mu2[where_const] = 0 sig2[where_const] = 1 if axis and mu.ndim < x.ndim: z = (x - np.expand_dims(mu2, axis=axis)) / np.expand_dims(sig2, axis=axis) else: z = (x - mu2) / sig2 return z, mu, sig
def ts2segments(ys, ts, factor): ''' Chop a time series into several segments based on gap detection. The rule of gap detection is very simple now: if the time interval between some two data points are larger than some factor times the mean time interval between each two neighbouring data points, then we think there is a breaking point between them and chop them into two pieces Args: ys (array): a time series, NaNs allowed ts (array): the time points factor (float): factor to adjust the threshold, threshold = factor * dt_mean Returns: seg_ys (list): a list of several segments with potentially different lengths seg_ts (list): a list of the time axis of the several segments n_segs (int): the number of segments @author: fzhu ''' # delete the NaNs if there is any ys_tmp = np.copy(ys) ys = ys[~np.isnan(ys_tmp)] ts = ts[~np.isnan(ys_tmp)] nt = np.size(ts) dt_mean = np.mean(np.diff(ts)) dts = np.diff(ts) threshold = factor * dt_mean n_breaks = np.size(dts[dts > threshold]) # the number of breaks n_segs = n_breaks + 1 seg_ys, seg_ts = [], [] # store the segments with lists i_start = 0 for i in range(nt-1): if dts[i] > threshold: i_end = i + 1 seg_ys.append(ys[i_start:i_end]) seg_ts.append(ts[i_start:i_end]) i_start = np.copy(i_end) seg_ys.append(ys[i_start:nt]) seg_ts.append(ts[i_start:nt]) return seg_ys, seg_ts, n_segs