# (C) Copyright IBM Corp. 2019, 2020, 2021, 2022.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import h5py
from scipy.integrate import quadrature
from typing import Union
from simulai.metrics import MemorySizeEval
from simulai.batching import batchdomain_constructor
from simulai.abstract import ROM
[docs]class HardPositivityLimiting:
def __init__(self, tol:float=1e-10) -> None:
""""Positivity limiting for avoiding negative values when they are
physically inconsistent.
It simply applies value >= tol for avoiding negative values.
:param tol: The minimum acceptable value to ensure positivity
:type tol: float
"""
self.tol = tol
def _apply_limiting(self, data:np.ndarray=None) -> (np.ndarray, int):
"""Effectively applying the limiting
:param data: the data to be limited
:type data: np.ndarray
:return: the data limited
:rtype: np.ndarray
"""
indices = np.where(data < self.tol)
data[indices] = self.tol
return data, len(indices)
def __call__(self, data:np.ndarray=None,
batch_size:Union[int, MemorySizeEval]=None) -> (np.ndarray, int):
"""The main call method
:param data: the data to be processed by the limiting
:type data: np.ndarray
:param batch_size: Size of each batch to be processed
:type batch_size:int
:return: A tuple containing the processed batch_size and the number of point in which
the limiting was actioned.
:rtype: (np.ndarray, int)
"""
if isinstance(data, np.ndarray):
return self._apply_limiting(data=data)
elif isinstance(data, h5py.Dataset):
assert batch_size, "It is necessary to provide a way for estimating the " \
"batch size."
if isinstance(batch_size, MemorySizeEval):
n_samples = data.shape[0]
batch_size = batch_size(max_batches=n_samples, shape=data.shape[1:])
else:
pass
# Constructing the normalization using the reference data
batches = batchdomain_constructor([0, n_samples], batch_size)
variables_names = data.dtype.names
n_variables = len(variables_names)
total_number_of_limited = 0
for batch_id , batch in enumerate(batches):
chunk_data = data[slice(*batch)].view(float)
limited_chunk_data, number_of_applied = self._apply_limiting(data=chunk_data)
chunk_data = np.core.records.fromarrays(np.split(limited_chunk_data, n_variables, axis=1),
names=variables_names,
formats=','.join(len(variables_names) * ['f8']))
data[slice(*batch)] = chunk_data
total_number_of_limited += number_of_applied
return data, total_number_of_limited
[docs]class TimeAveraging:
def __init__(self, batch_size:int=None, axis:int=None) -> None:
"""Time-averaging along an axis
:param batch_size: the size of the batch to be processed at each time
:type batch_size: int
:param axis: the axis along to apply time-averaging
:type axis: int
:return: nothing
"""
self.batch_size = batch_size
if not axis:
self.axis = 1
else:
self.axis = axis
[docs] def exec(self, dataset:Union[np.ndarray, h5py.Dataset], path:str=None) -> h5py.Dataset:
"""Executing time-averaging
:param dataset: The dataset to be time-averaged
:type dataset: Union[np.ndarray, h5py.Dataset]
:param path: path to save the output into a HDF5 file
:type path: str
:return: HDF5 dataset with the time-averaged original data
:rtype: h5py.Dataset
"""
fp = h5py.File(path, 'w')
output_n_samples = int(dataset.shape[0]/self.batch_size)
other_dimensions = dataset.shape[1:]
final_shape = (output_n_samples, ) + other_dimensions
dtype = [(name, 'f8') for name in dataset.dtype.names]
fp.create_dataset('data', shape=final_shape, dtype=dtype)
output_dataset = fp.get('data')
# The dataset is considered a hdf5 file
n_variables = len(output_dataset.dtype.names)
formats = n_variables*['f8']
for ii in range(0, output_n_samples):
mini_batch = dataset[ii*self.batch_size:(ii+1)*self.batch_size]
mean_value = [mini_batch[name].mean(0) for name in dataset.dtype.names]
mean_value_structured = np.core.records.fromarrays(mean_value, names=dataset.dtype.names,
formats=formats)
output_dataset[ii] = mean_value_structured
return output_dataset
[docs]class SVDThreshold:
def __init__(self) -> None:
"""Methodology for defining a threshold for de-noising data via SVD decomposition.
:return: nothing
"""
self.limit_aspect_ratio = 0.1
[docs] def lambda_special(self, beta:float=None) -> float:
"""Evaluating special lambda parameter
:param beta: parameter beta
:type beta: float
:return: special lambda
:rtype: float
"""
return np.sqrt(2 * (beta + 1) + 8 * beta / ((beta + 1) + math.sqrt(beta ** 2 + 14 * beta + 1)))
[docs] def lambda_function(self, beta:float=None, shape:Union[tuple, list]=None) -> float:
if shape[0] == shape[1]:
return beta
elif shape[1]/shape[0] < self.limit_aspect_ratio:
return self.lambda_special(beta)
else:
raise Exception(f"This case is not covered, The data matrix in nor square,"
f" nor m/n < {self.limit_aspect_ratio} = m << n.")
[docs] def beta_parameter(self, shape:Union[list, tuple]=None) -> float:
n, m = shape
# In case of square data matrices
if n == m:
return 4/np.sqrt(3)
elif n < m:
return n/m
else:
return m/n
[docs] def integrand(self, t:float, beta:float=None) -> np.ndarray:
monomial_upper = float((1 + np.sqrt(beta))**2)
monomial_lower = float((1 - np.sqrt(beta))**2)
out = np.where((monomial_upper - t)*(t - monomial_lower)>0,
np.sqrt((monomial_upper - t)*(t - monomial_lower))/(2*np.pi*beta*t),
0)
return out
[docs] def Marcenko_Pastur_integral(self, t:float, beta:float=None) -> float:
""" Marcenko-Pastur Integral
:param t: integration variable
:type t: float
:param beta: parameter beta
:type beta: float
:return: evaluation of the Marcenko-Pastur integral
:rtype: float
"""
upper_lim = (1 + np.sqrt(beta)) ** 2
val, err = quadrature(self.integrand, t, upper_lim, args=(beta,))
print(f"Quadrature error: {err}")
return val
[docs] def exec(self, singular_values:np.ndarray=None,
data_shape:Union[tuple, list]=None, gamma:float=None) -> np.ndarray:
print("Executing SVD filtering.")
n, m = data_shape
beta = self.beta_parameter(shape=data_shape)
if gamma is not None:
lambda_parameter = self.lambda_function(beta=beta, shape=data_shape)
tau_parameter = lambda_parameter*np.sqrt(n)*gamma
else:
sigma_med = np.median(singular_values)
lambda_parameter = self.lambda_special(beta=beta)
mu_parameter = self.MedianMarcenkoPastur(beta)
tau_parameter = (lambda_parameter/mu_parameter)*sigma_med
sv = singular_values[singular_values>=tau_parameter]
print(f"SVD threshold: {tau_parameter}."
f" Truncating component {len(sv)} of {len(singular_values)}")
return sv
[docs] def apply(self, pca:ROM=None, data_shape:Union[tuple, list]=None) -> ROM:
assert hasattr(pca, "singular_values"), f"The object {type(pca)} has bo attribute singular_values."
singular_values_truncated = self.exec(singular_values=pca.singular_values, data_shape=data_shape)
pca.singular_values = singular_values_truncated
pca.modes = pca.modes[:len(singular_values_truncated), :]
return pca
[docs]class TimeSeriesExtremes:
"""
Getting up the indices corresponding to the extremes of time-series
"""
def __init__(self):
pass
def _curvature_change_indicator(self, data:np.ndarray=None, index:int=None) -> np.ndarray:
assert type(index) == int, f"index must be an integer but received {type(index)}."
previous = data[:-2, index]
next = data[2:, index]
return previous*next
def _get_indices_for_extremes(self, data:np.ndarray=None, index:int=None):
z = np.zeros(data.shape[0])
z[1:-1] = self._curvature_change_indicator(data=data, index=index)
indices = np.where(z < 0)[0]
return indices
[docs] def apply(self, gradient_input_data:np.ndarray=None, column:int=None):
if column is not None:
indices = self._get_indices_for_extremes(data=gradient_input_data, index=column)
return (indices,)
else:
indices_list = list()
for cln in range(gradient_input_data.shape[1]):
indices = self._get_indices_for_extremes(data=gradient_input_data, index=cln)
indices_list.append(indices)
return tuple(indices_list)