# (C) Copyright IBM Corp. 2019, 2020, 2021, 2022.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import sys
import h5py
import psutil
import numpy as np
import dask.array as da
from typing import Union
from simulai.math.integration import RK4
from simulai.abstract import DataPreparer
from simulai.batching import batchdomain_constructor
[docs]class ByPass:
name = "no_metric"
def __init__(self):
pass
[docs]class L2Norm:
name = "l2_norm"
def __init__(self, mask:Union[str, float, None]=None, do_clean_data:bool=True,
large_number:float=1e15, default_data:float=0.0) -> None:
"""It evaluates a L^2 norm comparing an approximation and its reference value
:param mask: if there are masked or missing data, it informs what kind of value is filling these gaps
:type mask: Union[str, float, None]
:param do_clean_data: Execute a data cleaning (removing large numbers and NaN) or not ?
:type do_clean_data: bool
:returns: nothing
"""
self.large_number = large_number
self.mask = mask
self.do_clean_data = do_clean_data
self.default_data = default_data
def _clean_nan_and_large_single(self, d:np.ndarray) -> np.ndarray:
""" It removes NaNs and large numbers from a single array
:param d: data to be cleaned
:type d: np.ndarray
:returns: the data cleaned
:rtype: np.ndarray
"""
if self.mask is not None:
is_mask = d == self.mask
if np.any(is_mask):
d = np.where(is_mask, self.default_data, d)
print(f"There are values equal to the mask=={self.mask} in the dataset. They will be replaced with {self.default_data}.")
isnan_data = np.isnan(d)
if np.any(isnan_data):
d = np.where(isnan_data, self.default_data, d)
print(f"There are NaN's in the dataset. They will be replaced with {self.default_data}.")
is_large = np.abs(d) >= self.large_number
if np.any(is_large):
d = np.where(is_large, self.default_data, d)
print(f"There are large numbers, i.e., abs(x) > {self.large_number}, in the dataset. They will be replaced with {self.default_data}.")
return d
def _clean_nan_and_large(self, data:np.ndarray, reference_data:np.ndarray) -> (np.ndarray, np.ndarray):
"""It removes NaNs and large number of the input and the reference dataset
:param data: the data to be evaluated in the norm
:type data: np.ndarray
:param reference_data: the data to used as comparison
:type reference_data: np.ndarray
:returns: both the datasets after have been cleaned
:rtype: (np.ndarray, np.ndarray)
"""
if self.do_clean_data:
data = self._clean_nan_and_large_single(data)
reference_data = self._clean_nan_and_large_single(reference_data)
return data, reference_data
def _batchwise_error(self, data:Union[np.ndarray, h5py.Dataset]=None,
reference_data:Union[np.ndarray, h5py.Dataset]=None,
relative_norm:bool=False, data_interval:list=None, batch_size:int=1) -> float:
"""It evaluated the error over a single batch a time
:param data: the data to be usd for assess the norm
:type data: Union[np.ndarray, h5py.Dataset]
:param reference_data: the data to use as comparison
:type data: Union[np.ndarray, h5py.Dataset]
:param relative_norm: use relative norm or not ?
:type relative_norm: bool
:param data_interval: the interval over the samples' axis to use for evaluating the norm
:type data_interval: list
:param batch_size: the maximum size of each batch
:type batch_size: int
:returns: the total norm
:rtype: float
"""
batches = batchdomain_constructor(data_interval, batch_size)
accumulated_error = 0
accumulated_ref = 0
for batch_id, batch in enumerate(batches):
chunk_array = data[slice(*batch)]
print(f"Evaluating error for the batch {batch_id+1}/{len(batches)} batch_size={chunk_array.shape[0]}")
chunk_ref_array = reference_data[slice(*(batch + data_interval[0]))]
if chunk_array.dtype.names:
chunk_array_numeric = np.concatenate([chunk_array[var]
for var in chunk_array.dtype.names], axis=1)
chunk_ref_array_numeric = np.concatenate([chunk_ref_array[var]
for var in chunk_ref_array.dtype.names], axis=1)
else:
chunk_array_numeric = chunk_array
chunk_ref_array_numeric = chunk_ref_array
chunk_array_numeric, chunk_ref_array_numeric = self._clean_nan_and_large(chunk_array_numeric,
chunk_ref_array_numeric)
accumulated_error += np.sum(np.square(chunk_array_numeric.flatten()
- chunk_ref_array_numeric.flatten()))
accumulated_ref += np.sum(np.square(chunk_ref_array_numeric.flatten()))
if relative_norm:
return np.sqrt(accumulated_error/accumulated_ref)
else:
return np.sqrt(accumulated_error)
def __call__(self, data:Union[np.ndarray, da.core.Array, h5py.Dataset]=None,
reference_data:Union[np.ndarray, da.core.Array, h5py.Dataset]=None,
relative_norm:bool=False, data_interval:list=None, batch_size:int=1, ord:int=2) -> float:
"""It evaluated the error over a single batch a time
:param data: the data to be usd for assess the norm
:type data: Union[np.ndarray, da.core.Array, h5py.Dataset]
:param reference_data: the data to use as comparison
:type data: Union[np.ndarray, da.core.Array, h5py.Dataset]
:param relative_norm: use relative norm or not ?
:type relative_norm: bool
:param data_interval: the interval over the samples' axis to use for evaluating the norm
:type data_interval: list
:param batch_size: the maximum size of each batch
:type batch_size: int
:returns: the total norm
:rtype: float
"""
# NumPy and Dask arrays have similar properties
if isinstance(data, (np.ndarray, da.core.Array)) and isinstance(reference_data, (np.ndarray, da.core.Array)):
data, reference_data = self._clean_nan_and_large(data, reference_data)
# Linear algebra engine
if isinstance(data, np.ndarray):
lin = np
elif isinstance(data, da.core.Array):
lin = da
else:
lin = np
# The arrays must be flattened because np.linalg.norm accept only two-dimensional
# arrays
data_ = data.flatten()
reference_data_ = reference_data.flatten()
if relative_norm:
eval_ = lin.linalg.norm(data_ - reference_data_, ord=ord)/lin.linalg.norm(reference_data_, ord=ord)
else:
eval_ = lin.linalg.norm(data_-reference_data_, ord=ord)
return float(eval_)
if isinstance(data, h5py.Dataset) and isinstance(reference_data, h5py.Dataset):
assert ord == 2, 'We only implemented the norm 2 for hdf5 input data'
assert data_interval, "In using a h5py.Dataset it is necessary" \
"to provide a data interval"
return self._batchwise_error(data=data, reference_data=reference_data,
relative_norm=relative_norm,
data_interval=data_interval, batch_size=batch_size)
else:
raise Exception("Data format not supported. It must be np.ndarray, dask.core.Array or h5py.Dataset.")
[docs]class SampleWiseErrorNorm:
name = "samplewiseerrornorm"
def __init__(self, ):
pass
def _aggregate_norm(self, norms, ord):
n = np.stack(norms, axis=0)
if ord == 1:
return np.sum(n, axis=0)
elif ord == 2:
return np.sqrt(np.sum(n*n, axis=0))
elif ord == np.inf:
return np.max(n, axis=0)
else:
raise RuntimeError(f'Norm ord={str(ord)} not supported')
def __call__(self, data=None, reference_data=None, relative_norm=False,
key=None, data_interval=None, batch_size=1, ord=2):
"""
:param data: np.ndarray
:param reference_data: np.ndarray
:param relative_norm: bool
:return: None
"""
if data_interval is None:
data_interval = (0, data.shape[0])
n_samples = data_interval[1] - data_interval[0]
if isinstance(data, np.ndarray) and isinstance(reference_data, np.ndarray):
data_ = np.reshape(data[slice(*data_interval)], [n_samples, -1])
reference_data_ = np.reshape(reference_data[slice(*data_interval)], [n_samples, -1])
norm = np.linalg.norm(data_-reference_data_, ord=ord, axis=0)
if relative_norm:
ref_norm = np.linalg.norm(reference_data_, ord=ord, axis=0)
norm = _relative(norm, ref_norm)
elif isinstance(data, h5py.Dataset) and isinstance(reference_data, h5py.Dataset):
if isinstance(batch_size, MemorySizeEval):
batch_size = batch_size(max_batches=data_interval[1] - data_interval[0], shape=data.shape[1:])
batches = batchdomain_constructor(data_interval, batch_size)
is_string_key = isinstance(key, str)
if is_string_key:
keys = [key]
else:
keys = key
norms_dict = []
ref_norms_dict = []
for ii, batch in enumerate(batches):
d = data[slice(*batch)]
print(f'Computing norm for batch {ii+1}/{len(batches)}. batch_size={d.shape[0]}')
r = reference_data[slice(*batch)]
nb = {}
rb = {}
for k in keys:
rk = r[k]
nb[k] = self.__call__(data=d[k],
reference_data=rk,
relative_norm=False,
ord=ord)
if relative_norm:
rb[k] = np.linalg.norm(np.reshape(rk, [batch[1] - batch[0], -1]), ord=ord, axis=0)
norms_dict.append(nb)
ref_norms_dict.append(rb)
norms_dict = {k: self._aggregate_norm([n[k] for n in norms_dict], ord) for k in keys}
if relative_norm:
ref_norms_dict = {k: self._aggregate_norm([n[k] for n in ref_norms_dict], ord) for k in keys}
norms_dict = {k: _relative(norms_dict[k], ref_norms_dict[k]) for k in keys}
if is_string_key:
norm = norms_dict[key]
else:
norm = norms_dict
else:
raise Exception("Data format not supported. It must be np.ndarray or"
"h5py.Dataset.")
return norm
def _relative(norm, ref_norm):
ref_norm_zero = ref_norm == 0
norm_zero = norm == 0
ref_not_zero = np.logical_not(ref_norm_zero)
norm[ref_not_zero] = norm[ref_not_zero] / ref_norm[ref_not_zero]
norm[ref_norm_zero] = np.inf
norm[norm_zero] = 0
return norm
[docs]class FeatureWiseErrorNorm:
name = "featurewiseerrornorm"
def __init__(self, ):
pass
def __call__(self, data=None, reference_data=None, relative_norm=False,
key=None, data_interval=None, reference_data_interval=None, batch_size=1, ord=2):
"""
:param data: np.ndarray
:param reference_data: np.ndarray
:param relative_norm: bool
:return: None
"""
if data_interval is None:
data_interval = (0, data.shape[0])
if reference_data_interval is None:
reference_data_interval = (0, reference_data.shape[0])
assert (data_interval[1]-data_interval[0]) == (reference_data_interval[1]-reference_data_interval[0])
n_samples = data_interval[1] - data_interval[0]
if isinstance(data, np.ndarray) and isinstance(reference_data, np.ndarray):
data_ = np.reshape(data[slice(*data_interval)], [n_samples, -1])
reference_data_ = np.reshape(reference_data[slice(*reference_data_interval)], [n_samples, -1])
norm = np.linalg.norm(data_-reference_data_, ord=ord, axis=1)
if relative_norm:
ref_norm = np.linalg.norm(reference_data_, ord=ord, axis=1)
norm = _relative(norm, ref_norm)
elif isinstance(data, h5py.Dataset) and isinstance(reference_data, h5py.Dataset):
if isinstance(batch_size, MemorySizeEval):
batch_size = batch_size(max_batches=data_interval[1] - data_interval[0], shape=data.shape[1:])
batches = batchdomain_constructor(data_interval, batch_size)
batches_ref = batchdomain_constructor(reference_data_interval, batch_size)
is_string_key = isinstance(key, str)
if is_string_key:
keys = [key]
else:
keys = key
norms_dict = []
ref_norms_dict = []
for ii, (batch, batch_ref) in enumerate(zip(batches, batches_ref)):
d = data[slice(*batch)]
print(f'Computing norm for batch {ii+1}/{len(batches)} batch_size={d.shape[0]}')
r = reference_data[slice(*batch_ref)]
nb = {}
rb = {}
for k in keys:
rk = r[k]
nb[k] = self.__call__(data=d[k],
reference_data=rk,
relative_norm=False,
ord=ord)
if relative_norm:
rb[k] = np.linalg.norm(np.reshape(rk, [batch[1] - batch[0], -1]), ord=ord, axis=1)
norms_dict.append(nb)
ref_norms_dict.append(rb)
norms_dict = {k: np.concatenate([n[k] for n in norms_dict], axis=0) for k in keys}
if relative_norm:
ref_norms_dict = {k: np.concatenate([n[k] for n in ref_norms_dict], axis=0) for k in keys}
norms_dict = {k: _relative(norms_dict[k], ref_norms_dict[k]) for k in keys}
if is_string_key:
norm = norms_dict[key]
else:
norm = norms_dict
else:
raise Exception("Data format not supported. It must be np.ndarray or"
"h5py.Dataset.")
return norm
[docs]class DeterminationCoeff:
def __init__(self) -> None:
pass
def __call__(self, data:np.ndarray=None, reference_data:np.ndarray=None) -> float:
self.mean = reference_data.mean(axis=0)
assert isinstance(data, np.ndarray), "Error! data is not a ndarray: {}".format(type(data))
data_ = data.flatten()
assert isinstance(reference_data, np.ndarray), \
"Error! reference_data is not a ndarray: {}".format(type(reference_data))
reference_data_ = reference_data.flatten()
_reference_data = (reference_data - self.mean).flatten()
return 1 - np.linalg.norm(data_ - reference_data_, 2)/np.linalg.norm(_reference_data, 2)
[docs]class RosensteinKantz:
name = 'lyapunov_exponent'
def __init__(self, epsilon:float=None) -> None:
self.ref_index = 30
self.epsilon = epsilon
self.tau_amp = 30
def _neighborhood(self, v_ref:np.ndarray, v:np.ndarray) -> np.ndarray:
v_epsilon = np.where(np.abs(v-v_ref) <= self.epsilon)
return v_epsilon
def _reference_shift(self, v:np.ndarray, ref_index:int, shift:int) -> np.ndarray:
return v[ref_index + shift]
def __call__(self, data:np.ndarray=None) -> None:
# It is expected data to be an array with shape (n_timesteps, n_variables)
n_timesteps = data.shape[0]
n_variables = data.shape[1]
for vv in range(n_variables):
var_ = data[:, vv]
S_tau = list()
for tau in range(-self.tau_amp, self.tau_amp):
s_tau_list = list()
for tt in range(self.ref_index, n_timesteps-self.ref_index-1):
var = var_[self.ref_index:n_timesteps-self.ref_index]
var_ref = self._reference_shift(var_, tt, tau)
var_copy = copy.copy(var)
var_copy = np.delete(var_copy, self.ref_index)
v_index_epsilon = self._neighborhood(var_ref, var_copy)[0]
assert not v_index_epsilon.size == 0
v_index = v_index_epsilon + tau
var_tau = var_[v_index]
s_tau = np.mean(np.abs(var_tau - var_ref))
s_tau_list.append(s_tau)
log_str = "Timestep {}".format(tt)
sys.stdout.write("\r" + log_str)
sys.stdout.flush()
s_tau_array = np.array(s_tau_list)
s_tau = s_tau_array.mean()
S_tau.append(s_tau)
print('\n')
log_str = "Tau {}".format(tau)
sys.stdout.write(log_str)
S_tau = np.array(S_tau)
print('.')
[docs]class PerturbationMethod:
def __init__(self, jacobian_evaluator:callable=None) -> None:
"""
:param jacobian_evaluator: function
"""
self.jacobian_matrix_series = None
self.global_timestep = None
self.jacobian_evaluator = jacobian_evaluator
def _definition_equation(self, z:np.ndarray) -> np.ndarray:
"""
:param z: np.ndarray
:return: np.ndarray
"""
jacobian_matrix = self.jacobian_matrix_series[self.global_timestep, :, :]
return jacobian_matrix.dot(z.T).T
def __call__(self, data:np.ndarray=None, data_residual:np.ndarray=None, step:float=None) -> float:
"""
:param data: np.ndarray
:param data_residual: np.ndarray
:param step: float
:return: float
"""
n_timesteps = data.shape[0]
self.jacobian_matrix_series = self.jacobian_evaluator(data, data_residual=data_residual)
init_state = np.array([1e-150, 1e-150, 1e-150])[None, :]
lyapunov_exponents_list = list()
integrator = RK4(self._definition_equation)
current_state = init_state
for timestep in range(n_timesteps):
self.global_timestep = timestep
variables_state, derivatives_state = integrator.step(current_state, step)
current_state = variables_state
sys.stdout.write("\rIteration {}/{}".format(timestep, n_timesteps))
sys.stdout.flush()
lyapunov_exponent_ = derivatives_state.dot(variables_state[0].T)
lyapunov_exponent = lyapunov_exponent_/variables_state[0].dot(variables_state[0].T)
lyapunov_exponents_list.append(lyapunov_exponent)
lle = np.mean(lyapunov_exponents_list)
return lle
[docs]class MeanEvaluation:
def __init__(self) -> None:
pass
def __call__(self, dataset:Union[np.ndarray, h5py.Dataset]=None,
data_interval:list=None, batch_size:int=None,
data_preparer:DataPreparer=None) -> np.ndarray:
if isinstance(batch_size, MemorySizeEval):
batch_size = batch_size(max_batches=data_interval[1]-data_interval[0], shape=dataset.shape[1:])
else:
pass
# Constructing the normalization using the reference data
batches = batchdomain_constructor(data_interval=data_interval, batch_size=batch_size)
data_size = 0
data_mean = 0
for batch_id, batch in enumerate(batches):
data = dataset[slice(*batch)]
if data_preparer is None:
data_ = data.view(np.float)
data_flatten = data_.reshape(-1, np.product(data_.shape[1:]))
else:
data_flatten = data_preparer.prepare_input_structured_data(data)
data_mean = (data_size * data_mean +
data_flatten.shape[0] * data_flatten.mean(0)) / (data_size + data_flatten.shape[0])
data_size += data.shape[0]
return data_mean
# #valuating Minimum and Maximum values for large dataset in batch-wise mode
[docs]class MinMaxEvaluation:
def __init__(self) -> None:
self.data_min_ref = np.inf
self.data_max_ref = - np.inf
self.default_axis = None
def __call__(self, dataset:Union[np.ndarray, h5py.Dataset]=None,
data_interval:list=None, batch_size:int=None,
data_preparer:DataPreparer=None, axis:int=-1) -> np.ndarray:
if isinstance(batch_size, MemorySizeEval):
batch_size = batch_size(max_batches=data_interval[1]-data_interval[0], shape=dataset.shape[1:])
else:
pass
# Constructing the normalization using the reference data
batches = batchdomain_constructor(data_interval=data_interval, batch_size=batch_size)
if axis is not None:
n_dims = len(dataset.shape)
axes = [i for i in range(n_dims)]
axes.remove(axis)
axes = tuple(axes)
data_max = self.data_max_ref*np.ones(n_dims - 1)
data_min = self.data_min_ref*np.ones(n_dims - 1)
else:
axes = self.default_axis
data_max = self.data_max_ref
data_min = self.data_min_ref
for batch_id, batch in enumerate(batches):
data = dataset[slice(*batch)]
max_ = data.max(axes)
min_ = data.min(axes)
data_max = np.maximum(max_, data_max)
data_min = np.minimum(min_, data_min)
return data_max, data_min
[docs] def eval_h5(self, dataset: h5py.Group = None,
data_interval: list = None, batch_size: int = None,
data_preparer: DataPreparer = None, axis: int = -1, keys:list=None) -> np.ndarray:
data_min_ = list()
data_max_ = list()
for k in keys:
data = dataset[k]
data_max, data_min = self.__call__(dataset=data, data_interval=data_interval, batch_size=batch_size,
data_preparer=data_preparer, axis=axis)
data_min_.append(data_min)
data_max_.append(data_max)
return np.hstack(data_min_), np.hstack(data_max_)
[docs]class MemorySizeEval:
def __init__(self, memory_tol_percent:float=0.5) -> None:
self.memory_tol_percent = memory_tol_percent
self.size_default = np.array([0]).astype('float64').itemsize
self.available_memory = None
self.multiplier = 1024
@property
def available_memory_in_GB(self) -> float:
if self.available_memory is not None:
return self.available_memory/(self.multiplier**3)
else:
raise Warning("The available was not evaluated. Execute the __call__ method.")
def __call__(self, max_batches:int=None, shape:Union[tuple, list]=None) -> int:
self.available_memory = self.memory_tol_percent * psutil.virtual_memory().available
memory_size = self.size_default*np.prod(shape)
if memory_size <= self.available_memory:
possible_batch_size = max_batches
else:
possible_batch_size = np.ceil(memory_size/self.available_memory).astype(int)
return possible_batch_size
# Cumulative error norm for time-series
[docs]class CumulativeNorm:
def __init__(self):
pass
def __call__(self, data:np.ndarray=None, reference_data:np.ndarray=None,
relative_norm:bool=False, ord:int=2) -> np.ndarray:
assert len(data.shape) == len(reference_data.shape) == 2, "The data and reference_data must be two-dimensional."
cumulative_norm = np.sqrt(np.cumsum(np.square(data - reference_data), axis=0)/np.cumsum(np.square(reference_data), axis=0))
return np.where(cumulative_norm == np.NaN, 0, cumulative_norm)
# Cumulative error norm for time-series
[docs]class PointwiseError:
def __init__(self):
pass
def __call__(self, data:np.ndarray=None, reference_data:np.ndarray=None,
relative_norm:bool=False, ord:int=2) -> np.ndarray:
assert len(data.shape) == len(reference_data.shape) == 2, "The data and reference_data must be two-dimensional."
pointwise_relative_error = (data - reference_data)/reference_data
filter_nan = np.where(pointwise_relative_error == np.NaN, 0, pointwise_relative_error)
filter_nan_inf = np.where(filter_nan == np.inf, 0, filter_nan)
return filter_nan_inf
# Evaluating the number of Lyapunov units using cumulative error norm
[docs]class LyapunovUnits:
def __init__(self, lyapunov_unit:float=1, tol=0.10, time_scale=1, norm_criteria='cumulative_norm'):
self.lyapunov_unit = lyapunov_unit
self.tol = tol
self.time_scale = time_scale
if norm_criteria == 'cumulative_norm':
self.norm = CumulativeNorm()
elif norm_criteria == 'pointwise_error':
self.norm = PointwiseError()
else:
raise Exception(f"The case {norm_criteria} is not available.")
def __call__(self, data:np.ndarray=None, reference_data:np.ndarray=None,
relative_norm:bool=False, ord:int=2) -> float:
cumulative_norm = self.norm(data=data, reference_data=reference_data, relative_norm=relative_norm)
respect = cumulative_norm[cumulative_norm <= self.tol]
return respect.shape[0]*self.time_scale/self.lyapunov_unit