Source code for simulai.simulation

# (C) Copyright IBM Corp. 2019, 2020, 2021, 2022.

#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at

#           http://www.apache.org/licenses/LICENSE-2.0

#     Unless required by applicable law or agreed to in writing, software
#     distributed under the License is distributed on an "AS IS" BASIS,
#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#     See the License for the specific language governing permissions and
#     limitations under the License.

import warnings
import numpy as np
import copy
import os
import inspect
import pickle
from collections import OrderedDict
import h5py

import simulai
from simulai.io import DataPreparer
from simulai.rom import ROM
from simulai.batching import batchdomain_constructor
from simulai.abstract import BaseFramework

# Prototype of the class Pipeline
[docs]class Pipeline(BaseFramework): """ Pipeline class manages the execution of multiple operations such as data pre-processing (DataPreparer), ROM, and ML modeling (Model) """ def __init__(self, stages:list=list(), channels_last:bool=False) -> None: """ :param stages: :type: stages: List[Tuple] """ self.channels_last = channels_last super().__init__() assert isinstance(stages, list), "Error! stages is not a list" self.stages = OrderedDict(stages) self.triage_dict = { 'data_preparer': 'prepare_input_data', 'rom': 'fit', 'model': 'fit', 'integration': '__call__', 'normalization': 'rescale' } self.wrappers_dict = { 'data_preparer': self._data_preparer_wrapper, 'rom': self._rom_wrapper, 'model': self._model_wrapper, 'normalization': self._normalization_wrapper, 'integration': self._integration_wrapper } # Some global directives: pipeline_layers = self.stages.keys() self.there_is_data_preparer = False self.there_is_rom = False self.there_is_model = False self.there_is_target = False self.there_is_reference = False self.is_batchwise = False if 'data_preparer' in pipeline_layers: self.there_is_data_preparer = True if 'rom' in pipeline_layers: self.there_is_rom = True if 'model' in pipeline_layers: self.there_is_model = True self.execution_pipeline, self.pipeline_algorithms = self._classify_op(self.stages) # Multiple global attributes are initialized as empty objects and filled at # execution time self._input_data = None self.target_data = None self.reference_data = None self.fig_kwargs = None self.data = None self.model = None self.input_vars_list = None self.target_vars_list = None self.output = None self.rom = None # Bypass used when no data preparer is provided self.data_preparer = None self.data_generator = None self.normalization = None self.fit_kwargs = {} self.slicer = None @property def input_data(self): return self._input_data @input_data.setter def input_data(self, v): assert (isinstance(v, np.ndarray) or isinstance(v, h5py.Dataset)),\ "Error! input_data is not a numpy.ndarray: {}".format(type(v)) self._input_data = v @staticmethod def _construct_data_array(data, var_names_list): """ :param data: :type data: np.ndarray :param var_names_list: :type var_names_list: List[str] :return: """ # It considers the argument data as a structured array if isinstance(data, np.ndarray) and isinstance(data.dtype.names, tuple): return data.view(float) else: return data def _classify_op(self, op_dict): """ Classify the type of execution method used for each operation in the pipeline :param op_dict: :type op_dict: dict :return: """ execution_pipeline = OrderedDict() pipeline_algorithms = {key: {'op': list(), 'wrapper': None} for key in self.triage_dict} for node, op in op_dict.items(): execution_method = self.triage_dict.get(op.purpose) wrapper_method = self.wrappers_dict.get(op.purpose) execution_pipeline[op] = execution_method pipeline_algorithms[op.purpose]['op'] = execution_method pipeline_algorithms[op.purpose]['wrapper'] = wrapper_method return execution_pipeline, pipeline_algorithms @staticmethod def _recover_array(data, var_names_list): """ :param data: :param var_names_list: List[str] :return: """ variables_list = list() for var in var_names_list: variables_list.append(data[var]) variables_str = ','.join(var_names_list) formats_str = ','.join(len(var_names_list)*['f8']) recovered_data = np.core.records.fromarrays(variables_list, names=variables_str, formats=formats_str) return recovered_data def _slice_by_interval(self, batch): return slice(*batch) def _slice_by_set(self, batch): return batch def _data_preparer_wrapper(self, data_preparer): """ :param data_preparer: :type data_preparer: DataPreparer :return: None """ if isinstance(data_preparer, DataPreparer): data_preparers = [data_preparer] elif isinstance(data_preparer, list): data_preparers = data_preparer assert all([isinstance(data_preparer, DataPreparer) for data_preparer in data_preparer]) else: raise Exception("Error! data_preparer is not an instance of data_preparer") self.data_preparer = data_preparers data = getattr(self, 'input_data') for data_preparer in self.data_preparer: self.input_data = data_preparer.prepare_input_data(data) if isinstance(self.target_data, np.ndarray): target_data = getattr(self, 'target_data') self.target_data = data_preparer.prepare_input_data(target_data) else: pass def _get_operator(self, operator): if self.normalization: def wrapper(data): evaluation = operator(data) output_dict = self.normalization.apply_descaling(map_dict={'target': evaluation}) evaluation = output_dict['target'] output_dict = self.normalization.apply_rescaling(map_dict={'input': evaluation}) evaluation = output_dict['input'] return evaluation else: def wrapper(data): return operator(data) return wrapper def _model_wrapper(self, model): if model.is_this_model_rough: model.fit(input_data=self.input_data, target_data=self.target_data, **self.fit_kwargs) else: pass self.model = model @staticmethod def _get_kwargs(op): kwargs = inspect.getfullargspec(op).args kwargs.remove('self') return kwargs def _integration_wrapper(self, integration_op): # It checks if the post_process_op is instantiated or not self.right_operator = self.model.eval for key, var in self.extra_kwargs.items(): setattr(self, key, var) self.initial_state = self.project_data(self.initial_state, self.input_vars_list) if inspect.isclass(integration_op): # It instantiates the class post_process_op init_kwargs_list = self._get_kwargs(integration_op.__init__) init_kwargs_dict = {key: getattr(self, key) for key in init_kwargs_list} execution_method_str = self.execution_pipeline.get(integration_op) postproc_op_instance = integration_op(**init_kwargs_dict) execution_method = getattr(postproc_op_instance, execution_method_str) exec_kwargs_list = self._get_kwargs(execution_method) exec_kwargs_dict = {key: getattr(self, key) for key in exec_kwargs_list} # It executes the main method of the instance output = execution_method(**exec_kwargs_dict) self.output = self.reconstruct_data(data=output) else: pass def _rom_wrapper(self, rom): """ :param rom: :type rom: ROM :return: None """ # The target data may be previously provided or constructed after the # dimensionality reduction if self.is_batchwise: assert rom.kind == "batchwise", "The ROM chosen is not proper " \ "to batchwise executions" else: pass input_data = getattr(self, 'input_data') target_data = getattr(self, 'target_data') input_data = self._construct_data_array(input_data, self.input_vars_list) rom.fit(data=input_data) # Apply the conversion to the target data object when that exists if self.there_is_model: reduced_input_data = rom.project(data=input_data) if isinstance(target_data, np.ndarray): target_data = self._construct_data_array(target_data, self.target_vars_list) # Apply the dimensionality reduction to the target data object when that exists reduced_target_data = rom.project(data=target_data) else: reduced_target_data = self.data_generator(data=reduced_input_data) self.target_data = reduced_target_data self.input_data = reduced_input_data else: pass def _normalization_wrapper(self, normalization_op): map_dict = dict() if isinstance(self.input_data, np.ndarray): map_dict.update({'input': self.input_data}) else: pass if isinstance(self.target_data, np.ndarray): map_dict.update({'target': self.target_data}) else: pass transformed_array_dict = normalization_op.rescale(map_dict=map_dict) if isinstance(self.input_data, np.ndarray): self.input_data = transformed_array_dict['input'] else: pass if isinstance(self.target_data, np.ndarray): self.target_data = transformed_array_dict['target'] else: pass def _batchwise_projection(self, data=None, variables_list=None, data_interval=None, batch_size=None, batch_indices=None): if data_interval is not None: n_samples = data_interval[1] - data_interval[0] slicer = self._slice_by_interval elif batch_indices is not None: n_samples = len(batch_indices) slicer = self._slice_by_set else: raise Exception("There is a contradiction. Or data_interval or batch_indices must be provided.") if isinstance(batch_size, simulai.metrics.MemorySizeEval): batch_size = batch_size(max_batches=n_samples, shape=data.shape) # TODO data.shape[1:] elif batch_size == -1: batch_size = n_samples else: pass batches = batchdomain_constructor(data_interval=data_interval, batch_size=batch_size, batch_indices=batch_indices) batches_list = list() for batch_id, batch in enumerate(batches): chunk_array = data[slicer(batch)] print(f"Projecting for the batch {batch_id+1}/{len(batches)} batch_size={chunk_array.shape[0]}") if self.data_preparer: data_ = self.data_preparer.prepare_input_structured_data(chunk_array) else: data_ = chunk_array data_numeric = self._construct_data_array(data_, variables_list) batches_list.append(self.rom.project(data_numeric)) return np.vstack(batches_list) # Reconstructing using chunks of data in order to save memory def _batchwise_reconstruction(self, data=None, variables_list=None, data_interval=None, batch_size=None, batch_indices=None, dump_path=None): assert dump_path, "It is necessary to provide a path to save the reconstruction" \ "output to a HDF5 file." if os.path.exists(dump_path): warnings.warn(f'Reconstruction dump_path={dump_path} exists. It will be overwritten') data_file = h5py.File(dump_path, "w") if data_interval is not None: samples_dim = data_interval[1] - data_interval[0] slicer = self._slice_by_interval elif batch_indices is not None: samples_dim = len(batch_indices) slicer = self._slice_by_set else: raise Exception("There is a contradiction. Or data_interval or batch_indices must be provided.") # In case of using a memory limiter, it is necessary to evaluate the batch_size # using it if isinstance(batch_size, simulai.metrics.MemorySizeEval): batch_size = batch_size(max_batches=samples_dim, shape=(self.data_preparer.n_features, )) elif batch_size == -1: batch_size = samples_dim else: pass # Constructing the chunks intervals batches = batchdomain_constructor(data_interval=data_interval, batch_size=batch_size, batch_indices=batch_indices) # If the data structure is a structured numpy array a list of variables is provided data_shape = (samples_dim,) + self.data_preparer.collapsible_shapes dset = data_file.create_dataset("reconstructed_data", shape=data_shape, dtype=self.data_preparer.dtype) # Batchwise reconstruction loop for batch_id, batch in enumerate(batches): chunk_array = data[slicer(batch)] print(f"Reconstruction for the batch {batch_id+1}/{len(batches)} batch_size={chunk_array.shape[0]}") data_numeric = self.rom.reconstruct(chunk_array) output_data_ = self.data_preparer.prepare_output_data(data_numeric) dset[slicer(batch)] = output_data_ return dset
[docs] def project_data(self, data=None, variables_list=None, data_interval=None, batch_size=1, batch_indices=None): """ :param data: :type data: np.ndarray :param variables_list: :type variables_list: List[str] :param mean_component: :type mean_component: bool :return: """ if isinstance(data, np.ndarray): if variables_list: data_ = self.data_preparer.prepare_input_structured_data(data) data_numeric = self._construct_data_array(data_, variables_list) return self.rom.project(data_numeric) else: data_ = self.data_preparer.prepare_input_data(data) data_numeric = self._construct_data_array(data_, variables_list) return self.rom.project(data_numeric) elif isinstance(data, h5py.Dataset): assert data_interval, "In using a h5py Dataset it is necessary" \ "to provide a data interval" return self._batchwise_projection(data=data, variables_list=variables_list, data_interval=data_interval, batch_size=batch_size, batch_indices=batch_indices) else: raise Exception("Data format not supported. It must be np.ndarray" "or h5py.Dataset.")
[docs] def reconstruct_data(self, data=None, variables_list=None, data_interval=None, batch_size=1, dump_path=None): if isinstance(data, np.ndarray) and not data_interval: print("Applying the global reconstruction strategy.") data_numeric = self.rom.reconstruct(data) return self.data_preparer.prepare_output_data(data_numeric) elif isinstance(data, np.ndarray) and data_interval and dump_path: print("Applying the batch-wise reconstruction strategy.") return self._batchwise_reconstruction(data=data, variables_list=variables_list, data_interval=data_interval, batch_size=batch_size, dump_path=dump_path) else: raise Exception("Data format not supported. It must be np.ndarray" "or h5py.Dataset.")
[docs] def pipeline_loop(self, input_data, target_data, reference_data, extra_kwargs): self.input_data = input_data self.target_data = target_data self.reference_data = reference_data self.extra_kwargs = extra_kwargs # These operations are considered already instantiated # it is also necessary to comprise the no instantiated cases for op, method_name in self.execution_pipeline.items(): # op can be a class instance or a class itself wrapper_method = self.pipeline_algorithms[op.purpose]['wrapper'] wrapper_method(op) setattr(self, op.purpose, op) print(f"Executed operation {op.name.upper()}.")
[docs] def batchwise_pipeline_loop(self, input_data, target_data, reference_data, extra_kwargs, data_interval=None, batch_size=None, batch_indices=None): self.input_data = input_data self.target_data = target_data self.reference_data = reference_data self.extra_kwargs = extra_kwargs if self.there_is_target: error_message = "The input and target dimensions are not compatible" \ "with {} and {} samples". \ format(self.input_data.shape, self.target_data) assert self.input_data.shape == self.target_data, error_message else: pass # Checking up if a list of batches was provided or if it is necessary to construct it assert (data_interval is not None or batch_indices is not None),\ "There is a contradiction. Or data_interval or batch_indices must be provided." batches = batchdomain_constructor(data_interval=data_interval, batch_size=batch_size, batch_indices=batch_indices) for batch_id, batch in enumerate(batches): self.input_data = input_data[self.slicer(batch)] print(f"Executing the mini-batch {batch_id+1}/{len(batches)} batch_size={self.input_data.shape[0]}") if self.there_is_target: self.target_data = target_data[self.slicer(batch)] else: pass if self.there_is_reference: self.reference_data = reference_data[self.slicer(batch)] else: pass self.extra_kwargs = extra_kwargs # These operations are considered already instantiated # it is also necessary to comprise the no instantiated cases for op, method_name in self.execution_pipeline.items(): # op can be a class instance or a class itself wrapper_method = self.pipeline_algorithms[op.purpose]['wrapper'] wrapper_method(op) setattr(self, op.purpose, op) print("Executed operation.")
[docs] def exec(self, input_data=None, target_data=None, reference_data=None, data_generator=None, extra_kwargs=None, fit_kwargs=None, data_interval=None, batch_size=None, batch_indices=None): """ :param batch_size: :param batch_size: int :param data: :type data: np.ndarray :param input_data: :type input_data: np.ndarray :param target_data: :type target_data: np.ndarray :param reference_data: :param data_generator: :param extra_kwargs: :return: """ data_format = list() self.fit_kwargs = fit_kwargs if isinstance(input_data, np.ndarray): if input_data.dtype.names: self.input_vars_list = list(input_data.dtype.names) else: pass data_format.append("numpy") elif isinstance(input_data, h5py.Dataset): if data_interval is not None: n_samples = data_interval[1] - data_interval[0] self.slicer = self._slice_by_interval elif batch_indices is not None: n_samples = len(batch_indices) self.slicer = self._slice_by_set else: raise Exception("There is a contradiction. Or data_interval or batch_indices must be provided.") if isinstance(batch_size, simulai.metrics.MemorySizeEval): batch_size = batch_size(max_batches=n_samples, shape=input_data.shape) # TODO input_data.shape[1:] elif batch_size == -1: batch_size = n_samples else: pass if input_data.dtype.names: self.input_vars_list = list(input_data.dtype.names) else: pass assert batch_size, "The argument batch_size must be" \ "provided when using HDF5 as input" data_format.append("hdf5") else: data_format.append(None) raise Exception("This data format is not supported.") # When a machine-learning (or other fitting method) is employed # target data, or a method for generating it, must be provided if self.there_is_model: if isinstance(target_data, np.ndarray): self.target_vars_list = list(target_data.dtype.names) data_format.append("numpy") elif isinstance(target_data, h5py.Dataset): assert batch_size, "The argument batch_size must be" \ "provided when using HDF5 as input" self.target_vars_list = ['var_' + str(ii) for ii in range(target_data.shape[1])] data_format.append("numpy") else: assert data_generator self.data_generator = data_generator data_format.append("numpy") self.there_is_target = True else: pass # This way of executing the pipeline is used when the data are ingested # at a time in a Numpy array data_format = set(data_format) assert len(data_format) == 1, "Incompatible input and output formats" data_format = list(data_format)[0] if data_format == "numpy": print("Executing a global pipeline.") self.pipeline_loop(input_data, target_data, reference_data, extra_kwargs) # For ingestion via HDF5, the pipeline process is repeated for each # mini-batch elif data_format == "hdf5": print("Executing a batchwise pipeline.") self.is_batchwise = True self.batchwise_pipeline_loop(input_data, target_data, reference_data, extra_kwargs, data_interval=data_interval, batch_size=batch_size, batch_indices=batch_indices) else: raise Exception("The data format was not understood")
[docs] def eval(self, data=None, with_projection=True, with_reconstruction=True): if self.rom and with_projection: data_ = self.project_data(data, self.input_vars_list) else: data_ = data if self.normalization: output_dict = self.normalization.apply_rescaling(map_dict={'input': data_}) data_ = output_dict['input'] else: pass evaluation_ = self.model.eval(data_) if self.normalization: output_dict = self.normalization.apply_descaling(map_dict={'target': evaluation_}) evaluation_ = output_dict['target'] else: pass if with_reconstruction: evaluation = self.reconstruct_data(data=evaluation_) else: evaluation = evaluation_ return evaluation
[docs] def predict(self, post_process_op=None, extra_kwargs=None, with_reconstruction=True): # It checks if the post_process_op is instantiated or not initial_state = extra_kwargs['initial_state'] extra_kwargs_copy = copy.copy(extra_kwargs) if self.rom: extra_kwargs_copy['initial_state'] = self.project_data(initial_state, self.input_vars_list) else: pass if self.normalization: initial_state = extra_kwargs_copy['initial_state'] output_dict = self.normalization.apply_rescaling(map_dict={'input': initial_state}) extra_kwargs_copy['initial_state'] = output_dict['input'] else: pass recurrent_operator = self._get_operator(self.model.eval) op_instance = post_process_op(recurrent_operator) output = op_instance(**extra_kwargs_copy) if self.normalization: output_dict = self.normalization.apply_descaling(map_dict={'input': output}) output = output_dict['input'] else: pass if with_reconstruction: output = self.reconstruct_data(data=output) else: pass return output
def _save(self, save_path=None, model_name=None): fp = open(os.path.join(save_path, model_name), 'wb') try: print(f"Trying to save {self} to a file.") pickle.dump(self, fp, protocol=4) except Exception as e: print('START ---- Exception message') print(e, e.args) print('END ---- Exception message') print(f"Object {self} is not pickable." f" Trying to save with another way")
[docs] def save(self, save_path=None, model_name=None): try: self._save(save_path=save_path, model_name=model_name) except Exception: assert self.model is not None, f"model is not an attribute of {self.model}" self.model.save(save_path, model_name)
[docs] def test(self, metric=None, data=None): data = self._construct_data_array(data, self.input_vars_list) error = metric(data, self.output) return error