Source code for ninolearn.learn.fit_predict

"""
This module aims to standardize the training and evaluation procedure of machine
learning models that are used for the ENSO prediction. At the
core of the standardized training and evaluation is the split of the entire
time series into several decades, namely 1962-1971, 1972-1981, ...,2012-2018.

At first, one of these decades is spared for later evaluation. Then, the remaining
data set is used in the :func:`ninolearn.learn.fit_predict.cross_fit` method to train the model AND optimize the
architecture of the model, the so-called hyperparameter optimization. This
hyperparameters optimization is currently done using a random search algorithm.
The described procedure is repeated until each decade was once spared for later
evaluation.

With the :func:`ninolearn.learn.fit_predict.cross_predict` method a prediction for the full time series
of the ONI can be made. Here, for each decade, the model that was NOT trained on
the corresponding decade is used for the prediction.
"""


import numpy as np
import pandas as pd
import xarray as xr

from os.path import join

from ninolearn.utils import print_header, small_print_header
from ninolearn.pathes import modeldir, processeddir

# evaluation decades
decades = [1962, 1972, 1982, 1992, 2002, 2012, 2018]
n_decades = len(decades)

# lead times for the evaluation
lead_times = [0, 3, 6, 9, 12, 15]
n_lead = len(lead_times)

[docs]def cross_fit(model, pipeline, n_iter, **kwargs): """ Training of the model on different training sets in which each time a\ period corresponing to a decade out of 1962-1971, 1972-1981, ...,\ 2012-2018 is spared for later testing. :param model: A model that follows the guidelines how a model object\ should be set up. :param pipeline: A function that takes lead time as argument and returns\ the corresponding feature, label, time and persistance. :type n_iter: int :param n_iter: The number of iterations for the random search. :param **kwargs: Arguments that shell be passed to the .set_parameter()\ method of the provided model. """ for lead_time in [0, 3, 6, 9, 12, 15]: X, y, timey, yp = pipeline(lead_time, return_persistance=True) print_header(f'Lead time: {lead_time} month') for decade in decades: small_print_header(f'Test period: {decade}-01-01 till {decade+9}-12-01') test_indeces = (timey>=f'{decade}-01-01') & (timey<=f'{decade+9}-12-01') train_indeces = np.invert(test_indeces) trainX, trainy = X[train_indeces,:], y[train_indeces] m = model() m.set_parameters(**kwargs) m.fit_RandomizedSearch(trainX, trainy, n_iter=n_iter) m.save(location=modeldir, dir_name=f'{m.name}_decade{decade}_lead{lead_time}') del m
[docs]def cross_predict(model, pipeline, model_name): """ Generate a hindcast from 1962 till today using the models which were trained by the .cross_fit() method. :param model: The considered model. :param pipeline: The data pipeline that already was used before in \ .cross_fit(). """ first_lead_loop = True for i in range(n_lead): lead_time = lead_times[i] print_header(f'Lead time: {lead_time} months') X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) ytrue = np.array([]) timeytrue = pd.DatetimeIndex([]) first_dec_loop = True for j in range(n_decades-1): small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # test indices test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces] m = model() m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # allocate arrays and variables for which the model must be loaded if first_dec_loop: n_outputs = m.n_outputs output_names = m.output_names pred_full = np.zeros((n_outputs, 0)) first_dec_loop=False # make prediction pred = np.zeros((m.n_outputs, testX.shape[0])) pred[:,:] = m.predict(testX) # make the full time series pred_full = np.append(pred_full, pred, axis=1) ytrue = np.append(ytrue, testy) timeytrue = timeytrue.append(testtimey) del m if timeytrue[0]!=pd.to_datetime('1962-01-01'): expected_first_date = '1962-01-01' got_first_date = timeytrue[0].isoformat()[:10] raise Exception(f"The first predicted date for lead time {lead_time} \ is {got_first_date} but expected {expected_first_date}") # allocate arrays and variables for which the full length of the time # series must be known if first_lead_loop: n_time = len(timeytrue) pred_save = np.zeros((n_outputs, n_time, n_lead)) first_lead_loop=False pred_save[:,:,i] = pred_full # Save data to a netcdf file save_dict = {} for i in range(n_outputs): save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:]) print(save_dict) ds = xr.Dataset(save_dict, coords={'target_season': timeytrue, 'lead': lead_times} ) ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc')) ds.close()