"""
This module aims to standardize the training and evaluation procedure of machine
learning models that are used for the ENSO prediction. At the
core of the standardized training and evaluation is the split of the entire
time series into several decades, namely 1962-1971, 1972-1981, ...,2012-2018.
At first, one of these decades is spared for later evaluation. Then, the remaining
data set is used in the :func:`ninolearn.learn.fit_predict.cross_fit` method to train the model AND optimize the
architecture of the model, the so-called hyperparameter optimization. This
hyperparameters optimization is currently done using a random search algorithm.
The described procedure is repeated until each decade was once spared for later
evaluation.
With the :func:`ninolearn.learn.fit_predict.cross_predict` method a prediction for the full time series
of the ONI can be made. Here, for each decade, the model that was NOT trained on
the corresponding decade is used for the prediction.
"""
import numpy as np
import pandas as pd
import xarray as xr
from os.path import join
from ninolearn.utils import print_header, small_print_header
from ninolearn.pathes import modeldir, processeddir
# evaluation decades
decades = [1962, 1972, 1982, 1992, 2002, 2012, 2018]
n_decades = len(decades)
# lead times for the evaluation
lead_times = [0, 3, 6, 9, 12, 15]
n_lead = len(lead_times)
[docs]def cross_fit(model, pipeline, n_iter, **kwargs):
"""
Training of the model on different training sets in which each time a\
period corresponing to a decade out of 1962-1971, 1972-1981, ...,\
2012-2018 is spared for later testing.
:param model: A model that follows the guidelines how a model object\
should be set up.
:param pipeline: A function that takes lead time as argument and returns\
the corresponding feature, label, time and persistance.
:type n_iter: int
:param n_iter: The number of iterations for the random search.
:param **kwargs: Arguments that shell be passed to the .set_parameter()\
method of the provided model.
"""
for lead_time in [0, 3, 6, 9, 12, 15]:
X, y, timey, yp = pipeline(lead_time, return_persistance=True)
print_header(f'Lead time: {lead_time} month')
for decade in decades:
small_print_header(f'Test period: {decade}-01-01 till {decade+9}-12-01')
test_indeces = (timey>=f'{decade}-01-01') & (timey<=f'{decade+9}-12-01')
train_indeces = np.invert(test_indeces)
trainX, trainy = X[train_indeces,:], y[train_indeces]
m = model()
m.set_parameters(**kwargs)
m.fit_RandomizedSearch(trainX, trainy, n_iter=n_iter)
m.save(location=modeldir, dir_name=f'{m.name}_decade{decade}_lead{lead_time}')
del m
[docs]def cross_predict(model, pipeline, model_name):
"""
Generate a hindcast from 1962 till today using the models which were
trained by the .cross_fit() method.
:param model: The considered model.
:param pipeline: The data pipeline that already was used before in \
.cross_fit().
"""
first_lead_loop = True
for i in range(n_lead):
lead_time = lead_times[i]
print_header(f'Lead time: {lead_time} months')
X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)
ytrue = np.array([])
timeytrue = pd.DatetimeIndex([])
first_dec_loop = True
for j in range(n_decades-1):
small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')
# test indices
test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]
m = model()
m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')
# allocate arrays and variables for which the model must be loaded
if first_dec_loop:
n_outputs = m.n_outputs
output_names = m.output_names
pred_full = np.zeros((n_outputs, 0))
first_dec_loop=False
# make prediction
pred = np.zeros((m.n_outputs, testX.shape[0]))
pred[:,:] = m.predict(testX)
# make the full time series
pred_full = np.append(pred_full, pred, axis=1)
ytrue = np.append(ytrue, testy)
timeytrue = timeytrue.append(testtimey)
del m
if timeytrue[0]!=pd.to_datetime('1962-01-01'):
expected_first_date = '1962-01-01'
got_first_date = timeytrue[0].isoformat()[:10]
raise Exception(f"The first predicted date for lead time {lead_time} \
is {got_first_date} but expected {expected_first_date}")
# allocate arrays and variables for which the full length of the time
# series must be known
if first_lead_loop:
n_time = len(timeytrue)
pred_save = np.zeros((n_outputs, n_time, n_lead))
first_lead_loop=False
pred_save[:,:,i] = pred_full
# Save data to a netcdf file
save_dict = {}
for i in range(n_outputs):
save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:])
print(save_dict)
ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
'lead': lead_times} )
ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))
ds.close()