
% ------- Data Reader of : Data-driven machine learning unveils emerging properties 
%         of metabolic regimes at the scale of an entire stream network ---------%


% Developed by Pier Luigi Segatto, Tom J. Battin, and Enrico Bertuzzo

% This script loads the final datasets used for training the RFs after
% feature selection (to avoid overfit). Each .mat file refers to one of the
% six RFs that have been trained, i.e. PAR, T, GPP, ER, GPP including T and
% PAR predictions, and ER including T and PAR predictions. 
% Within each file the full dataset is stored in a table format in which
% each row represents an observation and each column represents a feature. 
% Training and Test datasets, for both setup T and S (see main text) are
% easily derived from the full table indexed using the appropriate index
% vectors (see below). 

% RF prediction for each reach of the Ybbs river network under both
% training S and T are loaded as well.

% In the next lines show how to load resutls of all RFs and
% explain how to derive Training and Test datasets. only for
% the first RF as an example.

close all 
clear all
clc
%%
% Load PAR Dataset and Predictions
load('DataSetPAR_RF.mat')

% DataTrainingPAR_RF contains 517 days of observations in all twelve sites
% used for training (nrows: 517*12 = 6204). Final PAR RF used 17 features 
% for predicting network scale patterns. Last column stores the response 
% variable, i.e. the measured PAR. 

% a summary of the loaded variables and their units. A ore complete
% description of the variables is available in Table S1.
summary(DataTrainingPAR_RF)

% convert the matlab table into array 
array_DataTrainingPAR_RF = table2array(DataTrainingPAR_RF);

% Under Training T only one year for each site has been used while the rest 
% has been kept to test the output and to measure the model error  
RF_PAR_Training_Dataset_Under_Training_T = array_DataTrainingPAR_RF(PAR_Training_T_idx_For_Train,:);
RF_PAR_Test_Dataset_Under_Training_T = array_DataTrainingPAR_RF(PAR_Training_T_idx_For_Test,:);

% Under Training S we trained 12 forests, each one is using as test
% dataset a different site and as training all the others.

% memory allocation
RF_PAR_Training_Dataset_Under_Training_S = zeros(6204-517,18,12); 
RF_PAR_Test_Dataset_Under_Training_S = zeros(517,18,12);
% 3d array assignment. 3rd dimension stores the different training and test 
% datasets of the 12 forests
for i = 1:12
    RF_PAR_Training_Dataset_Under_Training_S(:,:,i) = array_DataTrainingPAR_RF(PAR_Training_S_idx_For_Train(i,:),:);
    RF_PAR_Test_Dataset_Under_Training_S(:,:,i) = array_DataTrainingPAR_RF(PAR_Training_S_idx_For_Test(i,:),:);
end

% Ater training, RF predictions for each of the 292 reaches, have been 
% stored in the PredictedPAR_Ybbs_Training_T and PredictedPAR_Ybbs_Training_S
% arrays. Each row stores the predicted reach scale daily timeseries 
% derived under training T and S, respectively.

% this plot shows the RF predictions of one reach under both setups

fig = figure('Units', 'centimeters', ...
    'Visible','on','color','white','InvertHardcopy','on');
subplot(1,2,1)
plot(PredictedPAR_Ybbs_Training_T(21,:),'-r','DisplayName','Training T')
ylabel('PAR [lux]')
xlabel('days [d]')
title('Trainig T Reach 21','FontWeight', 'Normal')
set(gca,'Box','off')
subplot(1,2,2)
plot(PredictedPAR_Ybbs_Training_S(21,:),'-b','DisplayName','Training S')
xlabel('days [d]')
title('Trainig S Reach 21','FontWeight', 'Normal')
set(gca,'Box','off')


%% Load T - Dataset and Predictions
load('DataSetT_RF.mat')
%% Load GPP - Dataset and Predictions
load('DataSetGPP_RF.mat')
%% Load GPP including T and PAR - Dataset and Predictions
load('DataSetGPP_with_TPAR_RF.mat')
%% Load ER - Dataset and Predictions
load('DataSetER_RF.mat')
%% Load ER including T and PAR - Dataset and Predictions
load('DataSetER_with_TPAR_RF.mat')


