This notebook contains the code used to create the csvs with model targets and the code for running the RF models themselves (+ tracking of shapley / feature importances)

In [None]:
import numpy as np
import pickle
import pandas as pd
from functools import partial
import glob

In [None]:
feature_loc = "features/"
position_loc = "positions/"
fitness_loc = "fitnesses/"
results_loc = "OUTPUT DIR"

In [None]:
algorithms = ['DifferentialEvolution', 'ConfiguredPSO', 'modcma']
fids = range(1,25)
splits = [50*(i+1) for i in range(200)]
dim = 10

In [None]:
records = []
for a1 in algorithms:
    for fid in fids:
        for a2 in algorithms:
            for s in splits:
                records.append([a1, fid, a2, s, dim])
dt_usecases = pd.DataFrame.from_records(records, columns=['a1', 'fid', 'a2', 'splitpoint', 'dim'])

In [None]:
def get_target_dt(fid, iid, rep, a1, type_ = 'short'):
    try:
        fitnesses = np.load(f"fitnesses/{a1}_F{fid}_I{iid}_R{rep}.npy")
        fitnesses[fitnesses < 1e-10] = 1e-10
    except ValueError:
        print(fid, iid, rep, a1)
        return None
    if (len(fitnesses)) < 500:
        print(fid, iid, rep, a1)
        return None
    
    if type_ == 'short':
        start_idx = 0
    elif type_ == 'medium':
        start_idx = 2
    else:
        start_idx = 4
    used_splits = splits[start_idx:int(np.floor((fitnesses.shape[0] - 500) /50))]
    cont_vals = [np.min(fitnesses[sp:sp+500]) for sp in used_splits]
    dt_temp = pd.DataFrame(cont_vals, columns=['cont'])
    dt_temp['sp'] =  used_splits
    
    dt_targets_sel = dt_targets[(dt_targets['fid'] == fid) & (dt_targets['iid'] == iid) & (dt_targets['rep'] == rep)]
    for alg in algorithms:
        sw_vals = np.array(dt_targets_sel[(dt_targets_sel['a1'] == a1) & (dt_targets_sel['a2'] == alg)& (dt_targets_sel['sp'].isin(used_splits))]['res'])
        dt_temp[f"sw_{alg}"] = sw_vals 
    dt_temp['fid'] = fid
    dt_temp['iid'] = iid
    dt_temp['rep'] = rep
    dt_temp['a1'] = a1
    
    feats = np.load(f"/mnt/g/SPECIES/Features_ELA/{type_}/{a1}_F{fid}_I{iid}_R{rep}.npy")
    merged = pd.concat([dt_temp, pd.DataFrame(feats[:len(used_splits), :91], columns=feature_names)], axis=1)
    return merged

In [None]:
feature_names = np.load("All_Feature_names.npy")

In [None]:
def create_and_store_dt(type_ = 'short'):
    dt_overall = pd.DataFrame()
    for fid in range(1,25):
        for iid in range(1,6):
            for r in range(5):
                for a1 in algorithms:
                    dt_overall = dt_overall.append(get_target_dt(fid, iid, r, a1, type_))
    # dt_overall = dt_overall[dt_overall[0] == 10]
    feats_remove, remaining_features = [], []
    for fname in feature_names:
        if 'basic' in fname: #Remove the features from the 'basic' set
            feats_remove.append(fname)
        elif len(np.unique(dt_overall[fname])) < 3:
            feats_remove.append(fname)
        elif np.sum(np.isfinite(dt_overall[fname])) < 0.9*np.sum(np.isfinite(dt_overall['basic.dim'])):
            print(np.sum(np.isfinite(dt_overall[fname])), fname)
            feats_remove.append(fname)
        else:
            remaining_features.append(fname)
    
    dt_overall = dt_overall.drop(feats_remove, axis=1)
    for alg in algorithms:
        relative_vals = [(1-(min(x,y)/max(x,y)))*(2*int(x<y)-1) for x,y in zip(dt_overall[f'sw_{alg}'], dt_overall['cont'])]
        dt_overall[f'rel_{alg}'] = relative_vals
    
    print(dt_overall)
    fully_filtered = dt_overall[np.isfinite(dt_overall.iloc(1)[9:100-len(feats_remove)]).all(1)]
    fully_filtered = fully_filtered[~(np.abs(fully_filtered.iloc(1)[9:100-len(feats_remove)]) > 1e100).any(1)]
    print(fully_filtered)

    for fname in remaining_features:
        vals = fully_filtered[fname]
        normalized = (vals - np.mean(vals))/np.std(vals)
        if np.max(normalized) > 4:
            vals = np.log(fully_filtered[fname])
            normalized_log = (vals - np.mean(vals))/np.std(vals)
            if not np.isnan(normalized_log).any():
                normalized = normalized_log
        fully_filtered[fname] = normalized
        print(np.max(normalized))
    
    fully_filtered.to_csv(f"/mnt/g/SPECIES/overall_ml_table_{type_}_normalized.csv")
    
    return fully_filtered, 100-len(feats_remove)

In [None]:
create_and_store_dt('long')

In [None]:
create_and_store_dt('medium')
create_and_store_dt('short')

In [None]:
from itertools import product
from multiprocessing import Pool, cpu_count

In [None]:
def runParallelFunction(runFunction, arguments):
    """
        Return the output of runFunction for each set of arguments,
        making use of as much parallelization as possible on this system

        :param runFunction: The function that can be executed in parallel
        :param arguments:   List of tuples, where each tuple are the arguments
                            to pass to the function
        :return:
    """
    

    arguments = list(arguments)
    p = Pool(min(cpu_count(), len(arguments)))
    results = p.map(runFunction, arguments)
    p.close()
    return results


In [None]:
fully_filtered = pd.read_csv("/mnt/d/SPECIES/overall_ml_table_short_normalized.csv", index_col=0)

In [None]:
import shap

In [None]:
import pickle

In [None]:
def get_rf_results(ipt):
    fid, a1, a2 = ipt
    ml_data_train = fully_filtered[(fully_filtered['fid'] != fid) & (fully_filtered['a1'] == a1)]
    ml_data_test = fully_filtered[(fully_filtered['fid'] == fid) & (fully_filtered['a1'] == a1)]

    rf = RandomForestRegressor(oob_score=False)
    
    end_idx = len(ml_data_train.columns)-3
    feats = ml_data_train.iloc(1)[9:end_idx]
    targets = ml_data_train[f'rel_{a2}']

    feats_test = ml_data_test.iloc(1)[9:end_idx]
    targets_test = ml_data_test[f'rel_{a2}']

    rf.fit(feats, targets)

    preds = rf.predict(feats_test)

#     dt_temp = pd.DataFrame({'Predicted':np.array(preds), 'Real':np.array(targets_test)})
#     dt_temp['fid'] = fid
#     dt_temp['a1'] = a1
#     dt_temp['a2'] = a2
    
    ml_data_test['Predicted'] = np.array(preds)
    ml_data_test['Real'] = np.array(targets_test)

    idxs = np.random.choice(len(feats), 100)
    explainer = shap.Explainer(rf)
    shap_values = explainer(feats.iloc[idxs], check_additivity=False)
    with open(f"RF_Results_norm/{type_}/Real/Shaps_F{fid}_{a1}_{a2}.pkl", 'wb') as f:
        pickle.dump(shap_values, f)

    np.save(f"RF_Results_norm/{type_}/Real/Feats_F{fid}_{a1}_{a2}", np.array(rf.feature_importances_))
    ml_data_test.to_csv(f"RF_Results_norm/{type_}/Real/F{fid}_{a1}_{a2}.csv")

In [None]:
def get_rf_results_bool(ipt):
    fid, a1, a2 = ipt
    ml_data_train = fully_filtered[(fully_filtered['fid'] != fid) & (fully_filtered['a1'] == a1)]
    ml_data_test = fully_filtered[(fully_filtered['fid'] == fid) & (fully_filtered['a1'] == a1)]

    rf = RandomForestClassifier(oob_score=False)
    end_idx = len(ml_data_train.columns)-3

    feats = ml_data_train.iloc(1)[9:end_idx]
    targets = ml_data_train[f'rel_{a2}'] >= 0.01

    feats_test = ml_data_test.iloc(1)[9:end_idx]
    targets_test = ml_data_test[f'rel_{a2}'] >= 0.01

    rf.fit(feats, targets)

    preds = rf.predict(feats_test)

#     dt_temp = pd.DataFrame({'Predicted':np.array(preds), 'Real':np.array(targets_test)})
#     dt_temp['fid'] = fid
#     dt_temp['a1'] = a1
#     dt_temp['a2'] = a2
    
    ml_data_test['Predicted'] = np.array(preds)
    ml_data_test['Real'] = np.array(targets_test)
    
    idxs = np.random.choice(len(feats), 100)
    explainer = shap.Explainer(rf)
    shap_values = explainer(feats.iloc[idxs], check_additivity=False)
    with open(f"RF_Results_norm/{type_}/Bool/Shaps_F{fid}_{a1}_{a2}.pkl", 'wb') as f:
        pickle.dump(shap_values, f)
        
    np.save(f"RF_Results_norm/{type_}/Bool/Feats_F{fid}_{a1}_{a2}", np.array(rf.feature_importances_))
    ml_data_test.to_csv(f"RF_Results_norm/{type_}/Bool/F{fid}_{a1}_{a2}.csv")

In [None]:
def get_rf_results_cat(ipt):
    fid, a1 = ipt
    ml_data_train = fully_filtered[(fully_filtered['fid'] != fid) & (fully_filtered['a1'] == a1)]
    ml_data_test = fully_filtered[(fully_filtered['fid'] == fid) & (fully_filtered['a1'] == a1)]
    end_idx = len(ml_data_train.columns)-3

    rf = RandomForestClassifier(oob_score=False)

    feats = ml_data_train.iloc(1)[9:end_idx]
    temp = ml_data_train[[f'rel_{x}' for x in algorithms]]
    temp['stay'] = 0
    targets = np.argmax(np.array(temp), axis=1)

    feats_test = ml_data_test.iloc(1)[9:end_idx]
    temp = ml_data_test[[f'rel_{x}' for x in algorithms]]
    temp['stay'] = 0
    targets_test = np.argmax(np.array(temp), axis=1)

    rf.fit(feats, targets)

    preds = rf.predict(feats_test)
    
    ml_data_test['Predicted'] = np.array(preds)
    ml_data_test['Real'] = np.array(targets_test)
    
    idxs = np.random.choice(len(feats), 100)
    explainer = shap.Explainer(rf)
    shap_values = explainer(feats.iloc[idxs], check_additivity=False)
    with open(f"RF_Results_norm/{type_}/Cat/Shaps_F{fid}_{a1}.pkl", 'wb') as f:
        pickle.dump(shap_values, f)
#     dt_temp = pd.DataFrame({'Predicted':np.array(preds), 'Real':np.array(targets_test)})
#     dt_temp['fid'] = fid
#     dt_temp['a1'] = a1
    np.save(f"RF_Results_norm/{type_}/Cat/Feats_F{fid}_{a1}", np.array(rf.feature_importances_))
    ml_data_test.to_csv(f"RF_Results_norm/{type_}/Cat/F{fid}_{a1}.csv")

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from scipy.stats import kendalltau
from sklearn.model_selection import cross_val_score, cross_val_predict


In [None]:
algorithms = ['DifferentialEvolution', 'ConfiguredPSO', 'modcma']

In [None]:
for type_ in ['long', 'medium', 'short']:
#     fully_filtered, end_idx = create_and_store_dt(type_)
    fully_filtered = pd.read_csv(f"/mnt/g/SPECIES/overall_ml_table_{type_}_normalized.csv", index_col=0)
    if type_ != 'long':
        args = product(range(1,25), algorithms, algorithms)
        runParallelFunction(get_rf_results, args)
        args = product(range(1,25), algorithms, algorithms)
        runParallelFunction(get_rf_results_bool, args)
    args = product(range(1,25), algorithms)
    runParallelFunction(get_rf_results_cat, args)