Job power prediction result analysis
Processing the mean power prediction results
Outputs of script
run_prediction_per_user_allmethods_mean.py
.
import pandas as pd
import seaborn as sns
import os
RESULTS_PATH = "../user-power-predictions/data/total_power_mean_predictions_users_allmethods_mean/"
PRED_COLS = ["hist_pred_total_power_mean",
"LinearRegression_total_power_mean_watts",
"RandomForestRegressor_total_power_mean_watts",
"LinearSVR_total_power_mean_watts",
"SGDRegressor_total_power_mean_watts"]
result_filenames = os.listdir(RESULTS_PATH)
df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])
df_all_results = df_all_results.dropna(subset=PRED_COLS)
df_all_results.to_csv('/tmp/allresults-mean.csv', index=False)
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
lst_users = df_all_results["user_id"].drop_duplicates().to_list()
#print(lst_users)
df_results_user_group = df_all_results.groupby("user_id")
lst_stats_per_user = []
for user in lst_users:
results_user = df_results_user_group.get_group(user)
hist_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["hist_pred_total_power_mean"])
LR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearRegression_total_power_mean_watts"])
RF_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["RandomForestRegressor_total_power_mean_watts"])
LSVR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearSVR_total_power_mean_watts"])
SGD_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["SGDRegressor_total_power_mean_watts"])
res = {"user_id": user,
"hist_mape": hist_mape,
"LinearRegression_mape": LR_mape,
"RandomForestRegressor_mape": RF_mape,
"LinearSVR_mape": LSVR_mape,
"SGDRegressor_mape": SGD_mape}
lst_stats_per_user.append(res)
#break
df_stats_per_user = pd.DataFrame(lst_stats_per_user)
df_stats_per_user
## user_id hist_mape LinearRegression_mape RandomForestRegressor_mape LinearSVR_mape SGDRegressor_mape
## 0 1249 0.087213 1.013289e-01 0.124200 0.104198 0.112710
## 1 1494 0.075278 7.187246e-02 0.074445 0.083109 0.075146
## 2 634 0.067456 3.187002e-01 0.209108 0.247893 0.311342
## 3 694 0.270083 6.136092e+11 0.231271 0.248924 0.270858
## 4 1365 0.223117 2.038785e-01 0.208285 0.197755 0.194399
## .. ... ... ... ... ... ...
## 571 1711 0.158810 1.669925e-01 0.218634 0.131284 0.213693
## 572 718 0.059795 1.269777e-01 0.052001 0.088634 0.068901
## 573 550 0.269676 2.704648e-01 0.259030 0.295231 0.283677
## 574 359 0.119872 2.296592e-01 0.160075 0.202770 0.214655
## 575 850 0.132682 3.073521e-01 0.344460 0.349007 0.314373
##
## [576 rows x 6 columns]
COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user[COLS].describe()
## hist_mape LinearRegression_mape RandomForestRegressor_mape LinearSVR_mape SGDRegressor_mape
## count 5.760000e+02 5.760000e+02 5.760000e+02 5.760000e+02 5.760000e+02
## mean 6.297430e+12 6.681627e+12 6.684230e+12 6.261852e+12 6.478381e+12
## std 7.301566e+13 7.503549e+13 7.817233e+13 7.203878e+13 7.572169e+13
## min 4.004490e-03 3.718275e-03 1.193619e-02 1.126697e-02 2.253075e-03
## 25% 7.110493e-02 7.679108e-02 7.374854e-02 7.741257e-02 7.314237e-02
## 50% 1.151113e-01 1.286967e-01 1.176980e-01 1.275947e-01 1.217893e-01
## 75% 1.800437e-01 2.326375e-01 1.857662e-01 2.062293e-01 2.028217e-01
## max 1.310127e+15 1.323483e+15 1.424301e+15 1.232220e+15 1.364989e+15
COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
df_stats_per_user_pivot
## user_id variable value
## 0 1249 hist_mape 0.087213
## 1 1494 hist_mape 0.075278
## 2 634 hist_mape 0.067456
## 3 694 hist_mape 0.270083
## 4 1365 hist_mape 0.223117
## ... ... ... ...
## 2875 1711 SGDRegressor_mape 0.213693
## 2876 718 SGDRegressor_mape 0.068901
## 2877 550 SGDRegressor_mape 0.283677
## 2878 359 SGDRegressor_mape 0.214655
## 2879 850 SGDRegressor_mape 0.314373
##
## [2880 rows x 3 columns]
Figure 3 (a)
import matplotlib.pyplot as plt
TINY_SIZE = 2
SMALL_SIZE = 5
MEDIUM_SIZE = 20
BIGGER_SIZE = 50
FIG_WIDTH = 40
FIG_HEIGHT = 10
#plt.rc('font', size=16) # controls default text sizes
plt.rc('font', size=20) # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE) # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE) # fontsize of the figure title
plt.rc('figure', figsize=(8,4))
#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)
## ([<matplotlib.axis.YTick object at 0x7fffa0beef90>, <matplotlib.axis.YTick object at 0x7fff9e197410>, <matplotlib.axis.YTick object at 0x7fff9d1cafd0>, <matplotlib.axis.YTick object at 0x7fff9d216bd0>, <matplotlib.axis.YTick object at 0x7fff9dc8ffd0>], [Text(0, 0, 'History'), Text(0, 1, 'LinearRegression'), Text(0, 2, 'RandomForest'), Text(0, 3, 'LinearSVR'), Text(0, 4, 'SGDRegressor')])
Processing the max power prediction results
Outputs of script
run_prediction_per_user_allmethods_max.py
.
import pandas as pd
import seaborn as sns
import os
RESULTS_PATH = "../user-power-predictions/data/total_power_mean_predictions_users_allmethods_max/"
PRED_COLS = ["hist_pred_total_power_max",
"LinearRegression_total_power_max_watts",
"RandomForestRegressor_total_power_max_watts",
"LinearSVR_total_power_max_watts",
"SGDRegressor_total_power_max_watts"]
result_filenames = os.listdir(RESULTS_PATH)
df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])
df_all_results = df_all_results.dropna(subset=PRED_COLS)
df_all_results.to_csv('/tmp/allresults-max.csv', index=False)
#df_all_results
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
lst_users = df_all_results["user_id"].drop_duplicates().to_list()
#print(lst_users)
df_results_user_group = df_all_results.groupby("user_id")
lst_stats_per_user = []
for user in lst_users:
results_user = df_results_user_group.get_group(user)
hist_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["hist_pred_total_power_max"])
LR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearRegression_total_power_max_watts"])
RF_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["RandomForestRegressor_total_power_max_watts"])
LSVR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearSVR_total_power_max_watts"])
SGD_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["SGDRegressor_total_power_max_watts"])
res = {"user_id": user,
"hist_mape": hist_mape,
"LinearRegression_mape": LR_mape,
"RandomForestRegressor_mape": RF_mape,
"LinearSVR_mape": LSVR_mape,
"SGDRegressor_mape": SGD_mape}
lst_stats_per_user.append(res)
#break
df_stats_per_user = pd.DataFrame(lst_stats_per_user)
#df_stats_per_user
COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user[COLS].describe()
## hist_mape LinearRegression_mape RandomForestRegressor_mape LinearSVR_mape SGDRegressor_mape
## count 5.760000e+02 5.760000e+02 5.760000e+02 5.760000e+02 5.760000e+02
## mean 7.703733e+12 7.850490e+12 7.442731e+12 7.222109e+12 7.435384e+12
## std 8.970551e+13 8.518249e+13 8.367890e+13 8.352948e+13 8.543374e+13
## min 3.036269e-03 4.983389e-03 1.162791e-03 1.479376e-02 1.651753e-04
## 25% 1.040980e-01 1.174098e-01 1.088291e-01 1.136779e-01 1.144742e-01
## 50% 1.959539e-01 2.155061e-01 1.940571e-01 1.956479e-01 2.051810e-01
## 75% 3.224582e-01 3.794099e-01 3.158385e-01 3.447812e-01 3.413483e-01
## max 1.533300e+15 1.539987e+15 1.431193e+15 1.489713e+15 1.556222e+15
COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
df_stats_per_user_pivot
## user_id variable value
## 0 1249 hist_mape 0.066269
## 1 1494 hist_mape 0.085390
## 2 634 hist_mape 0.141988
## 3 694 hist_mape 0.326237
## 4 1365 hist_mape 0.487555
## ... ... ... ...
## 2875 1711 SGDRegressor_mape 0.208962
## 2876 718 SGDRegressor_mape 0.538712
## 2877 550 SGDRegressor_mape 0.530643
## 2878 359 SGDRegressor_mape 0.442287
## 2879 850 SGDRegressor_mape 0.409482
##
## [2880 rows x 3 columns]
Figure 3 (b)
import matplotlib.pyplot as plt
TINY_SIZE = 2
SMALL_SIZE = 5
MEDIUM_SIZE = 20
BIGGER_SIZE = 50
FIG_WIDTH = 40
FIG_HEIGHT = 10
plt.rc('font', size=20) # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE) # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE) # fontsize of the figure title
plt.rc('figure', figsize=(8,4))
#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
#g.set_xlabel("Prediction Method")
#g.set_ylabel("Mean Absolute Percentage Error (MAPE) ")
g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)
## ([<matplotlib.axis.YTick object at 0x7fff9d9d4e50>, <matplotlib.axis.YTick object at 0x7fff9f6a1f50>, <matplotlib.axis.YTick object at 0x7fff9d550910>, <matplotlib.axis.YTick object at 0x7fffa1772e10>, <matplotlib.axis.YTick object at 0x7fffa187e6d0>], [Text(0, 0, 'History'), Text(0, 1, 'LinearRegression'), Text(0, 2, 'RandomForest'), Text(0, 3, 'LinearSVR'), Text(0, 4, 'SGDRegressor')])
Getting the actual mean and max power distributions
## 4907
## ── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 523204 Columns: 111
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): cpus_alloc_layout, job_state, nodes, resv_name, time_limit_str, t...
## dbl (34): index, Unnamed: 0, array_job_id, array_task_id, end_time, group_i...
## lgl (69): accrue_time, alloc_node, alloc_sid, array_max_tasks, array_task_s...
## dttm (1): submission_datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_mean %>% ggplot(aes(x=total_power_mean_watts)) +
geom_histogram() +
scale_y_continuous(labels = scales::label_number()) +
theme_bw(base_size=20) +
labs(
x='Total power (W)',
y='Number of jobs'
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 523204 Columns: 111
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): cpus_alloc_layout, job_state, nodes, resv_name, time_limit_str, t...
## dbl (34): index, Unnamed: 0, array_job_id, array_task_id, end_time, group_i...
## lgl (69): accrue_time, alloc_node, alloc_sid, array_max_tasks, array_task_s...
## dttm (1): submission_datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_max %>% ggplot(aes(x=total_power_max_watts)) +
geom_histogram() +
scale_y_continuous(labels = scales::label_number()) +
theme_bw(base_size=20) +
labs(
x='Total power (W)',
y='Number of jobs'
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.