Job power prediction result analysis

Processing the mean power prediction results

Outputs of script run_prediction_per_user_allmethods_mean.py.

import pandas as pd
import seaborn as sns

import os

RESULTS_PATH = "../user-power-predictions/data/total_power_mean_predictions_users_allmethods_mean/"
PRED_COLS = ["hist_pred_total_power_mean",
            "LinearRegression_total_power_mean_watts",
            "RandomForestRegressor_total_power_mean_watts", 
            "LinearSVR_total_power_mean_watts", 
            "SGDRegressor_total_power_mean_watts"]


result_filenames = os.listdir(RESULTS_PATH)

df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])

df_all_results = df_all_results.dropna(subset=PRED_COLS)
df_all_results.to_csv('/tmp/allresults-mean.csv', index=False)


from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

lst_users = df_all_results["user_id"].drop_duplicates().to_list()
#print(lst_users)

df_results_user_group = df_all_results.groupby("user_id")

lst_stats_per_user = []

for user in lst_users:
    results_user = df_results_user_group.get_group(user)
    hist_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["hist_pred_total_power_mean"])
    LR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearRegression_total_power_mean_watts"])
    RF_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["RandomForestRegressor_total_power_mean_watts"])
    LSVR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearSVR_total_power_mean_watts"])
    SGD_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["SGDRegressor_total_power_mean_watts"])
    res = {"user_id": user, 
           "hist_mape": hist_mape, 
           "LinearRegression_mape": LR_mape, 
           "RandomForestRegressor_mape": RF_mape, 
           "LinearSVR_mape": LSVR_mape,
           "SGDRegressor_mape": SGD_mape}
    lst_stats_per_user.append(res)
    #break

df_stats_per_user = pd.DataFrame(lst_stats_per_user)
df_stats_per_user

##      user_id  hist_mape  LinearRegression_mape  RandomForestRegressor_mape  LinearSVR_mape  SGDRegressor_mape
## 0       1249   0.087213           1.013289e-01                    0.124200        0.104198           0.112710
## 1       1494   0.075278           7.187246e-02                    0.074445        0.083109           0.075146
## 2        634   0.067456           3.187002e-01                    0.209108        0.247893           0.311342
## 3        694   0.270083           6.136092e+11                    0.231271        0.248924           0.270858
## 4       1365   0.223117           2.038785e-01                    0.208285        0.197755           0.194399
## ..       ...        ...                    ...                         ...             ...                ...
## 571     1711   0.158810           1.669925e-01                    0.218634        0.131284           0.213693
## 572      718   0.059795           1.269777e-01                    0.052001        0.088634           0.068901
## 573      550   0.269676           2.704648e-01                    0.259030        0.295231           0.283677
## 574      359   0.119872           2.296592e-01                    0.160075        0.202770           0.214655
## 575      850   0.132682           3.073521e-01                    0.344460        0.349007           0.314373
## 
## [576 rows x 6 columns]

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user[COLS].describe()

##           hist_mape  LinearRegression_mape  RandomForestRegressor_mape  LinearSVR_mape  SGDRegressor_mape
## count  5.760000e+02           5.760000e+02                5.760000e+02    5.760000e+02       5.760000e+02
## mean   6.297430e+12           6.681627e+12                6.684230e+12    6.261852e+12       6.478381e+12
## std    7.301566e+13           7.503549e+13                7.817233e+13    7.203878e+13       7.572169e+13
## min    4.004490e-03           3.718275e-03                1.193619e-02    1.126697e-02       2.253075e-03
## 25%    7.110493e-02           7.679108e-02                7.374854e-02    7.741257e-02       7.314237e-02
## 50%    1.151113e-01           1.286967e-01                1.176980e-01    1.275947e-01       1.217893e-01
## 75%    1.800437e-01           2.326375e-01                1.857662e-01    2.062293e-01       2.028217e-01
## max    1.310127e+15           1.323483e+15                1.424301e+15    1.232220e+15       1.364989e+15

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
df_stats_per_user_pivot

##       user_id           variable     value
## 0        1249          hist_mape  0.087213
## 1        1494          hist_mape  0.075278
## 2         634          hist_mape  0.067456
## 3         694          hist_mape  0.270083
## 4        1365          hist_mape  0.223117
## ...       ...                ...       ...
## 2875     1711  SGDRegressor_mape  0.213693
## 2876      718  SGDRegressor_mape  0.068901
## 2877      550  SGDRegressor_mape  0.283677
## 2878      359  SGDRegressor_mape  0.214655
## 2879      850  SGDRegressor_mape  0.314373
## 
## [2880 rows x 3 columns]

Figure 3 (a)

import matplotlib.pyplot as plt

TINY_SIZE = 2
SMALL_SIZE = 5
MEDIUM_SIZE = 20
BIGGER_SIZE = 50
FIG_WIDTH = 40
FIG_HEIGHT = 10

#plt.rc('font', size=16)          # controls default text sizes
plt.rc('font', size=20)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
plt.rc('figure', figsize=(8,4))

#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)

## ([<matplotlib.axis.YTick object at 0x7fffa0beef90>, <matplotlib.axis.YTick object at 0x7fff9e197410>, <matplotlib.axis.YTick object at 0x7fff9d1cafd0>, <matplotlib.axis.YTick object at 0x7fff9d216bd0>, <matplotlib.axis.YTick object at 0x7fff9dc8ffd0>], [Text(0, 0, 'History'), Text(0, 1, 'LinearRegression'), Text(0, 2, 'RandomForest'), Text(0, 3, 'LinearSVR'), Text(0, 4, 'SGDRegressor')])


g.set_ylabel("Prediction Method")
g.set_xlabel("Mean Absolute Percentage Error (MAPE)     ")
plt.tight_layout(pad=0)
plt.savefig("./fig3a-pred-mape-mean-power.svg")
plt.savefig("./fig3a-pred-mape-mean-power.pdf")

Processing the max power prediction results

Outputs of script run_prediction_per_user_allmethods_max.py.

import pandas as pd
import seaborn as sns

import os

RESULTS_PATH = "../user-power-predictions/data/total_power_mean_predictions_users_allmethods_max/"

PRED_COLS = ["hist_pred_total_power_max",
            "LinearRegression_total_power_max_watts",
            "RandomForestRegressor_total_power_max_watts", 
            "LinearSVR_total_power_max_watts", 
            "SGDRegressor_total_power_max_watts"]


result_filenames = os.listdir(RESULTS_PATH)

df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])

df_all_results = df_all_results.dropna(subset=PRED_COLS)
df_all_results.to_csv('/tmp/allresults-max.csv', index=False)
#df_all_results


from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

lst_users = df_all_results["user_id"].drop_duplicates().to_list()
#print(lst_users)

df_results_user_group = df_all_results.groupby("user_id")

lst_stats_per_user = []

for user in lst_users:
    results_user = df_results_user_group.get_group(user)
    hist_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["hist_pred_total_power_max"])
    LR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearRegression_total_power_max_watts"])
    RF_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["RandomForestRegressor_total_power_max_watts"])
    LSVR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearSVR_total_power_max_watts"])
    SGD_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["SGDRegressor_total_power_max_watts"])
    res = {"user_id": user, 
           "hist_mape": hist_mape, 
           "LinearRegression_mape": LR_mape, 
           "RandomForestRegressor_mape": RF_mape, 
           "LinearSVR_mape": LSVR_mape,
           "SGDRegressor_mape": SGD_mape}
    lst_stats_per_user.append(res)
    #break

df_stats_per_user = pd.DataFrame(lst_stats_per_user)
#df_stats_per_user

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user[COLS].describe()

##           hist_mape  LinearRegression_mape  RandomForestRegressor_mape  LinearSVR_mape  SGDRegressor_mape
## count  5.760000e+02           5.760000e+02                5.760000e+02    5.760000e+02       5.760000e+02
## mean   7.703733e+12           7.850490e+12                7.442731e+12    7.222109e+12       7.435384e+12
## std    8.970551e+13           8.518249e+13                8.367890e+13    8.352948e+13       8.543374e+13
## min    3.036269e-03           4.983389e-03                1.162791e-03    1.479376e-02       1.651753e-04
## 25%    1.040980e-01           1.174098e-01                1.088291e-01    1.136779e-01       1.144742e-01
## 50%    1.959539e-01           2.155061e-01                1.940571e-01    1.956479e-01       2.051810e-01
## 75%    3.224582e-01           3.794099e-01                3.158385e-01    3.447812e-01       3.413483e-01
## max    1.533300e+15           1.539987e+15                1.431193e+15    1.489713e+15       1.556222e+15

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
df_stats_per_user_pivot

##       user_id           variable     value
## 0        1249          hist_mape  0.066269
## 1        1494          hist_mape  0.085390
## 2         634          hist_mape  0.141988
## 3         694          hist_mape  0.326237
## 4        1365          hist_mape  0.487555
## ...       ...                ...       ...
## 2875     1711  SGDRegressor_mape  0.208962
## 2876      718  SGDRegressor_mape  0.538712
## 2877      550  SGDRegressor_mape  0.530643
## 2878      359  SGDRegressor_mape  0.442287
## 2879      850  SGDRegressor_mape  0.409482
## 
## [2880 rows x 3 columns]

Figure 3 (b)

import matplotlib.pyplot as plt

TINY_SIZE = 2
SMALL_SIZE = 5
MEDIUM_SIZE = 20
BIGGER_SIZE = 50
FIG_WIDTH = 40
FIG_HEIGHT = 10


plt.rc('font', size=20)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
plt.rc('figure', figsize=(8,4))

#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
#g.set_xlabel("Prediction Method")
#g.set_ylabel("Mean Absolute Percentage Error (MAPE)            ")

g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)

## ([<matplotlib.axis.YTick object at 0x7fff9d9d4e50>, <matplotlib.axis.YTick object at 0x7fff9f6a1f50>, <matplotlib.axis.YTick object at 0x7fff9d550910>, <matplotlib.axis.YTick object at 0x7fffa1772e10>, <matplotlib.axis.YTick object at 0x7fffa187e6d0>], [Text(0, 0, 'History'), Text(0, 1, 'LinearRegression'), Text(0, 2, 'RandomForest'), Text(0, 3, 'LinearSVR'), Text(0, 4, 'SGDRegressor')])

g.set_ylabel("Prediction Method")
g.set_xlabel("Mean Absolute Percentage Error (MAPE)")
plt.tight_layout(pad=0)
plt.savefig("./fig3b-pred-mape-max-power.svg")
plt.savefig("./fig3b-pred-mape-max-power.pdf")

Getting the actual mean and max power distributions

# clear all Python memory
import sys
sys.modules[__name__].__dict__.clear()
import gc
gc.collect()

## 4907

library(tidyverse)

## ── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data_mean = read_csv('/tmp/allresults-mean.csv')

## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 523204 Columns: 111
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): cpus_alloc_layout, job_state, nodes, resv_name, time_limit_str, t...
## dbl  (34): index, Unnamed: 0, array_job_id, array_task_id, end_time, group_i...
## lgl  (69): accrue_time, alloc_node, alloc_sid, array_max_tasks, array_task_s...
## dttm  (1): submission_datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_mean %>% ggplot(aes(x=total_power_mean_watts)) +
  geom_histogram() +
  scale_y_continuous(labels = scales::label_number()) +
  theme_bw(base_size=20) +
  labs(
    x='Total power (W)',
    y='Number of jobs'
  )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave('./fig2a-distrib-job-power-mean.pdf', width=6, height=3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave('./fig2a-distrib-job-power-mean.svg', width=6, height=3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rm(data_mean)

data_max = read_csv('/tmp/allresults-max.csv')

## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 523204 Columns: 111
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): cpus_alloc_layout, job_state, nodes, resv_name, time_limit_str, t...
## dbl  (34): index, Unnamed: 0, array_job_id, array_task_id, end_time, group_i...
## lgl  (69): accrue_time, alloc_node, alloc_sid, array_max_tasks, array_task_s...
## dttm  (1): submission_datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_max %>% ggplot(aes(x=total_power_max_watts)) +
  geom_histogram() +
  scale_y_continuous(labels = scales::label_number()) +
  theme_bw(base_size=20) +
  labs(
    x='Total power (W)',
    y='Number of jobs'
  )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave('./fig2b-distrib-job-power-max.pdf', width=6, height=3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave('./fig2b-distrib-job-power-max.svg', width=6, height=3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rm(data_max)