
# Load required libraries
library(randomForest)
library(xgboost)
library(caret)
library(tidyverse)

rm(list=ls())
data = read_csv("data_with_diveprofiles.csv")
glimpse(data)
data = data %>% dplyr::select(age,gender,bmi,
                       BarLiter,max_depth,bubble_grade,
                       total_dive_time_min,
                       after_max_depth_time_at_intervall_0_10:after_max_depth_time_at_intervall_30_40,
                       total_time_at_intervall_0_10:total_time_at_intervall_30_40)

data$after_max_depth_time_at_intervall_0_10[is.na(data$after_max_depth_time_at_intervall_0_10)]=0
data$after_max_depth_time_at_intervall_10_20[is.na(data$after_max_depth_time_at_intervall_10_20)]=0
data$after_max_depth_time_at_intervall_20_30[is.na(data$after_max_depth_time_at_intervall_20_30)]=0
data$after_max_depth_time_at_intervall_30_40[is.na(data$after_max_depth_time_at_intervall_30_40)]=0

data$total_time_at_intervall_0_10[is.na(data$total_time_at_intervall_0_10)]=0
data$total_time_at_intervall_10_20[is.na(data$total_time_at_intervall_10_20)]=0
data$total_time_at_intervall_20_30[is.na(data$total_time_at_intervall_20_30)]=0
data$total_time_at_intervall_30_40[is.na(data$total_time_at_intervall_30_40)]=0




data$gender = as.numeric(as.factor(data$gender))-1

data=na.omit(data)

glimpse(data)


####ML Approaches for Full feature set

# Install and load necessary packages
library(smotefamily)
library(randomForest)
library(caret)

# Set up data splitting
set.seed(44)
trainIndex = createDataPartition(data$bubble_grade, p = .8, list = FALSE)
train = data[trainIndex,]
test  = data[-trainIndex,]

# Convert categorical variables to numeric using one-hot encoding (for example, 'gender')
train_numeric = train

test_numeric = data.frame(model.matrix(~.-1, data = test[, -which(names(test) == "bubble_grade")]))
test_numeric$bubble_grade = test$bubble_grade

# Apply SMOTE for class imbalance on the training set
train_balanced = SMOTE(train_numeric[, -which(names(train_numeric) == "bubble_grade")], 
                       train_numeric$bubble_grade, K = 1, dup_size =2)

# The SMOTE function returns a list, with oversampled data in $data and target variable in $class
train_balanced_data <- train_balanced$data %>% rename(bubble_grade = class)
train_balanced_data$bubble_grade = as.numeric(train_balanced_data$bubble_grade)

# Create a Random Forest model using the SMOTE-balanced training data
set.seed(124)
rf_model = randomForest(log(bubble_grade+1) ~ ., data = train_balanced_data, ntree = 1000)

# Make predictions on the test set
predictions = predict(rf_model, test_numeric)

plot(test_numeric$bubble_grade,exp(predictions))
cor(test_numeric$bubble_grade,exp(predictions),method="spearman")

library(xgboost)

# Prepare data for XGBoost by creating matrices (XGBoost requires matrix input)
dtrain = xgb.DMatrix(data = as.matrix(train_balanced_data[, -which(names(train_balanced_data) == "bubble_grade")]), 
                     label = log(train_balanced_data$bubble_grade+1))
dtest = xgb.DMatrix(data = as.matrix(test_numeric[, -which(names(test_numeric) == "bubble_grade")]), 
                    label = log(test_numeric$bubble_grade+1))

# Set XGBoost parameters for regression
params = list(
  objective = "reg:squarederror",  # Objective function for regression
  eval_metric = "rmse"  # Root Mean Square Error as the evaluation metric
)

# Train the XGBoost model for regression
set.seed(124)
xgb_model = xgb.train(params = params, data = dtrain, nrounds = 100)

# Make predictions on the test set using the XGBoost model
xgb_predictions = predict(xgb_model, dtest)

# Evaluate performance: calculate RMSE
rmse = sqrt(mean((xgb_predictions - test_numeric$bubble_grade)^2))
print(paste("RMSE:", rmse))

# Optionally, if you want to compare the predicted values to the actual values:
results = data.frame(actual = test_numeric$bubble_grade, predicted = exp(xgb_predictions))
plot(results)
cor(results,method="spearman")



# Load required libraries
library(randomForest)
library(caret)
library(dplyr)

# Subset the data for person-specific variables
train_person = train_balanced_data[, c("age", "gender", "bmi", "bubble_grade")]
test_person = test_numeric[, c("age", "gender", "bmi", "bubble_grade")]

# Train the Random Forest model
set.seed(124)
rf_person = randomForest(log(bubble_grade + 1) ~ ., data = train_person, ntree = 1000, nodesize = 5)

# Predict on the test set
pred_person = predict(rf_person, test_person)

# Spearman correlation (all bubble grades)
spearman_all_person = cor(test_person$bubble_grade, exp(pred_person), method = "spearman")
print(spearman_all_person)

# Spearman correlation (excluding 0 bubble grades)
test_person_non_zero = test_person %>% filter(bubble_grade != 0)
spearman_non_zero_person = cor(test_person_non_zero$bubble_grade, exp(predict(rf_person, test_person_non_zero)), method = "spearman")
print(spearman_non_zero_person)


# Subset the data for minimal dive data
train_minimal = train_balanced_data[, c("max_depth", "total_dive_time_min", "bubble_grade")]
test_minimal = test_numeric[, c("max_depth", "total_dive_time_min", "bubble_grade")]

# Train the Random Forest model
set.seed(124)
rf_minimal = randomForest(log(bubble_grade + 1) ~ ., data = train_minimal, ntree = 1000, nodesize = 5)

# Predict on the test set
pred_minimal = predict(rf_minimal, test_minimal)

# Spearman correlation (all bubble grades)
spearman_all_minimal = cor(test_minimal$bubble_grade, exp(pred_minimal), method = "spearman")
print(spearman_all_minimal)

# Spearman correlation (excluding 0 bubble grades)
test_minimal_non_zero = test_minimal %>% filter(bubble_grade != 0)
spearman_non_zero_minimal = cor(test_minimal_non_zero$bubble_grade, exp(predict(rf_minimal, test_minimal_non_zero)), method = "spearman")
print(spearman_non_zero_minimal)

# Subset the data for person-specific + minimal dive data
train_person_dive = train_balanced_data[, c("age", "gender", "bmi", "max_depth", "total_dive_time_min", "bubble_grade")]
test_person_dive = test_numeric[, c("age", "gender", "bmi", "max_depth", "total_dive_time_min", "bubble_grade")]

# Train the Random Forest model
set.seed(124)
rf_person_dive = randomForest(log(bubble_grade + 1) ~ ., data = train_person_dive, ntree = 1000, mtry = floor(sqrt(5)), nodesize = 5)

# Predict on the test set
pred_person_dive = predict(rf_person_dive, test_person_dive)

# Spearman correlation (all bubble grades)
spearman_all_person_dive = cor(test_person_dive$bubble_grade, exp(pred_person_dive), method = "spearman")
print(spearman_all_person_dive)

# Spearman correlation (excluding 0 bubble grades)
test_person_dive_non_zero = test_person_dive %>% filter(bubble_grade != 0)
spearman_non_zero_person_dive = cor(test_person_dive_non_zero$bubble_grade, exp(predict(rf_person_dive, test_person_dive_non_zero)), method = "spearman")
print(spearman_non_zero_person_dive)

# Subset the data for person-specific + dive data + air consumption
train_person_dive_air = train_balanced_data[, c("age", "gender", "bmi", "max_depth", "total_dive_time_min", "BarLiter", "bubble_grade")]
test_person_dive_air = test_numeric[, c("age", "gender", "bmi", "max_depth", "total_dive_time_min", "BarLiter", "bubble_grade")]

# Train the Random Forest model
set.seed(124)
rf_person_dive_air = randomForest(log(bubble_grade + 1) ~ ., data = train_person_dive_air, ntree = 1000, mtry = floor(sqrt(6)), nodesize = 5)

# Predict on the test set
pred_person_dive_air = predict(rf_person_dive_air, test_person_dive_air)

# Spearman correlation (all bubble grades)
spearman_all_person_dive_air = cor(test_person_dive_air$bubble_grade, exp(pred_person_dive_air), method = "spearman")
print(spearman_all_person_dive_air)

# Spearman correlation (excluding 0 bubble grades)
test_person_dive_air_non_zero = test_person_dive_air %>% filter(bubble_grade != 0)
spearman_non_zero_person_dive_air = cor(test_person_dive_air_non_zero$bubble_grade, exp(predict(rf_person_dive_air, test_person_dive_air_non_zero)), method = "spearman")
print(spearman_non_zero_person_dive_air)

# Subset the data for dive computer data
train_dive_computer = train_balanced_data[, c("max_depth", "total_dive_time_min", 
                                              "after_max_depth_time_at_intervall_0_10", 
                                              "after_max_depth_time_at_intervall_10_20", 
                                              "after_max_depth_time_at_intervall_20_30", 
                                              "after_max_depth_time_at_intervall_30_40",
                                              "total_time_at_intervall_0_10",
                                              "total_time_at_intervall_10_20",
                                              "total_time_at_intervall_20_30",
                                              "total_time_at_intervall_30_40",
                                              "bubble_grade")]
test_dive_computer = test_numeric[, c("max_depth", "total_dive_time_min", 
                                      "after_max_depth_time_at_intervall_0_10", 
                                      "after_max_depth_time_at_intervall_10_20", 
                                      "after_max_depth_time_at_intervall_20_30", 
                                      "after_max_depth_time_at_intervall_30_40", 
                                      "total_time_at_intervall_0_10",
                                      "total_time_at_intervall_10_20",
                                      "total_time_at_intervall_20_30",
                                      "total_time_at_intervall_30_40",
                                      "bubble_grade")]

# Train the Random Forest model
set.seed(124)
rf_dive_computer = randomForest(log(bubble_grade + 1) ~ ., data = train_dive_computer, ntree = 1000, mtry = floor(sqrt(6)), nodesize = 5)

# Predict on the test set
pred_dive_computer = predict(rf_dive_computer, test_dive_computer)

# Spearman correlation (all bubble grades)
spearman_all_dive_computer = cor(test_dive_computer$bubble_grade, exp(pred_dive_computer), method = "spearman")
print(spearman_all_dive_computer)

# Spearman correlation (excluding 0 bubble grades)
test_dive_computer_non_zero = test_dive_computer %>% filter(bubble_grade != 0)
spearman_non_zero_dive_computer = cor(test_dive_computer_non_zero$bubble_grade, exp(predict(rf_dive_computer, test_dive_computer_non_zero)), method = "spearman")
print(spearman_non_zero_dive_computer)

# Subset the data for all features
train_all = train_balanced_data[, c("age", "gender", "bmi", "max_depth", "total_dive_time_min", "BarLiter",
                                    "after_max_depth_time_at_intervall_0_10", 
                                    "after_max_depth_time_at_intervall_10_20", 
                                    "after_max_depth_time_at_intervall_20_30", 
                                    "after_max_depth_time_at_intervall_30_40", 
                                    "total_time_at_intervall_0_10",
                                    "total_time_at_intervall_10_20",
                                    "total_time_at_intervall_20_30",
                                    "total_time_at_intervall_30_40",
                                    "bubble_grade")]
test_all = test_numeric[, c("age", "gender", "bmi", "max_depth", "total_dive_time_min", "BarLiter",
                            "after_max_depth_time_at_intervall_0_10", 
                            "after_max_depth_time_at_intervall_10_20", 
                            "after_max_depth_time_at_intervall_20_30", 
                            "after_max_depth_time_at_intervall_30_40", 
                            "total_time_at_intervall_0_10",
                            "total_time_at_intervall_10_20",
                            "total_time_at_intervall_20_30",
                            "total_time_at_intervall_30_40",
                            "bubble_grade")]

# Train the Random Forest model
set.seed(124)
rf_all = randomForest(log(bubble_grade + 1) ~ ., data = train_all, ntree = 1000, mtry = floor(sqrt(10)), nodesize = 5)

# Predict on the test set
pred_all = predict(rf_all, test_all)

# Spearman correlation (all bubble grades)
spearman_all_all = cor(test_all$bubble_grade, exp(pred_all), method = "spearman")
print(spearman_all_all)

# Spearman correlation (excluding 0 bubble grades)
test_all_non_zero = test_all %>% filter(bubble_grade != 0)
spearman_non_zero_all = cor(test_all_non_zero$bubble_grade, exp(predict(rf_all, test_all_non_zero)), method = "spearman")
print(spearman_non_zero_all)


# Create a summary table with Spearman correlations
results_table = data.frame(
  Feature_Set = c("Person specific", 
                  "Minimal dive data", 
                  "Person + dive data", 
                  "Person + dive + air", 
                  "Dive computer data", 
                  "All features"),
  Spearman_All = c(spearman_all_person, 
                   spearman_all_minimal, 
                   spearman_all_person_dive, 
                   spearman_all_person_dive_air, 
                   spearman_all_dive_computer, 
                   spearman_all_all),
  Spearman_Excl_Zero = c(spearman_non_zero_person, 
                         spearman_non_zero_minimal, 
                         spearman_non_zero_person_dive, 
                         spearman_non_zero_person_dive_air, 
                         spearman_non_zero_dive_computer, 
                         spearman_non_zero_all)
)

# Print the results table
print(results_table)



