
#library(ggplot2)
library(caret)
library(data.table)
library(cluster)


catchments <- fread("catchments_non_clustered.csv", encoding = "UTF-8")

catchments_mean <- catchments[,
  lapply(.SD, mean, na.rm = TRUE),
  by = Gauge,
  .SDcols = names(catchments)[-(1:7)]
]

vars <- c(
          "Gauge",
          "A",
          "E",
          "S",
          "Agr_8",
          "Glc",
          "Alp",
          "Nat",
          "Grl",
          "Lakes",
          "l_p",
          "l_a",
          "q"
          )
catchments_mean_sel <- catchments_mean[, vars, with = FALSE]

#### cluster analysis ####
pre <- preProcess(catchments_mean_sel[, -1], method = c("knnImpute", "center", "scale"))
catchments_mean_sel <- cbind(catchments_mean_sel[, 1], predict(pre, catchments_mean_sel[, -1]))

pca <- prcomp(catchments_mean_sel[, -1])
plot(pca)
summary(pca)
summary_PCA <- summary(pca)
test <- predict(pca, catchments_mean_sel)

pam_test <- pam(test[, 1:2], 3, metric = "euclidean")
plot(pam_test)


clusters <- data.table(Gauge = catchments_mean_sel$Gauge, cluster = pam_test$clustering)

catchments_clusters <- merge(catchments_mean_sel[, .(Gauge)], clusters, by = "Gauge", all.x = TRUE, sort = FALSE)

