# Identification of vocalization types

# Packages
library(car)
library(rgl)
library(EFAtools)
library(dplyr)
library(psych) 


# PCA

# Load data
mydata <- read.csv(file = "/vocal_id_pca.csv", header = TRUE)

# Preliminary selection of transformed variables to minimize outlier influence.
dataset <- mydata[, c("meanfreq", "sd", "freq.median", "freq.Q25", "freq.Q75_sqrt", 
                      "freq.IQR_log", "time.median_log", "time.Q25_sqrt", "time.Q75_log", 
                      "time.IQR_log", "skew_log", "kurt_log", "sp.ent_sq", "time.ent_sq", 
                      "entropy_sq", "sfm", "meandom", "mindom_sqrt", "maxdom_log", 
                      "dfrange_log", "modindx_log", "startdom", "enddom", "dfslope_sqrt", 
                      "meanpeakf", "peakf", "h1_freq_sq", "h1_width_sqrt", "h2_freq", 
                      "h2_width_sqrt", "h3_freq_sq", "h3_width", "harmonics_sq", 
                      "HNR_log", "meanfun", "minfun", "maxfun_log", "num.elms_log", 
                      "elm.duration_log", "song.duration_log", "song.rate_log", 
                      "gap.duration_sqrt")]

# Redundancy reduction: final subset selected after removing uninformative, uncorrelated, and low-KMO variables.
dataset2 <- mydata[, c("freq.median", "meanfreq", "maxdom_log", "meanpeakf", 
                       "h3_freq_sq", "peakf", "elm.duration_log", "time.median_log", 
                       "modindx_log", "entropy_sq", "dfrange_log", "dfslope_sqrt", 
                       "maxfun_log", "minfun", "h1_freq_sq", "gap.duration_sqrt", 
                       "song.rate_log")]

# Multicollinearity assessment: correlation matrix of the selected acoustic parameters.
data_matrix <- cor(dataset2, use = "complete.obs")
data_matrix

# KMO test for sampling adequacy.
EFAtools::KMO(x = data_matrix)

# Bartlett's test of sphericity for data suitability.
EFAtools::BARTLETT(x = data_matrix, N = 1616)

# Initial PCA without rotation (all components retained).
pca1 <- psych::principal(dataset2, nfactors = length(dataset2),rotate = "none")

# Scree plot for component selection.
plot(pca1$values, type = "b")

# Final PCA model (4 components, Varimax rotation).
ultimatepca <- psych::principal(r = dataset2,
                                nfactors = 4,
                                rotate = "Varimax",
                                eps = 1e-7,
                                use = "pairwise.complete.obs")

# Merging PCA scores with original data.
dataset3 <- cbind(mydata, ultimatepca$scores)

# Converting categorical variable to factor.
dataset3$type.2 <- as.factor(dataset3$type.2)

# Representative subsampling: selecting the 60 observations closest to the multivariate median for each acoustic class.
datos_filtered <- dataset3 %>%
  filter(type.2 %in% c("single", "double", "vibrato", "trill", "drumming")) %>%
  group_by(type.2) %>%
  mutate(dist_to_median = (RC1 - median(RC1))^2 + (RC2 - median(RC2))^2 + (RC3 - median(RC3))^2) %>%
  arrange(dist_to_median) %>%
  slice_head(n = 60) %>% 
  ungroup() %>%
  select(-dist_to_median)

# Plot
scatter3d(x = datos_filtered$RC1,
          y = datos_filtered$RC2,
          z = datos_filtered$RC3,
          groups = datos_filtered$type.2,
          grid = F,
          surface = TRUE,
          surface.alpha = 0.3,
          ellipsoid = F,
          ellipsoid.alpha = 0.1,
          surface.col = c("tan3","firebrick","violetred", "springgreen3", "cyan4"),
          fit = "smooth",
          axis.scales = T,
          sphere.size = 1.05 ,
          axis.col = c("black", "black", "black"))






