/pagenumbering{arabic}

1 Data preparation

Before applying fPCAs to the data, the 854 recorded dives must be visualised to make sure there is no sensor error or unexpected data. For obvious reasons, the 854 dive profiles will not be provided here. A sample of 4 dives is presented below. These dive profiles illustrate the variety of dive types in the studied dataset, with duration, depth, skewness and overall shape differing among dives.

Following Godard et al., 2019 (in Front. Mar. Sc.), fPCAs are lead on normalised dives. By reducing depth and duration to a 0 to 1 scale, dive structures are made comparable. The code is shown below.

nbcoef=50
myb=create.bspline.basis(c(0,1),nbcoef)
penMat=eval.penalty(myb,2)
penMat=cbind(penMat,0,0)
penMat=rbind(penMat,0,0)

X.fd=fd(coef=NULL,basisobj=myb)
max_depth=NULL
duree=NULL
for(k in 1:nplong){
  cat("k=",k,fill=TRUE)
  tps=plong[[k]][,1]
  duree=c(duree,max(tps)-min(tps))
  tps=1/(max(tps)-min(tps))*(tps-min(tps))
  prof=plong[[k]][,2]
  max_depth=c(max_depth,min(prof))
  prof=-prof/min(prof)
  
  ### Compute the spline coefficients under constraints 
  k1=bsplineS(0,breaks=c(0,myb$params,1),nderiv=0)
  k1=t(k1)
  k1b=bsplineS(1,breaks=c(0,myb$params,1),nderiv=0)
  k1b=t(k1b)
  X=eval.basis(tps,myb)
  Mat=rbind(cbind(t(X)%*%X,k1,k1b),cbind(t(k1),0,0),cbind(t(k1b),0,0))
  ### Keep the spline coefficients, here KAPPA=1e-06
  achap=solve(Mat+0.000001*penMat)%*%rbind(t(X)%*%prof,0,0)
  X.fd$coefs=cbind(X.fd$coefs,as.matrix(achap[1:nbcoef]))
}
## k= 1
## k= 2
## k= 3
## k= 4
## k= 5
## k= 6
## k= 7
## k= 8
## k= 9
## k= 10
## k= 11
## k= 12
## k= 13
## k= 14
## k= 15
## k= 16
## k= 17
## k= 18
## k= 19
## k= 20
## k= 21
## k= 22
## k= 23
## k= 24
## k= 25
## k= 26
## k= 27
## k= 28
## k= 29
## k= 30
## k= 31
## k= 32
## k= 33
## k= 34
## k= 35
## k= 36
## k= 37
## k= 38
## k= 39
## k= 40
## k= 41
## k= 42
## k= 43
## k= 44
## k= 45
## k= 46
## k= 47
## k= 48
## k= 49
## k= 50
## k= 51
## k= 52
## k= 53
## k= 54
## k= 55
## k= 56
## k= 57
## k= 58
## k= 59
## k= 60
## k= 61
## k= 62
## k= 63
## k= 64
## k= 65
## k= 66
## k= 67
## k= 68
## k= 69
## k= 70
## k= 71
## k= 72
## k= 73
## k= 74
## k= 75
## k= 76
## k= 77
## k= 78
## k= 79
## k= 80
## k= 81
## k= 82
## k= 83
## k= 84
## k= 85
## k= 86
## k= 87
## k= 88
## k= 89
## k= 90
## k= 91
## k= 92
## k= 93
## k= 94
## k= 95
## k= 96
## k= 97
## k= 98
## k= 99
## k= 100
## k= 101
## k= 102
## k= 103
## k= 104
## k= 105
## k= 106
## k= 107
## k= 108
## k= 109
## k= 110
## k= 111
## k= 112
## k= 113
## k= 114
## k= 115
## k= 116
## k= 117
## k= 118
## k= 119
## k= 120
## k= 121
## k= 122
## k= 123
## k= 124
## k= 125
## k= 126
## k= 127
## k= 128
## k= 129
## k= 130
## k= 131
## k= 132
## k= 133
## k= 134
## k= 135
## k= 136
## k= 137
## k= 138
## k= 139
## k= 140
## k= 141
## k= 142
## k= 143
## k= 144
## k= 145
## k= 146
## k= 147
## k= 148
## k= 149
## k= 150
## k= 151
## k= 152
## k= 153
## k= 154
## k= 155
## k= 156
## k= 157
## k= 158
## k= 159
## k= 160
## k= 161
## k= 162
## k= 163
## k= 164
## k= 165
## k= 166
## k= 167
## k= 168
## k= 169
## k= 170
## k= 171
## k= 172
## k= 173
## k= 174
## k= 175
## k= 176
## k= 177
## k= 178
## k= 179
## k= 180
## k= 181
## k= 182
## k= 183
## k= 184
## k= 185
## k= 186
## k= 187
## k= 188
## k= 189
## k= 190
## k= 191
## k= 192
## k= 193
## k= 194
## k= 195
## k= 196
## k= 197
## k= 198
## k= 199
## k= 200
## k= 201
## k= 202
## k= 203
## k= 204
## k= 205
## k= 206
## k= 207
## k= 208
## k= 209
## k= 210
## k= 211
## k= 212
## k= 213
## k= 214
## k= 215
## k= 216
## k= 217
## k= 218
## k= 219
## k= 220
## k= 221
## k= 222
## k= 223
## k= 224
## k= 225
## k= 226
## k= 227
## k= 228
## k= 229
## k= 230
## k= 231
## k= 232
## k= 233
## k= 234
## k= 235
## k= 236
## k= 237
## k= 238
## k= 239
## k= 240
## k= 241
## k= 242
## k= 243
## k= 244
## k= 245
## k= 246
## k= 247
## k= 248
## k= 249
## k= 250
## k= 251
## k= 252
## k= 253
## k= 254
## k= 255
## k= 256
## k= 257
## k= 258
## k= 259
## k= 260
## k= 261
## k= 262
## k= 263
## k= 264
## k= 265
## k= 266
## k= 267
## k= 268
## k= 269
## k= 270
## k= 271
## k= 272
## k= 273
## k= 274
## k= 275
## k= 276
## k= 277
## k= 278
## k= 279
## k= 280
## k= 281
## k= 282
## k= 283
## k= 284
## k= 285
## k= 286
## k= 287
## k= 288
## k= 289
## k= 290
## k= 291
## k= 292
## k= 293
## k= 294
## k= 295
## k= 296
## k= 297
## k= 298
## k= 299
## k= 300
## k= 301
## k= 302
## k= 303
## k= 304
## k= 305
## k= 306
## k= 307
## k= 308
## k= 309
## k= 310
## k= 311
## k= 312
## k= 313
## k= 314
## k= 315
## k= 316
## k= 317
## k= 318
## k= 319
## k= 320
## k= 321
## k= 322
## k= 323
## k= 324
## k= 325
## k= 326
## k= 327
## k= 328
## k= 329
## k= 330
## k= 331
## k= 332
## k= 333
## k= 334
## k= 335
## k= 336
## k= 337
## k= 338
## k= 339
## k= 340
## k= 341
## k= 342
## k= 343
## k= 344
## k= 345
## k= 346
## k= 347
## k= 348
## k= 349
## k= 350
## k= 351
## k= 352
## k= 353
## k= 354
## k= 355
## k= 356
## k= 357
## k= 358
## k= 359
## k= 360
## k= 361
## k= 362
## k= 363
## k= 364
## k= 365
## k= 366
## k= 367
## k= 368
## k= 369
## k= 370
## k= 371
## k= 372
## k= 373
## k= 374
## k= 375
## k= 376
## k= 377
## k= 378
## k= 379
## k= 380
## k= 381
## k= 382
## k= 383
## k= 384
## k= 385
## k= 386
## k= 387
## k= 388
## k= 389
## k= 390
## k= 391
## k= 392
## k= 393
## k= 394
## k= 395
## k= 396
## k= 397
## k= 398
## k= 399
## k= 400
## k= 401
## k= 402
## k= 403
## k= 404
## k= 405
## k= 406
## k= 407
## k= 408
## k= 409
## k= 410
## k= 411
## k= 412
## k= 413
## k= 414
## k= 415
## k= 416
## k= 417
## k= 418
## k= 419
## k= 420
## k= 421
## k= 422
## k= 423
## k= 424
## k= 425
## k= 426
## k= 427
## k= 428
## k= 429
## k= 430
## k= 431
## k= 432
## k= 433
## k= 434
## k= 435
## k= 436
## k= 437
## k= 438
## k= 439
## k= 440
## k= 441
## k= 442
## k= 443
## k= 444
## k= 445
## k= 446
## k= 447
## k= 448
## k= 449
## k= 450
## k= 451
## k= 452
## k= 453
## k= 454
## k= 455
## k= 456
## k= 457
## k= 458
## k= 459
## k= 460
## k= 461
## k= 462
## k= 463
## k= 464
## k= 465
## k= 466
## k= 467
## k= 468
## k= 469
## k= 470
## k= 471
## k= 472
## k= 473
## k= 474
## k= 475
## k= 476
## k= 477
## k= 478
## k= 479
## k= 480
## k= 481
## k= 482
## k= 483
## k= 484
## k= 485
## k= 486
## k= 487
## k= 488
## k= 489
## k= 490
## k= 491
## k= 492
## k= 493
## k= 494
## k= 495
## k= 496
## k= 497
## k= 498
## k= 499
## k= 500
## k= 501
## k= 502
## k= 503
## k= 504
## k= 505
## k= 506
## k= 507
## k= 508
## k= 509
## k= 510
## k= 511
## k= 512
## k= 513
## k= 514
## k= 515
## k= 516
## k= 517
## k= 518
## k= 519
## k= 520
## k= 521
## k= 522
## k= 523
## k= 524
## k= 525
## k= 526
## k= 527
## k= 528
## k= 529
## k= 530
## k= 531
## k= 532
## k= 533
## k= 534
## k= 535
## k= 536
## k= 537
## k= 538
## k= 539
## k= 540
## k= 541
## k= 542
## k= 543
## k= 544
## k= 545
## k= 546
## k= 547
## k= 548
## k= 549
## k= 550
## k= 551
## k= 552
## k= 553
## k= 554
## k= 555
## k= 556
## k= 557
## k= 558
## k= 559
## k= 560
## k= 561
## k= 562
## k= 563
## k= 564
## k= 565
## k= 566
## k= 567
## k= 568
## k= 569
## k= 570
## k= 571
## k= 572
## k= 573
## k= 574
## k= 575
## k= 576
## k= 577
## k= 578
## k= 579
## k= 580
## k= 581
## k= 582
## k= 583
## k= 584
## k= 585
## k= 586
## k= 587
## k= 588
## k= 589
## k= 590
## k= 591
## k= 592
## k= 593
## k= 594
## k= 595
## k= 596
## k= 597
## k= 598
## k= 599
## k= 600
## k= 601
## k= 602
## k= 603
## k= 604
## k= 605
## k= 606
## k= 607
## k= 608
## k= 609
## k= 610
## k= 611
## k= 612
## k= 613
## k= 614
## k= 615
## k= 616
## k= 617
## k= 618
## k= 619
## k= 620
## k= 621
## k= 622
## k= 623
## k= 624
## k= 625
## k= 626
## k= 627
## k= 628
## k= 629
## k= 630
## k= 631
## k= 632
## k= 633
## k= 634
## k= 635
## k= 636
## k= 637
## k= 638
## k= 639
## k= 640
## k= 641
## k= 642
## k= 643
## k= 644
## k= 645
## k= 646
## k= 647
## k= 648
## k= 649
## k= 650
## k= 651
## k= 652
## k= 653
## k= 654
## k= 655
## k= 656
## k= 657
## k= 658
## k= 659
## k= 660
## k= 661
## k= 662
## k= 663
## k= 664
## k= 665
## k= 666
## k= 667
## k= 668
## k= 669
## k= 670
## k= 671
## k= 672
## k= 673
## k= 674
## k= 675
## k= 676
## k= 677
## k= 678
## k= 679
## k= 680
## k= 681
## k= 682
## k= 683
## k= 684
## k= 685
## k= 686
## k= 687
## k= 688
## k= 689
## k= 690
## k= 691
## k= 692
## k= 693
## k= 694
## k= 695
## k= 696
## k= 697
## k= 698
## k= 699
## k= 700
## k= 701
## k= 702
## k= 703
## k= 704
## k= 705
## k= 706
## k= 707
## k= 708
## k= 709
## k= 710
## k= 711
## k= 712
## k= 713
## k= 714
## k= 715
## k= 716
## k= 717
## k= 718
## k= 719
## k= 720
## k= 721
## k= 722
## k= 723
## k= 724
## k= 725
## k= 726
## k= 727
## k= 728
## k= 729
## k= 730
## k= 731
## k= 732
## k= 733
## k= 734
## k= 735
## k= 736
## k= 737
## k= 738
## k= 739
## k= 740
## k= 741
## k= 742
## k= 743
## k= 744
## k= 745
## k= 746
## k= 747
## k= 748
## k= 749
## k= 750
## k= 751
## k= 752
## k= 753
## k= 754
## k= 755
## k= 756
## k= 757
## k= 758
## k= 759
## k= 760
## k= 761
## k= 762
## k= 763
## k= 764
## k= 765
## k= 766
## k= 767
## k= 768
## k= 769
## k= 770
## k= 771
## k= 772
## k= 773
## k= 774
## k= 775
## k= 776
## k= 777
## k= 778
## k= 779
## k= 780
## k= 781
## k= 782
## k= 783
## k= 784
## k= 785
## k= 786
## k= 787
## k= 788
## k= 789
## k= 790
## k= 791
## k= 792
## k= 793
## k= 794
## k= 795
## k= 796
## k= 797
## k= 798
## k= 799
## k= 800
## k= 801
## k= 802
## k= 803
## k= 804
## k= 805
## k= 806
## k= 807
## k= 808
## k= 809
## k= 810
## k= 811
## k= 812
## k= 813
## k= 814
## k= 815
## k= 816
## k= 817
## k= 818
## k= 819
## k= 820
## k= 821
## k= 822
## k= 823
## k= 824
## k= 825
## k= 826
## k= 827
## k= 828
## k= 829
## k= 830
## k= 831
## k= 832
## k= 833
## k= 834
## k= 835
## k= 836
## k= 837
## k= 838
## k= 839
## k= 840
## k= 841
## k= 842
## k= 843
## k= 844
## k= 845
## k= 846
## k= 847
## k= 848
## k= 849
## k= 850
## k= 851
## k= 852
## k= 853
## k= 854
### Keep the coefficients
X.fd$coefs=X.fd$coefs[,-1]

# plot one normalised dive for visualisation
plot=plot(X.fd[68,])

# png(file="/Volumes/SARA_WORK/WS_data_DDU2019_SonarTag/data/lw19/output_files/figures/3D_dive_plot_all_dives/dives_not_classified_in_clusters/normalized/314a/dive_nb_3.png",width=600, height=600)

Functional PCAs are then performed.

xpca.fd=pca.fd(X.fd,3)

Three dive shapes are found, with each archetypal profile shown below.

The following plot is used to determine the number of components to keep for classification. It appears that 5 dimensions are necessary here.

2 Clustering methods application

To build clusters according to the shape of each dive, several methods are tested and compared. The robustness of clusters is tested by estimating consistency of groups created following each method.

Three methods were used:

  • Model based clustering, where classification and density are estimated based on finite Gaussian mixture modeling,

  • K means clustering, which assigns values to the cluster whose centroid is closest using Euclidean distance,

  • Hierarchical clustering, that calculates the pairwise dissimilarity between each observation in the dataset and fuses observations into clusters through a classification tree (dendrogram).

2.1 Model based clustering

2.1.1 Cluster construction

The data matrix is provided to the function Mclust to perform the clustering. The data matrix must be standardised. A summary displaying information on the best model is available, giving sample size and overall probability of belonging to each cluster. Estimation of the suitability of models is performed using the Bayesian information criterion (BIC hereafter).

The best suited model, with higher BIC, comprises 5 components. Group volume, shape and orientation can vary (VVV model).

The corresponding code and its evaluation is displayed below.

mod1 <- Mclust(xpca.fd[["scores"]])
summary(mod1, parameters = TRUE) 
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 5 components: 
## 
##  log-likelihood   n df      BIC      ICL
##        2148.217 854 37 4046.686 3694.369
## 
## Clustering table:
##   1   2   3   4   5 
##  75 205 133 256 185 
## 
## Mixing probabilities:
##          1          2          3          4          5 
## 0.07898608 0.24553603 0.16480201 0.27891470 0.23176118 
## 
## Means:
##             [,1]         [,2]        [,3]        [,4]        [,5]
## [1,] -0.19172360 -0.002713458 -0.06714429 -0.03480885  0.15785198
## [2,] -0.08748832  0.067796446  0.11319588 -0.08389994 -0.02153118
## [3,] -0.07574470  0.109994251 -0.04912812  0.02487256 -0.08571618
## 
## Variances:
## [,,1]
##               [,1]          [,2]          [,3]
## [1,]  1.005868e-03 -0.0001753370 -4.015381e-05
## [2,] -1.753370e-04  0.0008215416  1.301208e-04
## [3,] -4.015381e-05  0.0001301208  1.233113e-03
## [,,2]
##               [,1]          [,2]         [,3]
## [1,]  7.146416e-03 -3.453914e-05  0.001846948
## [2,] -3.453914e-05  6.098091e-03 -0.001566044
## [3,]  1.846948e-03 -1.566044e-03  0.003644108
## [,,3]
##              [,1]          [,2]          [,3]
## [1,]  0.004820275  0.0046402045 -0.0022437787
## [2,]  0.004640204  0.0115023659 -0.0006062919
## [3,] -0.002243779 -0.0006062919  0.0051292707
## [,,4]
##              [,1]          [,2]          [,3]
## [1,]  0.008670377 -0.0031691751  0.0028940728
## [2,] -0.003169175  0.0033637318 -0.0007543304
## [3,]  0.002894073 -0.0007543304  0.0058555057
## [,,5]
##              [,1]          [,2]         [,3]
## [1,] 1.541464e-02  7.292807e-05  0.003340142
## [2,] 7.292807e-05  1.367803e-02 -0.002906007
## [3,] 3.340142e-03 -2.906007e-03  0.008964650

Pairwise dimension clustering can be plotted, to illustrate classification.

plot(mod1, what ="classification")

Each model’s BIC can be plotted, depending on the number of components chosen. The following plot illustrates the necessity of a 5-dimension mixture.

The BIC summary presents BIC value for the top three models, for complementary information.

The ICL criterion can also be used, to balance bias induced by BIC that approximates the density rather than the number of clusters as such. Here, it might not be relevant to resort to the ICL criterion since it is efficient only in cases where there is very little overlap.

BIC <- mclustBIC(xpca.fd[["scores"]])
plot(BIC) # 5 dimensions mixture, VVV model with top BIC value

summary(BIC) # summary of top 3 models
## Best BIC values:
##             VVE,5      VVE,6      VEV,5
## BIC      4046.686 4031.03696 4013.66700
## BIC diff    0.000  -15.64927  -33.01922

2.1.2 Exploration of the resulting clusters

Scrucca et al., 2010 (in Statistics and computing) suggests reducing dimensions to project clusters into a suitable subspace, in order to visualise clustering structures and geometric characteristics. The reduction is performed using MclustDR function. The estimated directions are defined as a set of linear combinations of the original features, ordered by importance. The ordering is carried out through the quantification of associated eigenvalues.

The summaryfunction displays information on resulting dimensions. Eigenvalue of each remaining dimension is plotted, showing the classification performed with Mclustfunction.

drmod <- MclustDR(mod1, lambda = 1) # reduction of dimensions
summary(drmod)
## ----------------------------------------------------------------- 
## Dimension reduction for model-based clustering and classification 
## ----------------------------------------------------------------- 
## 
## Mixture model type: Mclust (VVE, 5) 
##         
## Clusters   n
##        1  75
##        2 205
##        3 133
##        4 256
##        5 185
## 
## Estimated basis vectors: 
##          Dir1     Dir2     Dir3
## [1,] -0.55279 -0.58669  0.34244
## [2,]  0.31787 -0.74461 -0.62080
## [3,]  0.77032 -0.31836  0.70523
## 
##                 Dir1     Dir2      Dir3
## Eigenvalues  0.88063  0.46734   0.21636
## Cum. %      56.29455 86.16902 100.00000
plot(drmod, what = "evalues") 

Several exploratory plots can be obtained. They are different representations of the same information. Each color represents a cluster.

Pairwise representation of the classification, for dimensions 1 to 4:

plot(drmod, what = "pairs") 

Cluster scatterplot with contour depending on distance to centroid, for dimensions 1 and 2:

plot(drmod, what = "contour") 

Cluster limits according to the model, for dimensions 1 and 2:

plot(drmod, what = "classification") 

Uncertainty zones (grey) according to the model, where ajdacent points belong to different clusters, for dimensions 1 and 2:

plot(drmod, what = "boundaries") 

Distribution of observations on each dimension axis can be plotted, to compare cluster positions in the four dimensions subspace.

plot(drmod, dimens = 1, what = "density")

plot(drmod, dimens = 2, what = "density")

plot(drmod, dimens = 3, what = "density")

2.1.3 Result extraction

Resulting clusters can be extracted from the model. Assessing probability of each data point to belong to its cluster is needed to estimate robustness of cluster predictions.

In the following table, the first five columns represent the position of data points on each dimension axis. The last two columns (in bold) represents the cluster number to which the given dive belongs to, and probability that the said dive does belong to this cluster. Only the first 6 rows are presented.

cluster = mod1$classification
uncertainty = mod1$uncertainty
clust_results = data.frame(xpca.fd[["scores"]],cluster)
results = data.frame(clust_results,uncertainty)
kable(head(results)) %>%
  kable_styling("striped") %>%
  column_spec(4:5, bold = TRUE) %>%
  add_header_above(c("Dimension axis" = 3, "Clustering results" = 2))
Dimension axis
Clustering results
X1 X2 X3 cluster uncertainty
-0.1724743 -0.1123871 -0.0563770 1 0.0851807
-0.0589972 0.1269479 0.0889921 2 0.0727001
-0.1879304 -0.0415901 -0.0348726 1 0.4344803
0.1187469 -0.0105761 0.1141133 2 0.1234015
0.0646416 0.0971359 0.1943154 2 0.0004626
0.0869299 0.0913429 0.2200825 2 0.0004977

Several threshold can be tested. Only dives with uncertainty below a given threshold are considered to belong to the corresponding cluster. For every tested threshold, the resulting number of classified dives is compared.

The maximum uncertainty of a dive dive to belong to the cluster it was classified in is 0.6490577.

The different thresholds tested are:

  • Uncertainty > 0.6490577, called a1 in the following code chunk,
  • Uncertainty > 0.5,
  • Uncertainty > 0.4,
  • Uncertainty > 0.3,
  • Uncertainty > 0.2,
  • Uncertainty > 0.1,
  • Uncertainty > 0.05,
  • Uncertainty > 0.04,
  • Uncertainty > 0.03,
  • Uncertainty > 0.02,
  • Uncertainty > 0.01.
results_thr1 = results[!(results$uncertainty>a1),]
results_thr2 = results[!(results$uncertainty>0.5),]
results_thr3 = results[!(results$uncertainty>0.4),]
results_thr4 = results[!(results$uncertainty>0.3),]
results_thr5 = results[!(results$uncertainty>0.2),]
results_thr6 = results[!(results$uncertainty>0.1),]
results_thr7 = results[!(results$uncertainty>0.05),]
results_thr8 = results[!(results$uncertainty>0.04),]
results_thr9 = results[!(results$uncertainty>0.03),]
results_thr10 = results[!(results$uncertainty>0.02),]
results_thr11 = results[!(results$uncertainty>0.01),]

thr_summary <- data.frame(var1=c(a1,0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.04,0.03, 0.02, 0.01),
                 var2=c(nrow(results_thr1),nrow(results_thr2),nrow(results_thr3),
                        nrow(results_thr4),nrow(results_thr5),nrow(results_thr6),
                        nrow(results_thr7),nrow(results_thr8),nrow(results_thr9),nrow(results_thr10),nrow(results_thr11)))

a2 <- c(a1,0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.04, 0.03, 0.02, 0.01)
clust1 <- numeric()
clust2 <- numeric()
clust3 <- numeric()
clust4 <- numeric()
clust5 <- numeric()
for(j in 1:11){ 
  k = a2[c(j)] 
  x1 = which(results$cluster == 1 & results$uncertainty<k)
  x2 = which(results$cluster == 2 & results$uncertainty<k)
  x3 = which(results$cluster == 3 & results$uncertainty<k)
  x4 = which(results$cluster == 4 & results$uncertainty<k)
  x5 = which(results$cluster == 5 & results$uncertainty<k)
clust1=rbind(clust1, length(x1))
clust2=rbind(clust2, length(x2))
clust3=rbind(clust3, length(x3))
clust4=rbind(clust4, length(x4))
clust5=rbind(clust5, length(x5))
}

thr_summary <- cbind(thr_summary,clust1,clust2,clust3,clust4,clust5)
names(thr_summary) <- c("Threshold value","Total number of dives", "Cluster 1",
                         "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")

kable(thr_summary,align = 'c') %>%
  kable_styling("striped") %>%
  add_header_above(c(" " = 2, "Number of dives per cluster" = 5))
Number of dives per cluster
Threshold value Total number of dives Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
0.6490577 854 75 205 133 255 185
0.5000000 832 73 200 130 247 182
0.4000000 741 67 181 109 214 170
0.3000000 656 62 168 84 177 165
0.2000000 545 52 146 59 138 150
0.1000000 411 38 114 37 84 138
0.0500000 294 17 93 23 37 124
0.0400000 258 14 86 15 26 117
0.0300000 210 4 79 7 10 110
0.0200000 171 0 66 3 0 102
0.0100000 143 0 50 0 0 93

2.2 K means clustering

2.2.1 Clustering construction

To perform K means clustering, the data matrix must be normalised. It is therefore necessary to reduce and center the data frame.

df <- scale(xpca.fd[["scores"]]) 
head(df)
##            [,1]        [,2]       [,3]
## [1,] -1.2721708 -0.96809969 -0.5303446
## [2,] -0.4351637  1.09352552  0.8371591
## [3,] -1.3861756 -0.35825638 -0.3280506
## [4,]  0.8758778 -0.09110264  1.0734769
## [5,]  0.4767966  0.83672626  1.8279467
## [6,]  0.6411954  0.78682553  2.0703413

Contrary to usual method, the number of cluster will not be computed through fviz_nbclust function, nor by comparing the number of clusters to the gap statistic (using clusGap function). As the aim of using K means clustering here is to compare robustness of clusters resulting from model-based clustering, the same number of dimensions must be used.

In the following analysis, a fixed number of clusters is provided to the function (5 dimensions). The nstart parameter is fixed at 150, to ensure stability of the model.

As shown in cluster plots examples, clusters overlay each other a lot, therefore results must be taken carefully.

km <- kmeans(df, centers = 5,nstart = 150) 
km
## K-means clustering with 5 clusters of sizes 127, 108, 218, 249, 152
## 
## Cluster means:
##          [,1]       [,2]       [,3]
## 1 -0.12993060  1.5998422 -0.2185933
## 2  1.27474350  0.2305231 -1.3047782
## 3 -0.02999682  0.2459167  1.1302084
## 4 -0.98619837 -0.4442591 -0.4587578
## 5  0.86139204 -1.1254327  0.2402807
## 
## Clustering vector:
##   [1] 4 3 4 3 3 3 5 3 3 4 4 4 3 2 2 3 4 5 1 2 5 4 3 5 3 5 5 1 5 3 5 1 2 5 4 1 3
##  [38] 3 3 2 1 4 4 1 4 3 1 4 5 5 1 4 4 5 2 4 3 5 1 3 3 4 4 1 4 3 4 4 4 4 3 5 1 3
##  [75] 1 5 5 1 3 3 5 3 1 5 4 1 5 3 5 3 4 4 5 4 1 4 4 3 1 4 1 5 5 5 3 2 3 2 4 4 3
## [112] 3 1 3 3 3 1 3 3 1 1 4 5 2 3 4 5 3 2 4 3 1 2 4 5 4 3 5 2 4 3 3 1 1 2 5 3 5
## [149] 4 4 3 4 4 3 5 3 3 3 1 3 1 2 3 1 4 3 1 3 5 3 5 4 3 4 2 1 5 4 4 3 2 1 2 2 3
## [186] 4 3 4 1 3 3 4 4 4 2 4 4 4 1 1 2 3 4 4 4 3 4 5 3 2 5 3 5 3 5 5 3 3 1 4 3 4
## [223] 4 5 3 1 1 5 5 5 3 3 3 3 2 5 5 3 1 4 3 4 3 4 5 4 5 2 5 4 4 1 5 1 5 5 2 4 2
## [260] 3 1 5 5 4 5 2 3 2 3 4 3 2 3 2 5 2 5 3 3 5 3 3 1 3 4 5 1 3 3 3 5 4 4 1 4 4
## [297] 4 3 5 4 1 4 3 1 2 5 2 4 2 1 2 1 4 3 2 3 3 5 4 3 2 4 5 5 2 5 1 1 3 2 4 1 4
## [334] 5 4 5 4 3 4 3 4 4 3 2 3 1 3 1 4 3 3 4 1 4 2 4 4 4 5 5 3 1 2 3 2 3 4 4 3 3
## [371] 3 4 1 2 4 2 3 3 1 2 4 5 5 2 4 4 3 4 4 4 3 2 4 2 3 4 3 4 3 4 2 4 2 4 5 3 5
## [408] 4 2 1 5 2 4 3 5 4 2 4 1 3 5 2 3 5 3 5 5 2 2 4 5 4 3 5 3 2 4 1 3 4 3 2 5 4
## [445] 2 5 4 4 2 4 4 5 4 1 4 3 1 3 1 3 4 4 2 3 5 4 3 1 3 2 4 3 2 4 2 4 4 2 4 4 4
## [482] 5 4 2 4 1 1 1 3 4 3 4 5 1 2 1 4 3 2 1 4 3 2 3 2 3 1 1 4 1 2 3 4 2 4 1 4 4
## [519] 4 2 4 2 4 2 5 4 1 2 3 2 5 4 4 5 1 4 4 2 5 4 2 2 4 4 3 4 3 2 4 2 4 1 4 2 4
## [556] 2 4 5 3 1 3 4 1 1 3 4 5 4 3 5 3 4 5 1 1 1 5 4 4 2 2 4 1 2 1 5 4 3 4 4 4 4
## [593] 4 4 3 3 1 5 5 3 5 1 4 4 3 3 4 4 4 1 2 3 4 1 5 4 1 3 3 5 4 4 5 2 1 4 4 4 1
## [630] 3 5 5 4 4 2 1 2 2 4 5 2 4 4 4 4 1 4 1 3 3 3 3 5 5 4 4 5 3 4 4 2 3 5 5 5 5
## [667] 3 5 3 4 3 3 5 3 3 1 3 4 1 2 4 4 5 5 4 5 4 5 3 5 3 3 2 1 5 5 1 4 5 3 5 3 4
## [704] 4 5 4 5 3 3 3 3 3 2 5 1 1 1 1 5 3 1 1 3 3 3 5 4 2 4 1 2 3 2 2 3 1 1 4 4 1
## [741] 5 1 5 5 3 3 3 1 4 3 1 3 5 4 2 4 3 5 3 5 2 3 3 1 3 3 5 5 3 3 1 3 4 4 4 1 5
## [778] 1 5 4 4 3 4 4 4 4 4 5 4 5 3 4 1 5 4 4 5 5 3 1 1 1 4 5 2 4 5 3 1 5 1 4 1 4
## [815] 3 4 2 2 5 1 4 3 3 3 4 2 5 3 3 4 3 3 4 4 3 4 3 3 1 4 4 3 3 3 5 4 5 2 4 1 3
## [852] 4 1 5
## 
## Within cluster sum of squares by cluster:
## [1] 134.2446 194.9681 202.3039 212.8665 166.5970
##  (between_SS / total_SS =  64.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
fviz_cluster(km, data = df,ellipse.type = "norm",axes = c(1, 2),) # to plot results

fviz_cluster(km, data = df,ellipse.type = "norm",axes = c(1, 3),) # to plot results

2.2.2 Results extraction

The number of dives per cluster is presented in the following data frame.

kmeans_results = as.data.frame(t(km$size))
names(kmeans_results) <- c("Cluster 1", "Cluster 2","Cluster 3","Cluster 4","Cluster 5")
kmeans_results
##   Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## 1       127       108       218       249       152

2.3 Hierarchical clustering

2.3.1 Clustering construction

As in K means clustering, the data matrix must be normalised to perform the analysis.

The first step of the analysis is to define the linkage method. To do so, we look at the agglomerative coefficient of each method, chosing the one that is closer to 1.

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

#function to compute agglomerative coefficient
ac <- function(x) {
  agnes(df, method = x)$ac
}

#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac) # best: closest to 1 (here Ward)
##   average    single  complete      ward 
## 0.9352095 0.7961501 0.9589835 0.9923943

Ward’s method seems to be the best suited for the analysis.

Hierarchical clustering is performed using Ward’s minimum variance and a dendrogram is plotted. The dendrogram is then cut in the number of cluster needed, here 5.

clust <- agnes(df, method = "ward")
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram") 

#compute distance matrix
d <- dist(df, method = "euclidean")
#perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2" )
#cut the dendrogram into 5 clusters
groups <- cutree(final_clust, k=5)

2.3.2 Result extraction

The number of dives per cluster is given below.

hmeans_results <- table(groups)
names(hmeans_results) <- c("Cluster 1","Cluster 2","Cluster 3","Cluster 4","Cluster 5")
hmeans_results
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 
##       150       153       239       149       163

3 Clustering results comparison

To estimate robustness of clusters obtained through the different methods used here, a comparison of the intersecting data points among clusters is performed.

Regarding model-based clustering, only the first threshold is tested to ensure all dives are classified, allowing for comparison with hierarchical and k means clustering. After computing a distance matrix, the function cluster.stats is applied to the three methods.

Pairwise comparison is applied to all three clustering methods. Corrected for chance rand index (corrected.rand) and variation of information index (VI) allow for estimating the similarity of information of both classification.

Comparison of model-based and hierarchical clustering:

data_dist = xpca.fd[["scores"]]
comp_mh = cluster.stats(data_dist, results_thr1$cluster, alt.clustering = groups) #comparison mclust & hclust
comp_mh$corrected.rand
## [1] 0.5951515
comp_mh$vi
## [1] 1.19351

Comparison of model-based and K means clustering:

comp_mk = cluster.stats(data_dist, results_thr1$cluster, alt.clustering = km$cluster) #comparison mclust & kclust
comp_mk$corrected.rand
## [1] 0.3877119
comp_mk$vi
## [1] 1.476983

Comparison of K means and hierarchical clusterings:

comp_kh = cluster.stats(data_dist, groups, alt.clustering = km$cluster) #comparison hclust & kclust
comp_kh$corrected.rand
## [1] 0.4719402
comp_kh$vi
## [1] 1.248073

(References to analyse comparison results:

4 Characteristics of clusters

time = seq(0, 1, by=(1/49))

idx_1 <- results_thr5[results_thr5$cluster == 1,] # extract dives in cluster 1
idx_1$index <- row.names(idx_1) # get index of dives 
vec_index_1 = as.numeric(c(idx_1$index)) # create vector to index in X.fd$coefs
mean_cl1 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { 
  data = mean(X.fd$coefs[k, vec_index_1 ]) # get the mean
  q_2_5 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.025)) # get quantile = 2.5%
  q_25 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.25)) # get quantile = 25%
  q_75 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.75)) # get quantile = 75%
  q_97_5 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.975)) # get quantile = 97.5%
  mean_cl1[k , 1] <- data.frame(data)
  mean_cl1[k , 2] <- data.frame(q_2_5)
  mean_cl1[k , 3] <- data.frame(q_25)
  mean_cl1[k , 4] <- data.frame(q_75)
  mean_cl1[k , 5] <- data.frame(q_97_5)
}
mean_cl1[, 6] <- time
plot(mean_cl1$NA_col)

idx_2 <- results_thr5[results_thr5$cluster == 2,]
idx_2$index <- row.names(idx_2)
vec_index_2 = as.numeric(c(idx_2$index)) # create vector to index in X.fd$coefs
mean_cl2 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data 
  data = mean(X.fd$coefs[k, vec_index_2 ])
  mean_cl2[k , ] <- data
  q_2_5 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.025)) # get quantile = 2.5%
  q_25 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.25)) # get quantile = 25%
  q_75 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.75)) # get quantile = 75%
  q_97_5 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.975)) # get quantile = 97.5%
  mean_cl2[k , 1] <- data.frame(data)
  mean_cl2[k , 2] <- data.frame(q_2_5)
  mean_cl2[k , 3] <- data.frame(q_25)
  mean_cl2[k , 4] <- data.frame(q_75)
  mean_cl2[k , 5] <- data.frame(q_97_5)}
plot(mean_cl2$q_25)

mean_cl2[, 6] <- time

idx_3 <- results_thr5[results_thr5$cluster == 3,]
idx_3$index <- row.names(idx_3)
vec_index_3 = as.numeric(c(idx_3$index)) # create vector to index in X.fd$coefs
mean_cl3 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data 
  data = mean(X.fd$coefs[k, vec_index_3 ])
  q_2_5 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.025)) # get quantile = 2.5%
  q_25 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.25)) # get quantile = 25%
  q_75 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.75)) # get quantile = 75%
  q_97_5 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.975)) # get quantile = 97.5%
  mean_cl3[k , 1] <- data.frame(data)
  mean_cl3[k , 2] <- data.frame(q_2_5)
  mean_cl3[k , 3] <- data.frame(q_25)
  mean_cl3[k , 4] <- data.frame(q_75)
  mean_cl3[k , 5] <- data.frame(q_97_5)}
plot(mean_cl3$NA_col)

mean_cl3[, 6] <- time

idx_4 <- results_thr5[results_thr5$cluster == 4,]
idx_4$index <- row.names(idx_4)
vec_index_4 = as.numeric(c(idx_4$index)) # create vector to index in X.fd$coefs
mean_cl4 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data 
  data = mean(X.fd$coefs[k, vec_index_4 ])
  q_2_5 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.025)) # get quantile = 2.5%
  q_25 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.25)) # get quantile = 25%
  q_75 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.75)) # get quantile = 75%
  q_97_5 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.975)) # get quantile = 97.5%
  mean_cl4[k , 1] <- data.frame(data)
  mean_cl4[k , 2] <- data.frame(q_2_5)
  mean_cl4[k , 3] <- data.frame(q_25)
  mean_cl4[k , 4] <- data.frame(q_75)
  mean_cl4[k , 5] <- data.frame(q_97_5)}
plot(mean_cl4$NA_col)

mean_cl4[, 6] <- time

idx_5 <- results_thr5[results_thr5$cluster == 5,]
idx_5$index <- row.names(idx_5)
vec_index_5 = as.numeric(c(idx_5$index)) # create vector to index in X.fd$coefs
mean_cl5 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data 
  data = mean(X.fd$coefs[k, vec_index_5 ])
  q_2_5 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.025)) # get quantile = 2.5%
  q_25 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.25)) # get quantile = 25%
  q_75 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.75)) # get quantile = 75%
  q_97_5 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.975)) # get quantile = 97.5%
  mean_cl5[k , 1] <- data.frame(data)
  mean_cl5[k , 2] <- data.frame(q_2_5)
  mean_cl5[k , 3] <- data.frame(q_25)
  mean_cl5[k , 4] <- data.frame(q_75)
  mean_cl5[k , 5] <- data.frame(q_97_5)}
plot(mean_cl5$NA_col)

mean_cl5[, 6] <- time

Now that we have extracted the mean and quantiles, let’s make graphs!

c1 <- ggplot(mean_cl1, aes(x=time, y=NA_col)) +
  geom_line(color="red", size=2)+
  geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
  geom_line(aes(y=q_25),linetype="dotted", size=1) +
  geom_line(aes(y=q_75),linetype="dotted", size=1) +
  geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
  ggtitle("Cluster 1") +
  scale_y_continuous(n.breaks=3) +
  scale_x_continuous(n.breaks=3) +
  xlab("Time") + ylab("Depth") +
  theme_classic()+
  theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
        axis.title = element_text(size=19),
        plot.title = element_text(size=21, face="bold"))
c1

c2 <- ggplot(mean_cl2, aes(x=time, y=NA_col)) +
 geom_line(color="red", size=2)+
  geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
  geom_line(aes(y=q_25),linetype="dotted", size=1) +
  geom_line(aes(y=q_75),linetype="dotted", size=1) +
  geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
  ggtitle("Cluster 2") +
  scale_y_continuous(n.breaks=3) +
  scale_x_continuous(n.breaks=3) +
  xlab("Time") + ylab("Depth") +
  theme_classic()+
  theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
        axis.title = element_text(size=19),
        plot.title = element_text(size=21, face="bold"))
c2

c3 <- ggplot(mean_cl3, aes(x=time, y=NA_col)) +
 geom_line(color="red", size=2)+
  geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
  geom_line(aes(y=q_25),linetype="dotted", size=1) +
  geom_line(aes(y=q_75),linetype="dotted", size=1) +
  geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
  ggtitle("Cluster 3") +
  scale_y_continuous(n.breaks=3) +
  scale_x_continuous(n.breaks=3) +
  xlab("Time") + ylab("Depth") +
  theme_classic()+
  theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
        axis.title = element_text(size=19),
        plot.title = element_text(size=21, face="bold"))
c3

c4 <- ggplot(mean_cl4, aes(x=time, y=NA_col)) +
 geom_line(color="red", size=2)+
  geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
  geom_line(aes(y=q_25),linetype="dotted", size=1) +
  geom_line(aes(y=q_75),linetype="dotted", size=1) +
  geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
  ggtitle("Cluster 4") +
  scale_y_continuous(n.breaks=3) +
  scale_x_continuous(n.breaks=3) +
  xlab("Time") + ylab("Depth") +
  theme_classic()+
  theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
        axis.title = element_text(size=19),
        plot.title = element_text(size=21, face="bold"))
c4

c5 <- ggplot(mean_cl5, aes(x=time, y=NA_col)) +
 geom_line(color="red", size=2)+
  geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
  geom_line(aes(y=q_25),linetype="dotted", size=1) +
  geom_line(aes(y=q_75),linetype="dotted", size=1) +
  geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
  ggtitle("Cluster 5") +
  scale_y_continuous(n.breaks=3) +
  scale_x_continuous(n.breaks=3) +
  xlab("Time") + ylab("Depth") +
  theme_classic()+
  theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
        axis.title = element_text(size=19),
        plot.title = element_text(size=21, face="bold"))
c5

legend <- get_legend( c1 )
prow <- plot_grid(
  c1 + theme(legend.position="none"),
  c2 + theme(legend.position="none"),
  c3 + theme(legend.position="none"),
  c4 + theme(legend.position="none"),
  c5 + theme(legend.position="none"),
  labels = c("A", "B", "C"),
  nrow = 2
)
prow

# png(file="/Users/adelieantoine/Desktop/clusters.png",
# width=600, height=350)

5 Save files

To assess seal’s behaviour among dive groups, analyses must be conducted with MATLAB. The result file, containing cluster number for each dive and dive index for each seal, can be saved in .csv format.

6 References

Godard M, Manté C, Guinet C, Picard B, Nerini D. 2020. ‘Diving Behavior of Mirounga leonina: A Functional Data Analysis Approach.’ Front Mar Sci. 7-595.

Kassambara A., Mundt F.. 2020. ‘factoextra: Extract and Visualize the Results of Multivariate Data Analyses’. R package version 1.0.7.

Maechler M., Rousseeuw P., Struyf A., Hubert M., Hornik K. 2019. ‘cluster: Cluster Analysis Basics and Extensions’. R package version 2.1.0.

Scrucca, L. 2010. ‘Dimension reduction for model-based clustering.’ Stat Comput 20, 471–484.

Scrucca L., Fop M., Murphy T.B., Raftery A.E. 2016. ‘mclust 5: clustering, classification and density estimation using Gaussian finite mixture models.’ The R Journal, 8(1): 289–317.