/pagenumbering{arabic}
Before applying fPCAs to the data, the 854 recorded dives must be
visualised to make sure there is no sensor error or unexpected data. For
obvious reasons, the 854 dive profiles will not be provided here. A
sample of 4 dives is presented below.
These dive profiles illustrate the variety of dive types in the studied
dataset, with duration, depth, skewness and overall shape differing
among dives.
Following Godard et al., 2019 (in Front. Mar. Sc.), fPCAs are lead on normalised dives. By reducing depth and duration to a 0 to 1 scale, dive structures are made comparable. The code is shown below.
nbcoef=50
myb=create.bspline.basis(c(0,1),nbcoef)
penMat=eval.penalty(myb,2)
penMat=cbind(penMat,0,0)
penMat=rbind(penMat,0,0)
X.fd=fd(coef=NULL,basisobj=myb)
max_depth=NULL
duree=NULL
for(k in 1:nplong){
cat("k=",k,fill=TRUE)
tps=plong[[k]][,1]
duree=c(duree,max(tps)-min(tps))
tps=1/(max(tps)-min(tps))*(tps-min(tps))
prof=plong[[k]][,2]
max_depth=c(max_depth,min(prof))
prof=-prof/min(prof)
### Compute the spline coefficients under constraints
k1=bsplineS(0,breaks=c(0,myb$params,1),nderiv=0)
k1=t(k1)
k1b=bsplineS(1,breaks=c(0,myb$params,1),nderiv=0)
k1b=t(k1b)
X=eval.basis(tps,myb)
Mat=rbind(cbind(t(X)%*%X,k1,k1b),cbind(t(k1),0,0),cbind(t(k1b),0,0))
### Keep the spline coefficients, here KAPPA=1e-06
achap=solve(Mat+0.000001*penMat)%*%rbind(t(X)%*%prof,0,0)
X.fd$coefs=cbind(X.fd$coefs,as.matrix(achap[1:nbcoef]))
}## k= 1
## k= 2
## k= 3
## k= 4
## k= 5
## k= 6
## k= 7
## k= 8
## k= 9
## k= 10
## k= 11
## k= 12
## k= 13
## k= 14
## k= 15
## k= 16
## k= 17
## k= 18
## k= 19
## k= 20
## k= 21
## k= 22
## k= 23
## k= 24
## k= 25
## k= 26
## k= 27
## k= 28
## k= 29
## k= 30
## k= 31
## k= 32
## k= 33
## k= 34
## k= 35
## k= 36
## k= 37
## k= 38
## k= 39
## k= 40
## k= 41
## k= 42
## k= 43
## k= 44
## k= 45
## k= 46
## k= 47
## k= 48
## k= 49
## k= 50
## k= 51
## k= 52
## k= 53
## k= 54
## k= 55
## k= 56
## k= 57
## k= 58
## k= 59
## k= 60
## k= 61
## k= 62
## k= 63
## k= 64
## k= 65
## k= 66
## k= 67
## k= 68
## k= 69
## k= 70
## k= 71
## k= 72
## k= 73
## k= 74
## k= 75
## k= 76
## k= 77
## k= 78
## k= 79
## k= 80
## k= 81
## k= 82
## k= 83
## k= 84
## k= 85
## k= 86
## k= 87
## k= 88
## k= 89
## k= 90
## k= 91
## k= 92
## k= 93
## k= 94
## k= 95
## k= 96
## k= 97
## k= 98
## k= 99
## k= 100
## k= 101
## k= 102
## k= 103
## k= 104
## k= 105
## k= 106
## k= 107
## k= 108
## k= 109
## k= 110
## k= 111
## k= 112
## k= 113
## k= 114
## k= 115
## k= 116
## k= 117
## k= 118
## k= 119
## k= 120
## k= 121
## k= 122
## k= 123
## k= 124
## k= 125
## k= 126
## k= 127
## k= 128
## k= 129
## k= 130
## k= 131
## k= 132
## k= 133
## k= 134
## k= 135
## k= 136
## k= 137
## k= 138
## k= 139
## k= 140
## k= 141
## k= 142
## k= 143
## k= 144
## k= 145
## k= 146
## k= 147
## k= 148
## k= 149
## k= 150
## k= 151
## k= 152
## k= 153
## k= 154
## k= 155
## k= 156
## k= 157
## k= 158
## k= 159
## k= 160
## k= 161
## k= 162
## k= 163
## k= 164
## k= 165
## k= 166
## k= 167
## k= 168
## k= 169
## k= 170
## k= 171
## k= 172
## k= 173
## k= 174
## k= 175
## k= 176
## k= 177
## k= 178
## k= 179
## k= 180
## k= 181
## k= 182
## k= 183
## k= 184
## k= 185
## k= 186
## k= 187
## k= 188
## k= 189
## k= 190
## k= 191
## k= 192
## k= 193
## k= 194
## k= 195
## k= 196
## k= 197
## k= 198
## k= 199
## k= 200
## k= 201
## k= 202
## k= 203
## k= 204
## k= 205
## k= 206
## k= 207
## k= 208
## k= 209
## k= 210
## k= 211
## k= 212
## k= 213
## k= 214
## k= 215
## k= 216
## k= 217
## k= 218
## k= 219
## k= 220
## k= 221
## k= 222
## k= 223
## k= 224
## k= 225
## k= 226
## k= 227
## k= 228
## k= 229
## k= 230
## k= 231
## k= 232
## k= 233
## k= 234
## k= 235
## k= 236
## k= 237
## k= 238
## k= 239
## k= 240
## k= 241
## k= 242
## k= 243
## k= 244
## k= 245
## k= 246
## k= 247
## k= 248
## k= 249
## k= 250
## k= 251
## k= 252
## k= 253
## k= 254
## k= 255
## k= 256
## k= 257
## k= 258
## k= 259
## k= 260
## k= 261
## k= 262
## k= 263
## k= 264
## k= 265
## k= 266
## k= 267
## k= 268
## k= 269
## k= 270
## k= 271
## k= 272
## k= 273
## k= 274
## k= 275
## k= 276
## k= 277
## k= 278
## k= 279
## k= 280
## k= 281
## k= 282
## k= 283
## k= 284
## k= 285
## k= 286
## k= 287
## k= 288
## k= 289
## k= 290
## k= 291
## k= 292
## k= 293
## k= 294
## k= 295
## k= 296
## k= 297
## k= 298
## k= 299
## k= 300
## k= 301
## k= 302
## k= 303
## k= 304
## k= 305
## k= 306
## k= 307
## k= 308
## k= 309
## k= 310
## k= 311
## k= 312
## k= 313
## k= 314
## k= 315
## k= 316
## k= 317
## k= 318
## k= 319
## k= 320
## k= 321
## k= 322
## k= 323
## k= 324
## k= 325
## k= 326
## k= 327
## k= 328
## k= 329
## k= 330
## k= 331
## k= 332
## k= 333
## k= 334
## k= 335
## k= 336
## k= 337
## k= 338
## k= 339
## k= 340
## k= 341
## k= 342
## k= 343
## k= 344
## k= 345
## k= 346
## k= 347
## k= 348
## k= 349
## k= 350
## k= 351
## k= 352
## k= 353
## k= 354
## k= 355
## k= 356
## k= 357
## k= 358
## k= 359
## k= 360
## k= 361
## k= 362
## k= 363
## k= 364
## k= 365
## k= 366
## k= 367
## k= 368
## k= 369
## k= 370
## k= 371
## k= 372
## k= 373
## k= 374
## k= 375
## k= 376
## k= 377
## k= 378
## k= 379
## k= 380
## k= 381
## k= 382
## k= 383
## k= 384
## k= 385
## k= 386
## k= 387
## k= 388
## k= 389
## k= 390
## k= 391
## k= 392
## k= 393
## k= 394
## k= 395
## k= 396
## k= 397
## k= 398
## k= 399
## k= 400
## k= 401
## k= 402
## k= 403
## k= 404
## k= 405
## k= 406
## k= 407
## k= 408
## k= 409
## k= 410
## k= 411
## k= 412
## k= 413
## k= 414
## k= 415
## k= 416
## k= 417
## k= 418
## k= 419
## k= 420
## k= 421
## k= 422
## k= 423
## k= 424
## k= 425
## k= 426
## k= 427
## k= 428
## k= 429
## k= 430
## k= 431
## k= 432
## k= 433
## k= 434
## k= 435
## k= 436
## k= 437
## k= 438
## k= 439
## k= 440
## k= 441
## k= 442
## k= 443
## k= 444
## k= 445
## k= 446
## k= 447
## k= 448
## k= 449
## k= 450
## k= 451
## k= 452
## k= 453
## k= 454
## k= 455
## k= 456
## k= 457
## k= 458
## k= 459
## k= 460
## k= 461
## k= 462
## k= 463
## k= 464
## k= 465
## k= 466
## k= 467
## k= 468
## k= 469
## k= 470
## k= 471
## k= 472
## k= 473
## k= 474
## k= 475
## k= 476
## k= 477
## k= 478
## k= 479
## k= 480
## k= 481
## k= 482
## k= 483
## k= 484
## k= 485
## k= 486
## k= 487
## k= 488
## k= 489
## k= 490
## k= 491
## k= 492
## k= 493
## k= 494
## k= 495
## k= 496
## k= 497
## k= 498
## k= 499
## k= 500
## k= 501
## k= 502
## k= 503
## k= 504
## k= 505
## k= 506
## k= 507
## k= 508
## k= 509
## k= 510
## k= 511
## k= 512
## k= 513
## k= 514
## k= 515
## k= 516
## k= 517
## k= 518
## k= 519
## k= 520
## k= 521
## k= 522
## k= 523
## k= 524
## k= 525
## k= 526
## k= 527
## k= 528
## k= 529
## k= 530
## k= 531
## k= 532
## k= 533
## k= 534
## k= 535
## k= 536
## k= 537
## k= 538
## k= 539
## k= 540
## k= 541
## k= 542
## k= 543
## k= 544
## k= 545
## k= 546
## k= 547
## k= 548
## k= 549
## k= 550
## k= 551
## k= 552
## k= 553
## k= 554
## k= 555
## k= 556
## k= 557
## k= 558
## k= 559
## k= 560
## k= 561
## k= 562
## k= 563
## k= 564
## k= 565
## k= 566
## k= 567
## k= 568
## k= 569
## k= 570
## k= 571
## k= 572
## k= 573
## k= 574
## k= 575
## k= 576
## k= 577
## k= 578
## k= 579
## k= 580
## k= 581
## k= 582
## k= 583
## k= 584
## k= 585
## k= 586
## k= 587
## k= 588
## k= 589
## k= 590
## k= 591
## k= 592
## k= 593
## k= 594
## k= 595
## k= 596
## k= 597
## k= 598
## k= 599
## k= 600
## k= 601
## k= 602
## k= 603
## k= 604
## k= 605
## k= 606
## k= 607
## k= 608
## k= 609
## k= 610
## k= 611
## k= 612
## k= 613
## k= 614
## k= 615
## k= 616
## k= 617
## k= 618
## k= 619
## k= 620
## k= 621
## k= 622
## k= 623
## k= 624
## k= 625
## k= 626
## k= 627
## k= 628
## k= 629
## k= 630
## k= 631
## k= 632
## k= 633
## k= 634
## k= 635
## k= 636
## k= 637
## k= 638
## k= 639
## k= 640
## k= 641
## k= 642
## k= 643
## k= 644
## k= 645
## k= 646
## k= 647
## k= 648
## k= 649
## k= 650
## k= 651
## k= 652
## k= 653
## k= 654
## k= 655
## k= 656
## k= 657
## k= 658
## k= 659
## k= 660
## k= 661
## k= 662
## k= 663
## k= 664
## k= 665
## k= 666
## k= 667
## k= 668
## k= 669
## k= 670
## k= 671
## k= 672
## k= 673
## k= 674
## k= 675
## k= 676
## k= 677
## k= 678
## k= 679
## k= 680
## k= 681
## k= 682
## k= 683
## k= 684
## k= 685
## k= 686
## k= 687
## k= 688
## k= 689
## k= 690
## k= 691
## k= 692
## k= 693
## k= 694
## k= 695
## k= 696
## k= 697
## k= 698
## k= 699
## k= 700
## k= 701
## k= 702
## k= 703
## k= 704
## k= 705
## k= 706
## k= 707
## k= 708
## k= 709
## k= 710
## k= 711
## k= 712
## k= 713
## k= 714
## k= 715
## k= 716
## k= 717
## k= 718
## k= 719
## k= 720
## k= 721
## k= 722
## k= 723
## k= 724
## k= 725
## k= 726
## k= 727
## k= 728
## k= 729
## k= 730
## k= 731
## k= 732
## k= 733
## k= 734
## k= 735
## k= 736
## k= 737
## k= 738
## k= 739
## k= 740
## k= 741
## k= 742
## k= 743
## k= 744
## k= 745
## k= 746
## k= 747
## k= 748
## k= 749
## k= 750
## k= 751
## k= 752
## k= 753
## k= 754
## k= 755
## k= 756
## k= 757
## k= 758
## k= 759
## k= 760
## k= 761
## k= 762
## k= 763
## k= 764
## k= 765
## k= 766
## k= 767
## k= 768
## k= 769
## k= 770
## k= 771
## k= 772
## k= 773
## k= 774
## k= 775
## k= 776
## k= 777
## k= 778
## k= 779
## k= 780
## k= 781
## k= 782
## k= 783
## k= 784
## k= 785
## k= 786
## k= 787
## k= 788
## k= 789
## k= 790
## k= 791
## k= 792
## k= 793
## k= 794
## k= 795
## k= 796
## k= 797
## k= 798
## k= 799
## k= 800
## k= 801
## k= 802
## k= 803
## k= 804
## k= 805
## k= 806
## k= 807
## k= 808
## k= 809
## k= 810
## k= 811
## k= 812
## k= 813
## k= 814
## k= 815
## k= 816
## k= 817
## k= 818
## k= 819
## k= 820
## k= 821
## k= 822
## k= 823
## k= 824
## k= 825
## k= 826
## k= 827
## k= 828
## k= 829
## k= 830
## k= 831
## k= 832
## k= 833
## k= 834
## k= 835
## k= 836
## k= 837
## k= 838
## k= 839
## k= 840
## k= 841
## k= 842
## k= 843
## k= 844
## k= 845
## k= 846
## k= 847
## k= 848
## k= 849
## k= 850
## k= 851
## k= 852
## k= 853
## k= 854
### Keep the coefficients
X.fd$coefs=X.fd$coefs[,-1]
# plot one normalised dive for visualisation
plot=plot(X.fd[68,])# png(file="/Volumes/SARA_WORK/WS_data_DDU2019_SonarTag/data/lw19/output_files/figures/3D_dive_plot_all_dives/dives_not_classified_in_clusters/normalized/314a/dive_nb_3.png",width=600, height=600)Functional PCAs are then performed.
xpca.fd=pca.fd(X.fd,3)Three dive shapes are found, with each archetypal profile shown
below.
The following plot is used to determine the number of components to
keep for classification. It appears that 5 dimensions are necessary
here.
To build clusters according to the shape of each dive, several methods are tested and compared. The robustness of clusters is tested by estimating consistency of groups created following each method.
Three methods were used:
Model based clustering, where classification and density are estimated based on finite Gaussian mixture modeling,
K means clustering, which assigns values to the cluster whose centroid is closest using Euclidean distance,
Hierarchical clustering, that calculates the pairwise dissimilarity between each observation in the dataset and fuses observations into clusters through a classification tree (dendrogram).
The data matrix is provided to the function Mclust to
perform the clustering. The data matrix must be standardised. A summary
displaying information on the best model is available, giving sample
size and overall probability of belonging to each cluster. Estimation of
the suitability of models is performed using the Bayesian information
criterion (BIC hereafter).
The best suited model, with higher BIC, comprises 5 components. Group volume, shape and orientation can vary (VVV model).
The corresponding code and its evaluation is displayed below.
mod1 <- Mclust(xpca.fd[["scores"]])
summary(mod1, parameters = TRUE) ## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 5 components:
##
## log-likelihood n df BIC ICL
## 2148.217 854 37 4046.686 3694.369
##
## Clustering table:
## 1 2 3 4 5
## 75 205 133 256 185
##
## Mixing probabilities:
## 1 2 3 4 5
## 0.07898608 0.24553603 0.16480201 0.27891470 0.23176118
##
## Means:
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.19172360 -0.002713458 -0.06714429 -0.03480885 0.15785198
## [2,] -0.08748832 0.067796446 0.11319588 -0.08389994 -0.02153118
## [3,] -0.07574470 0.109994251 -0.04912812 0.02487256 -0.08571618
##
## Variances:
## [,,1]
## [,1] [,2] [,3]
## [1,] 1.005868e-03 -0.0001753370 -4.015381e-05
## [2,] -1.753370e-04 0.0008215416 1.301208e-04
## [3,] -4.015381e-05 0.0001301208 1.233113e-03
## [,,2]
## [,1] [,2] [,3]
## [1,] 7.146416e-03 -3.453914e-05 0.001846948
## [2,] -3.453914e-05 6.098091e-03 -0.001566044
## [3,] 1.846948e-03 -1.566044e-03 0.003644108
## [,,3]
## [,1] [,2] [,3]
## [1,] 0.004820275 0.0046402045 -0.0022437787
## [2,] 0.004640204 0.0115023659 -0.0006062919
## [3,] -0.002243779 -0.0006062919 0.0051292707
## [,,4]
## [,1] [,2] [,3]
## [1,] 0.008670377 -0.0031691751 0.0028940728
## [2,] -0.003169175 0.0033637318 -0.0007543304
## [3,] 0.002894073 -0.0007543304 0.0058555057
## [,,5]
## [,1] [,2] [,3]
## [1,] 1.541464e-02 7.292807e-05 0.003340142
## [2,] 7.292807e-05 1.367803e-02 -0.002906007
## [3,] 3.340142e-03 -2.906007e-03 0.008964650
Pairwise dimension clustering can be plotted, to illustrate classification.
plot(mod1, what ="classification")Each model’s BIC can be plotted, depending on the number of components chosen. The following plot illustrates the necessity of a 5-dimension mixture.
The BIC summary presents BIC value for the top three models, for complementary information.
The ICL criterion can also be used, to balance bias induced by BIC that approximates the density rather than the number of clusters as such. Here, it might not be relevant to resort to the ICL criterion since it is efficient only in cases where there is very little overlap.
BIC <- mclustBIC(xpca.fd[["scores"]])
plot(BIC) # 5 dimensions mixture, VVV model with top BIC valuesummary(BIC) # summary of top 3 models## Best BIC values:
## VVE,5 VVE,6 VEV,5
## BIC 4046.686 4031.03696 4013.66700
## BIC diff 0.000 -15.64927 -33.01922
Scrucca et al., 2010 (in Statistics and computing)
suggests reducing dimensions to project clusters into a suitable
subspace, in order to visualise clustering structures and geometric
characteristics. The reduction is performed using MclustDR
function. The estimated directions are defined as a set of linear
combinations of the original features, ordered by importance. The
ordering is carried out through the quantification of associated
eigenvalues.
The summaryfunction displays information on resulting
dimensions. Eigenvalue of each remaining dimension is plotted, showing
the classification performed with Mclustfunction.
drmod <- MclustDR(mod1, lambda = 1) # reduction of dimensions
summary(drmod)## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVE, 5)
##
## Clusters n
## 1 75
## 2 205
## 3 133
## 4 256
## 5 185
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3
## [1,] -0.55279 -0.58669 0.34244
## [2,] 0.31787 -0.74461 -0.62080
## [3,] 0.77032 -0.31836 0.70523
##
## Dir1 Dir2 Dir3
## Eigenvalues 0.88063 0.46734 0.21636
## Cum. % 56.29455 86.16902 100.00000
plot(drmod, what = "evalues") Several exploratory plots can be obtained. They are different representations of the same information. Each color represents a cluster.
Pairwise representation of the classification, for dimensions 1 to 4:
plot(drmod, what = "pairs") Cluster scatterplot with contour depending on distance to centroid, for dimensions 1 and 2:
plot(drmod, what = "contour") Cluster limits according to the model, for dimensions 1 and 2:
plot(drmod, what = "classification") Uncertainty zones (grey) according to the model, where ajdacent points belong to different clusters, for dimensions 1 and 2:
plot(drmod, what = "boundaries") Distribution of observations on each dimension axis can be plotted, to compare cluster positions in the four dimensions subspace.
plot(drmod, dimens = 1, what = "density")plot(drmod, dimens = 2, what = "density")plot(drmod, dimens = 3, what = "density")Resulting clusters can be extracted from the model. Assessing probability of each data point to belong to its cluster is needed to estimate robustness of cluster predictions.
In the following table, the first five columns represent the position of data points on each dimension axis. The last two columns (in bold) represents the cluster number to which the given dive belongs to, and probability that the said dive does belong to this cluster. Only the first 6 rows are presented.
cluster = mod1$classification
uncertainty = mod1$uncertainty
clust_results = data.frame(xpca.fd[["scores"]],cluster)
results = data.frame(clust_results,uncertainty)
kable(head(results)) %>%
kable_styling("striped") %>%
column_spec(4:5, bold = TRUE) %>%
add_header_above(c("Dimension axis" = 3, "Clustering results" = 2))| X1 | X2 | X3 | cluster | uncertainty |
|---|---|---|---|---|
| -0.1724743 | -0.1123871 | -0.0563770 | 1 | 0.0851807 |
| -0.0589972 | 0.1269479 | 0.0889921 | 2 | 0.0727001 |
| -0.1879304 | -0.0415901 | -0.0348726 | 1 | 0.4344803 |
| 0.1187469 | -0.0105761 | 0.1141133 | 2 | 0.1234015 |
| 0.0646416 | 0.0971359 | 0.1943154 | 2 | 0.0004626 |
| 0.0869299 | 0.0913429 | 0.2200825 | 2 | 0.0004977 |
Several threshold can be tested. Only dives with uncertainty below a given threshold are considered to belong to the corresponding cluster. For every tested threshold, the resulting number of classified dives is compared.
The maximum uncertainty of a dive dive to belong to the cluster it was classified in is 0.6490577.
The different thresholds tested are:
results_thr1 = results[!(results$uncertainty>a1),]
results_thr2 = results[!(results$uncertainty>0.5),]
results_thr3 = results[!(results$uncertainty>0.4),]
results_thr4 = results[!(results$uncertainty>0.3),]
results_thr5 = results[!(results$uncertainty>0.2),]
results_thr6 = results[!(results$uncertainty>0.1),]
results_thr7 = results[!(results$uncertainty>0.05),]
results_thr8 = results[!(results$uncertainty>0.04),]
results_thr9 = results[!(results$uncertainty>0.03),]
results_thr10 = results[!(results$uncertainty>0.02),]
results_thr11 = results[!(results$uncertainty>0.01),]
thr_summary <- data.frame(var1=c(a1,0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.04,0.03, 0.02, 0.01),
var2=c(nrow(results_thr1),nrow(results_thr2),nrow(results_thr3),
nrow(results_thr4),nrow(results_thr5),nrow(results_thr6),
nrow(results_thr7),nrow(results_thr8),nrow(results_thr9),nrow(results_thr10),nrow(results_thr11)))
a2 <- c(a1,0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.04, 0.03, 0.02, 0.01)
clust1 <- numeric()
clust2 <- numeric()
clust3 <- numeric()
clust4 <- numeric()
clust5 <- numeric()
for(j in 1:11){
k = a2[c(j)]
x1 = which(results$cluster == 1 & results$uncertainty<k)
x2 = which(results$cluster == 2 & results$uncertainty<k)
x3 = which(results$cluster == 3 & results$uncertainty<k)
x4 = which(results$cluster == 4 & results$uncertainty<k)
x5 = which(results$cluster == 5 & results$uncertainty<k)
clust1=rbind(clust1, length(x1))
clust2=rbind(clust2, length(x2))
clust3=rbind(clust3, length(x3))
clust4=rbind(clust4, length(x4))
clust5=rbind(clust5, length(x5))
}
thr_summary <- cbind(thr_summary,clust1,clust2,clust3,clust4,clust5)
names(thr_summary) <- c("Threshold value","Total number of dives", "Cluster 1",
"Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")
kable(thr_summary,align = 'c') %>%
kable_styling("striped") %>%
add_header_above(c(" " = 2, "Number of dives per cluster" = 5))| Threshold value | Total number of dives | Cluster 1 | Cluster 2 | Cluster 3 | Cluster 4 | Cluster 5 |
|---|---|---|---|---|---|---|
| 0.6490577 | 854 | 75 | 205 | 133 | 255 | 185 |
| 0.5000000 | 832 | 73 | 200 | 130 | 247 | 182 |
| 0.4000000 | 741 | 67 | 181 | 109 | 214 | 170 |
| 0.3000000 | 656 | 62 | 168 | 84 | 177 | 165 |
| 0.2000000 | 545 | 52 | 146 | 59 | 138 | 150 |
| 0.1000000 | 411 | 38 | 114 | 37 | 84 | 138 |
| 0.0500000 | 294 | 17 | 93 | 23 | 37 | 124 |
| 0.0400000 | 258 | 14 | 86 | 15 | 26 | 117 |
| 0.0300000 | 210 | 4 | 79 | 7 | 10 | 110 |
| 0.0200000 | 171 | 0 | 66 | 3 | 0 | 102 |
| 0.0100000 | 143 | 0 | 50 | 0 | 0 | 93 |
To perform K means clustering, the data matrix must be normalised. It is therefore necessary to reduce and center the data frame.
df <- scale(xpca.fd[["scores"]])
head(df)## [,1] [,2] [,3]
## [1,] -1.2721708 -0.96809969 -0.5303446
## [2,] -0.4351637 1.09352552 0.8371591
## [3,] -1.3861756 -0.35825638 -0.3280506
## [4,] 0.8758778 -0.09110264 1.0734769
## [5,] 0.4767966 0.83672626 1.8279467
## [6,] 0.6411954 0.78682553 2.0703413
Contrary to usual method, the number of cluster will not be computed
through fviz_nbclust function, nor by comparing the number
of clusters to the gap statistic (using clusGap function).
As the aim of using K means clustering here is to compare robustness of
clusters resulting from model-based clustering, the same number of
dimensions must be used.
In the following analysis, a fixed number of clusters is provided to
the function (5 dimensions). The nstart parameter is fixed
at 150, to ensure stability of the model.
As shown in cluster plots examples, clusters overlay each other a lot, therefore results must be taken carefully.
km <- kmeans(df, centers = 5,nstart = 150)
km## K-means clustering with 5 clusters of sizes 127, 108, 218, 249, 152
##
## Cluster means:
## [,1] [,2] [,3]
## 1 -0.12993060 1.5998422 -0.2185933
## 2 1.27474350 0.2305231 -1.3047782
## 3 -0.02999682 0.2459167 1.1302084
## 4 -0.98619837 -0.4442591 -0.4587578
## 5 0.86139204 -1.1254327 0.2402807
##
## Clustering vector:
## [1] 4 3 4 3 3 3 5 3 3 4 4 4 3 2 2 3 4 5 1 2 5 4 3 5 3 5 5 1 5 3 5 1 2 5 4 1 3
## [38] 3 3 2 1 4 4 1 4 3 1 4 5 5 1 4 4 5 2 4 3 5 1 3 3 4 4 1 4 3 4 4 4 4 3 5 1 3
## [75] 1 5 5 1 3 3 5 3 1 5 4 1 5 3 5 3 4 4 5 4 1 4 4 3 1 4 1 5 5 5 3 2 3 2 4 4 3
## [112] 3 1 3 3 3 1 3 3 1 1 4 5 2 3 4 5 3 2 4 3 1 2 4 5 4 3 5 2 4 3 3 1 1 2 5 3 5
## [149] 4 4 3 4 4 3 5 3 3 3 1 3 1 2 3 1 4 3 1 3 5 3 5 4 3 4 2 1 5 4 4 3 2 1 2 2 3
## [186] 4 3 4 1 3 3 4 4 4 2 4 4 4 1 1 2 3 4 4 4 3 4 5 3 2 5 3 5 3 5 5 3 3 1 4 3 4
## [223] 4 5 3 1 1 5 5 5 3 3 3 3 2 5 5 3 1 4 3 4 3 4 5 4 5 2 5 4 4 1 5 1 5 5 2 4 2
## [260] 3 1 5 5 4 5 2 3 2 3 4 3 2 3 2 5 2 5 3 3 5 3 3 1 3 4 5 1 3 3 3 5 4 4 1 4 4
## [297] 4 3 5 4 1 4 3 1 2 5 2 4 2 1 2 1 4 3 2 3 3 5 4 3 2 4 5 5 2 5 1 1 3 2 4 1 4
## [334] 5 4 5 4 3 4 3 4 4 3 2 3 1 3 1 4 3 3 4 1 4 2 4 4 4 5 5 3 1 2 3 2 3 4 4 3 3
## [371] 3 4 1 2 4 2 3 3 1 2 4 5 5 2 4 4 3 4 4 4 3 2 4 2 3 4 3 4 3 4 2 4 2 4 5 3 5
## [408] 4 2 1 5 2 4 3 5 4 2 4 1 3 5 2 3 5 3 5 5 2 2 4 5 4 3 5 3 2 4 1 3 4 3 2 5 4
## [445] 2 5 4 4 2 4 4 5 4 1 4 3 1 3 1 3 4 4 2 3 5 4 3 1 3 2 4 3 2 4 2 4 4 2 4 4 4
## [482] 5 4 2 4 1 1 1 3 4 3 4 5 1 2 1 4 3 2 1 4 3 2 3 2 3 1 1 4 1 2 3 4 2 4 1 4 4
## [519] 4 2 4 2 4 2 5 4 1 2 3 2 5 4 4 5 1 4 4 2 5 4 2 2 4 4 3 4 3 2 4 2 4 1 4 2 4
## [556] 2 4 5 3 1 3 4 1 1 3 4 5 4 3 5 3 4 5 1 1 1 5 4 4 2 2 4 1 2 1 5 4 3 4 4 4 4
## [593] 4 4 3 3 1 5 5 3 5 1 4 4 3 3 4 4 4 1 2 3 4 1 5 4 1 3 3 5 4 4 5 2 1 4 4 4 1
## [630] 3 5 5 4 4 2 1 2 2 4 5 2 4 4 4 4 1 4 1 3 3 3 3 5 5 4 4 5 3 4 4 2 3 5 5 5 5
## [667] 3 5 3 4 3 3 5 3 3 1 3 4 1 2 4 4 5 5 4 5 4 5 3 5 3 3 2 1 5 5 1 4 5 3 5 3 4
## [704] 4 5 4 5 3 3 3 3 3 2 5 1 1 1 1 5 3 1 1 3 3 3 5 4 2 4 1 2 3 2 2 3 1 1 4 4 1
## [741] 5 1 5 5 3 3 3 1 4 3 1 3 5 4 2 4 3 5 3 5 2 3 3 1 3 3 5 5 3 3 1 3 4 4 4 1 5
## [778] 1 5 4 4 3 4 4 4 4 4 5 4 5 3 4 1 5 4 4 5 5 3 1 1 1 4 5 2 4 5 3 1 5 1 4 1 4
## [815] 3 4 2 2 5 1 4 3 3 3 4 2 5 3 3 4 3 3 4 4 3 4 3 3 1 4 4 3 3 3 5 4 5 2 4 1 3
## [852] 4 1 5
##
## Within cluster sum of squares by cluster:
## [1] 134.2446 194.9681 202.3039 212.8665 166.5970
## (between_SS / total_SS = 64.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(km, data = df,ellipse.type = "norm",axes = c(1, 2),) # to plot resultsfviz_cluster(km, data = df,ellipse.type = "norm",axes = c(1, 3),) # to plot resultsThe number of dives per cluster is presented in the following data frame.
kmeans_results = as.data.frame(t(km$size))
names(kmeans_results) <- c("Cluster 1", "Cluster 2","Cluster 3","Cluster 4","Cluster 5")
kmeans_results## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## 1 127 108 218 249 152
As in K means clustering, the data matrix must be normalised to perform the analysis.
The first step of the analysis is to define the linkage method. To do so, we look at the agglomerative coefficient of each method, chosing the one that is closer to 1.
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
#function to compute agglomerative coefficient
ac <- function(x) {
agnes(df, method = x)$ac
}
#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac) # best: closest to 1 (here Ward)## average single complete ward
## 0.9352095 0.7961501 0.9589835 0.9923943
Ward’s method seems to be the best suited for the analysis.
Hierarchical clustering is performed using Ward’s minimum variance and a dendrogram is plotted. The dendrogram is then cut in the number of cluster needed, here 5.
clust <- agnes(df, method = "ward")
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram") #compute distance matrix
d <- dist(df, method = "euclidean")
#perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2" )
#cut the dendrogram into 5 clusters
groups <- cutree(final_clust, k=5)The number of dives per cluster is given below.
hmeans_results <- table(groups)
names(hmeans_results) <- c("Cluster 1","Cluster 2","Cluster 3","Cluster 4","Cluster 5")
hmeans_results## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## 150 153 239 149 163
To estimate robustness of clusters obtained through the different methods used here, a comparison of the intersecting data points among clusters is performed.
Regarding model-based clustering, only the first threshold is tested
to ensure all dives are classified, allowing for comparison with
hierarchical and k means clustering. After computing a distance matrix,
the function cluster.stats is applied to the three
methods.
Pairwise comparison is applied to all three clustering methods.
Corrected for chance rand index (corrected.rand) and
variation of information index (VI) allow for estimating
the similarity of information of both classification.
Comparison of model-based and hierarchical clustering:
data_dist = xpca.fd[["scores"]]
comp_mh = cluster.stats(data_dist, results_thr1$cluster, alt.clustering = groups) #comparison mclust & hclust
comp_mh$corrected.rand## [1] 0.5951515
comp_mh$vi## [1] 1.19351
Comparison of model-based and K means clustering:
comp_mk = cluster.stats(data_dist, results_thr1$cluster, alt.clustering = km$cluster) #comparison mclust & kclust
comp_mk$corrected.rand## [1] 0.3877119
comp_mk$vi## [1] 1.476983
Comparison of K means and hierarchical clusterings:
comp_kh = cluster.stats(data_dist, groups, alt.clustering = km$cluster) #comparison hclust & kclust
comp_kh$corrected.rand## [1] 0.4719402
comp_kh$vi## [1] 1.248073
(References to analyse comparison results:
time = seq(0, 1, by=(1/49))
idx_1 <- results_thr5[results_thr5$cluster == 1,] # extract dives in cluster 1
idx_1$index <- row.names(idx_1) # get index of dives
vec_index_1 = as.numeric(c(idx_1$index)) # create vector to index in X.fd$coefs
mean_cl1 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) {
data = mean(X.fd$coefs[k, vec_index_1 ]) # get the mean
q_2_5 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.025)) # get quantile = 2.5%
q_25 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.25)) # get quantile = 25%
q_75 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.75)) # get quantile = 75%
q_97_5 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.975)) # get quantile = 97.5%
mean_cl1[k , 1] <- data.frame(data)
mean_cl1[k , 2] <- data.frame(q_2_5)
mean_cl1[k , 3] <- data.frame(q_25)
mean_cl1[k , 4] <- data.frame(q_75)
mean_cl1[k , 5] <- data.frame(q_97_5)
}
mean_cl1[, 6] <- time
plot(mean_cl1$NA_col)idx_2 <- results_thr5[results_thr5$cluster == 2,]
idx_2$index <- row.names(idx_2)
vec_index_2 = as.numeric(c(idx_2$index)) # create vector to index in X.fd$coefs
mean_cl2 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data
data = mean(X.fd$coefs[k, vec_index_2 ])
mean_cl2[k , ] <- data
q_2_5 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.025)) # get quantile = 2.5%
q_25 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.25)) # get quantile = 25%
q_75 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.75)) # get quantile = 75%
q_97_5 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.975)) # get quantile = 97.5%
mean_cl2[k , 1] <- data.frame(data)
mean_cl2[k , 2] <- data.frame(q_2_5)
mean_cl2[k , 3] <- data.frame(q_25)
mean_cl2[k , 4] <- data.frame(q_75)
mean_cl2[k , 5] <- data.frame(q_97_5)}
plot(mean_cl2$q_25)mean_cl2[, 6] <- time
idx_3 <- results_thr5[results_thr5$cluster == 3,]
idx_3$index <- row.names(idx_3)
vec_index_3 = as.numeric(c(idx_3$index)) # create vector to index in X.fd$coefs
mean_cl3 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data
data = mean(X.fd$coefs[k, vec_index_3 ])
q_2_5 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.025)) # get quantile = 2.5%
q_25 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.25)) # get quantile = 25%
q_75 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.75)) # get quantile = 75%
q_97_5 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.975)) # get quantile = 97.5%
mean_cl3[k , 1] <- data.frame(data)
mean_cl3[k , 2] <- data.frame(q_2_5)
mean_cl3[k , 3] <- data.frame(q_25)
mean_cl3[k , 4] <- data.frame(q_75)
mean_cl3[k , 5] <- data.frame(q_97_5)}
plot(mean_cl3$NA_col)mean_cl3[, 6] <- time
idx_4 <- results_thr5[results_thr5$cluster == 4,]
idx_4$index <- row.names(idx_4)
vec_index_4 = as.numeric(c(idx_4$index)) # create vector to index in X.fd$coefs
mean_cl4 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data
data = mean(X.fd$coefs[k, vec_index_4 ])
q_2_5 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.025)) # get quantile = 2.5%
q_25 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.25)) # get quantile = 25%
q_75 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.75)) # get quantile = 75%
q_97_5 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.975)) # get quantile = 97.5%
mean_cl4[k , 1] <- data.frame(data)
mean_cl4[k , 2] <- data.frame(q_2_5)
mean_cl4[k , 3] <- data.frame(q_25)
mean_cl4[k , 4] <- data.frame(q_75)
mean_cl4[k , 5] <- data.frame(q_97_5)}
plot(mean_cl4$NA_col)mean_cl4[, 6] <- time
idx_5 <- results_thr5[results_thr5$cluster == 5,]
idx_5$index <- row.names(idx_5)
vec_index_5 = as.numeric(c(idx_5$index)) # create vector to index in X.fd$coefs
mean_cl5 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
for (k in 1:50) { # loop to get mean of data
data = mean(X.fd$coefs[k, vec_index_5 ])
q_2_5 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.025)) # get quantile = 2.5%
q_25 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.25)) # get quantile = 25%
q_75 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.75)) # get quantile = 75%
q_97_5 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.975)) # get quantile = 97.5%
mean_cl5[k , 1] <- data.frame(data)
mean_cl5[k , 2] <- data.frame(q_2_5)
mean_cl5[k , 3] <- data.frame(q_25)
mean_cl5[k , 4] <- data.frame(q_75)
mean_cl5[k , 5] <- data.frame(q_97_5)}
plot(mean_cl5$NA_col)mean_cl5[, 6] <- timeNow that we have extracted the mean and quantiles, let’s make graphs!
c1 <- ggplot(mean_cl1, aes(x=time, y=NA_col)) +
geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 1") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c1c2 <- ggplot(mean_cl2, aes(x=time, y=NA_col)) +
geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 2") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c2c3 <- ggplot(mean_cl3, aes(x=time, y=NA_col)) +
geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 3") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c3c4 <- ggplot(mean_cl4, aes(x=time, y=NA_col)) +
geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 4") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c4c5 <- ggplot(mean_cl5, aes(x=time, y=NA_col)) +
geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 5") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c5legend <- get_legend( c1 )
prow <- plot_grid(
c1 + theme(legend.position="none"),
c2 + theme(legend.position="none"),
c3 + theme(legend.position="none"),
c4 + theme(legend.position="none"),
c5 + theme(legend.position="none"),
labels = c("A", "B", "C"),
nrow = 2
)
prow# png(file="/Users/adelieantoine/Desktop/clusters.png",
# width=600, height=350)To assess seal’s behaviour among dive groups, analyses must be conducted with MATLAB. The result file, containing cluster number for each dive and dive index for each seal, can be saved in .csv format.
Godard M, Manté C, Guinet C, Picard B, Nerini D. 2020. ‘Diving Behavior of Mirounga leonina: A Functional Data Analysis Approach.’ Front Mar Sci. 7-595.
Kassambara A., Mundt F.. 2020. ‘factoextra: Extract and Visualize the Results of Multivariate Data Analyses’. R package version 1.0.7.
Maechler M., Rousseeuw P., Struyf A., Hubert M., Hornik K. 2019. ‘cluster: Cluster Analysis Basics and Extensions’. R package version 2.1.0.
Scrucca, L. 2010. ‘Dimension reduction for model-based clustering.’ Stat Comput 20, 471–484.
Scrucca L., Fop M., Murphy T.B., Raftery A.E. 2016. ‘mclust 5: clustering, classification and density estimation using Gaussian finite mixture models.’ The R Journal, 8(1): 289–317.