/pagenumbering{arabic}
Before applying fPCAs to the data, the 854 recorded dives must be visualised to make sure there is no sensor error or unexpected data. For obvious reasons, the 854 dive profiles will not be provided here. A sample of 4 dives is presented below. These dive profiles illustrate the variety of dive types in the studied dataset, with duration, depth, skewness and overall shape differing among dives.
Following Godard et al., 2019 (in Front. Mar. Sc.), fPCAs are lead on normalised dives. By reducing depth and duration to a 0 to 1 scale, dive structures are made comparable. The code is shown below.
=50
nbcoef=create.bspline.basis(c(0,1),nbcoef)
myb=eval.penalty(myb,2)
penMat=cbind(penMat,0,0)
penMat=rbind(penMat,0,0)
penMat
=fd(coef=NULL,basisobj=myb)
X.fd=NULL
max_depth=NULL
dureefor(k in 1:nplong){
cat("k=",k,fill=TRUE)
=plong[[k]][,1]
tps=c(duree,max(tps)-min(tps))
duree=1/(max(tps)-min(tps))*(tps-min(tps))
tps=plong[[k]][,2]
prof=c(max_depth,min(prof))
max_depth=-prof/min(prof)
prof
### Compute the spline coefficients under constraints
=bsplineS(0,breaks=c(0,myb$params,1),nderiv=0)
k1=t(k1)
k1=bsplineS(1,breaks=c(0,myb$params,1),nderiv=0)
k1b=t(k1b)
k1b=eval.basis(tps,myb)
X=rbind(cbind(t(X)%*%X,k1,k1b),cbind(t(k1),0,0),cbind(t(k1b),0,0))
Mat### Keep the spline coefficients, here KAPPA=1e-06
=solve(Mat+0.000001*penMat)%*%rbind(t(X)%*%prof,0,0)
achap$coefs=cbind(X.fd$coefs,as.matrix(achap[1:nbcoef]))
X.fd }
## k= 1
## k= 2
## k= 3
## k= 4
## k= 5
## k= 6
## k= 7
## k= 8
## k= 9
## k= 10
## k= 11
## k= 12
## k= 13
## k= 14
## k= 15
## k= 16
## k= 17
## k= 18
## k= 19
## k= 20
## k= 21
## k= 22
## k= 23
## k= 24
## k= 25
## k= 26
## k= 27
## k= 28
## k= 29
## k= 30
## k= 31
## k= 32
## k= 33
## k= 34
## k= 35
## k= 36
## k= 37
## k= 38
## k= 39
## k= 40
## k= 41
## k= 42
## k= 43
## k= 44
## k= 45
## k= 46
## k= 47
## k= 48
## k= 49
## k= 50
## k= 51
## k= 52
## k= 53
## k= 54
## k= 55
## k= 56
## k= 57
## k= 58
## k= 59
## k= 60
## k= 61
## k= 62
## k= 63
## k= 64
## k= 65
## k= 66
## k= 67
## k= 68
## k= 69
## k= 70
## k= 71
## k= 72
## k= 73
## k= 74
## k= 75
## k= 76
## k= 77
## k= 78
## k= 79
## k= 80
## k= 81
## k= 82
## k= 83
## k= 84
## k= 85
## k= 86
## k= 87
## k= 88
## k= 89
## k= 90
## k= 91
## k= 92
## k= 93
## k= 94
## k= 95
## k= 96
## k= 97
## k= 98
## k= 99
## k= 100
## k= 101
## k= 102
## k= 103
## k= 104
## k= 105
## k= 106
## k= 107
## k= 108
## k= 109
## k= 110
## k= 111
## k= 112
## k= 113
## k= 114
## k= 115
## k= 116
## k= 117
## k= 118
## k= 119
## k= 120
## k= 121
## k= 122
## k= 123
## k= 124
## k= 125
## k= 126
## k= 127
## k= 128
## k= 129
## k= 130
## k= 131
## k= 132
## k= 133
## k= 134
## k= 135
## k= 136
## k= 137
## k= 138
## k= 139
## k= 140
## k= 141
## k= 142
## k= 143
## k= 144
## k= 145
## k= 146
## k= 147
## k= 148
## k= 149
## k= 150
## k= 151
## k= 152
## k= 153
## k= 154
## k= 155
## k= 156
## k= 157
## k= 158
## k= 159
## k= 160
## k= 161
## k= 162
## k= 163
## k= 164
## k= 165
## k= 166
## k= 167
## k= 168
## k= 169
## k= 170
## k= 171
## k= 172
## k= 173
## k= 174
## k= 175
## k= 176
## k= 177
## k= 178
## k= 179
## k= 180
## k= 181
## k= 182
## k= 183
## k= 184
## k= 185
## k= 186
## k= 187
## k= 188
## k= 189
## k= 190
## k= 191
## k= 192
## k= 193
## k= 194
## k= 195
## k= 196
## k= 197
## k= 198
## k= 199
## k= 200
## k= 201
## k= 202
## k= 203
## k= 204
## k= 205
## k= 206
## k= 207
## k= 208
## k= 209
## k= 210
## k= 211
## k= 212
## k= 213
## k= 214
## k= 215
## k= 216
## k= 217
## k= 218
## k= 219
## k= 220
## k= 221
## k= 222
## k= 223
## k= 224
## k= 225
## k= 226
## k= 227
## k= 228
## k= 229
## k= 230
## k= 231
## k= 232
## k= 233
## k= 234
## k= 235
## k= 236
## k= 237
## k= 238
## k= 239
## k= 240
## k= 241
## k= 242
## k= 243
## k= 244
## k= 245
## k= 246
## k= 247
## k= 248
## k= 249
## k= 250
## k= 251
## k= 252
## k= 253
## k= 254
## k= 255
## k= 256
## k= 257
## k= 258
## k= 259
## k= 260
## k= 261
## k= 262
## k= 263
## k= 264
## k= 265
## k= 266
## k= 267
## k= 268
## k= 269
## k= 270
## k= 271
## k= 272
## k= 273
## k= 274
## k= 275
## k= 276
## k= 277
## k= 278
## k= 279
## k= 280
## k= 281
## k= 282
## k= 283
## k= 284
## k= 285
## k= 286
## k= 287
## k= 288
## k= 289
## k= 290
## k= 291
## k= 292
## k= 293
## k= 294
## k= 295
## k= 296
## k= 297
## k= 298
## k= 299
## k= 300
## k= 301
## k= 302
## k= 303
## k= 304
## k= 305
## k= 306
## k= 307
## k= 308
## k= 309
## k= 310
## k= 311
## k= 312
## k= 313
## k= 314
## k= 315
## k= 316
## k= 317
## k= 318
## k= 319
## k= 320
## k= 321
## k= 322
## k= 323
## k= 324
## k= 325
## k= 326
## k= 327
## k= 328
## k= 329
## k= 330
## k= 331
## k= 332
## k= 333
## k= 334
## k= 335
## k= 336
## k= 337
## k= 338
## k= 339
## k= 340
## k= 341
## k= 342
## k= 343
## k= 344
## k= 345
## k= 346
## k= 347
## k= 348
## k= 349
## k= 350
## k= 351
## k= 352
## k= 353
## k= 354
## k= 355
## k= 356
## k= 357
## k= 358
## k= 359
## k= 360
## k= 361
## k= 362
## k= 363
## k= 364
## k= 365
## k= 366
## k= 367
## k= 368
## k= 369
## k= 370
## k= 371
## k= 372
## k= 373
## k= 374
## k= 375
## k= 376
## k= 377
## k= 378
## k= 379
## k= 380
## k= 381
## k= 382
## k= 383
## k= 384
## k= 385
## k= 386
## k= 387
## k= 388
## k= 389
## k= 390
## k= 391
## k= 392
## k= 393
## k= 394
## k= 395
## k= 396
## k= 397
## k= 398
## k= 399
## k= 400
## k= 401
## k= 402
## k= 403
## k= 404
## k= 405
## k= 406
## k= 407
## k= 408
## k= 409
## k= 410
## k= 411
## k= 412
## k= 413
## k= 414
## k= 415
## k= 416
## k= 417
## k= 418
## k= 419
## k= 420
## k= 421
## k= 422
## k= 423
## k= 424
## k= 425
## k= 426
## k= 427
## k= 428
## k= 429
## k= 430
## k= 431
## k= 432
## k= 433
## k= 434
## k= 435
## k= 436
## k= 437
## k= 438
## k= 439
## k= 440
## k= 441
## k= 442
## k= 443
## k= 444
## k= 445
## k= 446
## k= 447
## k= 448
## k= 449
## k= 450
## k= 451
## k= 452
## k= 453
## k= 454
## k= 455
## k= 456
## k= 457
## k= 458
## k= 459
## k= 460
## k= 461
## k= 462
## k= 463
## k= 464
## k= 465
## k= 466
## k= 467
## k= 468
## k= 469
## k= 470
## k= 471
## k= 472
## k= 473
## k= 474
## k= 475
## k= 476
## k= 477
## k= 478
## k= 479
## k= 480
## k= 481
## k= 482
## k= 483
## k= 484
## k= 485
## k= 486
## k= 487
## k= 488
## k= 489
## k= 490
## k= 491
## k= 492
## k= 493
## k= 494
## k= 495
## k= 496
## k= 497
## k= 498
## k= 499
## k= 500
## k= 501
## k= 502
## k= 503
## k= 504
## k= 505
## k= 506
## k= 507
## k= 508
## k= 509
## k= 510
## k= 511
## k= 512
## k= 513
## k= 514
## k= 515
## k= 516
## k= 517
## k= 518
## k= 519
## k= 520
## k= 521
## k= 522
## k= 523
## k= 524
## k= 525
## k= 526
## k= 527
## k= 528
## k= 529
## k= 530
## k= 531
## k= 532
## k= 533
## k= 534
## k= 535
## k= 536
## k= 537
## k= 538
## k= 539
## k= 540
## k= 541
## k= 542
## k= 543
## k= 544
## k= 545
## k= 546
## k= 547
## k= 548
## k= 549
## k= 550
## k= 551
## k= 552
## k= 553
## k= 554
## k= 555
## k= 556
## k= 557
## k= 558
## k= 559
## k= 560
## k= 561
## k= 562
## k= 563
## k= 564
## k= 565
## k= 566
## k= 567
## k= 568
## k= 569
## k= 570
## k= 571
## k= 572
## k= 573
## k= 574
## k= 575
## k= 576
## k= 577
## k= 578
## k= 579
## k= 580
## k= 581
## k= 582
## k= 583
## k= 584
## k= 585
## k= 586
## k= 587
## k= 588
## k= 589
## k= 590
## k= 591
## k= 592
## k= 593
## k= 594
## k= 595
## k= 596
## k= 597
## k= 598
## k= 599
## k= 600
## k= 601
## k= 602
## k= 603
## k= 604
## k= 605
## k= 606
## k= 607
## k= 608
## k= 609
## k= 610
## k= 611
## k= 612
## k= 613
## k= 614
## k= 615
## k= 616
## k= 617
## k= 618
## k= 619
## k= 620
## k= 621
## k= 622
## k= 623
## k= 624
## k= 625
## k= 626
## k= 627
## k= 628
## k= 629
## k= 630
## k= 631
## k= 632
## k= 633
## k= 634
## k= 635
## k= 636
## k= 637
## k= 638
## k= 639
## k= 640
## k= 641
## k= 642
## k= 643
## k= 644
## k= 645
## k= 646
## k= 647
## k= 648
## k= 649
## k= 650
## k= 651
## k= 652
## k= 653
## k= 654
## k= 655
## k= 656
## k= 657
## k= 658
## k= 659
## k= 660
## k= 661
## k= 662
## k= 663
## k= 664
## k= 665
## k= 666
## k= 667
## k= 668
## k= 669
## k= 670
## k= 671
## k= 672
## k= 673
## k= 674
## k= 675
## k= 676
## k= 677
## k= 678
## k= 679
## k= 680
## k= 681
## k= 682
## k= 683
## k= 684
## k= 685
## k= 686
## k= 687
## k= 688
## k= 689
## k= 690
## k= 691
## k= 692
## k= 693
## k= 694
## k= 695
## k= 696
## k= 697
## k= 698
## k= 699
## k= 700
## k= 701
## k= 702
## k= 703
## k= 704
## k= 705
## k= 706
## k= 707
## k= 708
## k= 709
## k= 710
## k= 711
## k= 712
## k= 713
## k= 714
## k= 715
## k= 716
## k= 717
## k= 718
## k= 719
## k= 720
## k= 721
## k= 722
## k= 723
## k= 724
## k= 725
## k= 726
## k= 727
## k= 728
## k= 729
## k= 730
## k= 731
## k= 732
## k= 733
## k= 734
## k= 735
## k= 736
## k= 737
## k= 738
## k= 739
## k= 740
## k= 741
## k= 742
## k= 743
## k= 744
## k= 745
## k= 746
## k= 747
## k= 748
## k= 749
## k= 750
## k= 751
## k= 752
## k= 753
## k= 754
## k= 755
## k= 756
## k= 757
## k= 758
## k= 759
## k= 760
## k= 761
## k= 762
## k= 763
## k= 764
## k= 765
## k= 766
## k= 767
## k= 768
## k= 769
## k= 770
## k= 771
## k= 772
## k= 773
## k= 774
## k= 775
## k= 776
## k= 777
## k= 778
## k= 779
## k= 780
## k= 781
## k= 782
## k= 783
## k= 784
## k= 785
## k= 786
## k= 787
## k= 788
## k= 789
## k= 790
## k= 791
## k= 792
## k= 793
## k= 794
## k= 795
## k= 796
## k= 797
## k= 798
## k= 799
## k= 800
## k= 801
## k= 802
## k= 803
## k= 804
## k= 805
## k= 806
## k= 807
## k= 808
## k= 809
## k= 810
## k= 811
## k= 812
## k= 813
## k= 814
## k= 815
## k= 816
## k= 817
## k= 818
## k= 819
## k= 820
## k= 821
## k= 822
## k= 823
## k= 824
## k= 825
## k= 826
## k= 827
## k= 828
## k= 829
## k= 830
## k= 831
## k= 832
## k= 833
## k= 834
## k= 835
## k= 836
## k= 837
## k= 838
## k= 839
## k= 840
## k= 841
## k= 842
## k= 843
## k= 844
## k= 845
## k= 846
## k= 847
## k= 848
## k= 849
## k= 850
## k= 851
## k= 852
## k= 853
## k= 854
### Keep the coefficients
$coefs=X.fd$coefs[,-1]
X.fd
# plot one normalised dive for visualisation
=plot(X.fd[68,]) plot
# png(file="/Volumes/SARA_WORK/WS_data_DDU2019_SonarTag/data/lw19/output_files/figures/3D_dive_plot_all_dives/dives_not_classified_in_clusters/normalized/314a/dive_nb_3.png",width=600, height=600)
Functional PCAs are then performed.
=pca.fd(X.fd,3) xpca.fd
Three dive shapes are found, with each archetypal profile shown below.
The following plot is used to determine the number of components to keep for classification. It appears that 5 dimensions are necessary here.
To build clusters according to the shape of each dive, several methods are tested and compared. The robustness of clusters is tested by estimating consistency of groups created following each method.
Three methods were used:
Model based clustering, where classification and density are estimated based on finite Gaussian mixture modeling,
K means clustering, which assigns values to the cluster whose centroid is closest using Euclidean distance,
Hierarchical clustering, that calculates the pairwise dissimilarity between each observation in the dataset and fuses observations into clusters through a classification tree (dendrogram).
The data matrix is provided to the function Mclust
to
perform the clustering. The data matrix must be standardised. A summary
displaying information on the best model is available, giving sample
size and overall probability of belonging to each cluster. Estimation of
the suitability of models is performed using the Bayesian information
criterion (BIC hereafter).
The best suited model, with higher BIC, comprises 5 components. Group volume, shape and orientation can vary (VVV model).
The corresponding code and its evaluation is displayed below.
<- Mclust(xpca.fd[["scores"]])
mod1 summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 5 components:
##
## log-likelihood n df BIC ICL
## 2148.217 854 37 4046.686 3694.369
##
## Clustering table:
## 1 2 3 4 5
## 75 205 133 256 185
##
## Mixing probabilities:
## 1 2 3 4 5
## 0.07898608 0.24553603 0.16480201 0.27891470 0.23176118
##
## Means:
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.19172360 -0.002713458 -0.06714429 -0.03480885 0.15785198
## [2,] -0.08748832 0.067796446 0.11319588 -0.08389994 -0.02153118
## [3,] -0.07574470 0.109994251 -0.04912812 0.02487256 -0.08571618
##
## Variances:
## [,,1]
## [,1] [,2] [,3]
## [1,] 1.005868e-03 -0.0001753370 -4.015381e-05
## [2,] -1.753370e-04 0.0008215416 1.301208e-04
## [3,] -4.015381e-05 0.0001301208 1.233113e-03
## [,,2]
## [,1] [,2] [,3]
## [1,] 7.146416e-03 -3.453914e-05 0.001846948
## [2,] -3.453914e-05 6.098091e-03 -0.001566044
## [3,] 1.846948e-03 -1.566044e-03 0.003644108
## [,,3]
## [,1] [,2] [,3]
## [1,] 0.004820275 0.0046402045 -0.0022437787
## [2,] 0.004640204 0.0115023659 -0.0006062919
## [3,] -0.002243779 -0.0006062919 0.0051292707
## [,,4]
## [,1] [,2] [,3]
## [1,] 0.008670377 -0.0031691751 0.0028940728
## [2,] -0.003169175 0.0033637318 -0.0007543304
## [3,] 0.002894073 -0.0007543304 0.0058555057
## [,,5]
## [,1] [,2] [,3]
## [1,] 1.541464e-02 7.292807e-05 0.003340142
## [2,] 7.292807e-05 1.367803e-02 -0.002906007
## [3,] 3.340142e-03 -2.906007e-03 0.008964650
Pairwise dimension clustering can be plotted, to illustrate classification.
plot(mod1, what ="classification")
Each model’s BIC can be plotted, depending on the number of components chosen. The following plot illustrates the necessity of a 5-dimension mixture.
The BIC summary presents BIC value for the top three models, for complementary information.
The ICL criterion can also be used, to balance bias induced by BIC that approximates the density rather than the number of clusters as such. Here, it might not be relevant to resort to the ICL criterion since it is efficient only in cases where there is very little overlap.
<- mclustBIC(xpca.fd[["scores"]])
BIC plot(BIC) # 5 dimensions mixture, VVV model with top BIC value
summary(BIC) # summary of top 3 models
## Best BIC values:
## VVE,5 VVE,6 VEV,5
## BIC 4046.686 4031.03696 4013.66700
## BIC diff 0.000 -15.64927 -33.01922
Scrucca et al., 2010 (in Statistics and computing)
suggests reducing dimensions to project clusters into a suitable
subspace, in order to visualise clustering structures and geometric
characteristics. The reduction is performed using MclustDR
function. The estimated directions are defined as a set of linear
combinations of the original features, ordered by importance. The
ordering is carried out through the quantification of associated
eigenvalues.
The summary
function displays information on resulting
dimensions. Eigenvalue of each remaining dimension is plotted, showing
the classification performed with Mclust
function.
<- MclustDR(mod1, lambda = 1) # reduction of dimensions
drmod summary(drmod)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVE, 5)
##
## Clusters n
## 1 75
## 2 205
## 3 133
## 4 256
## 5 185
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3
## [1,] -0.55279 -0.58669 0.34244
## [2,] 0.31787 -0.74461 -0.62080
## [3,] 0.77032 -0.31836 0.70523
##
## Dir1 Dir2 Dir3
## Eigenvalues 0.88063 0.46734 0.21636
## Cum. % 56.29455 86.16902 100.00000
plot(drmod, what = "evalues")
Several exploratory plots can be obtained. They are different representations of the same information. Each color represents a cluster.
Pairwise representation of the classification, for dimensions 1 to 4:
plot(drmod, what = "pairs")
Cluster scatterplot with contour depending on distance to centroid, for dimensions 1 and 2:
plot(drmod, what = "contour")
Cluster limits according to the model, for dimensions 1 and 2:
plot(drmod, what = "classification")
Uncertainty zones (grey) according to the model, where ajdacent points belong to different clusters, for dimensions 1 and 2:
plot(drmod, what = "boundaries")
Distribution of observations on each dimension axis can be plotted, to compare cluster positions in the four dimensions subspace.
plot(drmod, dimens = 1, what = "density")
plot(drmod, dimens = 2, what = "density")
plot(drmod, dimens = 3, what = "density")
Resulting clusters can be extracted from the model. Assessing probability of each data point to belong to its cluster is needed to estimate robustness of cluster predictions.
In the following table, the first five columns represent the position of data points on each dimension axis. The last two columns (in bold) represents the cluster number to which the given dive belongs to, and probability that the said dive does belong to this cluster. Only the first 6 rows are presented.
= mod1$classification
cluster = mod1$uncertainty
uncertainty = data.frame(xpca.fd[["scores"]],cluster)
clust_results = data.frame(clust_results,uncertainty)
results kable(head(results)) %>%
kable_styling("striped") %>%
column_spec(4:5, bold = TRUE) %>%
add_header_above(c("Dimension axis" = 3, "Clustering results" = 2))
X1 | X2 | X3 | cluster | uncertainty |
---|---|---|---|---|
-0.1724743 | -0.1123871 | -0.0563770 | 1 | 0.0851807 |
-0.0589972 | 0.1269479 | 0.0889921 | 2 | 0.0727001 |
-0.1879304 | -0.0415901 | -0.0348726 | 1 | 0.4344803 |
0.1187469 | -0.0105761 | 0.1141133 | 2 | 0.1234015 |
0.0646416 | 0.0971359 | 0.1943154 | 2 | 0.0004626 |
0.0869299 | 0.0913429 | 0.2200825 | 2 | 0.0004977 |
Several threshold can be tested. Only dives with uncertainty below a given threshold are considered to belong to the corresponding cluster. For every tested threshold, the resulting number of classified dives is compared.
The maximum uncertainty of a dive dive to belong to the cluster it was classified in is 0.6490577.
The different thresholds tested are:
= results[!(results$uncertainty>a1),]
results_thr1 = results[!(results$uncertainty>0.5),]
results_thr2 = results[!(results$uncertainty>0.4),]
results_thr3 = results[!(results$uncertainty>0.3),]
results_thr4 = results[!(results$uncertainty>0.2),]
results_thr5 = results[!(results$uncertainty>0.1),]
results_thr6 = results[!(results$uncertainty>0.05),]
results_thr7 = results[!(results$uncertainty>0.04),]
results_thr8 = results[!(results$uncertainty>0.03),]
results_thr9 = results[!(results$uncertainty>0.02),]
results_thr10 = results[!(results$uncertainty>0.01),]
results_thr11
<- data.frame(var1=c(a1,0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.04,0.03, 0.02, 0.01),
thr_summary var2=c(nrow(results_thr1),nrow(results_thr2),nrow(results_thr3),
nrow(results_thr4),nrow(results_thr5),nrow(results_thr6),
nrow(results_thr7),nrow(results_thr8),nrow(results_thr9),nrow(results_thr10),nrow(results_thr11)))
<- c(a1,0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.04, 0.03, 0.02, 0.01)
a2 <- numeric()
clust1 <- numeric()
clust2 <- numeric()
clust3 <- numeric()
clust4 <- numeric()
clust5 for(j in 1:11){
= a2[c(j)]
k = which(results$cluster == 1 & results$uncertainty<k)
x1 = which(results$cluster == 2 & results$uncertainty<k)
x2 = which(results$cluster == 3 & results$uncertainty<k)
x3 = which(results$cluster == 4 & results$uncertainty<k)
x4 = which(results$cluster == 5 & results$uncertainty<k)
x5 =rbind(clust1, length(x1))
clust1=rbind(clust2, length(x2))
clust2=rbind(clust3, length(x3))
clust3=rbind(clust4, length(x4))
clust4=rbind(clust5, length(x5))
clust5
}
<- cbind(thr_summary,clust1,clust2,clust3,clust4,clust5)
thr_summary names(thr_summary) <- c("Threshold value","Total number of dives", "Cluster 1",
"Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")
kable(thr_summary,align = 'c') %>%
kable_styling("striped") %>%
add_header_above(c(" " = 2, "Number of dives per cluster" = 5))
Threshold value | Total number of dives | Cluster 1 | Cluster 2 | Cluster 3 | Cluster 4 | Cluster 5 |
---|---|---|---|---|---|---|
0.6490577 | 854 | 75 | 205 | 133 | 255 | 185 |
0.5000000 | 832 | 73 | 200 | 130 | 247 | 182 |
0.4000000 | 741 | 67 | 181 | 109 | 214 | 170 |
0.3000000 | 656 | 62 | 168 | 84 | 177 | 165 |
0.2000000 | 545 | 52 | 146 | 59 | 138 | 150 |
0.1000000 | 411 | 38 | 114 | 37 | 84 | 138 |
0.0500000 | 294 | 17 | 93 | 23 | 37 | 124 |
0.0400000 | 258 | 14 | 86 | 15 | 26 | 117 |
0.0300000 | 210 | 4 | 79 | 7 | 10 | 110 |
0.0200000 | 171 | 0 | 66 | 3 | 0 | 102 |
0.0100000 | 143 | 0 | 50 | 0 | 0 | 93 |
To perform K means clustering, the data matrix must be normalised. It is therefore necessary to reduce and center the data frame.
<- scale(xpca.fd[["scores"]])
df head(df)
## [,1] [,2] [,3]
## [1,] -1.2721708 -0.96809969 -0.5303446
## [2,] -0.4351637 1.09352552 0.8371591
## [3,] -1.3861756 -0.35825638 -0.3280506
## [4,] 0.8758778 -0.09110264 1.0734769
## [5,] 0.4767966 0.83672626 1.8279467
## [6,] 0.6411954 0.78682553 2.0703413
Contrary to usual method, the number of cluster will not be computed
through fviz_nbclust
function, nor by comparing the number
of clusters to the gap statistic (using clusGap
function).
As the aim of using K means clustering here is to compare robustness of
clusters resulting from model-based clustering, the same number of
dimensions must be used.
In the following analysis, a fixed number of clusters is provided to
the function (5 dimensions). The nstart
parameter is fixed
at 150, to ensure stability of the model.
As shown in cluster plots examples, clusters overlay each other a lot, therefore results must be taken carefully.
<- kmeans(df, centers = 5,nstart = 150)
km km
## K-means clustering with 5 clusters of sizes 127, 108, 218, 249, 152
##
## Cluster means:
## [,1] [,2] [,3]
## 1 -0.12993060 1.5998422 -0.2185933
## 2 1.27474350 0.2305231 -1.3047782
## 3 -0.02999682 0.2459167 1.1302084
## 4 -0.98619837 -0.4442591 -0.4587578
## 5 0.86139204 -1.1254327 0.2402807
##
## Clustering vector:
## [1] 4 3 4 3 3 3 5 3 3 4 4 4 3 2 2 3 4 5 1 2 5 4 3 5 3 5 5 1 5 3 5 1 2 5 4 1 3
## [38] 3 3 2 1 4 4 1 4 3 1 4 5 5 1 4 4 5 2 4 3 5 1 3 3 4 4 1 4 3 4 4 4 4 3 5 1 3
## [75] 1 5 5 1 3 3 5 3 1 5 4 1 5 3 5 3 4 4 5 4 1 4 4 3 1 4 1 5 5 5 3 2 3 2 4 4 3
## [112] 3 1 3 3 3 1 3 3 1 1 4 5 2 3 4 5 3 2 4 3 1 2 4 5 4 3 5 2 4 3 3 1 1 2 5 3 5
## [149] 4 4 3 4 4 3 5 3 3 3 1 3 1 2 3 1 4 3 1 3 5 3 5 4 3 4 2 1 5 4 4 3 2 1 2 2 3
## [186] 4 3 4 1 3 3 4 4 4 2 4 4 4 1 1 2 3 4 4 4 3 4 5 3 2 5 3 5 3 5 5 3 3 1 4 3 4
## [223] 4 5 3 1 1 5 5 5 3 3 3 3 2 5 5 3 1 4 3 4 3 4 5 4 5 2 5 4 4 1 5 1 5 5 2 4 2
## [260] 3 1 5 5 4 5 2 3 2 3 4 3 2 3 2 5 2 5 3 3 5 3 3 1 3 4 5 1 3 3 3 5 4 4 1 4 4
## [297] 4 3 5 4 1 4 3 1 2 5 2 4 2 1 2 1 4 3 2 3 3 5 4 3 2 4 5 5 2 5 1 1 3 2 4 1 4
## [334] 5 4 5 4 3 4 3 4 4 3 2 3 1 3 1 4 3 3 4 1 4 2 4 4 4 5 5 3 1 2 3 2 3 4 4 3 3
## [371] 3 4 1 2 4 2 3 3 1 2 4 5 5 2 4 4 3 4 4 4 3 2 4 2 3 4 3 4 3 4 2 4 2 4 5 3 5
## [408] 4 2 1 5 2 4 3 5 4 2 4 1 3 5 2 3 5 3 5 5 2 2 4 5 4 3 5 3 2 4 1 3 4 3 2 5 4
## [445] 2 5 4 4 2 4 4 5 4 1 4 3 1 3 1 3 4 4 2 3 5 4 3 1 3 2 4 3 2 4 2 4 4 2 4 4 4
## [482] 5 4 2 4 1 1 1 3 4 3 4 5 1 2 1 4 3 2 1 4 3 2 3 2 3 1 1 4 1 2 3 4 2 4 1 4 4
## [519] 4 2 4 2 4 2 5 4 1 2 3 2 5 4 4 5 1 4 4 2 5 4 2 2 4 4 3 4 3 2 4 2 4 1 4 2 4
## [556] 2 4 5 3 1 3 4 1 1 3 4 5 4 3 5 3 4 5 1 1 1 5 4 4 2 2 4 1 2 1 5 4 3 4 4 4 4
## [593] 4 4 3 3 1 5 5 3 5 1 4 4 3 3 4 4 4 1 2 3 4 1 5 4 1 3 3 5 4 4 5 2 1 4 4 4 1
## [630] 3 5 5 4 4 2 1 2 2 4 5 2 4 4 4 4 1 4 1 3 3 3 3 5 5 4 4 5 3 4 4 2 3 5 5 5 5
## [667] 3 5 3 4 3 3 5 3 3 1 3 4 1 2 4 4 5 5 4 5 4 5 3 5 3 3 2 1 5 5 1 4 5 3 5 3 4
## [704] 4 5 4 5 3 3 3 3 3 2 5 1 1 1 1 5 3 1 1 3 3 3 5 4 2 4 1 2 3 2 2 3 1 1 4 4 1
## [741] 5 1 5 5 3 3 3 1 4 3 1 3 5 4 2 4 3 5 3 5 2 3 3 1 3 3 5 5 3 3 1 3 4 4 4 1 5
## [778] 1 5 4 4 3 4 4 4 4 4 5 4 5 3 4 1 5 4 4 5 5 3 1 1 1 4 5 2 4 5 3 1 5 1 4 1 4
## [815] 3 4 2 2 5 1 4 3 3 3 4 2 5 3 3 4 3 3 4 4 3 4 3 3 1 4 4 3 3 3 5 4 5 2 4 1 3
## [852] 4 1 5
##
## Within cluster sum of squares by cluster:
## [1] 134.2446 194.9681 202.3039 212.8665 166.5970
## (between_SS / total_SS = 64.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(km, data = df,ellipse.type = "norm",axes = c(1, 2),) # to plot results
fviz_cluster(km, data = df,ellipse.type = "norm",axes = c(1, 3),) # to plot results
The number of dives per cluster is presented in the following data frame.
= as.data.frame(t(km$size))
kmeans_results names(kmeans_results) <- c("Cluster 1", "Cluster 2","Cluster 3","Cluster 4","Cluster 5")
kmeans_results
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## 1 127 108 218 249 152
As in K means clustering, the data matrix must be normalised to perform the analysis.
The first step of the analysis is to define the linkage method. To do so, we look at the agglomerative coefficient of each method, chosing the one that is closer to 1.
<- c( "average", "single", "complete", "ward")
m names(m) <- c( "average", "single", "complete", "ward")
#function to compute agglomerative coefficient
<- function(x) {
ac agnes(df, method = x)$ac
}
#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac) # best: closest to 1 (here Ward)
## average single complete ward
## 0.9352095 0.7961501 0.9589835 0.9923943
Ward’s method seems to be the best suited for the analysis.
Hierarchical clustering is performed using Ward’s minimum variance and a dendrogram is plotted. The dendrogram is then cut in the number of cluster needed, here 5.
<- agnes(df, method = "ward")
clust pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram")
#compute distance matrix
<- dist(df, method = "euclidean")
d #perform hierarchical clustering using Ward's method
<- hclust(d, method = "ward.D2" )
final_clust #cut the dendrogram into 5 clusters
<- cutree(final_clust, k=5) groups
The number of dives per cluster is given below.
<- table(groups)
hmeans_results names(hmeans_results) <- c("Cluster 1","Cluster 2","Cluster 3","Cluster 4","Cluster 5")
hmeans_results
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## 150 153 239 149 163
To estimate robustness of clusters obtained through the different methods used here, a comparison of the intersecting data points among clusters is performed.
Regarding model-based clustering, only the first threshold is tested
to ensure all dives are classified, allowing for comparison with
hierarchical and k means clustering. After computing a distance matrix,
the function cluster.stats
is applied to the three
methods.
Pairwise comparison is applied to all three clustering methods.
Corrected for chance rand index (corrected.rand
) and
variation of information index (VI
) allow for estimating
the similarity of information of both classification.
Comparison of model-based and hierarchical clustering:
= xpca.fd[["scores"]]
data_dist = cluster.stats(data_dist, results_thr1$cluster, alt.clustering = groups) #comparison mclust & hclust
comp_mh $corrected.rand comp_mh
## [1] 0.5951515
$vi comp_mh
## [1] 1.19351
Comparison of model-based and K means clustering:
= cluster.stats(data_dist, results_thr1$cluster, alt.clustering = km$cluster) #comparison mclust & kclust
comp_mk $corrected.rand comp_mk
## [1] 0.3877119
$vi comp_mk
## [1] 1.476983
Comparison of K means and hierarchical clusterings:
= cluster.stats(data_dist, groups, alt.clustering = km$cluster) #comparison hclust & kclust
comp_kh $corrected.rand comp_kh
## [1] 0.4719402
$vi comp_kh
## [1] 1.248073
(References to analyse comparison results:
= seq(0, 1, by=(1/49))
time
<- results_thr5[results_thr5$cluster == 1,] # extract dives in cluster 1
idx_1 $index <- row.names(idx_1) # get index of dives
idx_1= as.numeric(c(idx_1$index)) # create vector to index in X.fd$coefs
vec_index_1 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
mean_cl1 for (k in 1:50) {
= mean(X.fd$coefs[k, vec_index_1 ]) # get the mean
data = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.025)) # get quantile = 2.5%
q_2_5 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.25)) # get quantile = 25%
q_25 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.75)) # get quantile = 75%
q_75 = quantile(X.fd$coefs[k, vec_index_1 ], probs = c(.975)) # get quantile = 97.5%
q_97_5 1] <- data.frame(data)
mean_cl1[k , 2] <- data.frame(q_2_5)
mean_cl1[k , 3] <- data.frame(q_25)
mean_cl1[k , 4] <- data.frame(q_75)
mean_cl1[k , 5] <- data.frame(q_97_5)
mean_cl1[k ,
}6] <- time
mean_cl1[, plot(mean_cl1$NA_col)
<- results_thr5[results_thr5$cluster == 2,]
idx_2 $index <- row.names(idx_2)
idx_2= as.numeric(c(idx_2$index)) # create vector to index in X.fd$coefs
vec_index_2 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
mean_cl2 for (k in 1:50) { # loop to get mean of data
= mean(X.fd$coefs[k, vec_index_2 ])
data <- data
mean_cl2[k , ] = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.025)) # get quantile = 2.5%
q_2_5 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.25)) # get quantile = 25%
q_25 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.75)) # get quantile = 75%
q_75 = quantile(X.fd$coefs[k, vec_index_2 ], probs = c(.975)) # get quantile = 97.5%
q_97_5 1] <- data.frame(data)
mean_cl2[k , 2] <- data.frame(q_2_5)
mean_cl2[k , 3] <- data.frame(q_25)
mean_cl2[k , 4] <- data.frame(q_75)
mean_cl2[k , 5] <- data.frame(q_97_5)}
mean_cl2[k , plot(mean_cl2$q_25)
6] <- time
mean_cl2[,
<- results_thr5[results_thr5$cluster == 3,]
idx_3 $index <- row.names(idx_3)
idx_3= as.numeric(c(idx_3$index)) # create vector to index in X.fd$coefs
vec_index_3 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
mean_cl3 for (k in 1:50) { # loop to get mean of data
= mean(X.fd$coefs[k, vec_index_3 ])
data = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.025)) # get quantile = 2.5%
q_2_5 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.25)) # get quantile = 25%
q_25 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.75)) # get quantile = 75%
q_75 = quantile(X.fd$coefs[k, vec_index_3 ], probs = c(.975)) # get quantile = 97.5%
q_97_5 1] <- data.frame(data)
mean_cl3[k , 2] <- data.frame(q_2_5)
mean_cl3[k , 3] <- data.frame(q_25)
mean_cl3[k , 4] <- data.frame(q_75)
mean_cl3[k , 5] <- data.frame(q_97_5)}
mean_cl3[k , plot(mean_cl3$NA_col)
6] <- time
mean_cl3[,
<- results_thr5[results_thr5$cluster == 4,]
idx_4 $index <- row.names(idx_4)
idx_4= as.numeric(c(idx_4$index)) # create vector to index in X.fd$coefs
vec_index_4 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
mean_cl4 for (k in 1:50) { # loop to get mean of data
= mean(X.fd$coefs[k, vec_index_4 ])
data = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.025)) # get quantile = 2.5%
q_2_5 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.25)) # get quantile = 25%
q_25 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.75)) # get quantile = 75%
q_75 = quantile(X.fd$coefs[k, vec_index_4 ], probs = c(.975)) # get quantile = 97.5%
q_97_5 1] <- data.frame(data)
mean_cl4[k , 2] <- data.frame(q_2_5)
mean_cl4[k , 3] <- data.frame(q_25)
mean_cl4[k , 4] <- data.frame(q_75)
mean_cl4[k , 5] <- data.frame(q_97_5)}
mean_cl4[k , plot(mean_cl4$NA_col)
6] <- time
mean_cl4[,
<- results_thr5[results_thr5$cluster == 5,]
idx_5 $index <- row.names(idx_5)
idx_5= as.numeric(c(idx_5$index)) # create vector to index in X.fd$coefs
vec_index_5 <- data.frame(NA_col = rep(NA, 50)) # create empty df to store results
mean_cl5 for (k in 1:50) { # loop to get mean of data
= mean(X.fd$coefs[k, vec_index_5 ])
data = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.025)) # get quantile = 2.5%
q_2_5 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.25)) # get quantile = 25%
q_25 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.75)) # get quantile = 75%
q_75 = quantile(X.fd$coefs[k, vec_index_5 ], probs = c(.975)) # get quantile = 97.5%
q_97_5 1] <- data.frame(data)
mean_cl5[k , 2] <- data.frame(q_2_5)
mean_cl5[k , 3] <- data.frame(q_25)
mean_cl5[k , 4] <- data.frame(q_75)
mean_cl5[k , 5] <- data.frame(q_97_5)}
mean_cl5[k , plot(mean_cl5$NA_col)
6] <- time mean_cl5[,
Now that we have extracted the mean and quantiles, let’s make graphs!
<- ggplot(mean_cl1, aes(x=time, y=NA_col)) +
c1 geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 1") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c1
<- ggplot(mean_cl2, aes(x=time, y=NA_col)) +
c2 geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 2") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c2
<- ggplot(mean_cl3, aes(x=time, y=NA_col)) +
c3 geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 3") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c3
<- ggplot(mean_cl4, aes(x=time, y=NA_col)) +
c4 geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 4") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c4
<- ggplot(mean_cl5, aes(x=time, y=NA_col)) +
c5 geom_line(color="red", size=2)+
geom_line(aes(y=q_2_5),linetype="dotted", size=1) +
geom_line(aes(y=q_25),linetype="dotted", size=1) +
geom_line(aes(y=q_75),linetype="dotted", size=1) +
geom_line(aes(y=q_97_5),linetype="dotted", size=1) +
ggtitle("Cluster 5") +
scale_y_continuous(n.breaks=3) +
scale_x_continuous(n.breaks=3) +
xlab("Time") + ylab("Depth") +
theme_classic()+
theme(axis.text.x=element_text(size=17), axis.text.y=element_text(size=17),
axis.title = element_text(size=19),
plot.title = element_text(size=21, face="bold"))
c5
<- get_legend( c1 )
legend <- plot_grid(
prow + theme(legend.position="none"),
c1 + theme(legend.position="none"),
c2 + theme(legend.position="none"),
c3 + theme(legend.position="none"),
c4 + theme(legend.position="none"),
c5 labels = c("A", "B", "C"),
nrow = 2
) prow
# png(file="/Users/adelieantoine/Desktop/clusters.png",
# width=600, height=350)
To assess seal’s behaviour among dive groups, analyses must be conducted with MATLAB. The result file, containing cluster number for each dive and dive index for each seal, can be saved in .csv format.
Godard M, Manté C, Guinet C, Picard B, Nerini D. 2020. ‘Diving Behavior of Mirounga leonina: A Functional Data Analysis Approach.’ Front Mar Sci. 7-595.
Kassambara A., Mundt F.. 2020. ‘factoextra: Extract and Visualize the Results of Multivariate Data Analyses’. R package version 1.0.7.
Maechler M., Rousseeuw P., Struyf A., Hubert M., Hornik K. 2019. ‘cluster: Cluster Analysis Basics and Extensions’. R package version 2.1.0.
Scrucca, L. 2010. ‘Dimension reduction for model-based clustering.’ Stat Comput 20, 471–484.
Scrucca L., Fop M., Murphy T.B., Raftery A.E. 2016. ‘mclust 5: clustering, classification and density estimation using Gaussian finite mixture models.’ The R Journal, 8(1): 289–317.