1. Importer les données
library(HDclassif)
data(crabs)
don <- crabs[ , 4:ncol(crabs)]
summary(don)
FL RW CL CW BD
Min. : 7.20 Min. : 6.50 Min. :14.70 Min. :17.10 Min. : 6.10
1st Qu.:12.90 1st Qu.:11.00 1st Qu.:27.27 1st Qu.:31.50 1st Qu.:11.40
Median :15.55 Median :12.80 Median :32.10 Median :36.80 Median :13.90
Mean :15.58 Mean :12.74 Mean :32.11 Mean :36.41 Mean :14.03
3rd Qu.:18.05 3rd Qu.:14.30 3rd Qu.:37.23 3rd Qu.:42.00 3rd Qu.:16.60
Max. :23.10 Max. :20.20 Max. :47.60 Max. :54.60 Max. :21.60
2 Choix du modèle et du nombre de classes
library(mclust)
res.BIC <- mclustBIC(don, verbose = F)
plot(res.BIC)
summary(res.BIC)
Best BIC values:
EEV,4 VEE,6 VEE,7
BIC -2842.298 -2857.31968 -2863.98088
BIC diff 0.000 -15.02184 -21.68305
3. Construction de la classification
mod <- Mclust(don, x = res.BIC)
summary(mod)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust EEV (ellipsoidal, equal volume and shape) model with 4 components:
log.likelihood n df BIC ICL
-1241.006 200 68 -2842.298 -2854.29
Clustering table:
1 2 3 4
60 55 39 46
table(mod$classification)
1 2 3 4
60 55 39 46
4. Caractériser les classes
don.comp <- cbind.data.frame(don,classe=factor(mod$classification))
library(FactoMineR)
catdes(don.comp, num.var = 6)
Link between the cluster variable and the quantitative variables
================================================================
Eta2 P-value
FL 0.3063318 1.700502e-15
RW 0.2819780 4.806351e-14
BD 0.2791240 7.055350e-14
CL 0.2262530 6.562003e-11
CW 0.2025411 1.198917e-09
Description of each cluster by quantitative variables
=====================================================
$`1`
v.test Mean in category Overall mean sd in category Overall sd p.value
RW -4.173975 11.57833 12.7385 2.587154 2.566898 2.993307e-05
CW -5.864227 31.42833 36.4145 6.924500 7.852251 4.512294e-09
CL -6.513962 27.09667 32.1055 5.998416 7.101163 7.319396e-11
BD -7.219174 11.36000 14.0305 2.758575 3.416200 5.230442e-13
FL -7.322878 12.81833 15.5830 2.668926 3.486576 2.427082e-13
$`2`
v.test Mean in category Overall mean sd in category Overall sd p.value
BD 2.567361 15.04 14.0305 3.542654 3.4162 0.01024758
$`3`
v.test Mean in category Overall mean sd in category Overall sd p.value
CW 3.131766 39.95641 36.4145 6.160036 7.852251 0.001737583
CL 2.611915 34.77692 32.1055 5.366490 7.101163 0.009003657
$`4`
v.test Mean in category Overall mean sd in category Overall sd p.value
RW 7.275734 15.16087 12.7385 2.027901 2.566898 3.445423e-13
FL 5.113948 17.89565 15.5830 2.741545 3.486576 3.154944e-07
BD 4.179923 15.88261 14.0305 2.574567 3.416200 2.916073e-05
CL 3.385706 35.22391 32.1055 5.302919 7.101163 7.099542e-04
CW 3.287811 39.76304 36.4145 5.834133 7.852251 1.009695e-03
Pour aller plus loin
load(url("https://r-stat-sc-donnees.github.io/USPS358.Rdata")) ### charger le jeu de données sur votre ordinateur
image(matrix(t(X[5,]), ncol = 16))
res.hddc <- hddc(X, 3)
model K threshold BIC
1 AKJBKQKDK 3 0.2 -613,317.19
SELECTED: model AKJBKQKDK with 3 clusters.
Selection Criterion: BIC.
table(res.hddc$class,cls)
cls
1 2 3
1 19 2 492
2 43 549 26
3 596 5 24
adjustedRandIndex(res.hddc$class,cls)
[1] 0.8055273
LS0tDQp0aXRsZTogIk1vZMOobGVzIGRlIG3DqWxhbmdlIg0KYXV0aG9yOiAiSHVzc29uIGV0IGFsLiINCmRhdGU6ICIwOS8wOS8yMDE4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzMnDQogICAgdG9jX2Zsb2F0OiB5ZXMNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSwgY2FjaGUgPSBUUlVFKQ0KYGBgDQoNCiMgMS4gSW1wb3J0ZXIgbGVzIGRvbm7DqWVzDQoNCmBgYHtyLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoSERjbGFzc2lmKQ0KZGF0YShjcmFicykNCmRvbiA8LSBjcmFic1sgLCA0Om5jb2woY3JhYnMpXQ0Kc3VtbWFyeShkb24pDQpgYGANCg0KIyAyIENob2l4IGR1IG1vZMOobGUgZXQgZHUgbm9tYnJlIGRlIGNsYXNzZXMNCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShtY2x1c3QpDQpyZXMuQklDIDwtIG1jbHVzdEJJQyhkb24sIHZlcmJvc2UgPSBGKQ0KcGxvdChyZXMuQklDKQ0Kc3VtbWFyeShyZXMuQklDKQ0KYGBgDQojIDMuIENvbnN0cnVjdGlvbiBkZSBsYSBjbGFzc2lmaWNhdGlvbg0KDQpgYGB7cn0NCm1vZCA8LSBNY2x1c3QoZG9uLCB4ID0gcmVzLkJJQykNCnN1bW1hcnkobW9kKQ0KdGFibGUobW9kJGNsYXNzaWZpY2F0aW9uKQ0KYGBgDQoNCiMgNC4gQ2FyYWN0w6lyaXNlciBsZXMgY2xhc3Nlcw0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpkb24uY29tcCA8LSBjYmluZC5kYXRhLmZyYW1lKGRvbixjbGFzc2U9ZmFjdG9yKG1vZCRjbGFzc2lmaWNhdGlvbikpDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShGYWN0b01pbmVSKQ0KY2F0ZGVzKGRvbi5jb21wLCBudW0udmFyID0gNikNCmBgYA0KDQojIFBvdXIgYWxsZXIgcGx1cyBsb2luDQoNCmBgYHtyLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxvYWQodXJsKCJodHRwczovL3Itc3RhdC1zYy1kb25uZWVzLmdpdGh1Yi5pby9VU1BTMzU4LlJkYXRhIikpICAjIyMgY2hhcmdlIGxlIGpldSBkZSBkb25uw6llcyBzdXIgdm90cmUgb3JkaW5hdGV1cg0KaW1hZ2UobWF0cml4KHQoWFs1LF0pLCBuY29sID0gMTYpKQ0KYGBgDQoNCmBgYHtyfQ0KcmVzLmhkZGMgPC0gaGRkYyhYLCAzKQ0KdGFibGUocmVzLmhkZGMkY2xhc3MsY2xzKQ0KYWRqdXN0ZWRSYW5kSW5kZXgocmVzLmhkZGMkY2xhc3MsY2xzKQ0KYGBgDQoNCg0K