1. Importer les données
library(kernlab)
data(spam)
summary(spam[,56:58])
capitalLong capitalTotal type
Min. : 1.00 Min. : 1.0 nonspam:2788
1st Qu.: 6.00 1st Qu.: 35.0 spam :1813
Median : 15.00 Median : 95.0
Mean : 52.17 Mean : 283.3
3rd Qu.: 43.00 3rd Qu.: 266.0
Max. :9989.00 Max. :15841.0
set.seed(5678)
perm <- sample(4601,3000)
app <- spam[perm,]
valid <- spam[-perm,]
2. Construire et analyser une forêt aléatoire
library(randomForest)
set.seed(1234)
foret <- randomForest(type~.,data=app)
foret
Call:
randomForest(formula = type ~ ., data = app)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 7
OOB estimate of error rate: 4.73%
Confusion matrix:
nonspam spam class.error
nonspam 1764 54 0.02970297
spam 88 1094 0.07445008
3. Sélectionner les paramètres de la forêt
plot(foret)
tail(foret$err.rate)
OOB nonspam spam
[495,] 0.04733333 0.02915292 0.07529611
[496,] 0.04666667 0.02915292 0.07360406
[497,] 0.04700000 0.02915292 0.07445008
[498,] 0.04733333 0.02915292 0.07529611
[499,] 0.04733333 0.02970297 0.07445008
[500,] 0.04733333 0.02970297 0.07445008
grille.mtry <- data.frame(mtry=seq(1,57,by=3))
library(caret)
ctrl <- trainControl(method="oob")
library(doParallel) # pour paralléliser
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
set.seed(12345)
sel.mtry <- train(type~.,data=app,method="rf",trControl=ctrl,tuneGrid=grille.mtry)
sel.mtry
Random Forest
3000 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling results across tuning parameters:
mtry Accuracy Kappa
1 0.9233333 0.8356570
4 0.9506667 0.8960131
7 0.9536667 0.9025254
10 0.9540000 0.9032122
13 0.9536667 0.9025543
16 0.9540000 0.9033846
19 0.9503333 0.8955749
22 0.9506667 0.8962295
25 0.9496667 0.8941418
28 0.9493333 0.8935198
31 0.9470000 0.8885002
34 0.9476667 0.8899682
37 0.9480000 0.8906528
40 0.9460000 0.8864809
43 0.9453333 0.8849769
46 0.9466667 0.8878157
49 0.9460000 0.8864134
52 0.9470000 0.8885665
55 0.9453333 0.8849769
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 10.
stopCluster(cl) ## fermeture des clusters
4. Faire de la prévision
set.seed(5432)
foret1 <- randomForest(type~.,data=app,mtry=10)
prev.valid <- predict(foret1,newdata=valid)
prev.valid[1:10]
2 6 9 11 15 16 18 19 21 22
spam spam spam spam spam spam spam spam nonspam spam
Levels: nonspam spam
prob.valid <- predict(foret1,newdata=valid,type="prob")
prob.valid[1:10,]
nonspam spam
2 0.000 1.000
6 0.306 0.694
9 0.056 0.944
11 0.180 0.820
15 0.086 0.914
16 0.036 0.964
18 0.024 0.976
19 0.206 0.794
21 0.542 0.458
22 0.026 0.974
5. Estimer les performances de la forêt
set.seed(5432)
foret2 <- randomForest(type~.,data=app,xtest=valid[,-58],ytest=valid[,58],keep.forest=TRUE)
set.seed(891)
foret3 <- randomForest(type~.,data=app,mtry=10,xtest=valid[,-58],ytest=valid[,58],keep.forest=TRUE)
foret2
Call:
randomForest(formula = type ~ ., data = app, xtest = valid[, -58], ytest = valid[, 58], keep.forest = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 7
OOB estimate of error rate: 4.63%
Confusion matrix:
nonspam spam class.error
nonspam 1760 58 0.03190319
spam 81 1101 0.06852792
Test set error rate: 5.5%
Confusion matrix:
nonspam spam class.error
nonspam 941 29 0.02989691
spam 59 572 0.09350238
foret3
Call:
randomForest(formula = type ~ ., data = app, mtry = 10, xtest = valid[, -58], ytest = valid[, 58], keep.forest = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 10
OOB estimate of error rate: 4.5%
Confusion matrix:
nonspam spam class.error
nonspam 1765 53 0.02915292
spam 82 1100 0.06937394
Test set error rate: 5.68%
Confusion matrix:
nonspam spam class.error
nonspam 939 31 0.03195876
spam 60 571 0.09508716
library(pROC)
prev2 <- predict(foret2,newdata=valid,type="prob")[,2]
roc2 <- roc(valid$type,prev2)
prev3 <- predict(foret3,newdata=valid,type="prob")[,2]
roc3 <- roc(valid$type,prev3)
plot(roc2,print.auc=TRUE,print.auc.cex=0.5,print.auc.x=0.4,print.auc.y=0.3)
plot(roc3,add=TRUE,col="red",print.auc=TRUE,print.auc.cex=0.5,print.auc.col="red",print.auc.x=0.4,print.auc.y=0.2)
library(rpart)
set.seed(12345)
arbre <- rpart(type~.,data=app,cp=0.0001)
library(tidyverse)
cp_opt <- arbre$cptable %>% as.data.frame() %>% filter(xerror==min(xerror)) %>% select(CP) %>% max() %>% as.numeric()
arbre_sel <- prune(arbre,cp=cp_opt)
prev.arbre <- predict(arbre_sel,newdata=valid,type="prob")[,2]
roc.arbre <- roc(valid$type,prev.arbre)
plot(roc.arbre,add=TRUE,col="blue",print.auc=TRUE,print.auc.cex=0.5,print.auc.col="blue",print.auc.x=0.4,print.auc.y=0.1)
6. Interpréter la forêt aléatoire
var.imp <- foret2$importance
ord <- order(var.imp,decreasing=TRUE)
barplot(sort(var.imp,decreasing = TRUE)[1:10],names.arg=rownames(var.imp)[ord][1:10],cex.names=0.4)
LS0tDQp0aXRsZTogIkZvcsOqdHMgYWzDqWF0b2lyZXMiDQphdXRob3I6ICJIdXNzb24gZXQgYWwuIg0KZGF0ZTogIjEyIHNlcHRlbWJyZSAyMDE4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzMnDQogICAgdG9jX2Zsb2F0OiB5ZXMNCi0tLQ0KDQojIDEuIEltcG9ydGVyIGxlcyBkb25uw6llcw0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KGtlcm5sYWIpDQpkYXRhKHNwYW0pDQpzdW1tYXJ5KHNwYW1bLDU2OjU4XSkNCmBgYA0KDQpgYGB7cn0NCnNldC5zZWVkKDU2NzgpDQpwZXJtIDwtIHNhbXBsZSg0NjAxLDMwMDApDQphcHAgPC0gc3BhbVtwZXJtLF0NCnZhbGlkIDwtIHNwYW1bLXBlcm0sXQ0KYGBgDQoNCiMgMi4gQ29uc3RydWlyZSBldCBhbmFseXNlciB1bmUgZm9yw6p0IGFsw6lhdG9pcmUNCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQpzZXQuc2VlZCgxMjM0KQ0KZm9yZXQgPC0gcmFuZG9tRm9yZXN0KHR5cGV+LixkYXRhPWFwcCkNCmZvcmV0DQpgYGANCg0KIyAzLiBTw6lsZWN0aW9ubmVyIGxlcyBwYXJhbcOodHJlcyBkZSBsYSBmb3LDqnQNCg0KYGBge3J9DQpwbG90KGZvcmV0KQ0KYGBgDQoNCmBgYHtyfQ0KdGFpbChmb3JldCRlcnIucmF0ZSkNCmBgYA0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpncmlsbGUubXRyeSA8LSBkYXRhLmZyYW1lKG10cnk9c2VxKDEsNTcsYnk9MykpDQpsaWJyYXJ5KGNhcmV0KQ0KY3RybCA8LSB0cmFpbkNvbnRyb2wobWV0aG9kPSJvb2IiKQ0KbGlicmFyeShkb1BhcmFsbGVsKSAgICAjIHBvdXIgcGFyYWxsw6lsaXNlcg0KY2wgPC0gbWFrZVBTT0NLY2x1c3Rlcig0KQ0KcmVnaXN0ZXJEb1BhcmFsbGVsKGNsKSAgICAgDQpzZXQuc2VlZCgxMjM0NSkNCnNlbC5tdHJ5IDwtIHRyYWluKHR5cGV+LixkYXRhPWFwcCxtZXRob2Q9InJmIix0ckNvbnRyb2w9Y3RybCx0dW5lR3JpZD1ncmlsbGUubXRyeSkNCnNlbC5tdHJ5DQpzdG9wQ2x1c3RlcihjbCkgICAgICAgICMjIGZlcm1ldHVyZSBkZXMgY2x1c3RlcnMNCmBgYA0KDQojIDQuIEZhaXJlIGRlIGxhIHByw6l2aXNpb24NCg0KYGBge3J9DQpzZXQuc2VlZCg1NDMyKQ0KZm9yZXQxIDwtIHJhbmRvbUZvcmVzdCh0eXBlfi4sZGF0YT1hcHAsbXRyeT0xMCkNCg0KcHJldi52YWxpZCA8LSBwcmVkaWN0KGZvcmV0MSxuZXdkYXRhPXZhbGlkKQ0KcHJldi52YWxpZFsxOjEwXQ0KDQpwcm9iLnZhbGlkIDwtIHByZWRpY3QoZm9yZXQxLG5ld2RhdGE9dmFsaWQsdHlwZT0icHJvYiIpDQpwcm9iLnZhbGlkWzE6MTAsXQ0KYGBgDQoNCiMgNS4gRXN0aW1lciBsZXMgcGVyZm9ybWFuY2VzIGRlIGxhIGZvcsOqdA0KDQpgYGB7cn0NCnNldC5zZWVkKDU0MzIpDQpmb3JldDIgPC0gcmFuZG9tRm9yZXN0KHR5cGV+LixkYXRhPWFwcCx4dGVzdD12YWxpZFssLTU4XSx5dGVzdD12YWxpZFssNThdLGtlZXAuZm9yZXN0PVRSVUUpDQpzZXQuc2VlZCg4OTEpDQpmb3JldDMgPC0gcmFuZG9tRm9yZXN0KHR5cGV+LixkYXRhPWFwcCxtdHJ5PTEwLHh0ZXN0PXZhbGlkWywtNThdLHl0ZXN0PXZhbGlkWyw1OF0sa2VlcC5mb3Jlc3Q9VFJVRSkNCmZvcmV0Mg0KZm9yZXQzDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFLGZpZy53aWR0aD01LGZpZy5oZWlnaHQ9NX0NCmxpYnJhcnkocFJPQykNCnByZXYyIDwtIHByZWRpY3QoZm9yZXQyLG5ld2RhdGE9dmFsaWQsdHlwZT0icHJvYiIpWywyXQ0Kcm9jMiA8LSByb2ModmFsaWQkdHlwZSxwcmV2MikNCnByZXYzIDwtIHByZWRpY3QoZm9yZXQzLG5ld2RhdGE9dmFsaWQsdHlwZT0icHJvYiIpWywyXQ0Kcm9jMyA8LSByb2ModmFsaWQkdHlwZSxwcmV2MykNCnBsb3Qocm9jMixwcmludC5hdWM9VFJVRSxwcmludC5hdWMuY2V4PTAuNSxwcmludC5hdWMueD0wLjQscHJpbnQuYXVjLnk9MC4zKQ0KcGxvdChyb2MzLGFkZD1UUlVFLGNvbD0icmVkIixwcmludC5hdWM9VFJVRSxwcmludC5hdWMuY2V4PTAuNSxwcmludC5hdWMuY29sPSJyZWQiLHByaW50LmF1Yy54PTAuNCxwcmludC5hdWMueT0wLjIpDQoNCg0KbGlicmFyeShycGFydCkNCnNldC5zZWVkKDEyMzQ1KQ0KYXJicmUgPC0gcnBhcnQodHlwZX4uLGRhdGE9YXBwLGNwPTAuMDAwMSkNCmxpYnJhcnkodGlkeXZlcnNlKQ0KY3Bfb3B0IDwtIGFyYnJlJGNwdGFibGUgJT4lIGFzLmRhdGEuZnJhbWUoKSAlPiUgZmlsdGVyKHhlcnJvcj09bWluKHhlcnJvcikpICU+JSBzZWxlY3QoQ1ApICU+JSBtYXgoKSAlPiUgYXMubnVtZXJpYygpIA0KYXJicmVfc2VsIDwtIHBydW5lKGFyYnJlLGNwPWNwX29wdCkNCnByZXYuYXJicmUgPC0gcHJlZGljdChhcmJyZV9zZWwsbmV3ZGF0YT12YWxpZCx0eXBlPSJwcm9iIilbLDJdDQpyb2MuYXJicmUgPC0gcm9jKHZhbGlkJHR5cGUscHJldi5hcmJyZSkNCnBsb3Qocm9jLmFyYnJlLGFkZD1UUlVFLGNvbD0iYmx1ZSIscHJpbnQuYXVjPVRSVUUscHJpbnQuYXVjLmNleD0wLjUscHJpbnQuYXVjLmNvbD0iYmx1ZSIscHJpbnQuYXVjLng9MC40LHByaW50LmF1Yy55PTAuMSkNCmBgYA0KDQojIDYuIEludGVycHLDqXRlciBsYSBmb3LDqnQgYWzDqWF0b2lyZQ0KDQpgYGB7cixmaWcud2lkdGg9MTAsZmlnLmhlaWdodD01fQ0KdmFyLmltcCA8LSBmb3JldDIkaW1wb3J0YW5jZQ0Kb3JkIDwtIG9yZGVyKHZhci5pbXAsZGVjcmVhc2luZz1UUlVFKQ0KYmFycGxvdChzb3J0KHZhci5pbXAsZGVjcmVhc2luZyA9IFRSVUUpWzE6MTBdLG5hbWVzLmFyZz1yb3duYW1lcyh2YXIuaW1wKVtvcmRdWzE6MTBdLGNleC5uYW1lcz0wLjQpDQpgYGANCg0K