1. Importer les données

library(kernlab)
data(spam)

Mise en forme pour gbm

spam$type <- as.numeric(spam$type)-1
set.seed(5678)
perm <- sample(4601,3000)
app <- spam[perm,]
valid <- spam[-perm,]

2. Construire et analyser l’algorithme de gradient boosting

library(gbm)
set.seed(1234)
gbm(type~., data=app, distribution="adaboost", shrinkage=0.01, n.trees=3000)
gbm(formula = type ~ ., distribution = "adaboost", data = app, 
    n.trees = 3000, shrinkage = 0.01)
A gradient boosted model with adaboost loss function.
3000 iterations were performed.
There were 57 predictors of which 36 had non-zero influence.

3. Sélectionner le nombre d’itérations

set.seed(1234)
mod.ada <- gbm(type~.,data=app,distribution="adaboost",cv.folds=5, shrinkage=0.01,n.trees=3000)
set.seed(567)
mod.logit <- gbm(type~.,data=app,distribution="bernoulli",cv.folds=5, shrinkage=0.05,n.trees=3000)
Mopt.ada <- gbm.perf(mod.ada,method="cv")

Mopt.ada
[1] 1740
Mopt.logit <- gbm.perf(mod.logit,method="cv")

Mopt.logit
[1] 1007

4. Faire de la prévision

prev.ada <- predict(mod.ada,newdata=valid,type="response", n.trees=Mopt.ada)
head(round(prev.ada,3))
[1] 0.998 0.421 0.996 0.746 0.963 0.998
prev.logit <- predict(mod.logit,newdata=valid,type="response", n.trees=Mopt.ada)
head(round(prev.logit,3))
[1] 0.997 0.812 0.999 0.953 0.997 0.999

5. Estimer les performances d’un algorithme de gradient boosting

prev.prob <- data.frame(ada=prev.ada,logit=prev.logit,obs=valid$type)
head(round(prev.prob,3))
    ada logit obs
1 0.998 0.997   1
2 0.421 0.812   1
3 0.996 0.999   1
4 0.746 0.953   1
5 0.963 0.997   1
6 0.998 0.999   1
prev.class <- round(prev.prob)
head(prev.class)
  ada logit obs
1   1     1   1
2   0     1   1
3   1     1   1
4   1     1   1
5   1     1   1
6   1     1   1
library(tidyverse)
prev.class %>% summarise_all(funs(err=mean(obs!=.))) %>% select(-obs_err) %>% round(3)
  ada_err logit_err
1   0.069     0.062
library(plotROC)
df.roc <- prev.prob %>% gather(key=Methode,value=score,ada,logit)
ggplot(df.roc)+aes(d=obs,m=score,color=Methode)+ geom_roc()+theme_classic()

library(pROC)
df.roc %>% group_by(Methode) %>% summarize(AUC=pROC::auc(obs,score))
# A tibble: 2 x 2
  Methode   AUC
  <chr>   <dbl>
1 ada     0.978
2 logit   0.981

6. Interpréter un algorithme de gradient boosting

summary(mod.logit)[1:10,]
                            var   rel.inf
charExclamation charExclamation 22.107888
charDollar           charDollar 18.328872
remove                   remove 12.903802
free                       free  7.229789
your                       your  6.039477
hp                           hp  4.981167
capitalAve           capitalAve  4.909642
capitalLong         capitalLong  4.014214
report                   report  2.208803
our                         our  2.091343

LS0tDQp0aXRsZTogIkdyYWRpZW50IEJvb3N0aW5nIg0KYXV0aG9yOiAiSHVzc29uIGV0IGFsLiINCmRhdGU6ICI2IHNlcHRlbWJyZSAyMDE4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzMnDQogICAgdG9jX2Zsb2F0OiB5ZXMNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkNCmBgYA0KDQojIDEuIEltcG9ydGVyIGxlcyBkb25uw6llcw0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShrZXJubGFiKQ0KZGF0YShzcGFtKQ0KYGBgDQpNaXNlIGVuIGZvcm1lIHBvdXIgZ2JtDQpgYGB7cn0NCnNwYW0kdHlwZSA8LSBhcy5udW1lcmljKHNwYW0kdHlwZSktMQ0KYGBgDQoNCmBgYHtyfQ0Kc2V0LnNlZWQoNTY3OCkNCnBlcm0gPC0gc2FtcGxlKDQ2MDEsMzAwMCkNCmFwcCA8LSBzcGFtW3Blcm0sXQ0KdmFsaWQgPC0gc3BhbVstcGVybSxdDQpgYGANCg0KDQojIDIuIENvbnN0cnVpcmUgZXQgYW5hbHlzZXIgbOKAmWFsZ29yaXRobWUgZGUgZ3JhZGllbnQgYm9vc3RpbmcNCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShnYm0pDQpzZXQuc2VlZCgxMjM0KQ0KZ2JtKHR5cGV+LiwgZGF0YT1hcHAsIGRpc3RyaWJ1dGlvbj0iYWRhYm9vc3QiLCBzaHJpbmthZ2U9MC4wMSwgbi50cmVlcz0zMDAwKQ0KYGBgDQoNCiMgMy4gU8OpbGVjdGlvbm5lciBsZSBub21icmUgZOKAmWl0w6lyYXRpb25zDQoNCmBgYHtyfQ0Kc2V0LnNlZWQoMTIzNCkNCm1vZC5hZGEgPC0gZ2JtKHR5cGV+LixkYXRhPWFwcCxkaXN0cmlidXRpb249ImFkYWJvb3N0Iixjdi5mb2xkcz01LCBzaHJpbmthZ2U9MC4wMSxuLnRyZWVzPTMwMDApDQpzZXQuc2VlZCg1NjcpDQptb2QubG9naXQgPC0gZ2JtKHR5cGV+LixkYXRhPWFwcCxkaXN0cmlidXRpb249ImJlcm5vdWxsaSIsY3YuZm9sZHM9NSwgc2hyaW5rYWdlPTAuMDUsbi50cmVlcz0zMDAwKQ0KYGBgDQoNCmBgYHtyfQ0KTW9wdC5hZGEgPC0gZ2JtLnBlcmYobW9kLmFkYSxtZXRob2Q9ImN2IikNCk1vcHQuYWRhDQpNb3B0LmxvZ2l0IDwtIGdibS5wZXJmKG1vZC5sb2dpdCxtZXRob2Q9ImN2IikNCk1vcHQubG9naXQNCmBgYA0KDQojIDQuIEZhaXJlIGRlIGxhIHByw6l2aXNpb24NCg0KYGBge3J9DQpwcmV2LmFkYSA8LSBwcmVkaWN0KG1vZC5hZGEsbmV3ZGF0YT12YWxpZCx0eXBlPSJyZXNwb25zZSIsIG4udHJlZXM9TW9wdC5hZGEpDQpoZWFkKHJvdW5kKHByZXYuYWRhLDMpKQ0KcHJldi5sb2dpdCA8LSBwcmVkaWN0KG1vZC5sb2dpdCxuZXdkYXRhPXZhbGlkLHR5cGU9InJlc3BvbnNlIiwgbi50cmVlcz1Nb3B0LmFkYSkNCmhlYWQocm91bmQocHJldi5sb2dpdCwzKSkNCmBgYA0KDQojIDUuIEVzdGltZXIgbGVzIHBlcmZvcm1hbmNlcyBk4oCZdW4gYWxnb3JpdGhtZSBkZSBncmFkaWVudCBib29zdGluZw0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpwcmV2LnByb2IgPC0gZGF0YS5mcmFtZShhZGE9cHJldi5hZGEsbG9naXQ9cHJldi5sb2dpdCxvYnM9dmFsaWQkdHlwZSkNCmhlYWQocm91bmQocHJldi5wcm9iLDMpKQ0KYGBgDQoNCmBgYHtyfQ0KcHJldi5jbGFzcyA8LSByb3VuZChwcmV2LnByb2IpDQpoZWFkKHByZXYuY2xhc3MpDQpgYGANCg0KYGBge3Isd2FybmluZz1GQUxTRSxtZXNzYWdlPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpwcmV2LmNsYXNzICU+JSBzdW1tYXJpc2VfYWxsKGZ1bnMoZXJyPW1lYW4ob2JzIT0uKSkpICU+JSBzZWxlY3QoLW9ic19lcnIpICU+JSByb3VuZCgzKQ0KYGBgDQoNCmBgYHtyLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxpYnJhcnkocGxvdFJPQykNCmRmLnJvYyA8LSBwcmV2LnByb2IgJT4lIGdhdGhlcihrZXk9TWV0aG9kZSx2YWx1ZT1zY29yZSxhZGEsbG9naXQpDQpnZ3Bsb3QoZGYucm9jKSthZXMoZD1vYnMsbT1zY29yZSxjb2xvcj1NZXRob2RlKSsgZ2VvbV9yb2MoKSt0aGVtZV9jbGFzc2ljKCkNCmBgYA0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHBST0MpDQpkZi5yb2MgJT4lIGdyb3VwX2J5KE1ldGhvZGUpICU+JSBzdW1tYXJpemUoQVVDPXBST0M6OmF1YyhvYnMsc2NvcmUpKQ0KYGBgDQoNCiMgNi4gSW50ZXJwcsOpdGVyIHVuIGFsZ29yaXRobWUgZGUgZ3JhZGllbnQgYm9vc3RpbmcNCg0KYGBge3J9DQpzdW1tYXJ5KG1vZC5sb2dpdClbMToxMCxdDQpgYGANCg==