1. Importer les données
library(kernlab)
data(spam)
summary(spam[,56:58])
capitalLong capitalTotal type
Min. : 1.00 Min. : 1.0 nonspam:2788
1st Qu.: 6.00 1st Qu.: 35.0 spam :1813
Median : 15.00 Median : 95.0
Mean : 52.17 Mean : 283.3
3rd Qu.: 43.00 3rd Qu.: 266.0
Max. :9989.00 Max. :15841.0
set.seed(5678)
perm <- sample(4601,3000)
app <- spam[perm,]
valid <- spam[-perm,]
2. Construire et analyser un algorithme de SVM
mod.svm <- ksvm(type~.,data=app,kernel="vanilladot",C=1)
Setting default kernel parameters
mod.svm
Support Vector Machine object of class "ksvm"
SV type: C-svc (classification)
parameter : cost C = 1
Linear (vanilla) kernel function.
Number of Support Vectors : 599
Objective Function Value : -546.7826
Training error : 0.067
3. Sélectionner les paramètres d’un SVM
C <- c(0.1,1,10,100)
degree <- c(1,2,3)
scale <- 1
sigma <- c(0.0001,0.001,0.01,0.1,1)
library(caret)
gr.poly <- expand.grid(C=C,degree=degree,scale=scale)
ctrl <- trainControl(method="cv",number=3)
set.seed(123)
sel.poly <- train(type~.,data=app,method="svmPoly",trControl=ctrl,tuneGrid=gr.poly)
sel.poly
Support Vector Machines with Polynomial Kernel
3000 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Cross-Validated (3 fold)
Summary of sample sizes: 2000, 2000, 2000
Resampling results across tuning parameters:
C degree Accuracy Kappa
0.1 1 0.9270000 0.8461040
0.1 2 0.9100000 0.8105364
0.1 3 0.9083333 0.8077676
1.0 1 0.9280000 0.8479742
1.0 2 0.8936667 0.7774023
1.0 3 0.8920000 0.7745405
10.0 1 0.9270000 0.8460247
10.0 2 0.8853333 0.7608912
10.0 3 0.8830000 0.7553775
100.0 1 0.9240000 0.8395595
100.0 2 0.8806667 0.7518021
100.0 3 0.8763333 0.7413648
Tuning parameter 'scale' was held constant at a value of 1
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were degree = 1, scale = 1 and C = 1.
gr.radial <- expand.grid(C=C,sigma=sigma)
set.seed(345)
sel.radial <- train(type~.,data=app,method="svmRadial",trControl=ctrl,tuneGrid=gr.radial)
sel.radial
Support Vector Machines with Radial Basis Function Kernel
3000 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Cross-Validated (3 fold)
Summary of sample sizes: 2000, 2000, 2000
Resampling results across tuning parameters:
C sigma Accuracy Kappa
0.1 1e-04 0.6060000 0.0000000
0.1 1e-03 0.7663333 0.4578780
0.1 1e-02 0.8990000 0.7825103
0.1 1e-01 0.7610000 0.4447726
0.1 1e+00 0.6060000 0.0000000
1.0 1e-04 0.7723333 0.4734765
1.0 1e-03 0.9000000 0.7855698
1.0 1e-02 0.9276667 0.8470048
1.0 1e-01 0.9050000 0.7958078
1.0 1e+00 0.7576667 0.4321881
10.0 1e-04 0.8970000 0.7790195
10.0 1e-03 0.9280000 0.8477454
10.0 1e-02 0.9353333 0.8636455
10.0 1e-01 0.9040000 0.7948708
10.0 1e+00 0.7626667 0.4475051
100.0 1e-04 0.9203333 0.8315819
100.0 1e-03 0.9333333 0.8594359
100.0 1e-02 0.9283333 0.8486409
100.0 1e-01 0.8966667 0.7797074
100.0 1e+00 0.7603333 0.4430619
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.01 and C = 10.
mod.poly <- ksvm(type~.,data=app,kernel="polydot",kpar=list(degree=1,scale=1,offset=1),C=1,prob.model = TRUE)
mod.radial <- ksvm(type~.,data=app,kernel="rbfdot",kpar=list(sigma=0.01),C=10,prob.model = TRUE)
4. Faire de la prévision
prev.class.poly <- predict(mod.poly,newdata=valid)
prev.class.radial <- predict(mod.radial,newdata=valid)
prev.class.poly[1:10]
[1] spam spam spam spam spam spam spam spam nonspam spam
Levels: nonspam spam
prev.class.radial[1:10]
[1] spam spam spam spam spam spam spam spam nonspam spam
Levels: nonspam spam
prev.prob.poly <- predict(mod.poly,newdata=valid,type="probabilities")
prev.prob.radial <- predict(mod.radial,newdata=valid,type="probabilities")
round(head(prev.prob.poly),3)
nonspam spam
[1,] 0.056 0.944
[2,] 0.412 0.588
[3,] 0.000 1.000
[4,] 0.172 0.828
[5,] 0.005 0.995
[6,] 0.066 0.934
LS0tDQp0aXRsZTogIlNWTSINCmF1dGhvcjogIkh1c3NvbiBldCBhbC4iDQpkYXRlOiAiNiBzZXB0ZW1icmUgMjAxOCINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogeWVzDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6ICczJw0KICAgIHRvY19mbG9hdDogeWVzDQotLS0NCg0KIyAxLiBJbXBvcnRlciBsZXMgZG9ubsOpZXMNCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShrZXJubGFiKQ0KZGF0YShzcGFtKQ0Kc3VtbWFyeShzcGFtWyw1Njo1OF0pDQpgYGANCg0KYGBge3J9DQpzZXQuc2VlZCg1Njc4KQ0KcGVybSA8LSBzYW1wbGUoNDYwMSwzMDAwKQ0KYXBwIDwtIHNwYW1bcGVybSxdDQp2YWxpZCA8LSBzcGFtWy1wZXJtLF0NCmBgYA0KDQojIDIuIENvbnN0cnVpcmUgZXQgYW5hbHlzZXIgdW4gYWxnb3JpdGhtZSBkZSBTVk0NCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbW9kLnN2bSA8LSBrc3ZtKHR5cGV+LixkYXRhPWFwcCxrZXJuZWw9InZhbmlsbGFkb3QiLEM9MSkNCm1vZC5zdm0NCmBgYA0KDQojIDMuIFPDqWxlY3Rpb25uZXIgbGVzIHBhcmFtw6h0cmVzIGTigJl1biBTVk0NCg0KYGBge3J9DQpDIDwtIGMoMC4xLDEsMTAsMTAwKQ0KZGVncmVlIDwtIGMoMSwyLDMpDQpzY2FsZSA8LSAxDQpzaWdtYSA8LSBjKDAuMDAwMSwwLjAwMSwwLjAxLDAuMSwxKQ0KYGBgDQoNCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShjYXJldCkNCmdyLnBvbHkgPC0gZXhwYW5kLmdyaWQoQz1DLGRlZ3JlZT1kZWdyZWUsc2NhbGU9c2NhbGUpDQpjdHJsIDwtIHRyYWluQ29udHJvbChtZXRob2Q9ImN2IixudW1iZXI9MykNCnNldC5zZWVkKDEyMykNCnNlbC5wb2x5IDwtIHRyYWluKHR5cGV+LixkYXRhPWFwcCxtZXRob2Q9InN2bVBvbHkiLHRyQ29udHJvbD1jdHJsLHR1bmVHcmlkPWdyLnBvbHkpDQpzZWwucG9seQ0KDQpnci5yYWRpYWwgPC0gZXhwYW5kLmdyaWQoQz1DLHNpZ21hPXNpZ21hKQ0Kc2V0LnNlZWQoMzQ1KQ0Kc2VsLnJhZGlhbCA8LSB0cmFpbih0eXBlfi4sZGF0YT1hcHAsbWV0aG9kPSJzdm1SYWRpYWwiLHRyQ29udHJvbD1jdHJsLHR1bmVHcmlkPWdyLnJhZGlhbCkNCnNlbC5yYWRpYWwNCmBgYA0KDQpgYGB7cn0NCm1vZC5wb2x5IDwtIGtzdm0odHlwZX4uLGRhdGE9YXBwLGtlcm5lbD0icG9seWRvdCIsa3Bhcj1saXN0KGRlZ3JlZT0xLHNjYWxlPTEsb2Zmc2V0PTEpLEM9MSxwcm9iLm1vZGVsID0gVFJVRSkNCm1vZC5yYWRpYWwgPC0ga3N2bSh0eXBlfi4sZGF0YT1hcHAsa2VybmVsPSJyYmZkb3QiLGtwYXI9bGlzdChzaWdtYT0wLjAxKSxDPTEwLHByb2IubW9kZWwgPSBUUlVFKQ0KDQpgYGANCg0KIyA0LiBGYWlyZSBkZSBsYSBwcsOpdmlzaW9uDQoNCmBgYHtyfQ0KcHJldi5jbGFzcy5wb2x5IDwtIHByZWRpY3QobW9kLnBvbHksbmV3ZGF0YT12YWxpZCkNCnByZXYuY2xhc3MucmFkaWFsIDwtIHByZWRpY3QobW9kLnJhZGlhbCxuZXdkYXRhPXZhbGlkKQ0KcHJldi5jbGFzcy5wb2x5WzE6MTBdDQpwcmV2LmNsYXNzLnJhZGlhbFsxOjEwXQ0KYGBgDQoNCmBgYHtyfQ0KcHJldi5wcm9iLnBvbHkgPC0gcHJlZGljdChtb2QucG9seSxuZXdkYXRhPXZhbGlkLHR5cGU9InByb2JhYmlsaXRpZXMiKQ0KcHJldi5wcm9iLnJhZGlhbCA8LSBwcmVkaWN0KG1vZC5yYWRpYWwsbmV3ZGF0YT12YWxpZCx0eXBlPSJwcm9iYWJpbGl0aWVzIikNCnJvdW5kKGhlYWQocHJldi5wcm9iLnBvbHkpLDMpDQpgYGANCg0KIyA1LiBFc3RpbWVyIGxlcyBwZXJmb3JtYW5jZXMgZGUgbOKAmWFsZ29yaXRobWUNCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpwcmV2LmNsYXNzIDwtIGRhdGEuZnJhbWUocG9seT1wcmV2LmNsYXNzLnBvbHkscmFkaWFsPXByZXYuY2xhc3MucmFkaWFsLG9icz12YWxpZCR0eXBlKQ0KcHJldi5jbGFzcyAlPiUgc3VtbWFyaXNlX2FsbChmdW5zKGVycj1tZWFuKG9icyE9LikpKSAlPiUgc2VsZWN0KC1vYnNfZXJyKSAlPiUgcm91bmQoMykNCmBgYA0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHBsb3RST0MpDQpwcmV2LnByb2IgPC0gZGF0YS5mcmFtZShwb2x5PXByZXYucHJvYi5wb2x5WywyXSxyYWRpYWw9cHJldi5wcm9iLnJhZGlhbFssMl0sb2JzPXZhbGlkJHR5cGUpDQpkZi5yb2MgPC0gcHJldi5wcm9iICU+JSBnYXRoZXIoa2V5PU1ldGhvZGUsdmFsdWU9c2NvcmUscG9seSxyYWRpYWwpDQpnZ3Bsb3QoZGYucm9jKSthZXMoZD1vYnMsbT1zY29yZSxjb2xvcj1NZXRob2RlKStnZW9tX3JvYygpK3RoZW1lX2NsYXNzaWMoKQ0KDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShwUk9DKQ0KZGYucm9jICU+JSBncm91cF9ieShNZXRob2RlKSAlPiUgc3VtbWFyaXplKEFVQz1wUk9DOjphdWMob2JzLHNjb3JlKSkNCmBgYA0KDQo=