1. Importer les données
bank <- read.table("https://r-stat-sc-donnees.github.io/bank-additional.csv",header=TRUE,sep=";")
summary(bank)[,c(1:3,21)]
age job marital y
Min. :18.00 admin. :1012 divorced: 446 no :3668
1st Qu.:32.00 blue-collar: 884 married :2509 yes: 451
Median :38.00 technician : 691 single :1153
Mean :40.11 services : 393 unknown : 11
3rd Qu.:47.00 management : 324
Max. :88.00 retired : 166
(Other) : 649
2. Construire le modèle
set.seed(5678)
perm <- sample(nrow(bank),3000)
app <- bank[perm,]
test <- bank[-perm,]
logit_complet <- glm(y~.,data=app,family=binomial)
coef(logit_complet)[is.na(coef(logit_complet))]
loanunknown
NA
table(app$loan,app$housing)
no unknown yes
no 1169 0 1280
unknown 0 82 0
yes 188 0 281
summary(logit_complet)
Call:
glm(formula = y ~ ., family = binomial, data = app)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.1751 -0.2823 -0.1693 -0.1090 2.8013
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.760e+01 1.496e+02 -0.519 0.604029
age 7.731e-03 9.583e-03 0.807 0.419816
jobblue-collar 1.109e-01 3.172e-01 0.350 0.726681
jobentrepreneur -5.170e-01 5.454e-01 -0.948 0.343222
jobhousemaid 5.245e-01 5.049e-01 1.039 0.298855
jobmanagement -3.730e-01 3.478e-01 -1.072 0.283539
jobretired -8.105e-02 4.117e-01 -0.197 0.843922
jobself-employed -5.629e-01 4.957e-01 -1.136 0.256129
jobservices 2.158e-01 3.342e-01 0.646 0.518342
jobstudent -3.934e-01 5.108e-01 -0.770 0.441194
jobtechnician 3.775e-01 2.566e-01 1.471 0.141289
jobunemployed 3.758e-01 4.569e-01 0.823 0.410779
jobunknown 4.676e-01 8.207e-01 0.570 0.568830
maritalmarried 2.996e-01 2.839e-01 1.055 0.291290
maritalsingle 3.723e-01 3.233e-01 1.151 0.249569
maritalunknown 1.163e+00 1.203e+00 0.966 0.334010
educationbasic.6y 6.380e-01 4.627e-01 1.379 0.167952
educationbasic.9y 5.097e-01 3.758e-01 1.357 0.174919
educationhigh.school 4.448e-01 3.650e-01 1.219 0.222988
educationilliterate -1.132e+01 5.354e+02 -0.021 0.983138
educationprofessional.course 4.227e-01 3.947e-01 1.071 0.284122
educationuniversity.degree 6.531e-01 3.722e-01 1.755 0.079306 .
educationunknown 5.589e-01 4.525e-01 1.235 0.216751
defaultunknown 9.906e-02 2.448e-01 0.405 0.685710
defaultyes -8.551e+00 5.354e+02 -0.016 0.987257
housingunknown -2.296e-01 5.839e-01 -0.393 0.694126
housingyes -1.235e-01 1.606e-01 -0.769 0.442069
loanunknown NA NA NA NA
loanyes 4.145e-02 2.138e-01 0.194 0.846278
contacttelephone -1.054e+00 3.368e-01 -3.131 0.001745 **
monthaug 5.582e-01 4.962e-01 1.125 0.260630
monthdec 7.539e-01 8.389e-01 0.899 0.368864
monthjul 1.842e-01 4.203e-01 0.438 0.661099
monthjun 8.020e-01 5.235e-01 1.532 0.125553
monthmar 2.369e+00 6.365e-01 3.722 0.000198 ***
monthmay -3.243e-01 3.598e-01 -0.901 0.367357
monthnov -4.332e-01 4.901e-01 -0.884 0.376731
monthoct 9.492e-03 6.175e-01 0.015 0.987736
monthsep 1.114e-01 7.050e-01 0.158 0.874462
day_of_weekmon 1.620e-01 2.509e-01 0.646 0.518418
day_of_weekthu 1.193e-02 2.524e-01 0.047 0.962317
day_of_weektue 2.575e-02 2.518e-01 0.102 0.918548
day_of_weekwed 2.288e-01 2.597e-01 0.881 0.378394
duration 5.834e-03 3.267e-04 17.855 < 2e-16 ***
campaign -7.646e-02 5.128e-02 -1.491 0.135946
pdays -3.089e-04 7.467e-04 -0.414 0.679077
previous 1.526e-01 2.013e-01 0.758 0.448389
poutcomenonexistent 6.972e-01 3.498e-01 1.993 0.046244 *
poutcomesuccess 1.570e+00 7.208e-01 2.178 0.029412 *
emp.var.rate -7.577e-01 5.681e-01 -1.334 0.182289
cons.price.idx 1.014e+00 9.920e-01 1.022 0.306640
cons.conf.idx 6.823e-02 3.211e-02 2.125 0.033593 *
euribor3m 6.553e-02 4.970e-01 0.132 0.895105
nr.employed -3.984e-03 1.199e-02 -0.332 0.739640
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2128.8 on 2999 degrees of freedom
Residual deviance: 1171.9 on 2947 degrees of freedom
AIC: 1277.9
Number of Fisher Scoring iterations: 12
library(car)
Anova(logit_complet,type=3,test.statistic = "LR",singular.ok=TRUE)
Analysis of Deviance Table (Type III tests)
Response: y
LR Chisq Df Pr(>Chisq)
age 0.65 1 0.4198705
job 11.47 11 0.4046378
marital 1.91 3 0.5918195
education 4.38 7 0.7357118
default 0.18 2 0.9158242
housing 0.59 1 0.4421206
loan 0.04 1 0.8465329
contact 10.95 1 0.0009373 ***
month 44.71 9 1.043e-06 ***
day_of_week 1.32 4 0.8587956
duration 486.70 1 < 2.2e-16 ***
campaign 2.47 1 0.1162119
pdays 0.17 1 0.6812723
previous 0.59 1 0.4411588
poutcome 7.64 2 0.0219673 *
emp.var.rate 1.77 1 0.1839836
cons.price.idx 1.04 1 0.3075302
cons.conf.idx 4.57 1 0.0324572 *
euribor3m 0.02 1 0.8951030
nr.employed 0.11 1 0.7393859
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
3. Sélectionner un modèle
logit_step <- step(logit_complet,direction="backward", trace=0)
Anova(logit_step,type=3,test.statistic = "LR")
Analysis of Deviance Table (Type III tests)
Response: y
LR Chisq Df Pr(>Chisq)
contact 14.88 1 0.0001145 ***
month 51.09 9 6.713e-08 ***
duration 483.69 1 < 2.2e-16 ***
campaign 2.99 1 0.0839311 .
poutcome 37.24 2 8.186e-09 ***
emp.var.rate 109.95 1 < 2.2e-16 ***
cons.price.idx 47.58 1 5.273e-12 ***
cons.conf.idx 12.23 1 0.0004691 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(logit_step,logit_complet,test="LRT")
Analysis of Deviance Table
Model 1: y ~ contact + month + duration + campaign + poutcome + emp.var.rate +
cons.price.idx + cons.conf.idx
Model 2: y ~ age + job + marital + education + default + housing + loan +
contact + month + day_of_week + duration + campaign + pdays +
previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
euribor3m + nr.employed
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2982 1192.2
2 2947 1171.9 35 20.213 0.9784
4. Faire de la prévision
prev_step <- predict(logit_step,newdata=test,type="response")
prev_step[1:5]
5 12 13 14 15
0.007852846 0.014818676 0.005213798 0.025447271 0.242988796
prev_prob <- data.frame(complet=predict(logit_complet,newdata=test,
type="response"),step=predict(logit_step,newdata=test,type="response"))
prediction from a rank-deficient fit may be misleading
head(round(prev_prob,3), n=3)
complet step
5 0.009 0.008
12 0.005 0.015
13 0.005 0.005
prev_class <- apply(prev_prob>=0.5,2,factor,labels=c("no","yes"))
head(prev_class, n=3)
complet step
5 "no" "no"
12 "no" "no"
13 "no" "no"
library(tidyverse)
prev_class <- data.frame(prev_class)
prev_class %>% mutate(obs=test$y) %>%
summarise_all(funs(err=mean(obs!=.))) %>% select(-obs_err) %>% round(3)
complet_err step_err
1 0.074 0.08
library(plotROC)
df_roc <- prev_prob %>% mutate(obs=test$y) %>%
gather(key=methode,value=score,complet,step)
ggplot(df_roc)+aes(d=obs,m=score,color=methode)+ geom_roc()+
theme_classic()
df_roc %>% group_by(methode) %>% summarize(AUC=pROC::auc(obs,score))
# A tibble: 2 x 2
methode AUC
<chr> <dbl>
1 complet 0.933
2 step 0.935
LS0tDQp0aXRsZTogIlJlZ3Jlc3Npb24gTG9naXN0aXF1ZSINCmF1dGhvcjogIkh1c3NvbiBldCBhbC4iDQpkYXRlOiAiMDUvMDkvMjAxOCINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogeWVzDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6ICczJw0KICAgIHRvY19mbG9hdDogeWVzDQotLS0NCg0KIyAxLiBJbXBvcnRlciBsZXMgZG9ubsOpZXMNCg0KYGBge3J9DQpiYW5rIDwtIHJlYWQudGFibGUoImh0dHBzOi8vci1zdGF0LXNjLWRvbm5lZXMuZ2l0aHViLmlvL2JhbmstYWRkaXRpb25hbC5jc3YiLGhlYWRlcj1UUlVFLHNlcD0iOyIpDQpzdW1tYXJ5KGJhbmspWyxjKDE6MywyMSldDQpgYGANCg0KIyAyLiBDb25zdHJ1aXJlIGxlIG1vZMOobGUNCg0KYGBge3IgfQ0Kc2V0LnNlZWQoNTY3OCkNCnBlcm0gPC0gc2FtcGxlKG5yb3coYmFuayksMzAwMCkNCmFwcCA8LSBiYW5rW3Blcm0sXQ0KdGVzdCA8LSBiYW5rWy1wZXJtLF0NCmBgYA0KDQpgYGB7cn0NCmxvZ2l0X2NvbXBsZXQgPC0gZ2xtKHl+LixkYXRhPWFwcCxmYW1pbHk9Ymlub21pYWwpDQpgYGANCg0KYGBge3J9DQpjb2VmKGxvZ2l0X2NvbXBsZXQpW2lzLm5hKGNvZWYobG9naXRfY29tcGxldCkpXQ0KYGBgDQoNCmBgYHtyfQ0KdGFibGUoYXBwJGxvYW4sYXBwJGhvdXNpbmcpDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KGxvZ2l0X2NvbXBsZXQpDQpgYGANCg0KYGBge3IsIG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoY2FyKQ0KQW5vdmEobG9naXRfY29tcGxldCx0eXBlPTMsdGVzdC5zdGF0aXN0aWMgPSAiTFIiLHNpbmd1bGFyLm9rPVRSVUUpDQpgYGANCg0KIyAzLiBTw6lsZWN0aW9ubmVyIHVuIG1vZMOobGUNCg0KYGBge3J9DQpsb2dpdF9zdGVwIDwtIHN0ZXAobG9naXRfY29tcGxldCxkaXJlY3Rpb249ImJhY2t3YXJkIiwgdHJhY2U9MCkNCkFub3ZhKGxvZ2l0X3N0ZXAsdHlwZT0zLHRlc3Quc3RhdGlzdGljID0gIkxSIikNCmBgYA0KDQpgYGB7cn0NCmFub3ZhKGxvZ2l0X3N0ZXAsbG9naXRfY29tcGxldCx0ZXN0PSJMUlQiKQ0KYGBgDQoNCiMgNC4gRmFpcmUgZGUgbGEgcHLDqXZpc2lvbg0KDQpgYGB7cn0NCnByZXZfc3RlcCA8LSBwcmVkaWN0KGxvZ2l0X3N0ZXAsbmV3ZGF0YT10ZXN0LHR5cGU9InJlc3BvbnNlIikNCnByZXZfc3RlcFsxOjVdDQpgYGANCg0KYGBge3J9DQpwcmV2X3Byb2IgPC0gZGF0YS5mcmFtZShjb21wbGV0PXByZWRpY3QobG9naXRfY29tcGxldCxuZXdkYXRhPXRlc3QsDQp0eXBlPSJyZXNwb25zZSIpLHN0ZXA9cHJlZGljdChsb2dpdF9zdGVwLG5ld2RhdGE9dGVzdCx0eXBlPSJyZXNwb25zZSIpKQ0KaGVhZChyb3VuZChwcmV2X3Byb2IsMyksIG49MykNCmBgYA0KYGBge3J9DQpwcmV2X2NsYXNzIDwtIGFwcGx5KHByZXZfcHJvYj49MC41LDIsZmFjdG9yLGxhYmVscz1jKCJubyIsInllcyIpKQ0KaGVhZChwcmV2X2NsYXNzLCBuPTMpDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpwcmV2X2NsYXNzIDwtIGRhdGEuZnJhbWUocHJldl9jbGFzcykNCnByZXZfY2xhc3MgJT4lIG11dGF0ZShvYnM9dGVzdCR5KSAlPiUNCnN1bW1hcmlzZV9hbGwoZnVucyhlcnI9bWVhbihvYnMhPS4pKSkgJT4lIHNlbGVjdCgtb2JzX2VycikgJT4lIHJvdW5kKDMpDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShwbG90Uk9DKQ0KZGZfcm9jIDwtIHByZXZfcHJvYiAlPiUgbXV0YXRlKG9icz10ZXN0JHkpICU+JQ0KZ2F0aGVyKGtleT1tZXRob2RlLHZhbHVlPXNjb3JlLGNvbXBsZXQsc3RlcCkNCmdncGxvdChkZl9yb2MpK2FlcyhkPW9icyxtPXNjb3JlLGNvbG9yPW1ldGhvZGUpKyBnZW9tX3JvYygpKw0KdGhlbWVfY2xhc3NpYygpDQpgYGANCg0KYGBge3J9DQpkZl9yb2MgJT4lIGdyb3VwX2J5KG1ldGhvZGUpICU+JSBzdW1tYXJpemUoQVVDPXBST0M6OmF1YyhvYnMsc2NvcmUpKQ0KYGBgDQoNCg==