1. Importer les données

bank <- read.table("https://r-stat-sc-donnees.github.io/bank-additional.csv",header=TRUE,sep=";")
summary(bank)[,c(1:3,21)]
      age                 job           marital       y       
 Min.   :18.00   admin.     :1012   divorced: 446   no :3668  
 1st Qu.:32.00   blue-collar: 884   married :2509   yes: 451  
 Median :38.00   technician : 691   single  :1153             
 Mean   :40.11   services   : 393   unknown :  11             
 3rd Qu.:47.00   management : 324                             
 Max.   :88.00   retired    : 166                             
                 (Other)    : 649                             

2. Construire le modèle

set.seed(5678)
perm <- sample(nrow(bank),3000)
app <- bank[perm,]
test <- bank[-perm,]
logit_complet <- glm(y~.,data=app,family=binomial)
coef(logit_complet)[is.na(coef(logit_complet))]
loanunknown 
         NA 
table(app$loan,app$housing)
         
            no unknown  yes
  no      1169       0 1280
  unknown    0      82    0
  yes      188       0  281
summary(logit_complet)

Call:
glm(formula = y ~ ., family = binomial, data = app)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-5.1751  -0.2823  -0.1693  -0.1090   2.8013  

Coefficients: (1 not defined because of singularities)
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -7.760e+01  1.496e+02  -0.519 0.604029    
age                           7.731e-03  9.583e-03   0.807 0.419816    
jobblue-collar                1.109e-01  3.172e-01   0.350 0.726681    
jobentrepreneur              -5.170e-01  5.454e-01  -0.948 0.343222    
jobhousemaid                  5.245e-01  5.049e-01   1.039 0.298855    
jobmanagement                -3.730e-01  3.478e-01  -1.072 0.283539    
jobretired                   -8.105e-02  4.117e-01  -0.197 0.843922    
jobself-employed             -5.629e-01  4.957e-01  -1.136 0.256129    
jobservices                   2.158e-01  3.342e-01   0.646 0.518342    
jobstudent                   -3.934e-01  5.108e-01  -0.770 0.441194    
jobtechnician                 3.775e-01  2.566e-01   1.471 0.141289    
jobunemployed                 3.758e-01  4.569e-01   0.823 0.410779    
jobunknown                    4.676e-01  8.207e-01   0.570 0.568830    
maritalmarried                2.996e-01  2.839e-01   1.055 0.291290    
maritalsingle                 3.723e-01  3.233e-01   1.151 0.249569    
maritalunknown                1.163e+00  1.203e+00   0.966 0.334010    
educationbasic.6y             6.380e-01  4.627e-01   1.379 0.167952    
educationbasic.9y             5.097e-01  3.758e-01   1.357 0.174919    
educationhigh.school          4.448e-01  3.650e-01   1.219 0.222988    
educationilliterate          -1.132e+01  5.354e+02  -0.021 0.983138    
educationprofessional.course  4.227e-01  3.947e-01   1.071 0.284122    
educationuniversity.degree    6.531e-01  3.722e-01   1.755 0.079306 .  
educationunknown              5.589e-01  4.525e-01   1.235 0.216751    
defaultunknown                9.906e-02  2.448e-01   0.405 0.685710    
defaultyes                   -8.551e+00  5.354e+02  -0.016 0.987257    
housingunknown               -2.296e-01  5.839e-01  -0.393 0.694126    
housingyes                   -1.235e-01  1.606e-01  -0.769 0.442069    
loanunknown                          NA         NA      NA       NA    
loanyes                       4.145e-02  2.138e-01   0.194 0.846278    
contacttelephone             -1.054e+00  3.368e-01  -3.131 0.001745 ** 
monthaug                      5.582e-01  4.962e-01   1.125 0.260630    
monthdec                      7.539e-01  8.389e-01   0.899 0.368864    
monthjul                      1.842e-01  4.203e-01   0.438 0.661099    
monthjun                      8.020e-01  5.235e-01   1.532 0.125553    
monthmar                      2.369e+00  6.365e-01   3.722 0.000198 ***
monthmay                     -3.243e-01  3.598e-01  -0.901 0.367357    
monthnov                     -4.332e-01  4.901e-01  -0.884 0.376731    
monthoct                      9.492e-03  6.175e-01   0.015 0.987736    
monthsep                      1.114e-01  7.050e-01   0.158 0.874462    
day_of_weekmon                1.620e-01  2.509e-01   0.646 0.518418    
day_of_weekthu                1.193e-02  2.524e-01   0.047 0.962317    
day_of_weektue                2.575e-02  2.518e-01   0.102 0.918548    
day_of_weekwed                2.288e-01  2.597e-01   0.881 0.378394    
duration                      5.834e-03  3.267e-04  17.855  < 2e-16 ***
campaign                     -7.646e-02  5.128e-02  -1.491 0.135946    
pdays                        -3.089e-04  7.467e-04  -0.414 0.679077    
previous                      1.526e-01  2.013e-01   0.758 0.448389    
poutcomenonexistent           6.972e-01  3.498e-01   1.993 0.046244 *  
poutcomesuccess               1.570e+00  7.208e-01   2.178 0.029412 *  
emp.var.rate                 -7.577e-01  5.681e-01  -1.334 0.182289    
cons.price.idx                1.014e+00  9.920e-01   1.022 0.306640    
cons.conf.idx                 6.823e-02  3.211e-02   2.125 0.033593 *  
euribor3m                     6.553e-02  4.970e-01   0.132 0.895105    
nr.employed                  -3.984e-03  1.199e-02  -0.332 0.739640    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2128.8  on 2999  degrees of freedom
Residual deviance: 1171.9  on 2947  degrees of freedom
AIC: 1277.9

Number of Fisher Scoring iterations: 12
library(car)
Anova(logit_complet,type=3,test.statistic = "LR",singular.ok=TRUE)
Analysis of Deviance Table (Type III tests)

Response: y
               LR Chisq Df Pr(>Chisq)    
age                0.65  1  0.4198705    
job               11.47 11  0.4046378    
marital            1.91  3  0.5918195    
education          4.38  7  0.7357118    
default            0.18  2  0.9158242    
housing            0.59  1  0.4421206    
loan               0.04  1  0.8465329    
contact           10.95  1  0.0009373 ***
month             44.71  9  1.043e-06 ***
day_of_week        1.32  4  0.8587956    
duration         486.70  1  < 2.2e-16 ***
campaign           2.47  1  0.1162119    
pdays              0.17  1  0.6812723    
previous           0.59  1  0.4411588    
poutcome           7.64  2  0.0219673 *  
emp.var.rate       1.77  1  0.1839836    
cons.price.idx     1.04  1  0.3075302    
cons.conf.idx      4.57  1  0.0324572 *  
euribor3m          0.02  1  0.8951030    
nr.employed        0.11  1  0.7393859    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

3. Sélectionner un modèle

logit_step <- step(logit_complet,direction="backward", trace=0)
Anova(logit_step,type=3,test.statistic = "LR")
Analysis of Deviance Table (Type III tests)

Response: y
               LR Chisq Df Pr(>Chisq)    
contact           14.88  1  0.0001145 ***
month             51.09  9  6.713e-08 ***
duration         483.69  1  < 2.2e-16 ***
campaign           2.99  1  0.0839311 .  
poutcome          37.24  2  8.186e-09 ***
emp.var.rate     109.95  1  < 2.2e-16 ***
cons.price.idx    47.58  1  5.273e-12 ***
cons.conf.idx     12.23  1  0.0004691 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(logit_step,logit_complet,test="LRT")
Analysis of Deviance Table

Model 1: y ~ contact + month + duration + campaign + poutcome + emp.var.rate + 
    cons.price.idx + cons.conf.idx
Model 2: y ~ age + job + marital + education + default + housing + loan + 
    contact + month + day_of_week + duration + campaign + pdays + 
    previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
    euribor3m + nr.employed
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      2982     1192.2                     
2      2947     1171.9 35   20.213   0.9784

4. Faire de la prévision

prev_step <- predict(logit_step,newdata=test,type="response")
prev_step[1:5]
          5          12          13          14          15 
0.007852846 0.014818676 0.005213798 0.025447271 0.242988796 
prev_prob <- data.frame(complet=predict(logit_complet,newdata=test,
type="response"),step=predict(logit_step,newdata=test,type="response"))
prediction from a rank-deficient fit may be misleading
head(round(prev_prob,3), n=3)
   complet  step
5    0.009 0.008
12   0.005 0.015
13   0.005 0.005
prev_class <- apply(prev_prob>=0.5,2,factor,labels=c("no","yes"))
head(prev_class, n=3)
   complet step
5  "no"    "no"
12 "no"    "no"
13 "no"    "no"
library(tidyverse)
prev_class <- data.frame(prev_class)
prev_class %>% mutate(obs=test$y) %>%
summarise_all(funs(err=mean(obs!=.))) %>% select(-obs_err) %>% round(3)
  complet_err step_err
1       0.074     0.08
library(plotROC)
df_roc <- prev_prob %>% mutate(obs=test$y) %>%
gather(key=methode,value=score,complet,step)
ggplot(df_roc)+aes(d=obs,m=score,color=methode)+ geom_roc()+
theme_classic()

df_roc %>% group_by(methode) %>% summarize(AUC=pROC::auc(obs,score))
# A tibble: 2 x 2
  methode   AUC
  <chr>   <dbl>
1 complet 0.933
2 step    0.935
LS0tDQp0aXRsZTogIlJlZ3Jlc3Npb24gTG9naXN0aXF1ZSINCmF1dGhvcjogIkh1c3NvbiBldCBhbC4iDQpkYXRlOiAiMDUvMDkvMjAxOCINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogeWVzDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6ICczJw0KICAgIHRvY19mbG9hdDogeWVzDQotLS0NCg0KIyAxLiBJbXBvcnRlciBsZXMgZG9ubsOpZXMNCg0KYGBge3J9DQpiYW5rIDwtIHJlYWQudGFibGUoImh0dHBzOi8vci1zdGF0LXNjLWRvbm5lZXMuZ2l0aHViLmlvL2JhbmstYWRkaXRpb25hbC5jc3YiLGhlYWRlcj1UUlVFLHNlcD0iOyIpDQpzdW1tYXJ5KGJhbmspWyxjKDE6MywyMSldDQpgYGANCg0KIyAyLiBDb25zdHJ1aXJlIGxlIG1vZMOobGUNCg0KYGBge3IgfQ0Kc2V0LnNlZWQoNTY3OCkNCnBlcm0gPC0gc2FtcGxlKG5yb3coYmFuayksMzAwMCkNCmFwcCA8LSBiYW5rW3Blcm0sXQ0KdGVzdCA8LSBiYW5rWy1wZXJtLF0NCmBgYA0KDQpgYGB7cn0NCmxvZ2l0X2NvbXBsZXQgPC0gZ2xtKHl+LixkYXRhPWFwcCxmYW1pbHk9Ymlub21pYWwpDQpgYGANCg0KYGBge3J9DQpjb2VmKGxvZ2l0X2NvbXBsZXQpW2lzLm5hKGNvZWYobG9naXRfY29tcGxldCkpXQ0KYGBgDQoNCmBgYHtyfQ0KdGFibGUoYXBwJGxvYW4sYXBwJGhvdXNpbmcpDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KGxvZ2l0X2NvbXBsZXQpDQpgYGANCg0KYGBge3IsIG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoY2FyKQ0KQW5vdmEobG9naXRfY29tcGxldCx0eXBlPTMsdGVzdC5zdGF0aXN0aWMgPSAiTFIiLHNpbmd1bGFyLm9rPVRSVUUpDQpgYGANCg0KIyAzLiBTw6lsZWN0aW9ubmVyIHVuIG1vZMOobGUNCg0KYGBge3J9DQpsb2dpdF9zdGVwIDwtIHN0ZXAobG9naXRfY29tcGxldCxkaXJlY3Rpb249ImJhY2t3YXJkIiwgdHJhY2U9MCkNCkFub3ZhKGxvZ2l0X3N0ZXAsdHlwZT0zLHRlc3Quc3RhdGlzdGljID0gIkxSIikNCmBgYA0KDQpgYGB7cn0NCmFub3ZhKGxvZ2l0X3N0ZXAsbG9naXRfY29tcGxldCx0ZXN0PSJMUlQiKQ0KYGBgDQoNCiMgNC4gRmFpcmUgZGUgbGEgcHLDqXZpc2lvbg0KDQpgYGB7cn0NCnByZXZfc3RlcCA8LSBwcmVkaWN0KGxvZ2l0X3N0ZXAsbmV3ZGF0YT10ZXN0LHR5cGU9InJlc3BvbnNlIikNCnByZXZfc3RlcFsxOjVdDQpgYGANCg0KYGBge3J9DQpwcmV2X3Byb2IgPC0gZGF0YS5mcmFtZShjb21wbGV0PXByZWRpY3QobG9naXRfY29tcGxldCxuZXdkYXRhPXRlc3QsDQp0eXBlPSJyZXNwb25zZSIpLHN0ZXA9cHJlZGljdChsb2dpdF9zdGVwLG5ld2RhdGE9dGVzdCx0eXBlPSJyZXNwb25zZSIpKQ0KaGVhZChyb3VuZChwcmV2X3Byb2IsMyksIG49MykNCmBgYA0KYGBge3J9DQpwcmV2X2NsYXNzIDwtIGFwcGx5KHByZXZfcHJvYj49MC41LDIsZmFjdG9yLGxhYmVscz1jKCJubyIsInllcyIpKQ0KaGVhZChwcmV2X2NsYXNzLCBuPTMpDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpwcmV2X2NsYXNzIDwtIGRhdGEuZnJhbWUocHJldl9jbGFzcykNCnByZXZfY2xhc3MgJT4lIG11dGF0ZShvYnM9dGVzdCR5KSAlPiUNCnN1bW1hcmlzZV9hbGwoZnVucyhlcnI9bWVhbihvYnMhPS4pKSkgJT4lIHNlbGVjdCgtb2JzX2VycikgJT4lIHJvdW5kKDMpDQpgYGANCg0KYGBge3IsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShwbG90Uk9DKQ0KZGZfcm9jIDwtIHByZXZfcHJvYiAlPiUgbXV0YXRlKG9icz10ZXN0JHkpICU+JQ0KZ2F0aGVyKGtleT1tZXRob2RlLHZhbHVlPXNjb3JlLGNvbXBsZXQsc3RlcCkNCmdncGxvdChkZl9yb2MpK2FlcyhkPW9icyxtPXNjb3JlLGNvbG9yPW1ldGhvZGUpKyBnZW9tX3JvYygpKw0KdGhlbWVfY2xhc3NpYygpDQpgYGANCg0KYGBge3J9DQpkZl9yb2MgJT4lIGdyb3VwX2J5KG1ldGhvZGUpICU+JSBzdW1tYXJpemUoQVVDPXBST0M6OmF1YyhvYnMsc2NvcmUpKQ0KYGBgDQoNCg==