5.2.1 Le package dplyr

library(tidyverse)
set.seed(1234)
tbl <- tibble(group1 = rep(c("A", "B"), 5000),
                group2 = rep(c("C", "D"), each = 5000),
                value = rnorm(10000),
                weight = sample(1:10, 10000, replace = TRUE))
 tbl
# A tibble: 10,000 x 4
   group1 group2  value weight
   <chr>  <chr>   <dbl>  <int>
 1 A      C      -1.21       1
 2 B      C       0.277      9
 3 A      C       1.08       8
 4 B      C      -2.35       7
 5 A      C       0.429      7
 6 B      C       0.506      6
 7 A      C      -0.575      6
 8 B      C      -0.547      6
 9 A      C      -0.564     10
10 B      C      -0.890      3
# ... with 9,990 more rows
tbl[1:2, ]
# A tibble: 2 x 4
  group1 group2  value weight
  <chr>  <chr>   <dbl>  <int>
1 A      C      -1.21       1
2 B      C       0.277      9
tbl[c(1,5), ]
# A tibble: 2 x 4
  group1 group2  value weight
  <chr>  <chr>   <dbl>  <int>
1 A      C      -1.21       1
2 A      C       0.429      7
tbl[, c("group1", "value")]
# A tibble: 10,000 x 2
   group1  value
   <chr>   <dbl>
 1 A      -1.21 
 2 B       0.277
 3 A       1.08 
 4 B      -2.35 
 5 A       0.429
 6 B       0.506
 7 A      -0.575
 8 B      -0.547
 9 A      -0.564
10 B      -0.890
# ... with 9,990 more rows
slice(tbl, 1:2)
# A tibble: 2 x 4
  group1 group2  value weight
  <chr>  <chr>   <dbl>  <int>
1 A      C      -1.21       1
2 B      C       0.277      9
select(tbl, group1, value)
# A tibble: 10,000 x 2
   group1  value
   <chr>   <dbl>
 1 A      -1.21 
 2 B       0.277
 3 A       1.08 
 4 B      -2.35 
 5 A       0.429
 6 B       0.506
 7 A      -0.575
 8 B      -0.547
 9 A      -0.564
10 B      -0.890
# ... with 9,990 more rows
tbl[["value"]]
pull(tbl, value)
tmp <- select(tbl, group1, weight)
filter(tmp, weight > 9)
# A tibble: 1,011 x 2
   group1 weight
   <chr>   <int>
 1 A          10
 2 B          10
 3 B          10
 4 A          10
 5 A          10
 6 B          10
 7 A          10
 8 A          10
 9 B          10
10 B          10
# ... with 1,001 more rows
filter(select(tbl, group1, weight), weight > 9)
# A tibble: 1,011 x 2
   group1 weight
   <chr>   <int>
 1 A          10
 2 B          10
 3 B          10
 4 A          10
 5 A          10
 6 B          10
 7 A          10
 8 A          10
 9 B          10
10 B          10
# ... with 1,001 more rows
tbl %>% select(group1, weight) %>% filter(weight > 9)
# A tibble: 1,011 x 2
   group1 weight
   <chr>   <int>
 1 A          10
 2 B          10
 3 B          10
 4 A          10
 5 A          10
 6 B          10
 7 A          10
 8 A          10
 9 B          10
10 B          10
# ... with 1,001 more rows

5.2.2 Manipulation de tables

tbl %>% select(mygr = group1, myval = value) %>% slice(1:2)
# A tibble: 2 x 2
  mygr   myval
  <chr>  <dbl>
1 A     -1.21 
2 B      0.277
tbl %>% mutate(tval = trunc(value)) %>% slice(1:2)
# A tibble: 2 x 5
  group1 group2  value weight  tval
  <chr>  <chr>   <dbl>  <int> <dbl>
1 A      C      -1.21       1    -1
2 B      C       0.277      9     0
tbl <- tbl %>% mutate(tvalue = trunc(value), rvalue = round(value,2))
tbl %>% mutate(tvalue = tvalue + 10)
# A tibble: 10,000 x 6
   group1 group2  value weight tvalue rvalue
   <chr>  <chr>   <dbl>  <int>  <dbl>  <dbl>
 1 A      C      -1.21       1      9 -1.21 
 2 B      C       0.277      9     10  0.28 
 3 A      C       1.08       8     11  1.08 
 4 B      C      -2.35       7      8 -2.35 
 5 A      C       0.429      7     10  0.43 
 6 B      C       0.506      6     10  0.51 
 7 A      C      -0.575      6     10 -0.570
 8 B      C      -0.547      6     10 -0.55 
 9 A      C      -0.564     10     10 -0.56 
10 B      C      -0.890      3     10 -0.89 
# ... with 9,990 more rows
tbl %>% select(-rvalue)
# A tibble: 10,000 x 5
   group1 group2  value weight tvalue
   <chr>  <chr>   <dbl>  <int>  <dbl>
 1 A      C      -1.21       1     -1
 2 B      C       0.277      9      0
 3 A      C       1.08       8      1
 4 B      C      -2.35       7     -2
 5 A      C       0.429      7      0
 6 B      C       0.506      6      0
 7 A      C      -0.575      6      0
 8 B      C      -0.547      6      0
 9 A      C      -0.564     10      0
10 B      C      -0.890      3      0
# ... with 9,990 more rows
tbl %>% mutate(rvalue = NULL)
# A tibble: 10,000 x 5
   group1 group2  value weight tvalue
   <chr>  <chr>   <dbl>  <int>  <dbl>
 1 A      C      -1.21       1     -1
 2 B      C       0.277      9      0
 3 A      C       1.08       8      1
 4 B      C      -2.35       7     -2
 5 A      C       0.429      7      0
 6 B      C       0.506      6      0
 7 A      C      -0.575      6      0
 8 B      C      -0.547      6      0
 9 A      C      -0.564     10      0
10 B      C      -0.890      3      0
# ... with 9,990 more rows
tbl %>% mutate(weight = if_else(group1 %in% "A",
                                weight * 10L, weight)) %>% slice(1:2)
# A tibble: 2 x 6
  group1 group2  value weight tvalue rvalue
  <chr>  <chr>   <dbl>  <int>  <dbl>  <dbl>
1 A      C      -1.21      10     -1  -1.21
2 B      C       0.277      9      0   0.28
tbl %>% summarize(sum(value))
# A tibble: 1 x 1
  `sum(value)`
         <dbl>
1         61.2
tbl %>% summarize(somme = sum(value), moyenne = mean(value))
# A tibble: 1 x 2
  somme moyenne
  <dbl>   <dbl>
1  61.2 0.00612
tbl %>% arrange(desc(weight)) %>% slice(1:2)
# A tibble: 2 x 6
  group1 group2  value weight tvalue rvalue
  <chr>  <chr>   <dbl>  <int>  <dbl>  <dbl>
1 A      C      -0.564     10      0  -0.56
2 B      C      -0.911     10      0  -0.91
tbl %>% group_by(group1, group2) %>% summarize(sum(value))
tbl %>% group_by(pop = group1, poids = weight > 5) %>%
  summarize(somme = sum(value))
tbl %>% group_by(group1) %>% mutate(mean_group1 = mean(value))
tbl %>% filter(weight > 5) %>% group_by(group1, group2) %>%
  summarize(N = n()) %>% arrange(desc(N))

5.2.3 Pour aller plus loin

setwd("C:/Users/husson/Dropbox/Rpourlastatetladatascience/donnees")
The working directory was changed to C:/Users/husson/Dropbox/Rpourlastatetladatascience/donnees inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
xlsx <- "chanson-française.xlsx"
readxl::excel_sheets(xlsx)
[1] "chanteurs" "albums"   
chanteurs <- readxl::read_excel(xlsx, sheet="chanteurs")
albums <- readxl::read_excel(xlsx, sheet="albums")
chanteurs
# A tibble: 4 x 4
  prenom  nom      naissance  mort
  <chr>   <chr>        <dbl> <dbl>
1 Georges Brassens      1921  1981
2 Léo     Ferré         1916  1993
3 Jacques Brel          1929  1978
4 Renaud  Séchan        1952    NA
albums
Warning in is(as, "crayon") :
  closing unused connection 3 (https://r-stat-sc-donnees.github.io/chanson-française.xlsx)
# A tibble: 76 x 4
   titre                             annee prenom  nom     
   <chr>                             <dbl> <chr>   <chr>   
 1 La Mauvaise Réputation             1952 Georges Brassens
 2 Le Vent                            1953 Georges Brassens
 3 Les Sabots d'Hélène                1954 Georges Brassens
 4 Je me suis fait tout petit         1956 Georges Brassens
 5 Oncle Archibald                    1957 Georges Brassens
 6 Le Pornographe                     1958 Georges Brassens
 7 Les Funérailles d'antan            1960 Georges Brassens
 8 Le temps ne fait rien à l'affaire  1961 Georges Brassens
 9 Les Trompettes de la renommée      1962 Georges Brassens
10 Les Copains d'abord                1964 Georges Brassens
# ... with 66 more rows
left_join(chanteurs, albums, by=c("prenom", "nom"))
# A tibble: 65 x 6
   prenom  nom      naissance  mort titre                             annee
   <chr>   <chr>        <dbl> <dbl> <chr>                             <dbl>
 1 Georges Brassens      1921  1981 La Mauvaise Réputation             1952
 2 Georges Brassens      1921  1981 Le Vent                            1953
 3 Georges Brassens      1921  1981 Les Sabots d'Hélène                1954
 4 Georges Brassens      1921  1981 Je me suis fait tout petit         1956
 5 Georges Brassens      1921  1981 Oncle Archibald                    1957
 6 Georges Brassens      1921  1981 Le Pornographe                     1958
 7 Georges Brassens      1921  1981 Les Funérailles d'antan            1960
 8 Georges Brassens      1921  1981 Le temps ne fait rien à l'affaire  1961
 9 Georges Brassens      1921  1981 Les Trompettes de la renommée      1962
10 Georges Brassens      1921  1981 Les Copains d'abord                1964
# ... with 55 more rows
albums %>%
  filter(annee>1968) %>%
  group_by(prenom, nom) %>%
  summarise(post_soixante_huit=n()) %>%
  left_join(chanteurs, by=c("prenom", "nom")) %>%
  select(prenom, nom, naissance, mort, post_soixante_huit)
tbl %>% group_by(group1) %>% select(value, weight) %>% nest() %>%
  mutate(data = map(data, ~ map_df(., mean))) %>% unnest()
Adding missing grouping variables: `group1`
# A tibble: 2 x 3
  group1   value weight
  <chr>    <dbl>  <dbl>
1 A       0.0274   5.51
2 B      -0.0152   5.58
LS0tDQp0aXRsZTogIkxlIHBhY2thZ2UgZHBseXIgZXQgbGUgdGlkeXZlcnNlIg0KYXV0aG9yOiAiSHVzc29uIGV0IGFsLiINCmRhdGU6ICIwOS8wOS8yMDE4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzMnDQogICAgdG9jX2Zsb2F0OiB5ZXMNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSwgY2FjaGUgPSBUUlVFKQ0KYGBgDQoNCiMgNS4yLjEgTGUgcGFja2FnZSBkcGx5cg0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCnNldC5zZWVkKDEyMzQpDQp0YmwgPC0gdGliYmxlKGdyb3VwMSA9IHJlcChjKCJBIiwgIkIiKSwgNTAwMCksDQogICAgICAgICAgICAgICAgZ3JvdXAyID0gcmVwKGMoIkMiLCAiRCIpLCBlYWNoID0gNTAwMCksDQogICAgICAgICAgICAgICAgdmFsdWUgPSBybm9ybSgxMDAwMCksDQogICAgICAgICAgICAgICAgd2VpZ2h0ID0gc2FtcGxlKDE6MTAsIDEwMDAwLCByZXBsYWNlID0gVFJVRSkpDQogdGJsDQpgYGANCg0KYGBge3J9DQp0YmxbMToyLCBdDQp0YmxbYygxLDUpLCBdDQp0YmxbLCBjKCJncm91cDEiLCAidmFsdWUiKV0NCmBgYA0KDQpgYGB7cn0NCnNsaWNlKHRibCwgMToyKQ0Kc2VsZWN0KHRibCwgZ3JvdXAxLCB2YWx1ZSkNCmBgYA0KYGBge3IscmVzdWx0cz0naGlkZSd9DQp0YmxbWyJ2YWx1ZSJdXQ0KcHVsbCh0YmwsIHZhbHVlKQ0KYGBgDQoNCmBgYHtyfQ0KdG1wIDwtIHNlbGVjdCh0YmwsIGdyb3VwMSwgd2VpZ2h0KQ0KZmlsdGVyKHRtcCwgd2VpZ2h0ID4gOSkNCmZpbHRlcihzZWxlY3QodGJsLCBncm91cDEsIHdlaWdodCksIHdlaWdodCA+IDkpDQp0YmwgJT4lIHNlbGVjdChncm91cDEsIHdlaWdodCkgJT4lIGZpbHRlcih3ZWlnaHQgPiA5KQ0KYGBgDQoNCiMgNS4yLjIgTWFuaXB1bGF0aW9uIGRlIHRhYmxlcw0KDQpgYGB7cn0NCnRibCAlPiUgc2VsZWN0KG15Z3IgPSBncm91cDEsIG15dmFsID0gdmFsdWUpICU+JSBzbGljZSgxOjIpDQp0YmwgJT4lIG11dGF0ZSh0dmFsID0gdHJ1bmModmFsdWUpKSAlPiUgc2xpY2UoMToyKQ0KdGJsIDwtIHRibCAlPiUgbXV0YXRlKHR2YWx1ZSA9IHRydW5jKHZhbHVlKSwgcnZhbHVlID0gcm91bmQodmFsdWUsMikpDQp0YmwgJT4lIG11dGF0ZSh0dmFsdWUgPSB0dmFsdWUgKyAxMCkNCnRibCAlPiUgc2VsZWN0KC1ydmFsdWUpDQp0YmwgJT4lIG11dGF0ZShydmFsdWUgPSBOVUxMKQ0KdGJsICU+JSBtdXRhdGUod2VpZ2h0ID0gaWZfZWxzZShncm91cDEgJWluJSAiQSIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHdlaWdodCAqIDEwTCwgd2VpZ2h0KSkgJT4lIHNsaWNlKDE6MikNCnRibCAlPiUgc3VtbWFyaXplKHN1bSh2YWx1ZSkpDQp0YmwgJT4lIHN1bW1hcml6ZShzb21tZSA9IHN1bSh2YWx1ZSksIG1veWVubmUgPSBtZWFuKHZhbHVlKSkNCnRibCAlPiUgYXJyYW5nZShkZXNjKHdlaWdodCkpICU+JSBzbGljZSgxOjIpDQp0YmwgJT4lIGdyb3VwX2J5KGdyb3VwMSwgZ3JvdXAyKSAlPiUgc3VtbWFyaXplKHN1bSh2YWx1ZSkpDQp0YmwgJT4lIGdyb3VwX2J5KHBvcCA9IGdyb3VwMSwgcG9pZHMgPSB3ZWlnaHQgPiA1KSAlPiUNCiAgc3VtbWFyaXplKHNvbW1lID0gc3VtKHZhbHVlKSkNCnRibCAlPiUgZ3JvdXBfYnkoZ3JvdXAxKSAlPiUgbXV0YXRlKG1lYW5fZ3JvdXAxID0gbWVhbih2YWx1ZSkpDQp0YmwgJT4lIGZpbHRlcih3ZWlnaHQgPiA1KSAlPiUgZ3JvdXBfYnkoZ3JvdXAxLCBncm91cDIpICU+JQ0KICBzdW1tYXJpemUoTiA9IG4oKSkgJT4lIGFycmFuZ2UoZGVzYyhOKSkNCmBgYA0KDQojIDUuMi4zIFBvdXIgYWxsZXIgcGx1cyBsb2luDQoNCmBgYHtyfQ0Kc2V0d2QoIkM6L1VzZXJzL2h1c3Nvbi9Ecm9wYm94L1Jwb3VybGFzdGF0ZXRsYWRhdGFzY2llbmNlL2Rvbm5lZXMiKQ0KeGxzeCA8LSAiY2hhbnNvbi1mcmFuw6dhaXNlLnhsc3giDQpyZWFkeGw6OmV4Y2VsX3NoZWV0cyh4bHN4KQ0KY2hhbnRldXJzIDwtIHJlYWR4bDo6cmVhZF9leGNlbCh4bHN4LCBzaGVldD0iY2hhbnRldXJzIikNCmFsYnVtcyA8LSByZWFkeGw6OnJlYWRfZXhjZWwoeGxzeCwgc2hlZXQ9ImFsYnVtcyIpDQpjaGFudGV1cnMNCmFsYnVtcw0KbGVmdF9qb2luKGNoYW50ZXVycywgYWxidW1zLCBieT1jKCJwcmVub20iLCAibm9tIikpDQpgYGANCg0KYGBge3J9DQphbGJ1bXMgJT4lDQogIGZpbHRlcihhbm5lZT4xOTY4KSAlPiUNCiAgZ3JvdXBfYnkocHJlbm9tLCBub20pICU+JQ0KICBzdW1tYXJpc2UocG9zdF9zb2l4YW50ZV9odWl0PW4oKSkgJT4lDQogIGxlZnRfam9pbihjaGFudGV1cnMsIGJ5PWMoInByZW5vbSIsICJub20iKSkgJT4lDQogIHNlbGVjdChwcmVub20sIG5vbSwgbmFpc3NhbmNlLCBtb3J0LCBwb3N0X3NvaXhhbnRlX2h1aXQpDQpgYGANCg0KYGBge3J9DQp0YmwgJT4lIGdyb3VwX2J5KGdyb3VwMSkgJT4lIHNlbGVjdCh2YWx1ZSwgd2VpZ2h0KSAlPiUgbmVzdCgpICU+JQ0KICBtdXRhdGUoZGF0YSA9IG1hcChkYXRhLCB+IG1hcF9kZiguLCBtZWFuKSkpICU+JSB1bm5lc3QoKQ0KYGBg