5.2.1 Le package dplyr
library(tidyverse)
set.seed(1234)
tbl <- tibble(group1 = rep(c("A", "B"), 5000),
group2 = rep(c("C", "D"), each = 5000),
value = rnorm(10000),
weight = sample(1:10, 10000, replace = TRUE))
tbl
# A tibble: 10,000 x 4
group1 group2 value weight
<chr> <chr> <dbl> <int>
1 A C -1.21 1
2 B C 0.277 9
3 A C 1.08 8
4 B C -2.35 7
5 A C 0.429 7
6 B C 0.506 6
7 A C -0.575 6
8 B C -0.547 6
9 A C -0.564 10
10 B C -0.890 3
# ... with 9,990 more rows
tbl[1:2, ]
# A tibble: 2 x 4
group1 group2 value weight
<chr> <chr> <dbl> <int>
1 A C -1.21 1
2 B C 0.277 9
tbl[c(1,5), ]
# A tibble: 2 x 4
group1 group2 value weight
<chr> <chr> <dbl> <int>
1 A C -1.21 1
2 A C 0.429 7
tbl[, c("group1", "value")]
# A tibble: 10,000 x 2
group1 value
<chr> <dbl>
1 A -1.21
2 B 0.277
3 A 1.08
4 B -2.35
5 A 0.429
6 B 0.506
7 A -0.575
8 B -0.547
9 A -0.564
10 B -0.890
# ... with 9,990 more rows
slice(tbl, 1:2)
# A tibble: 2 x 4
group1 group2 value weight
<chr> <chr> <dbl> <int>
1 A C -1.21 1
2 B C 0.277 9
select(tbl, group1, value)
# A tibble: 10,000 x 2
group1 value
<chr> <dbl>
1 A -1.21
2 B 0.277
3 A 1.08
4 B -2.35
5 A 0.429
6 B 0.506
7 A -0.575
8 B -0.547
9 A -0.564
10 B -0.890
# ... with 9,990 more rows
tbl[["value"]]
pull(tbl, value)
tmp <- select(tbl, group1, weight)
filter(tmp, weight > 9)
# A tibble: 1,011 x 2
group1 weight
<chr> <int>
1 A 10
2 B 10
3 B 10
4 A 10
5 A 10
6 B 10
7 A 10
8 A 10
9 B 10
10 B 10
# ... with 1,001 more rows
filter(select(tbl, group1, weight), weight > 9)
# A tibble: 1,011 x 2
group1 weight
<chr> <int>
1 A 10
2 B 10
3 B 10
4 A 10
5 A 10
6 B 10
7 A 10
8 A 10
9 B 10
10 B 10
# ... with 1,001 more rows
tbl %>% select(group1, weight) %>% filter(weight > 9)
# A tibble: 1,011 x 2
group1 weight
<chr> <int>
1 A 10
2 B 10
3 B 10
4 A 10
5 A 10
6 B 10
7 A 10
8 A 10
9 B 10
10 B 10
# ... with 1,001 more rows
5.2.2 Manipulation de tables
tbl %>% select(mygr = group1, myval = value) %>% slice(1:2)
# A tibble: 2 x 2
mygr myval
<chr> <dbl>
1 A -1.21
2 B 0.277
tbl %>% mutate(tval = trunc(value)) %>% slice(1:2)
# A tibble: 2 x 5
group1 group2 value weight tval
<chr> <chr> <dbl> <int> <dbl>
1 A C -1.21 1 -1
2 B C 0.277 9 0
tbl <- tbl %>% mutate(tvalue = trunc(value), rvalue = round(value,2))
tbl %>% mutate(tvalue = tvalue + 10)
# A tibble: 10,000 x 6
group1 group2 value weight tvalue rvalue
<chr> <chr> <dbl> <int> <dbl> <dbl>
1 A C -1.21 1 9 -1.21
2 B C 0.277 9 10 0.28
3 A C 1.08 8 11 1.08
4 B C -2.35 7 8 -2.35
5 A C 0.429 7 10 0.43
6 B C 0.506 6 10 0.51
7 A C -0.575 6 10 -0.570
8 B C -0.547 6 10 -0.55
9 A C -0.564 10 10 -0.56
10 B C -0.890 3 10 -0.89
# ... with 9,990 more rows
tbl %>% select(-rvalue)
# A tibble: 10,000 x 5
group1 group2 value weight tvalue
<chr> <chr> <dbl> <int> <dbl>
1 A C -1.21 1 -1
2 B C 0.277 9 0
3 A C 1.08 8 1
4 B C -2.35 7 -2
5 A C 0.429 7 0
6 B C 0.506 6 0
7 A C -0.575 6 0
8 B C -0.547 6 0
9 A C -0.564 10 0
10 B C -0.890 3 0
# ... with 9,990 more rows
tbl %>% mutate(rvalue = NULL)
# A tibble: 10,000 x 5
group1 group2 value weight tvalue
<chr> <chr> <dbl> <int> <dbl>
1 A C -1.21 1 -1
2 B C 0.277 9 0
3 A C 1.08 8 1
4 B C -2.35 7 -2
5 A C 0.429 7 0
6 B C 0.506 6 0
7 A C -0.575 6 0
8 B C -0.547 6 0
9 A C -0.564 10 0
10 B C -0.890 3 0
# ... with 9,990 more rows
tbl %>% mutate(weight = if_else(group1 %in% "A",
weight * 10L, weight)) %>% slice(1:2)
# A tibble: 2 x 6
group1 group2 value weight tvalue rvalue
<chr> <chr> <dbl> <int> <dbl> <dbl>
1 A C -1.21 10 -1 -1.21
2 B C 0.277 9 0 0.28
tbl %>% summarize(sum(value))
# A tibble: 1 x 1
`sum(value)`
<dbl>
1 61.2
tbl %>% summarize(somme = sum(value), moyenne = mean(value))
# A tibble: 1 x 2
somme moyenne
<dbl> <dbl>
1 61.2 0.00612
tbl %>% arrange(desc(weight)) %>% slice(1:2)
# A tibble: 2 x 6
group1 group2 value weight tvalue rvalue
<chr> <chr> <dbl> <int> <dbl> <dbl>
1 A C -0.564 10 0 -0.56
2 B C -0.911 10 0 -0.91
tbl %>% group_by(group1, group2) %>% summarize(sum(value))
tbl %>% group_by(pop = group1, poids = weight > 5) %>%
summarize(somme = sum(value))
tbl %>% group_by(group1) %>% mutate(mean_group1 = mean(value))
tbl %>% filter(weight > 5) %>% group_by(group1, group2) %>%
summarize(N = n()) %>% arrange(desc(N))
5.2.3 Pour aller plus loin
setwd("C:/Users/husson/Dropbox/Rpourlastatetladatascience/donnees")
The working directory was changed to C:/Users/husson/Dropbox/Rpourlastatetladatascience/donnees inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
xlsx <- "chanson-française.xlsx"
readxl::excel_sheets(xlsx)
[1] "chanteurs" "albums"
chanteurs <- readxl::read_excel(xlsx, sheet="chanteurs")
albums <- readxl::read_excel(xlsx, sheet="albums")
chanteurs
# A tibble: 4 x 4
prenom nom naissance mort
<chr> <chr> <dbl> <dbl>
1 Georges Brassens 1921 1981
2 Léo Ferré 1916 1993
3 Jacques Brel 1929 1978
4 Renaud Séchan 1952 NA
albums
Warning in is(as, "crayon") :
closing unused connection 3 (https://r-stat-sc-donnees.github.io/chanson-française.xlsx)
# A tibble: 76 x 4
titre annee prenom nom
<chr> <dbl> <chr> <chr>
1 La Mauvaise Réputation 1952 Georges Brassens
2 Le Vent 1953 Georges Brassens
3 Les Sabots d'Hélène 1954 Georges Brassens
4 Je me suis fait tout petit 1956 Georges Brassens
5 Oncle Archibald 1957 Georges Brassens
6 Le Pornographe 1958 Georges Brassens
7 Les Funérailles d'antan 1960 Georges Brassens
8 Le temps ne fait rien à l'affaire 1961 Georges Brassens
9 Les Trompettes de la renommée 1962 Georges Brassens
10 Les Copains d'abord 1964 Georges Brassens
# ... with 66 more rows
left_join(chanteurs, albums, by=c("prenom", "nom"))
# A tibble: 65 x 6
prenom nom naissance mort titre annee
<chr> <chr> <dbl> <dbl> <chr> <dbl>
1 Georges Brassens 1921 1981 La Mauvaise Réputation 1952
2 Georges Brassens 1921 1981 Le Vent 1953
3 Georges Brassens 1921 1981 Les Sabots d'Hélène 1954
4 Georges Brassens 1921 1981 Je me suis fait tout petit 1956
5 Georges Brassens 1921 1981 Oncle Archibald 1957
6 Georges Brassens 1921 1981 Le Pornographe 1958
7 Georges Brassens 1921 1981 Les Funérailles d'antan 1960
8 Georges Brassens 1921 1981 Le temps ne fait rien à l'affaire 1961
9 Georges Brassens 1921 1981 Les Trompettes de la renommée 1962
10 Georges Brassens 1921 1981 Les Copains d'abord 1964
# ... with 55 more rows
albums %>%
filter(annee>1968) %>%
group_by(prenom, nom) %>%
summarise(post_soixante_huit=n()) %>%
left_join(chanteurs, by=c("prenom", "nom")) %>%
select(prenom, nom, naissance, mort, post_soixante_huit)
tbl %>% group_by(group1) %>% select(value, weight) %>% nest() %>%
mutate(data = map(data, ~ map_df(., mean))) %>% unnest()
Adding missing grouping variables: `group1`
# A tibble: 2 x 3
group1 value weight
<chr> <dbl> <dbl>
1 A 0.0274 5.51
2 B -0.0152 5.58
LS0tDQp0aXRsZTogIkxlIHBhY2thZ2UgZHBseXIgZXQgbGUgdGlkeXZlcnNlIg0KYXV0aG9yOiAiSHVzc29uIGV0IGFsLiINCmRhdGU6ICIwOS8wOS8yMDE4Ig0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogJzMnDQogICAgdG9jX2Zsb2F0OiB5ZXMNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSwgY2FjaGUgPSBUUlVFKQ0KYGBgDQoNCiMgNS4yLjEgTGUgcGFja2FnZSBkcGx5cg0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCnNldC5zZWVkKDEyMzQpDQp0YmwgPC0gdGliYmxlKGdyb3VwMSA9IHJlcChjKCJBIiwgIkIiKSwgNTAwMCksDQogICAgICAgICAgICAgICAgZ3JvdXAyID0gcmVwKGMoIkMiLCAiRCIpLCBlYWNoID0gNTAwMCksDQogICAgICAgICAgICAgICAgdmFsdWUgPSBybm9ybSgxMDAwMCksDQogICAgICAgICAgICAgICAgd2VpZ2h0ID0gc2FtcGxlKDE6MTAsIDEwMDAwLCByZXBsYWNlID0gVFJVRSkpDQogdGJsDQpgYGANCg0KYGBge3J9DQp0YmxbMToyLCBdDQp0YmxbYygxLDUpLCBdDQp0YmxbLCBjKCJncm91cDEiLCAidmFsdWUiKV0NCmBgYA0KDQpgYGB7cn0NCnNsaWNlKHRibCwgMToyKQ0Kc2VsZWN0KHRibCwgZ3JvdXAxLCB2YWx1ZSkNCmBgYA0KYGBge3IscmVzdWx0cz0naGlkZSd9DQp0YmxbWyJ2YWx1ZSJdXQ0KcHVsbCh0YmwsIHZhbHVlKQ0KYGBgDQoNCmBgYHtyfQ0KdG1wIDwtIHNlbGVjdCh0YmwsIGdyb3VwMSwgd2VpZ2h0KQ0KZmlsdGVyKHRtcCwgd2VpZ2h0ID4gOSkNCmZpbHRlcihzZWxlY3QodGJsLCBncm91cDEsIHdlaWdodCksIHdlaWdodCA+IDkpDQp0YmwgJT4lIHNlbGVjdChncm91cDEsIHdlaWdodCkgJT4lIGZpbHRlcih3ZWlnaHQgPiA5KQ0KYGBgDQoNCiMgNS4yLjIgTWFuaXB1bGF0aW9uIGRlIHRhYmxlcw0KDQpgYGB7cn0NCnRibCAlPiUgc2VsZWN0KG15Z3IgPSBncm91cDEsIG15dmFsID0gdmFsdWUpICU+JSBzbGljZSgxOjIpDQp0YmwgJT4lIG11dGF0ZSh0dmFsID0gdHJ1bmModmFsdWUpKSAlPiUgc2xpY2UoMToyKQ0KdGJsIDwtIHRibCAlPiUgbXV0YXRlKHR2YWx1ZSA9IHRydW5jKHZhbHVlKSwgcnZhbHVlID0gcm91bmQodmFsdWUsMikpDQp0YmwgJT4lIG11dGF0ZSh0dmFsdWUgPSB0dmFsdWUgKyAxMCkNCnRibCAlPiUgc2VsZWN0KC1ydmFsdWUpDQp0YmwgJT4lIG11dGF0ZShydmFsdWUgPSBOVUxMKQ0KdGJsICU+JSBtdXRhdGUod2VpZ2h0ID0gaWZfZWxzZShncm91cDEgJWluJSAiQSIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHdlaWdodCAqIDEwTCwgd2VpZ2h0KSkgJT4lIHNsaWNlKDE6MikNCnRibCAlPiUgc3VtbWFyaXplKHN1bSh2YWx1ZSkpDQp0YmwgJT4lIHN1bW1hcml6ZShzb21tZSA9IHN1bSh2YWx1ZSksIG1veWVubmUgPSBtZWFuKHZhbHVlKSkNCnRibCAlPiUgYXJyYW5nZShkZXNjKHdlaWdodCkpICU+JSBzbGljZSgxOjIpDQp0YmwgJT4lIGdyb3VwX2J5KGdyb3VwMSwgZ3JvdXAyKSAlPiUgc3VtbWFyaXplKHN1bSh2YWx1ZSkpDQp0YmwgJT4lIGdyb3VwX2J5KHBvcCA9IGdyb3VwMSwgcG9pZHMgPSB3ZWlnaHQgPiA1KSAlPiUNCiAgc3VtbWFyaXplKHNvbW1lID0gc3VtKHZhbHVlKSkNCnRibCAlPiUgZ3JvdXBfYnkoZ3JvdXAxKSAlPiUgbXV0YXRlKG1lYW5fZ3JvdXAxID0gbWVhbih2YWx1ZSkpDQp0YmwgJT4lIGZpbHRlcih3ZWlnaHQgPiA1KSAlPiUgZ3JvdXBfYnkoZ3JvdXAxLCBncm91cDIpICU+JQ0KICBzdW1tYXJpemUoTiA9IG4oKSkgJT4lIGFycmFuZ2UoZGVzYyhOKSkNCmBgYA0KDQojIDUuMi4zIFBvdXIgYWxsZXIgcGx1cyBsb2luDQoNCmBgYHtyfQ0Kc2V0d2QoIkM6L1VzZXJzL2h1c3Nvbi9Ecm9wYm94L1Jwb3VybGFzdGF0ZXRsYWRhdGFzY2llbmNlL2Rvbm5lZXMiKQ0KeGxzeCA8LSAiY2hhbnNvbi1mcmFuw6dhaXNlLnhsc3giDQpyZWFkeGw6OmV4Y2VsX3NoZWV0cyh4bHN4KQ0KY2hhbnRldXJzIDwtIHJlYWR4bDo6cmVhZF9leGNlbCh4bHN4LCBzaGVldD0iY2hhbnRldXJzIikNCmFsYnVtcyA8LSByZWFkeGw6OnJlYWRfZXhjZWwoeGxzeCwgc2hlZXQ9ImFsYnVtcyIpDQpjaGFudGV1cnMNCmFsYnVtcw0KbGVmdF9qb2luKGNoYW50ZXVycywgYWxidW1zLCBieT1jKCJwcmVub20iLCAibm9tIikpDQpgYGANCg0KYGBge3J9DQphbGJ1bXMgJT4lDQogIGZpbHRlcihhbm5lZT4xOTY4KSAlPiUNCiAgZ3JvdXBfYnkocHJlbm9tLCBub20pICU+JQ0KICBzdW1tYXJpc2UocG9zdF9zb2l4YW50ZV9odWl0PW4oKSkgJT4lDQogIGxlZnRfam9pbihjaGFudGV1cnMsIGJ5PWMoInByZW5vbSIsICJub20iKSkgJT4lDQogIHNlbGVjdChwcmVub20sIG5vbSwgbmFpc3NhbmNlLCBtb3J0LCBwb3N0X3NvaXhhbnRlX2h1aXQpDQpgYGANCg0KYGBge3J9DQp0YmwgJT4lIGdyb3VwX2J5KGdyb3VwMSkgJT4lIHNlbGVjdCh2YWx1ZSwgd2VpZ2h0KSAlPiUgbmVzdCgpICU+JQ0KICBtdXRhdGUoZGF0YSA9IG1hcChkYXRhLCB+IG1hcF9kZiguLCBtZWFuKSkpICU+JSB1bm5lc3QoKQ0KYGBg