1. Importation des textes et nettoyage

library(tidyverse)
ND <- read_lines("https://r-stat-sc-donnees.github.io/NotreDameDeParis.txt",
                 locale = locale(encoding = "UTF-8"))
head(ND)
library(stringi)
stringi::stri_enc_detect(paste(read_lines_raw("https://r-stat-sc-donnees.github.io/LesMiserables1.txt",
                                              n_max = 30), collapse = " "))
ND <- str_replace_all(ND, "_|--|’", " ")
ND <- str_replace_all(ND, "_|--|'", " ")
noms <- c("NotreDameDeParis", "LesMiserables1", "LesMiserables2",
          "LesMiserables3", "LesMiserables4", "LesMiserables5")
lecture <- function(nom) {
  read_lines(str_c("https://r-stat-sc-donnees.github.io/",nom, ".txt"),locale = locale(encoding = "UTF-8")) %>%
    gutenbergr::gutenberg_strip() %>% str_replace_all("_|--|'", " ")
}
hugo <- tibble(nom = noms) %>% 
  mutate(id = if_else(nom == "NotreDameDeParis","ND", str_c("M", str_sub(nom, start = -1L)))) %>%
  mutate(txt = map(nom, lecture))
hugo <- hugo %>% unnest() %>% group_by(id) %>%
  mutate(ligne = row_number()) %>% ungroup()
hugo %>% slice(75000:75003)

2. Codage en terme de sac de mots

hugo_mots <- hugo_tokens %>% group_by(mots) %>% summarize(n = n())
hugo_mots %>% slice(1000:1005)
hugo_mots %>% top_n(20, n) %>% arrange(desc(n)) %>% pull(mots)
hugo_mots <- hugo_tokens %>% group_by(mots) %>% summarize(n = n())
hugo_mots %>% slice(1000:1005)
hugo_mots %>% top_n(20, n) %>% arrange(desc(n)) %>% pull(mots)
library(stopwords)
stopwords(language = "fr")
hugo_mots <- hugo_mots %>%
  anti_join(get_stopwords("fr"), by = c("mots" = "word")) %>% filter(n>=10)
library(wordcloud)
hugo_mots %>% top_n(100, n) %>% {
  wordcloud(.[["mots"]], .[["n"]], min.freq = 10, max.words = 100,
            color = brewer.pal(8,"Dark2"), random.order=FALSE, scale=c(3,.5))
}

library(wordcloud)
hugo_mots %>% top_n(100, n) %>% {
  wordcloud(.[["mots"]], .[["n"]], min.freq = 10, max.words = 100,
            color = brewer.pal(8,"Dark2"), random.order=FALSE, scale=c(3,.5))
}

hugo_tokens <- hugo_tokens %>%
  mutate(doc_id = str_c(id, sprintf("_%03i",
                                    (ligne-1) %/% 100 + 1)))
hugo_mots_doc <- hugo_tokens %>% semi_join(hugo_mots) %>%
  group_by(doc_id, mots) %>% summarize(n = n())
hugo_dtm <- hugo_mots_doc %>% spread(mots, n, fill = 0)

3. Analyse des correspondances et visualisation

library(FactoMineR)
hugo_ca <- CA(hugo_dtm %>% as.data.frame() %>%
                  column_to_rownames("doc_id"), ncp = 1000, graph = FALSE)

4. Classification par l’algorithme des k-means

set.seed(42)
hugo_ca_kmeans <- kmeans(hugo_ca_coord, 4, nstart = 100)
hugo_ca_df <- hugo_ca_df %>%
  mutate(classe = as.factor(hugo_ca_kmeans$cluster))
ggplot(hugo_ca_df, aes(x = doc_id, y = 1, fill = classe)) +
  geom_raster() + scale_x_discrete(breaks=NULL) +
  geom_vline(xintercept =669)

hugo_ca_df %>% select(classe, livre) %>% table()
      livre
classe  M1  M2  M3  M4  M5  ND
     1 139 124 111 157 131   1
     2   1   0   0   0   0   0
     3   1   1   1   2   1   1
     4   0   0   0   0   0 211
hugo_crit_classe <- hugo_mots_doc %>%
  left_join(hugo_ca_df %>% select(doc_id, classe)) %>%
  group_by(classe, mots) %>% summarize(n = sum(n)) %>%
  mutate(prop = n / sum(n))%>%group_by(mots) %>% mutate(n_tot = sum(n)) %>%
  ungroup() %>% mutate(prop_tot = n_tot / sum(n),
                       crit = prop * log(prop / prop_tot))
hugo_top_classe <- hugo_crit_classe %>%
  group_by(classe) %>% top_n(20, crit) %>%
  arrange(desc(crit)) %>% mutate(rank = row_number()) %>%
  filter(rank <= 20) %>% ungroup()
ggplot(hugo_top_classe %>%
         unite(mots_, mots, classe, remove = FALSE) %>%
         mutate(mots = fct_reorder(mots_,crit)),
       aes(x = mots, y = crit, fill = classe)) +
  geom_col() +
  facet_wrap(~classe, scales = "free") +
  scale_x_discrete(labels = function(x) {str_match(x, "([^_]*)")}) +
  coord_flip()
ggsave("textmining_top_classe_FH.pdf",width=8,height=5)

LS0tDQp0aXRsZTogIkFuYWx5c2UgZGUgdGV4dGUiDQphdXRob3I6ICJIdXNzb24gZXQgYWwuIg0KZGF0ZTogIjA1LzA5LzIwMTgiDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6IDMNCiAgICB0b2NfZmxvYXQ6IHllcw0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAnMycNCiAgICB0b2NfZmxvYXQ6IHllcw0KLS0tDQoNCiMgMS4gSW1wb3J0YXRpb24gZGVzIHRleHRlcyBldCBuZXR0b3lhZ2UNCg0KYGBge3IscmVzdWx0cz0iaGlkZSIsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpORCA8LSByZWFkX2xpbmVzKCJodHRwczovL3Itc3RhdC1zYy1kb25uZWVzLmdpdGh1Yi5pby9Ob3RyZURhbWVEZVBhcmlzLnR4dCIsDQogICAgICAgICAgICAgICAgIGxvY2FsZSA9IGxvY2FsZShlbmNvZGluZyA9ICJVVEYtOCIpKQ0KaGVhZChORCkNCmBgYA0KDQoNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoc3RyaW5naSkNCnN0cmluZ2k6OnN0cmlfZW5jX2RldGVjdChwYXN0ZShyZWFkX2xpbmVzX3JhdygiaHR0cHM6Ly9yLXN0YXQtc2MtZG9ubmVlcy5naXRodWIuaW8vTGVzTWlzZXJhYmxlczEudHh0IiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuX21heCA9IDMwKSwgY29sbGFwc2UgPSAiICIpKQ0KYGBgDQoNCg0KYGBge3IscmVzdWx0cz0iaGlkZSIsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShndXRlbmJlcmdyKQ0KTkQgPC0gZ3V0ZW5iZXJncjo6Z3V0ZW5iZXJnX3N0cmlwKE5EKQ0KTkQgJT4lIGhlYWQNCmBgYA0KDQoNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCk5EIDwtIHN0cl9yZXBsYWNlX2FsbChORCwgIl98LS18JyIsICIgIikNCmBgYA0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpub21zIDwtIGMoIk5vdHJlRGFtZURlUGFyaXMiLCAiTGVzTWlzZXJhYmxlczEiLCAiTGVzTWlzZXJhYmxlczIiLA0KICAgICAgICAgICJMZXNNaXNlcmFibGVzMyIsICJMZXNNaXNlcmFibGVzNCIsICJMZXNNaXNlcmFibGVzNSIpDQpsZWN0dXJlIDwtIGZ1bmN0aW9uKG5vbSkgew0KICByZWFkX2xpbmVzKHN0cl9jKCJodHRwczovL3Itc3RhdC1zYy1kb25uZWVzLmdpdGh1Yi5pby8iLG5vbSwgIi50eHQiKSxsb2NhbGUgPSBsb2NhbGUoZW5jb2RpbmcgPSAiVVRGLTgiKSkgJT4lDQogICAgZ3V0ZW5iZXJncjo6Z3V0ZW5iZXJnX3N0cmlwKCkgJT4lIHN0cl9yZXBsYWNlX2FsbCgiX3wtLXwnIiwgIiAiKQ0KfQ0KaHVnbyA8LSB0aWJibGUobm9tID0gbm9tcykgJT4lIA0KICBtdXRhdGUoaWQgPSBpZl9lbHNlKG5vbSA9PSAiTm90cmVEYW1lRGVQYXJpcyIsIk5EIiwgc3RyX2MoIk0iLCBzdHJfc3ViKG5vbSwgc3RhcnQgPSAtMUwpKSkpICU+JQ0KICBtdXRhdGUodHh0ID0gbWFwKG5vbSwgbGVjdHVyZSkpDQpgYGANCg0KYGBge3IscmVzdWx0cz0iaGlkZSIsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KaHVnbyA8LSBodWdvICU+JSB1bm5lc3QoKSAlPiUgZ3JvdXBfYnkoaWQpICU+JQ0KICBtdXRhdGUobGlnbmUgPSByb3dfbnVtYmVyKCkpICU+JSB1bmdyb3VwKCkNCmh1Z28gJT4lIHNsaWNlKDc1MDAwOjc1MDAzKQ0KYGBgDQojIDIuIENvZGFnZSBlbiB0ZXJtZSBkZSBzYWMgZGUgbW90cw0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KaHVnb190b2tlbnMgPC0gaHVnbyAlPiUgdW5uZXN0X3Rva2Vucyhtb3RzLCB0eHQpDQpodWdvX3Rva2VucyAlPiUgc2xpY2UoMTo1KQ0KYGBgDQoNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmh1Z29fbW90cyA8LSBodWdvX3Rva2VucyAlPiUgZ3JvdXBfYnkobW90cykgJT4lIHN1bW1hcml6ZShuID0gbigpKQ0KaHVnb19tb3RzICU+JSBzbGljZSgxMDAwOjEwMDUpDQpodWdvX21vdHMgJT4lIHRvcF9uKDIwLCBuKSAlPiUgYXJyYW5nZShkZXNjKG4pKSAlPiUgcHVsbChtb3RzKQ0KYGBgDQoNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoc3RvcHdvcmRzKQ0Kc3RvcHdvcmRzKGxhbmd1YWdlID0gImZyIikNCmBgYA0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpodWdvX21vdHMgPC0gaHVnb19tb3RzICU+JQ0KICBhbnRpX2pvaW4oZ2V0X3N0b3B3b3JkcygiZnIiKSwgYnkgPSBjKCJtb3RzIiA9ICJ3b3JkIikpICU+JSBmaWx0ZXIobj49MTApDQpgYGANCg0KYGBge3IscmVzdWx0cz0iaGlkZSIsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KaHVnb19tb3RzICU+JSB0b3BfbigyMCwgbikgJT4lIGFycmFuZ2UoZGVzYyhuKSkgJT4lIHB1bGwobW90cykNCmBgYA0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHdvcmRjbG91ZCkNCmh1Z29fbW90cyAlPiUgdG9wX24oMTAwLCBuKSAlPiUgew0KICB3b3JkY2xvdWQoLltbIm1vdHMiXV0sIC5bWyJuIl1dLCBtaW4uZnJlcSA9IDEwLCBtYXgud29yZHMgPSAxMDAsDQogICAgICAgICAgICBjb2xvciA9IGJyZXdlci5wYWwoOCwiRGFyazIiKSwgcmFuZG9tLm9yZGVyPUZBTFNFLCBzY2FsZT1jKDMsLjUpKQ0KfQ0KYGBgDQoNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmh1Z29fdG9rZW5zIDwtIGh1Z29fdG9rZW5zICU+JQ0KICBtdXRhdGUoZG9jX2lkID0gc3RyX2MoaWQsIHNwcmludGYoIl8lMDNpIiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIChsaWduZS0xKSAlLyUgMTAwICsgMSkpKQ0KYGBgDQoNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCmh1Z29fbW90c19kb2MgPC0gaHVnb190b2tlbnMgJT4lIHNlbWlfam9pbihodWdvX21vdHMpICU+JQ0KICBncm91cF9ieShkb2NfaWQsIG1vdHMpICU+JSBzdW1tYXJpemUobiA9IG4oKSkNCmh1Z29fZHRtIDwtIGh1Z29fbW90c19kb2MgJT4lIHNwcmVhZChtb3RzLCBuLCBmaWxsID0gMCkNCmBgYA0KIyAzLiBBbmFseXNlIGRlcyBjb3JyZXNwb25kYW5jZXMgZXQgdmlzdWFsaXNhdGlvbg0KDQpgYGB7cixmaWcud2lkdGg9NCxmaWcuaGVpZ2h0PTQsbWVzc2FnZT1GQUxTRSx3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShGYWN0b01pbmVSKQ0KaHVnb19jYSA8LSBDQShodWdvX2R0bSAlPiUgYXMuZGF0YS5mcmFtZSgpICU+JQ0KICAgICAgICAgICAgICAgICAgY29sdW1uX3RvX3Jvd25hbWVzKCJkb2NfaWQiKSwgbmNwID0gMTAwMCwgZ3JhcGggPSBGQUxTRSkNCmBgYA0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpodWdvX2NhX2Nvb3JkIDwtIGh1Z29fY2Ekcm93JGNvb3JkDQpodWdvX2NhX2RmIDwtIHRpYmJsZShkb2NfaWQgPSByb3cubmFtZXMoaHVnb19jYV9jb29yZCksDQogICAgICAgICAgICAgICAgICAgICBEaW0xID0gaHVnb19jYV9jb29yZFssMV0sIERpbTIgPSBodWdvX2NhX2Nvb3JkWywyXSwNCiAgICAgICAgICAgICAgICAgICAgIGxpdnJlID0gc3RyX3N1Yihkb2NfaWQsIDFMLCAyTCkpDQpnZ3Bsb3QoaHVnb19jYV9kZixhZXMoeCA9IERpbTEsIHkgPSBEaW0yLCBjb2xvciA9IGxpdnJlLCBncm91cCA9IGxpdnJlKSkgK2dlb21fcGF0aCgpICsNCiAgZ2VvbV9wb2ludCgpICsgY29vcmRfZXF1YWwoeGxpbSA9IGMoLTEsIDEpLCB5bGltID0gYygtMSwgMSkpDQpgYGANCg0KIyA0LiBDbGFzc2lmaWNhdGlvbiBwYXIgbOKAmWFsZ29yaXRobWUgZGVzIGstbWVhbnMNCmBgYHtyLHJlc3VsdHM9ImhpZGUiLG1lc3NhZ2U9RkFMU0Usd2FybmluZz1GQUxTRX0NCnNldC5zZWVkKDQyKQ0KaHVnb19jYV9rbWVhbnMgPC0ga21lYW5zKGh1Z29fY2FfY29vcmQsIDQsIG5zdGFydCA9IDEwMCkNCmh1Z29fY2FfZGYgPC0gaHVnb19jYV9kZiAlPiUNCiAgbXV0YXRlKGNsYXNzZSA9IGFzLmZhY3RvcihodWdvX2NhX2ttZWFucyRjbHVzdGVyKSkNCmdncGxvdChodWdvX2NhX2RmLCBhZXMoeCA9IGRvY19pZCwgeSA9IDEsIGZpbGwgPSBjbGFzc2UpKSArDQogIGdlb21fcmFzdGVyKCkgKyBzY2FsZV94X2Rpc2NyZXRlKGJyZWFrcz1OVUxMKSArDQogIGdlb21fdmxpbmUoeGludGVyY2VwdCA9NjY5KQ0KYGBgDQoNCg0KDQpgYGB7cixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpodWdvX2NhX2RmICU+JSBzZWxlY3QoY2xhc3NlLCBsaXZyZSkgJT4lIHRhYmxlKCkNCmBgYA0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIixtZXNzYWdlPUZBTFNFLHdhcm5pbmc9RkFMU0V9DQpodWdvX2NyaXRfY2xhc3NlIDwtIGh1Z29fbW90c19kb2MgJT4lDQogIGxlZnRfam9pbihodWdvX2NhX2RmICU+JSBzZWxlY3QoZG9jX2lkLCBjbGFzc2UpKSAlPiUNCiAgZ3JvdXBfYnkoY2xhc3NlLCBtb3RzKSAlPiUgc3VtbWFyaXplKG4gPSBzdW0obikpICU+JQ0KICBtdXRhdGUocHJvcCA9IG4gLyBzdW0obikpJT4lZ3JvdXBfYnkobW90cykgJT4lIG11dGF0ZShuX3RvdCA9IHN1bShuKSkgJT4lDQogIHVuZ3JvdXAoKSAlPiUgbXV0YXRlKHByb3BfdG90ID0gbl90b3QgLyBzdW0obiksDQogICAgICAgICAgICAgICAgICAgICAgIGNyaXQgPSBwcm9wICogbG9nKHByb3AgLyBwcm9wX3RvdCkpDQpodWdvX3RvcF9jbGFzc2UgPC0gaHVnb19jcml0X2NsYXNzZSAlPiUNCiAgZ3JvdXBfYnkoY2xhc3NlKSAlPiUgdG9wX24oMjAsIGNyaXQpICU+JQ0KICBhcnJhbmdlKGRlc2MoY3JpdCkpICU+JSBtdXRhdGUocmFuayA9IHJvd19udW1iZXIoKSkgJT4lDQogIGZpbHRlcihyYW5rIDw9IDIwKSAlPiUgdW5ncm91cCgpDQpnZ3Bsb3QoaHVnb190b3BfY2xhc3NlICU+JQ0KICAgICAgICAgdW5pdGUobW90c18sIG1vdHMsIGNsYXNzZSwgcmVtb3ZlID0gRkFMU0UpICU+JQ0KICAgICAgICAgbXV0YXRlKG1vdHMgPSBmY3RfcmVvcmRlcihtb3RzXyxjcml0KSksDQogICAgICAgYWVzKHggPSBtb3RzLCB5ID0gY3JpdCwgZmlsbCA9IGNsYXNzZSkpICsNCiAgZ2VvbV9jb2woKSArDQogIGZhY2V0X3dyYXAofmNsYXNzZSwgc2NhbGVzID0gImZyZWUiKSArDQogIHNjYWxlX3hfZGlzY3JldGUobGFiZWxzID0gZnVuY3Rpb24oeCkge3N0cl9tYXRjaCh4LCAiKFteX10qKSIpfSkgKw0KICBjb29yZF9mbGlwKCkNCmBgYA0KDQo=