this repo has no description
1library(dplyr) # tibble
2library(tidytext) # read.csv
3library(jsonlite) # fromJSON
4library(ggplot2) # ggplot
5library(SnowballC) # wordStem
6library(wordcloud) # wordcloud
7
8data("stop_words")
9
10sources <- fromJSON("data/variables.json")
11
12counts <- vector("list", length(sources))
13
14i <- 1
15for (vars in sources[[4]]) {
16 # This skip is needed because if length(vars) == 0, then 1:length(vars)
17 # produces a 2 element list. Then, when creating a tibble using
18 # 1:length(vars) and vars this would crash the program since both columns
19 # are different sizes
20 if (length(vars) == 0) {
21 counts[[i]] <- NULL
22 i <- i + 1
23 next
24 }
25
26 # The line column is needed because in the next part we convert each word in
27 # each question into its own row and we want to be able to track what
28 # question each word came from.
29 raw_text <- tibble(line = 1:length(vars), text = vars)
30
31 tidy_text <- (raw_text
32 |> unnest_tokens(word, text) # Convert each word in each question into a row
33 |> anti_join(stop_words) # Filter out stop words
34 |> mutate(word_stem = wordStem(word)) # Add a column with each word's stem
35 )
36
37 # The line with the inner_join and count is needed because count produces a
38 # tibble with only two columns (word_stem and n) and we want to preserve the
39 # word that the word stem came from. This is done using an inner_join.
40
41 word_counts <- (tidy_text
42 # WARNING: REMOVE THE NUMBERS HERE
43 # WARNING: REMOVE THE NUMBERS HERE
44 # WARNING: REMOVE THE NUMBERS HERE
45 # WARNING: REMOVE THE NUMBERS HERE
46 |> inner_join(count(tidy_text, word_stem, sort = TRUE)) # Count words and match them to their word_stem.
47 |> filter(n > 5) # Filter out all the low ranking words
48 # Filtering out words that share the same stem lets us pick an
49 # (arbitrary) representative for each word's stem that is still human
50 # readable (like having "education" instead of "educ").
51 |> distinct(word_stem, .keep_all = TRUE) # Filter out words that share the same stem.
52 )
53 counts[[i]] <- word_counts
54 i <- i + 1
55}
56
57pdf("data/out.pdf")
58wordcloud(words = counts[[14]] $ word, freq = counts[[14]] $ n, max.words = 50)
59# ggplot(counts[[14]], aes(x = reorder(word, n), y = n)) +
60# geom_col() +
61# coord_flip() +
62# labs(title = "Most Frequent Words in 'CBMS' surveys")
63dev.off()