library(dplyr) # tibble library(tidytext) # read.csv library(jsonlite) # fromJSON library(ggplot2) # ggplot library(SnowballC) # wordStem library(wordcloud) # wordcloud data("stop_words") sources <- fromJSON("data/variables.json") counts <- vector("list", length(sources)) i <- 1 for (vars in sources[[4]]) { # This skip is needed because if length(vars) == 0, then 1:length(vars) # produces a 2 element list. Then, when creating a tibble using # 1:length(vars) and vars this would crash the program since both columns # are different sizes if (length(vars) == 0) { counts[[i]] <- NULL i <- i + 1 next } # The line column is needed because in the next part we convert each word in # each question into its own row and we want to be able to track what # question each word came from. raw_text <- tibble(line = 1:length(vars), text = vars) tidy_text <- (raw_text |> unnest_tokens(word, text) # Convert each word in each question into a row |> anti_join(stop_words) # Filter out stop words |> mutate(word_stem = wordStem(word)) # Add a column with each word's stem ) # The line with the inner_join and count is needed because count produces a # tibble with only two columns (word_stem and n) and we want to preserve the # word that the word stem came from. This is done using an inner_join. word_counts <- (tidy_text # WARNING: REMOVE THE NUMBERS HERE # WARNING: REMOVE THE NUMBERS HERE # WARNING: REMOVE THE NUMBERS HERE # WARNING: REMOVE THE NUMBERS HERE |> inner_join(count(tidy_text, word_stem, sort = TRUE)) # Count words and match them to their word_stem. |> filter(n > 5) # Filter out all the low ranking words # Filtering out words that share the same stem lets us pick an # (arbitrary) representative for each word's stem that is still human # readable (like having "education" instead of "educ"). |> distinct(word_stem, .keep_all = TRUE) # Filter out words that share the same stem. ) counts[[i]] <- word_counts i <- i + 1 } pdf("data/out.pdf") wordcloud(words = counts[[14]] $ word, freq = counts[[14]] $ n, max.words = 50) # ggplot(counts[[14]], aes(x = reorder(word, n), y = n)) + # geom_col() + # coord_flip() + # labs(title = "Most Frequent Words in 'CBMS' surveys") dev.off()