this repo has no description
at main 68 lines 2.4 kB view raw
1library(dplyr) # tibble 2library(tidytext) # read.csv 3library(jsonlite) # fromJSON 4library(ggplot2) # ggplot 5library(SnowballC) # wordStem 6library(wordcloud) # wordcloud 7 8data("stop_words") 9 10getwd() 11 12json_text <- readLines("/Users/mine-ja/Documents/GitHub/underrepresentation-theory/variables.json", encoding = "UTF-8") 13 14json_text <- readLines("variables.json", encoding = "UTF-8", warn = FALSE) 15json_text <- paste(json_text, collapse = "\n") 16 17# Then parse 18sources_data <- fromJSON(json_text) 19 20#sources_data <- fromJSON("variables.json") 21 22counts <- vector("list", length(sources_data)) 23 24i <- 1 25for (vars in sources[[4]]) { 26 # This skip is needed because if length(vars) == 0, then 1:length(vars) 27 # produces a 2 element list. Then, when creating a tibble using 28 # 1:length(vars) and vars this would crash the program since both columns 29 # are different sizes 30 if (length(vars) == 0) { 31 counts[[i]] <- NULL 32 i <- i + 1 33 next 34 } 35 36 # The line column is needed because in the next part we convert each word in 37 # each question into its own row and we want to be able to track what 38 # question each word came from. 39 raw_text <- tibble(line = 1:length(vars), text = vars) 40 41 tidy_text <- (raw_text 42 |> unnest_tokens(word, text) # Convert each word in each question into a row 43 |> anti_join(stop_words) # Filter out stop words 44 |> mutate(word_stem = wordStem(word)) # Add a column with each word's stem 45 ) 46 47 # The line with the inner_join and count is needed because count produces a 48 # tibble with only two columns (word_stem and n) and we want to preserve the 49 # word that the word stem came from. This is done using an inner_join. 50 word_counts <- (tidy_text 51 |> inner_join(count(tidy_text, word_stem, sort = TRUE)) # Count words and match them to their word_stem. 52 |> filter(n > 5) # Filter out all the low ranking words 53 # Filtering out words that share the same stem lets us pick an 54 # (arbitrary) representative for each word's stem that is still human 55 # readable (like having "education" instead of "educ"). 56 |> distinct(word_stem, .keep_all = TRUE) # Filter out words that share the same stem. 57 ) 58 counts[[i]] <- word_counts 59 i <- i + 1 60} 61 62pdf("out.pdf") 63wordcloud(words = counts[[14]] $ word, freq = counts[[14]] $ n, max.words = 50) 64# ggplot(counts[[14]], aes(x = reorder(word, n), y = n)) + 65# geom_col() + 66# coord_flip() + 67# labs(title = "Most Frequent Words in 'CBMS' surveys") 68dev.off()