main.R at main · stau.space/underrepresentation-theory

stau.space / underrepresentation-theory
fork atom
this repo has no description
fork atom
underrepresentation-theory / main.R
at main 68 lines 2.4 kB view raw
wrap content
Jillian Alexander Tagging some Ipeds questions 8mo ago
029d8d61
 1library(dplyr) # tibble
 2library(tidytext) # read.csv
 3library(jsonlite) # fromJSON
 4library(ggplot2) # ggplot
 5library(SnowballC) # wordStem
 6library(wordcloud) # wordcloud
 7
 8data("stop_words")
 9
10getwd()
11
12json_text <- readLines("/Users/mine-ja/Documents/GitHub/underrepresentation-theory/variables.json", encoding = "UTF-8")
13
14json_text <- readLines("variables.json", encoding = "UTF-8", warn = FALSE)
15json_text <- paste(json_text, collapse = "\n")
16
17# Then parse
18sources_data <- fromJSON(json_text)
19
20#sources_data <- fromJSON("variables.json")
21
22counts <- vector("list", length(sources_data))
23
24i <- 1
25for (vars in sources[[4]]) {
26	# This skip is needed because if length(vars) == 0, then 1:length(vars)
27	# produces a 2 element list. Then, when creating a tibble using
28	# 1:length(vars) and vars this would crash the program since both columns
29	# are different sizes
30	if (length(vars) == 0) {
31		counts[[i]] <- NULL
32		i <- i + 1
33		next
34	}
35
36	# The line column is needed because in the next part we convert each word in
37	# each question into its own row and we want to be able to track what
38	# question each word came from.
39	raw_text <- tibble(line = 1:length(vars), text = vars)
40
41	tidy_text <- (raw_text
42		|> unnest_tokens(word, text)          # Convert each word in each question into a row
43		|> anti_join(stop_words)              # Filter out stop words
44		|> mutate(word_stem = wordStem(word)) # Add a column with each word's stem
45	)
46
47	# The line with the inner_join and count is needed because count produces a
48	# tibble with only two columns (word_stem and n) and we want to preserve the
49	# word that the word stem came from. This is done using an inner_join.
50	word_counts <- (tidy_text
51		|> inner_join(count(tidy_text, word_stem, sort = TRUE)) # Count words and match them to their word_stem.
52		|> filter(n > 5) # Filter out all the low ranking words
53		# Filtering out words that share the same stem lets us pick an
54		# (arbitrary) representative for each word's stem that is still human
55		# readable (like having "education" instead of "educ").
56		|> distinct(word_stem, .keep_all = TRUE) # Filter out words that share the same stem.
57	)
58	counts[[i]] <- word_counts
59	i <- i + 1
60}
61
62pdf("out.pdf")
63wordcloud(words = counts[[14]] $ word, freq = counts[[14]] $ n, max.words = 50)
64# ggplot(counts[[14]], aes(x = reorder(word, n), y = n)) +
65# 	geom_col() +
66# 	coord_flip() +
67# 	labs(title = "Most Frequent Words in 'CBMS' surveys")
68dev.off()