feat: Optimized data generation · stau.space/underrepresentation-theory@63a5de3

stau.space / underrepresentation-theory

fork atom

this repo has no description

fork atom

feat: Optimized data generation

Signed-off-by: Sona Tau Estrada Rivera <sona@stau.space>

stau.space 8 months ago 63a5de36 7c80aebf

+88 -38

1 changed file

expand all

code

generate_data.R

+88 -38

code/generate_data.R

··· 63 63 out 64 64 } 65 65 66 + lgbt_tags <- list("lgbt","lgbtq","sex","identity","gender","orientation","nonbinary") |> as.character() 67 + race_ethnicity_tags <- list("race","ethnicity","african","american","black","hispanic","asian","indigenous","native","latino","latina","latine") |> as.character() 68 + women_tags <- list("woman","women","girl","feminine","femeninity","ms","mrs") |> as.character() 69 + men_tags <- list("man", "men", "boy", "male", "masculine", "masculinity", "mr") |> as.character() 70 + disabilities_tags <- list("disabilities","disabled","disability","handicap","handicapped","neurodivergent") |> as.character() 71 + tags <- c(lgbt_tags, race_ethnicity_tags, women_tags, men_tags, disabilities_tags) 72 + tag_vectors <- w2v_v(tags) 73 + tag_category <- c( 74 + rep("lgbt", length(lgbt)), 75 + rep("race/ethnicity", length(race_ethnicity)), 76 + rep("women", length(women)), 77 + rep("men", length(men)), 78 + rep("disabilities", length(disabilities)) 79 + ) 80 + 66 81 word_category <- function(x, threshold = 0.3) { 67 - lgbt <- list("lgbt","lgbtq","sex","identity","gender","orientation","nonbinary") 68 - race_ethnicity <- list("race","ethnicity","african","american","black","hispanic","asian","indigenous","native","latino","latina","latine") 69 - women <- list("woman","women","girl","feminine","femeninity","ms","mrs") 70 - men <- list("man", "men", "boy", "male", "masculine", "masculinity", "mr") 71 - disabilities <- list("disabilities","disabled","disability","handicap","handicapped","neurodivergent") 72 - all <- c(lgbt, race_ethnicity, women, men, disabilities) 82 + 83 + similarity_vectors <- numeric(length = length(all_vecs[1,])) 84 + for (i in seq_along(all_vecs[1,])) { 85 + w <- tag_vectors[,i] 86 + similarity_vectors[i] <- word2vec_similarity(w2v(x), w, type = "cosine") 87 + } 73 88 74 89 similarities <- data.frame( 75 - sim = sim_v(x, all), 76 - word = as.character(all), 77 - word_category = c( 78 - rep("lgbt", length(lgbt)), 79 - rep("race/ethnicity", length(race_ethnicity)), 80 - rep("women", length(women)), 81 - rep("men", length(men)), 82 - rep("disabilities", length(disabilities)) 83 - ) 90 + sim = similarity_vectors, 91 + tags = tags, 92 + tag_category = tag_category 84 93 ) 94 + 85 95 similarities <- similarities[similarities |> complete.cases(),] 86 96 m <- max(similarities $ sim, na.rm = TRUE) 87 - if (m > threshold) similarities[m == similarities,] |> head(1) else data.frame(sim = NA, word = NA, word_category = NA) 97 + if (m > threshold) similarities[m == similarities,] |> head(1) else data.frame(sim = NA, tag = NA, tag_category = NA) 98 + } 99 + 100 + memo <- new.env(hash = TRUE, parent = emptyenv()) 101 + word_category_m <- function(x, threshold = 0.3) { 102 + if (is.null(memo[[x]])) { 103 + memo[[x]] <- word_category(x, threshold) 104 + } 105 + return(memo[[x]]) 88 106 } 89 107 90 108 word_category_v <- function(x) { 91 - res <- Vectorize(word_category)(x) |> t() 109 + res <- Vectorize(word_category_m)(x) |> t() 92 110 sim <- numeric(length = length(x)) 93 - word <- character(length = length(x)) 94 - word_category <- character(length = length(x)) 111 + tag <- character(length = length(x)) 112 + tag_category <- character(length = length(x)) 95 113 for (i in 1:nrow(res)) { 96 114 sim[i] <- res[,"sim"][[i]] 97 - word[i] <- res[,"word"][[i]] 98 - word_category[i] <- res[,"word_category"][[i]] 115 + tag[i] <- res[,"tag"][[i]] 116 + tag_category[i] <- res[,"tag_category"][[i]] 99 117 } 100 118 data.frame( 101 119 sim = sim, 102 - word = word, 103 - word_category = word_category 120 + tag = tag_category, 121 + tag_category = tag_category 104 122 ) 105 123 } 106 124 ··· 116 134 117 135 data <- data[selection,] 118 136 119 - for (row_idx in 1:nrow(data)) { 120 - raw_txt <- data[row_idx,"variables"] 121 - title <- data[row_idx,"title"] 122 - tib_txt <- tibble(line = seq_along(raw_txt[[1]]), text = raw_txt[[1]]) 123 - tmp <- unnest_tokens(tib_txt, word, text) 124 - clean_txt <- anti_join(tmp[!grepl("\\d", tmp $ word),], stop_words) 137 + clean_text <- function(raw_text) { 138 + tmp <- tibble( 139 + line = seq_along(raw_text), 140 + text = raw_text 141 + ) |> unnest_tokens(word, text) 142 + tmp[!grepl("\\d", tmp $ word),] |> anti_join(stop_words) 143 + } 125 144 126 - stem_txt <- mutate(clean_txt, word_stem = wordStem(word)) 127 - word_categories <- word_category_v(clean_txt $ word) 128 - sema_txt <- mutate(clean_txt, tag = word_categories $ word, word_category = word_categories $ word_category) 145 + # Word stems analysis 146 + 147 + for (row_idx in 1:nrow(data)) { 148 + stem_txt <- clean_text(data[row_idx,"variables"][[1]]) |> 149 + mutate(word_stem = wordStem(word)) 129 150 130 151 stem_count <- stem_txt |> 131 152 inner_join(count(stem_txt, word_stem)) |> 132 153 filter(n > 5) |> 133 154 distinct(word_stem, .keep_all = TRUE) 134 155 135 - sema_count <- sema_txt |> 136 - inner_join(count(sema_txt, tag)) |> 137 - distinct(tag, .keep_all = TRUE) 156 + title <- data[row_idx,"title"] 157 + save(sema_count, file = paste("data/stem_", title, ".Rda", sep = "")) 158 + } 159 + 160 + 161 + # Word semantics analysis 162 + 163 + word_semantic_analysis <- function(emb, data, model_name, threshold = 0.3) { 164 + for (row_idx in 1:nrow(data)) { 165 + word_categories <- word_category_v(clean_txt $ word, threshold) 138 166 139 - title <- data[row_idx,"title"] 140 - save(stem_count, file = paste("data/stem_", title, ".Rda", sep = "")) 141 - save(sema_count, file = paste("data/google_sema_", title, ".Rda", sep = "")) 167 + sema_txt <- clean_text(data[row_idx,"variables"][[1]]) |> 168 + mutate(tag = word_categories $ tag, word_category = word_categories $ tag_category) 169 + 170 + sema_count <- sema_txt |> 171 + inner_join(count(sema_txt, tag)) |> 172 + distinct(tag, .keep_all = TRUE) 173 + 174 + title <- data[row_idx,"title"] 175 + save(sema_count, file = paste("data/", model_name, "_", threshold, "_", title, ".Rda", sep = "")) 176 + } 142 177 } 178 + 179 + read.wordvectors("google_vecs.bin", type = "bin") |> 180 + word_semantic_analysis(data = data, model_name = "google_news", threshold = 0.216) 181 + 182 + read.wordvectors("glove.6B.300d.txt", type = "txt") |> 183 + word_semantic_analysis(data = data, model_name = "glove_300d", threshold = 0.154) 184 + 185 + read.wordvectors("glove.6B.200d.txt", type = "txt") |> 186 + word_semantic_analysis(data = data, model_name = "glove_200d", threshold = 0.182) 187 + 188 + read.wordvectors("glove.6B.100d.txt", type = "txt") |> 189 + word_semantic_analysis(data = data, model_name = "glove_100d", threshold = 0.227) 190 + 191 + read.wordvectors("glove.6B.50d.txt", type = "txt") |> 192 + word_semantic_analysis(data = data, model_name = "glove_50d", threshold = 0.270)