this repo has no description
at main 91 lines 2.5 kB view raw
1# install.packages("tidyverse") 2library(tibble) # tibble (comes from tidyverse) 3# install.packages("word2vec") 4library(word2vec) # read.wordvectors 5# install.packages("reticulate") 6library(reticulate) # reticulate::py_eval 7# install.packages("tidytext") 8library(tidytext) # data("stop_words") 9# install.packages("dplyr") 10library(dplyr) # anti_join 11# install.packages("SnowballC") 12library(SnowballC) # wordStem 13# install.packages("gglot2") 14library(ggplot2) # ggplot 15 16syn <- read.csv("syn.csv") 17 18# --- google vectors --- 19 20emb <- read.wordvectors("google_vecs.bin", type = "bin") 21w2v <- function(x) { 22 y <- txt_clean_word2vec(x) 23 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 24} 25w2v_v <- Vectorize(w2v) 26 27sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 28sim_v <- Vectorize(sim) 29 30data <- sim_v(syn $ word, syn $ synonym) 31save(data, file = "data/google_vecs_syn.Rda") 32 33# --- glove 300d --- 34 35emb <- read.wordvectors("glove.6B.300d.txt", type = "txt") 36w2v <- function(x) { 37 y <- txt_clean_word2vec(x) 38 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 39} 40w2v_v <- Vectorize(w2v) 41 42sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 43sim_v <- Vectorize(sim) 44 45data <- sim_v(syn $ word, syn $ synonym) 46save(data, file = "data/glove_300d_syn.Rda") 47 48# --- glove 200d --- 49 50emb <- read.wordvectors("glove.6B.200d.txt", type = "txt") 51w2v <- function(x) { 52 y <- txt_clean_word2vec(x) 53 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 54} 55w2v_v <- Vectorize(w2v) 56 57sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 58sim_v <- Vectorize(sim) 59 60data <- sim_v(syn $ word, syn $ synonym) 61save(data, file = "data/glove_200d_syn.Rda") 62 63# --- glove 100d --- 64 65emb <- read.wordvectors("glove.6B.100d.txt", type = "txt") 66w2v <- function(x) { 67 y <- txt_clean_word2vec(x) 68 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 69} 70w2v_v <- Vectorize(w2v) 71 72sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 73sim_v <- Vectorize(sim) 74 75data <- sim_v(syn $ word, syn $ synonym) 76save(data, file = "data/glove_100d_syn.Rda") 77 78# --- glove 50d --- 79 80emb <- read.wordvectors("glove.6B.50d.txt", type = "txt") 81w2v <- function(x) { 82 y <- txt_clean_word2vec(x) 83 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 84} 85w2v_v <- Vectorize(w2v) 86 87sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 88sim_v <- Vectorize(sim) 89 90data <- sim_v(syn $ word, syn $ synonym) 91save(data, file = "data/glove_50d_syn.Rda")