this repo has no description
1# install.packages("tidyverse")
2library(tibble) # tibble (comes from tidyverse)
3# install.packages("word2vec")
4library(word2vec) # read.wordvectors
5# install.packages("reticulate")
6library(reticulate) # reticulate::py_eval
7# install.packages("tidytext")
8library(tidytext) # data("stop_words")
9# install.packages("dplyr")
10library(dplyr) # anti_join
11# install.packages("SnowballC")
12library(SnowballC) # wordStem
13# install.packages("gglot2")
14library(ggplot2) # ggplot
15
16syn <- read.csv("syn.csv")
17
18# --- google vectors ---
19
20emb <- read.wordvectors("google_vecs.bin", type = "bin")
21w2v <- function(x) {
22 y <- txt_clean_word2vec(x)
23 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb))
24}
25w2v_v <- Vectorize(w2v)
26
27sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine")
28sim_v <- Vectorize(sim)
29
30data <- sim_v(syn $ word, syn $ synonym)
31save(data, file = "data/google_vecs_syn.Rda")
32
33# --- glove 300d ---
34
35emb <- read.wordvectors("glove.6B.300d.txt", type = "txt")
36w2v <- function(x) {
37 y <- txt_clean_word2vec(x)
38 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb))
39}
40w2v_v <- Vectorize(w2v)
41
42sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine")
43sim_v <- Vectorize(sim)
44
45data <- sim_v(syn $ word, syn $ synonym)
46save(data, file = "data/glove_300d_syn.Rda")
47
48# --- glove 200d ---
49
50emb <- read.wordvectors("glove.6B.200d.txt", type = "txt")
51w2v <- function(x) {
52 y <- txt_clean_word2vec(x)
53 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb))
54}
55w2v_v <- Vectorize(w2v)
56
57sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine")
58sim_v <- Vectorize(sim)
59
60data <- sim_v(syn $ word, syn $ synonym)
61save(data, file = "data/glove_200d_syn.Rda")
62
63# --- glove 100d ---
64
65emb <- read.wordvectors("glove.6B.100d.txt", type = "txt")
66w2v <- function(x) {
67 y <- txt_clean_word2vec(x)
68 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb))
69}
70w2v_v <- Vectorize(w2v)
71
72sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine")
73sim_v <- Vectorize(sim)
74
75data <- sim_v(syn $ word, syn $ synonym)
76save(data, file = "data/glove_100d_syn.Rda")
77
78# --- glove 50d ---
79
80emb <- read.wordvectors("glove.6B.50d.txt", type = "txt")
81w2v <- function(x) {
82 y <- txt_clean_word2vec(x)
83 if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb))
84}
85w2v_v <- Vectorize(w2v)
86
87sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine")
88sim_v <- Vectorize(sim)
89
90data <- sim_v(syn $ word, syn $ synonym)
91save(data, file = "data/glove_50d_syn.Rda")