Skip to content

Commit 1e95ed0

Browse files
committed
add example "GloVe"
1 parent 0e90135 commit 1e95ed0

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

vignettes/examples/load_GloVe.R

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# This example shows how one can quickly load glove vectors
2+
# and train a Keras model in R
3+
4+
library(keras)
5+
library(dplyr)
6+
7+
# Download Glove vectors if necessary
8+
if (!file.exists('glove.6B.zip')) {
9+
download.file('http://nlp.stanford.edu/data/glove.6B.zip',destfile = 'glove.6B.zip')
10+
unzip('glove.6B.zip')
11+
}
12+
13+
# load an example dataset from text2vec
14+
library(text2vec)
15+
data("movie_review")
16+
as_tibble(movie_review)
17+
18+
# load glove vectors into R
19+
vectors = data.table::fread('glove.6B.300d.txt', data.table = F, encoding = 'UTF-8')
20+
colnames(vectors) = c('word',paste('dim',1:300,sep = '_'))
21+
22+
# structure of the vectors
23+
as_tibble(vectors)
24+
25+
# define parameters of Keras model
26+
library(keras)
27+
max_words = 1e4
28+
maxlen = 60
29+
dim_size = 300
30+
31+
# tokenize the input data and then fit the created object
32+
word_seqs = text_tokenizer(num_words = max_words) %>%
33+
fit_text_tokenizer(movie_review$review)
34+
35+
# apply tokenizer to the text and get indices instead of words
36+
# later pad the sequence
37+
x_train = texts_to_sequences(word_seqs, movie_review$review) %>%
38+
pad_sequences( maxlen = maxlen)
39+
40+
# extract the output
41+
y_train = as.matrix(movie_review$sentiment)
42+
43+
# unlist word indices
44+
word_indices = unlist(word_seqs$word_index)
45+
46+
# then place them into data.frame
47+
dic = data.frame(word = names(word_indices), key = word_indices, stringsAsFactors = FALSE) %>%
48+
arrange(key) %>% .[1:max_words,]
49+
50+
# join the words with GloVe vectors and
51+
# if word does not exist in GloVe, then fill NA's with 0
52+
word_embeds = dic %>% left_join(vectors) %>% .[,3:302] %>% replace(., is.na(.), 0) %>% as.matrix()
53+
54+
# Use Keras Functional API
55+
input = layer_input(shape = list(maxlen), name = "input")
56+
57+
model = input %>%
58+
layer_embedding(input_dim = max_words, output_dim = dim_size, input_length = maxlen,
59+
# put weights into list and do not allow training
60+
weights = list(word_embeds), trainable = FALSE) %>%
61+
layer_spatial_dropout_1d(rate = 0.2 ) %>%
62+
bidirectional(
63+
layer_gru(units = 80, return_sequences = TRUE)
64+
)
65+
max_pool = model %>% layer_global_max_pooling_1d()
66+
ave_pool = model %>% layer_global_average_pooling_1d()
67+
68+
output = layer_concatenate(list(ave_pool, max_pool)) %>%
69+
layer_dense(units = 1, activation = "sigmoid")
70+
71+
model = keras_model(input, output)
72+
73+
# instead of accuracy we can use "AUC" metrics from "tensorflow.keras"
74+
model %>% compile(
75+
optimizer = "adam",
76+
loss = "binary_crossentropy",
77+
metrics = tensorflow::tf$keras$metrics$AUC()
78+
)
79+
80+
history = model %>% keras::fit(
81+
x_train, y_train,
82+
epochs = 8,
83+
batch_size = 32,
84+
validation_split = 0.2
85+
)
86+
87+

0 commit comments

Comments
 (0)