Skip to content

Commit 44d19ef

Browse files
authored
Merge pull request #1025 from henry090/master
add example "GloVe"
2 parents 142ddce + 50025cc commit 44d19ef

File tree

1 file changed

+64
-167
lines changed

1 file changed

+64
-167
lines changed
Lines changed: 64 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -1,190 +1,87 @@
1-
#' This script loads pre-trained word embeddings (GloVe embeddings) into a
2-
#' frozen Keras Embedding layer, and uses it to train a text classification
3-
#' model on the 20 Newsgroup dataset (classication of newsgroup messages into 20
4-
#' different categories).
5-
#'
6-
#' GloVe embedding data can be found at:
7-
#' http://nlp.stanford.edu/data/glove.6B.zip (source page:
8-
#' http://nlp.stanford.edu/projects/glove/)
9-
#'
10-
#' 20 Newsgroup data can be found at:
11-
#' http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
12-
#'
13-
14-
#'
15-
#' IMPORTANT NOTE: This example does yet work correctly. The code executes fine and
16-
#' appears to mimic the Python code upon which it is based however it achieves only
17-
#' half the training accuracy that the Python code does so there is clearly a
18-
#' subtle difference.
19-
#'
20-
#' We need to investigate this further before formally adding to the list of examples
21-
#'
22-
#'
1+
# This example shows how one can quickly load glove vectors
2+
# and train a Keras model in R
233

244
library(keras)
5+
library(dplyr)
256

26-
GLOVE_DIR <- 'glove.6B'
27-
TEXT_DATA_DIR <- '20_newsgroup'
28-
MAX_SEQUENCE_LENGTH <- 1000
29-
MAX_NUM_WORDS <- 20000
30-
EMBEDDING_DIM <- 100
31-
VALIDATION_SPLIT <- 0.2
32-
33-
# download data if necessary
34-
download_data <- function(data_dir, url_path, data_file) {
35-
if (!dir.exists(data_dir)) {
36-
download.file(paste0(url_path, data_file), data_file, mode = "wb")
37-
if (tools::file_ext(data_file) == "zip")
38-
unzip(data_file, exdir = tools::file_path_sans_ext(data_file))
39-
else
40-
untar(data_file)
41-
unlink(data_file)
42-
}
7+
# Download Glove vectors if necessary
8+
if (!file.exists('glove.6B.zip')) {
9+
download.file('http://nlp.stanford.edu/data/glove.6B.zip',destfile = 'glove.6B.zip')
10+
unzip('glove.6B.zip')
4311
}
44-
download_data(GLOVE_DIR, 'http://nlp.stanford.edu/data/', 'glove.6B.zip')
45-
download_data(TEXT_DATA_DIR, "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/", "news20.tar.gz")
4612

47-
# first, build index mapping words in the embeddings set
48-
# to their embedding vector
13+
# load an example dataset from text2vec
14+
library(text2vec)
15+
data("movie_review")
16+
as_tibble(movie_review)
4917

50-
cat('Indexing word vectors.\n')
18+
# load glove vectors into R
19+
vectors = data.table::fread('glove.6B.300d.txt', data.table = F, encoding = 'UTF-8')
20+
colnames(vectors) = c('word',paste('dim',1:300,sep = '_'))
5121

52-
embeddings_index <- new.env(parent = emptyenv())
53-
lines <- readLines(file.path(GLOVE_DIR, 'glove.6B.100d.txt'))
54-
for (line in lines) {
55-
values <- strsplit(line, ' ', fixed = TRUE)[[1]]
56-
word <- values[[1]]
57-
coefs <- as.numeric(values[-1])
58-
embeddings_index[[word]] <- coefs
59-
}
22+
# structure of the vectors
23+
as_tibble(vectors)
6024

61-
cat(sprintf('Found %s word vectors.\n', length(embeddings_index)))
62-
63-
# second, prepare text samples and their labels
64-
cat('Processing text dataset\n')
65-
66-
texts <- character() # text samples
67-
labels <- integer() # label ids
68-
labels_index <- list() # dictionary: label name to numeric id
69-
70-
for (name in list.files(TEXT_DATA_DIR)) {
71-
path <- file.path(TEXT_DATA_DIR, name)
72-
if (file_test("-d", path)) {
73-
label_id <- length(labels_index)
74-
labels_index[[name]] <- label_id
75-
for (fname in list.files(path)) {
76-
if (grepl("^[0-9]+$", fname)) {
77-
fpath <- file.path(path, fname)
78-
t <- readLines(fpath, encoding = "latin1")
79-
t <- paste(t, collapse = "\n")
80-
i <- regexpr(pattern = '\n\n', t, fixed = TRUE)[[1]]
81-
if (i != -1L)
82-
t <- substring(t, i)
83-
texts <- c(texts, t)
84-
labels <- c(labels, label_id)
85-
}
86-
}
87-
}
88-
}
25+
# define parameters of Keras model
26+
library(keras)
27+
max_words = 1e4
28+
maxlen = 60
29+
dim_size = 300
8930

90-
cat(sprintf('Found %s texts.\n', length(texts)))
91-
92-
# finally, vectorize the text samples into a 2D integer tensor
93-
tokenizer <- text_tokenizer(num_words=MAX_NUM_WORDS)
94-
tokenizer %>% fit_text_tokenizer(texts)
95-
96-
# save the tokenizer in case we want to use it again
97-
# for prediction within another R session, see:
98-
# https://keras.rstudio.com/reference/save_text_tokenizer.html
99-
save_text_tokenizer(tokenizer, "tokenizer")
100-
101-
sequences <- texts_to_sequences(tokenizer, texts)
102-
103-
word_index <- tokenizer$word_index
104-
cat(sprintf('Found %s unique tokens.\n', length(word_index)))
105-
106-
data <- pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
107-
labels <- to_categorical(labels)
108-
109-
cat('Shape of data tensor: ', dim(data), '\n')
110-
cat('Shape of label tensor: ', dim(labels), '\n')
111-
112-
# split the data into a training set and a validation set
113-
indices <- 1:nrow(data)
114-
indices <- sample(indices)
115-
data <- data[indices,]
116-
labels <- labels[indices,]
117-
num_validation_samples <- as.integer(VALIDATION_SPLIT * nrow(data))
118-
119-
x_train <- data[-(1:num_validation_samples),]
120-
y_train <- labels[-(1:num_validation_samples),]
121-
x_val <- data[1:num_validation_samples,]
122-
y_val <- labels[1:num_validation_samples,]
123-
124-
cat('Preparing embedding matrix.\n')
125-
126-
# prepare embedding matrix
127-
num_words <- min(MAX_NUM_WORDS, length(word_index) + 1)
128-
prepare_embedding_matrix <- function() {
129-
embedding_matrix <- matrix(0L, nrow = num_words, ncol = EMBEDDING_DIM)
130-
for (word in names(word_index)) {
131-
index <- word_index[[word]]
132-
if (index >= MAX_NUM_WORDS)
133-
next
134-
embedding_vector <- embeddings_index[[word]]
135-
if (!is.null(embedding_vector)) {
136-
# words not found in embedding index will be all-zeros.
137-
embedding_matrix[index,] <- embedding_vector
138-
}
139-
}
140-
embedding_matrix
141-
}
31+
# tokenize the input data and then fit the created object
32+
word_seqs = text_tokenizer(num_words = max_words) %>%
33+
fit_text_tokenizer(movie_review$review)
14234

143-
embedding_matrix <- prepare_embedding_matrix()
35+
# apply tokenizer to the text and get indices instead of words
36+
# later pad the sequence
37+
x_train = texts_to_sequences(word_seqs, movie_review$review) %>%
38+
pad_sequences( maxlen = maxlen)
14439

145-
# load pre-trained word embeddings into an Embedding layer
146-
# note that we set trainable = False so as to keep the embeddings fixed
147-
embedding_layer <- layer_embedding(
148-
input_dim = num_words,
149-
output_dim = EMBEDDING_DIM,
150-
weights = list(embedding_matrix),
151-
input_length = MAX_SEQUENCE_LENGTH,
152-
trainable = FALSE
153-
)
154-
155-
cat('Training model\n')
40+
# extract the output
41+
y_train = as.matrix(movie_review$sentiment)
15642

157-
# train a 1D convnet with global maxpooling
158-
sequence_input <- layer_input(shape = list(MAX_SEQUENCE_LENGTH), dtype='int32')
43+
# unlist word indices
44+
word_indices = unlist(word_seqs$word_index)
15945

160-
preds <- sequence_input %>%
161-
embedding_layer %>%
162-
layer_conv_1d(filters = 128, kernel_size = 5, activation = 'relu') %>%
163-
layer_max_pooling_1d(pool_size = 5) %>%
164-
layer_conv_1d(filters = 128, kernel_size = 5, activation = 'relu') %>%
165-
layer_max_pooling_1d(pool_size = 5) %>%
166-
layer_conv_1d(filters = 128, kernel_size = 5, activation = 'relu') %>%
167-
layer_max_pooling_1d(pool_size = 35) %>%
168-
layer_flatten() %>%
169-
layer_dense(units = 128, activation = 'relu') %>%
170-
layer_dense(units = length(labels_index), activation = 'softmax')
46+
# then place them into data.frame
47+
dic = data.frame(word = names(word_indices), key = word_indices, stringsAsFactors = FALSE) %>%
48+
arrange(key) %>% .[1:max_words,]
17149

50+
# join the words with GloVe vectors and
51+
# if word does not exist in GloVe, then fill NA's with 0
52+
word_embeds = dic %>% left_join(vectors) %>% .[,3:302] %>% replace(., is.na(.), 0) %>% as.matrix()
17253

173-
model <- keras_model(sequence_input, preds)
54+
# Use Keras Functional API
55+
input = layer_input(shape = list(maxlen), name = "input")
17456

57+
model = input %>%
58+
layer_embedding(input_dim = max_words, output_dim = dim_size, input_length = maxlen,
59+
# put weights into list and do not allow training
60+
weights = list(word_embeds), trainable = FALSE) %>%
61+
layer_spatial_dropout_1d(rate = 0.2 ) %>%
62+
bidirectional(
63+
layer_gru(units = 80, return_sequences = TRUE)
64+
)
65+
max_pool = model %>% layer_global_max_pooling_1d()
66+
ave_pool = model %>% layer_global_average_pooling_1d()
67+
68+
output = layer_concatenate(list(ave_pool, max_pool)) %>%
69+
layer_dense(units = 1, activation = "sigmoid")
70+
71+
model = keras_model(input, output)
72+
73+
# instead of accuracy we can use "AUC" metrics from "tensorflow.keras"
17574
model %>% compile(
176-
loss = 'categorical_crossentropy',
177-
optimizer = 'rmsprop',
178-
metrics = c('acc')
75+
optimizer = "adam",
76+
loss = "binary_crossentropy",
77+
metrics = tensorflow::tf$keras$metrics$AUC()
17978
)
18079

181-
model %>% fit(
80+
history = model %>% keras::fit(
18281
x_train, y_train,
183-
batch_size = 128,
184-
epochs = 10,
185-
validation_data = list(x_val, y_val)
82+
epochs = 8,
83+
batch_size = 32,
84+
validation_split = 0.2
18685
)
18786

18887

189-
190-

0 commit comments

Comments
 (0)