|
1 | | -#' This script loads pre-trained word embeddings (GloVe embeddings) into a |
2 | | -#' frozen Keras Embedding layer, and uses it to train a text classification |
3 | | -#' model on the 20 Newsgroup dataset (classication of newsgroup messages into 20 |
4 | | -#' different categories). |
5 | | -#' |
6 | | -#' GloVe embedding data can be found at: |
7 | | -#' http://nlp.stanford.edu/data/glove.6B.zip (source page: |
8 | | -#' http://nlp.stanford.edu/projects/glove/) |
9 | | -#' |
10 | | -#' 20 Newsgroup data can be found at: |
11 | | -#' http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html |
12 | | -#' |
13 | | - |
14 | | -#' |
15 | | -#' IMPORTANT NOTE: This example does yet work correctly. The code executes fine and |
16 | | -#' appears to mimic the Python code upon which it is based however it achieves only |
17 | | -#' half the training accuracy that the Python code does so there is clearly a |
18 | | -#' subtle difference. |
19 | | -#' |
20 | | -#' We need to investigate this further before formally adding to the list of examples |
21 | | -#' |
22 | | -#' |
| 1 | +# This example shows how one can quickly load glove vectors |
| 2 | +# and train a Keras model in R |
23 | 3 |
|
24 | 4 | library(keras) |
| 5 | +library(dplyr) |
25 | 6 |
|
26 | | -GLOVE_DIR <- 'glove.6B' |
27 | | -TEXT_DATA_DIR <- '20_newsgroup' |
28 | | -MAX_SEQUENCE_LENGTH <- 1000 |
29 | | -MAX_NUM_WORDS <- 20000 |
30 | | -EMBEDDING_DIM <- 100 |
31 | | -VALIDATION_SPLIT <- 0.2 |
32 | | - |
33 | | -# download data if necessary |
34 | | -download_data <- function(data_dir, url_path, data_file) { |
35 | | - if (!dir.exists(data_dir)) { |
36 | | - download.file(paste0(url_path, data_file), data_file, mode = "wb") |
37 | | - if (tools::file_ext(data_file) == "zip") |
38 | | - unzip(data_file, exdir = tools::file_path_sans_ext(data_file)) |
39 | | - else |
40 | | - untar(data_file) |
41 | | - unlink(data_file) |
42 | | - } |
| 7 | +# Download Glove vectors if necessary |
| 8 | +if (!file.exists('glove.6B.zip')) { |
| 9 | + download.file('http://nlp.stanford.edu/data/glove.6B.zip',destfile = 'glove.6B.zip') |
| 10 | + unzip('glove.6B.zip') |
43 | 11 | } |
44 | | -download_data(GLOVE_DIR, 'http://nlp.stanford.edu/data/', 'glove.6B.zip') |
45 | | -download_data(TEXT_DATA_DIR, "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/", "news20.tar.gz") |
46 | 12 |
|
47 | | -# first, build index mapping words in the embeddings set |
48 | | -# to their embedding vector |
| 13 | +# load an example dataset from text2vec |
| 14 | +library(text2vec) |
| 15 | +data("movie_review") |
| 16 | +as_tibble(movie_review) |
49 | 17 |
|
50 | | -cat('Indexing word vectors.\n') |
| 18 | +# load glove vectors into R |
| 19 | +vectors = data.table::fread('glove.6B.300d.txt', data.table = F, encoding = 'UTF-8') |
| 20 | +colnames(vectors) = c('word',paste('dim',1:300,sep = '_')) |
51 | 21 |
|
52 | | -embeddings_index <- new.env(parent = emptyenv()) |
53 | | -lines <- readLines(file.path(GLOVE_DIR, 'glove.6B.100d.txt')) |
54 | | -for (line in lines) { |
55 | | - values <- strsplit(line, ' ', fixed = TRUE)[[1]] |
56 | | - word <- values[[1]] |
57 | | - coefs <- as.numeric(values[-1]) |
58 | | - embeddings_index[[word]] <- coefs |
59 | | -} |
| 22 | +# structure of the vectors |
| 23 | +as_tibble(vectors) |
60 | 24 |
|
61 | | -cat(sprintf('Found %s word vectors.\n', length(embeddings_index))) |
62 | | - |
63 | | -# second, prepare text samples and their labels |
64 | | -cat('Processing text dataset\n') |
65 | | - |
66 | | -texts <- character() # text samples |
67 | | -labels <- integer() # label ids |
68 | | -labels_index <- list() # dictionary: label name to numeric id |
69 | | - |
70 | | -for (name in list.files(TEXT_DATA_DIR)) { |
71 | | - path <- file.path(TEXT_DATA_DIR, name) |
72 | | - if (file_test("-d", path)) { |
73 | | - label_id <- length(labels_index) |
74 | | - labels_index[[name]] <- label_id |
75 | | - for (fname in list.files(path)) { |
76 | | - if (grepl("^[0-9]+$", fname)) { |
77 | | - fpath <- file.path(path, fname) |
78 | | - t <- readLines(fpath, encoding = "latin1") |
79 | | - t <- paste(t, collapse = "\n") |
80 | | - i <- regexpr(pattern = '\n\n', t, fixed = TRUE)[[1]] |
81 | | - if (i != -1L) |
82 | | - t <- substring(t, i) |
83 | | - texts <- c(texts, t) |
84 | | - labels <- c(labels, label_id) |
85 | | - } |
86 | | - } |
87 | | - } |
88 | | -} |
| 25 | +# define parameters of Keras model |
| 26 | +library(keras) |
| 27 | +max_words = 1e4 |
| 28 | +maxlen = 60 |
| 29 | +dim_size = 300 |
89 | 30 |
|
90 | | -cat(sprintf('Found %s texts.\n', length(texts))) |
91 | | - |
92 | | -# finally, vectorize the text samples into a 2D integer tensor |
93 | | -tokenizer <- text_tokenizer(num_words=MAX_NUM_WORDS) |
94 | | -tokenizer %>% fit_text_tokenizer(texts) |
95 | | - |
96 | | -# save the tokenizer in case we want to use it again |
97 | | -# for prediction within another R session, see: |
98 | | -# https://keras.rstudio.com/reference/save_text_tokenizer.html |
99 | | -save_text_tokenizer(tokenizer, "tokenizer") |
100 | | - |
101 | | -sequences <- texts_to_sequences(tokenizer, texts) |
102 | | - |
103 | | -word_index <- tokenizer$word_index |
104 | | -cat(sprintf('Found %s unique tokens.\n', length(word_index))) |
105 | | - |
106 | | -data <- pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) |
107 | | -labels <- to_categorical(labels) |
108 | | - |
109 | | -cat('Shape of data tensor: ', dim(data), '\n') |
110 | | -cat('Shape of label tensor: ', dim(labels), '\n') |
111 | | - |
112 | | -# split the data into a training set and a validation set |
113 | | -indices <- 1:nrow(data) |
114 | | -indices <- sample(indices) |
115 | | -data <- data[indices,] |
116 | | -labels <- labels[indices,] |
117 | | -num_validation_samples <- as.integer(VALIDATION_SPLIT * nrow(data)) |
118 | | - |
119 | | -x_train <- data[-(1:num_validation_samples),] |
120 | | -y_train <- labels[-(1:num_validation_samples),] |
121 | | -x_val <- data[1:num_validation_samples,] |
122 | | -y_val <- labels[1:num_validation_samples,] |
123 | | - |
124 | | -cat('Preparing embedding matrix.\n') |
125 | | - |
126 | | -# prepare embedding matrix |
127 | | -num_words <- min(MAX_NUM_WORDS, length(word_index) + 1) |
128 | | -prepare_embedding_matrix <- function() { |
129 | | - embedding_matrix <- matrix(0L, nrow = num_words, ncol = EMBEDDING_DIM) |
130 | | - for (word in names(word_index)) { |
131 | | - index <- word_index[[word]] |
132 | | - if (index >= MAX_NUM_WORDS) |
133 | | - next |
134 | | - embedding_vector <- embeddings_index[[word]] |
135 | | - if (!is.null(embedding_vector)) { |
136 | | - # words not found in embedding index will be all-zeros. |
137 | | - embedding_matrix[index,] <- embedding_vector |
138 | | - } |
139 | | - } |
140 | | - embedding_matrix |
141 | | -} |
| 31 | +# tokenize the input data and then fit the created object |
| 32 | +word_seqs = text_tokenizer(num_words = max_words) %>% |
| 33 | + fit_text_tokenizer(movie_review$review) |
142 | 34 |
|
143 | | -embedding_matrix <- prepare_embedding_matrix() |
| 35 | +# apply tokenizer to the text and get indices instead of words |
| 36 | +# later pad the sequence |
| 37 | +x_train = texts_to_sequences(word_seqs, movie_review$review) %>% |
| 38 | + pad_sequences( maxlen = maxlen) |
144 | 39 |
|
145 | | -# load pre-trained word embeddings into an Embedding layer |
146 | | -# note that we set trainable = False so as to keep the embeddings fixed |
147 | | -embedding_layer <- layer_embedding( |
148 | | - input_dim = num_words, |
149 | | - output_dim = EMBEDDING_DIM, |
150 | | - weights = list(embedding_matrix), |
151 | | - input_length = MAX_SEQUENCE_LENGTH, |
152 | | - trainable = FALSE |
153 | | -) |
154 | | - |
155 | | -cat('Training model\n') |
| 40 | +# extract the output |
| 41 | +y_train = as.matrix(movie_review$sentiment) |
156 | 42 |
|
157 | | -# train a 1D convnet with global maxpooling |
158 | | -sequence_input <- layer_input(shape = list(MAX_SEQUENCE_LENGTH), dtype='int32') |
| 43 | +# unlist word indices |
| 44 | +word_indices = unlist(word_seqs$word_index) |
159 | 45 |
|
160 | | -preds <- sequence_input %>% |
161 | | - embedding_layer %>% |
162 | | - layer_conv_1d(filters = 128, kernel_size = 5, activation = 'relu') %>% |
163 | | - layer_max_pooling_1d(pool_size = 5) %>% |
164 | | - layer_conv_1d(filters = 128, kernel_size = 5, activation = 'relu') %>% |
165 | | - layer_max_pooling_1d(pool_size = 5) %>% |
166 | | - layer_conv_1d(filters = 128, kernel_size = 5, activation = 'relu') %>% |
167 | | - layer_max_pooling_1d(pool_size = 35) %>% |
168 | | - layer_flatten() %>% |
169 | | - layer_dense(units = 128, activation = 'relu') %>% |
170 | | - layer_dense(units = length(labels_index), activation = 'softmax') |
| 46 | +# then place them into data.frame |
| 47 | +dic = data.frame(word = names(word_indices), key = word_indices, stringsAsFactors = FALSE) %>% |
| 48 | + arrange(key) %>% .[1:max_words,] |
171 | 49 |
|
| 50 | +# join the words with GloVe vectors and |
| 51 | +# if word does not exist in GloVe, then fill NA's with 0 |
| 52 | +word_embeds = dic %>% left_join(vectors) %>% .[,3:302] %>% replace(., is.na(.), 0) %>% as.matrix() |
172 | 53 |
|
173 | | -model <- keras_model(sequence_input, preds) |
| 54 | +# Use Keras Functional API |
| 55 | +input = layer_input(shape = list(maxlen), name = "input") |
174 | 56 |
|
| 57 | +model = input %>% |
| 58 | + layer_embedding(input_dim = max_words, output_dim = dim_size, input_length = maxlen, |
| 59 | + # put weights into list and do not allow training |
| 60 | + weights = list(word_embeds), trainable = FALSE) %>% |
| 61 | + layer_spatial_dropout_1d(rate = 0.2 ) %>% |
| 62 | + bidirectional( |
| 63 | + layer_gru(units = 80, return_sequences = TRUE) |
| 64 | + ) |
| 65 | +max_pool = model %>% layer_global_max_pooling_1d() |
| 66 | +ave_pool = model %>% layer_global_average_pooling_1d() |
| 67 | + |
| 68 | +output = layer_concatenate(list(ave_pool, max_pool)) %>% |
| 69 | + layer_dense(units = 1, activation = "sigmoid") |
| 70 | + |
| 71 | +model = keras_model(input, output) |
| 72 | + |
| 73 | +# instead of accuracy we can use "AUC" metrics from "tensorflow.keras" |
175 | 74 | model %>% compile( |
176 | | - loss = 'categorical_crossentropy', |
177 | | - optimizer = 'rmsprop', |
178 | | - metrics = c('acc') |
| 75 | + optimizer = "adam", |
| 76 | + loss = "binary_crossentropy", |
| 77 | + metrics = tensorflow::tf$keras$metrics$AUC() |
179 | 78 | ) |
180 | 79 |
|
181 | | -model %>% fit( |
| 80 | +history = model %>% keras::fit( |
182 | 81 | x_train, y_train, |
183 | | - batch_size = 128, |
184 | | - epochs = 10, |
185 | | - validation_data = list(x_val, y_val) |
| 82 | + epochs = 8, |
| 83 | + batch_size = 32, |
| 84 | + validation_split = 0.2 |
186 | 85 | ) |
187 | 86 |
|
188 | 87 |
|
189 | | - |
190 | | - |
0 commit comments