Skip to content

Commit 0eb2015

Browse files
authored
Merge pull request #217 from tomtung/fasttext_example
Implement Example: FastText classifier on IMDb dataset
2 parents 62c57bb + 000e16b commit 0eb2015

File tree

1 file changed

+245
-0
lines changed

1 file changed

+245
-0
lines changed

example/imdb_fasttext.py

Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
#!/usr/bin/env python
2+
3+
__doc__ = """
4+
5+
This demo implements FastText[1] for sentence classification. FastText is a
6+
simple model for text classification with performance often close to
7+
state-of-the-art, and is useful as a solid baseline.
8+
9+
There are some important differences between this implementation and what
10+
is described in the paper. Instead of Hogwild! SGD[2], we use Adam optimizer
11+
with mini-batches. Hierarchical softmax is also not supported; if you have
12+
a large label space, consider utilizing candidate sampling methods provided
13+
by TensorFlow[3].
14+
15+
After 5 epochs, you should get test accuracy close to 90.9%.
16+
17+
[1] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016).
18+
Bag of Tricks for Efficient Text Classification.
19+
http://arxiv.org/abs/1607.01759
20+
21+
[2] Recht, B., Re, C., Wright, S., & Niu, F. (2011).
22+
Hogwild: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent.
23+
In Advances in Neural Information Processing Systems 24 (pp. 693–701).
24+
25+
[3] https://www.tensorflow.org/api_guides/python/nn#Candidate_Sampling
26+
27+
"""
28+
29+
import array
30+
import hashlib
31+
import time
32+
33+
import tensorflow as tf
34+
import tensorlayer as tl
35+
import numpy as np
36+
37+
38+
# Hashed n-grams with 1 < n <= N_GRAM are included as features
39+
# in addition to unigrams.
40+
N_GRAM = 2
41+
42+
# Size of vocabulary; less frequent works will be treated as "unknown"
43+
VOCAB_SIZE = 100000
44+
45+
# Number of buckets used for hashing n-grams
46+
N_BUCKETS = 1000000
47+
48+
# Size of the embedding vectors
49+
EMBEDDING_SIZE = 50
50+
51+
# Number of epochs for which the model is trained
52+
N_EPOCH = 5
53+
54+
# Size of training mini-batches
55+
BATCH_SIZE = 32
56+
57+
# Path to which to save the trained model
58+
MODEL_FILE_PATH = 'model.npz'
59+
60+
61+
class FastTestEmbeddingInputLayer(tl.layers.Layer):
62+
def __init__(
63+
self, inputs, vocabulary_size, embedding_size,
64+
name='fasttext_layer',
65+
embeddings_initializer=tf.random_uniform_initializer(-0.1, 0.1),
66+
embeddings_kwargs=None):
67+
"""FastText Embedding input layer for sentences.
68+
69+
:param inputs: input placeholder or tensor; zeros are paddings
70+
:param vocabulary_size: and integer, the size of vocabulary
71+
:param embedding_size: and integer, the dimension of embedding vectors
72+
:param name: a string, the name of the layer
73+
:param embeddings_initializer: the initializer of the embedding matrix
74+
:param embeddings_kwargs: kwargs to get embedding matrix variable
75+
"""
76+
super().__init__(name=name)
77+
78+
if inputs.get_shape().ndims != 2:
79+
raise ValueError(
80+
'inputs must be of size batch_size * batch_sentence_length')
81+
82+
self.inputs = inputs
83+
84+
print(f" [TL] FastTestEmbeddingInputLayer {self.name}:"
85+
f" ({vocabulary_size}, {embedding_size})")
86+
87+
with tf.variable_scope(name):
88+
self.embeddings = tf.get_variable(
89+
name='embeddings',
90+
shape=(vocabulary_size, embedding_size),
91+
initializer=embeddings_initializer,
92+
**(embeddings_kwargs or {}),
93+
)
94+
word_embeddings = tf.nn.embedding_lookup(
95+
self.embeddings, self.inputs,
96+
name='word_embeddings',
97+
)
98+
99+
# Masks used to ignore padding words
100+
masks = tf.expand_dims(
101+
tf.sign(self.inputs),
102+
axis=-1,
103+
name='masks',
104+
)
105+
sum_word_embeddings = tf.reduce_sum(
106+
word_embeddings * tf.cast(masks, tf.float32),
107+
axis=1,
108+
)
109+
110+
# Count number of non-padding words in each sentence
111+
# Used to commute average word embeddings in sentences
112+
sentence_lengths = tf.count_nonzero(
113+
self.inputs,
114+
axis=1,
115+
keep_dims=True,
116+
dtype=tf.float32,
117+
name='sentence_lengths',
118+
)
119+
120+
sentence_embeddings = tf.divide(
121+
sum_word_embeddings,
122+
sentence_lengths,
123+
name='sentence_embeddings'
124+
)
125+
126+
self.outputs = sentence_embeddings
127+
self.all_layers = [self.outputs]
128+
self.all_params = [self.embeddings]
129+
self.all_drop = {}
130+
131+
132+
class FastTextClassifier(object):
133+
"""Simple wrapper class for creating the graph of FastText classifier."""
134+
def __init__(self, vocab_size, embedding_size, n_labels):
135+
self.vocab_size = vocab_size
136+
self.embedding_size = embedding_size
137+
self.n_labels = n_labels
138+
139+
self.inputs = tf.placeholder(
140+
tf.int32, shape=[None, None], name='inputs')
141+
self.labels = tf.placeholder(
142+
tf.int32, shape=[None], name='labels')
143+
144+
# Network structure
145+
network = FastTestEmbeddingInputLayer(
146+
self.inputs, self.vocab_size, self.embedding_size)
147+
self.network = tl.layers.DenseLayer(network, self.n_labels)
148+
149+
# Training operation
150+
cost = tl.cost.cross_entropy(
151+
self.network.outputs,
152+
self.labels,
153+
name='cost'
154+
)
155+
self.train_op = tf.train.AdamOptimizer().minimize(cost)
156+
157+
# Predictions
158+
self.prediction_probs = tf.nn.softmax(self.network.outputs)
159+
self.predictions = tf.argmax(
160+
self.network.outputs, axis=1, output_type=tf.int32)
161+
162+
# Evaluation
163+
are_predictions_correct = tf.equal(self.predictions, self.labels)
164+
self.accuracy = tf.reduce_mean(
165+
tf.cast(are_predictions_correct, tf.float32))
166+
167+
def save(self, sess, filename):
168+
tl.files.save_npz(self.network.all_params, name=filename, sess=sess)
169+
170+
def load(self, sess, filename):
171+
tl.files.load_and_assign_npz(sess, name=filename, network=self.network)
172+
173+
174+
def augment_with_ngrams(unigrams, unigram_vocab_size, n_buckets, n=2):
175+
"""Augment unigram features with hashed n-gram features."""
176+
def get_ngrams(n):
177+
return list(zip(*[
178+
unigrams[i:]
179+
for i in range(n)
180+
]))
181+
182+
def hash_ngram(ngram):
183+
bytes_ = array.array('L', ngram).tobytes()
184+
hash_ = int(hashlib.sha256(bytes_).hexdigest(), 16)
185+
return unigram_vocab_size + hash_ % n_buckets
186+
187+
return unigrams + [
188+
hash_ngram(ngram)
189+
for i in range(2, n + 1)
190+
for ngram in get_ngrams(i)
191+
]
192+
193+
194+
def load_and_preprocess_imdb_data(n_gram=None):
195+
"""Load IMDb data and augment with hashed n-gram features."""
196+
X_train, y_train, X_test, y_test = \
197+
tl.files.load_imdb_dataset(nb_words=VOCAB_SIZE)
198+
199+
if n_gram is not None:
200+
X_train = np.array([
201+
augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n=n_gram)
202+
for x in X_train
203+
])
204+
X_test = np.array([
205+
augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n=n_gram)
206+
for x in X_test
207+
])
208+
209+
return X_train, y_train, X_test, y_test
210+
211+
212+
def train_test_and_save_model():
213+
X_train, y_train, X_test, y_test = load_and_preprocess_imdb_data(N_GRAM)
214+
classifier = FastTextClassifier(
215+
vocab_size=VOCAB_SIZE + N_BUCKETS,
216+
embedding_size=EMBEDDING_SIZE,
217+
n_labels=2,
218+
)
219+
220+
with tf.Session() as sess:
221+
tl.layers.initialize_global_variables(sess)
222+
223+
for epoch in range(N_EPOCH):
224+
start_time = time.time()
225+
print(f'Epoch {epoch + 1}/{N_EPOCH}', end='')
226+
for X_batch, y_batch in tl.iterate.minibatches(
227+
X_train, y_train, batch_size=BATCH_SIZE, shuffle=True):
228+
sess.run(classifier.train_op, feed_dict={
229+
classifier.inputs: tl.prepro.pad_sequences(X_batch),
230+
classifier.labels: y_batch,
231+
})
232+
233+
print(f'\t{time.time() - start_time:.2f}s')
234+
235+
test_accuracy = sess.run(classifier.accuracy, feed_dict={
236+
classifier.inputs: tl.prepro.pad_sequences(X_test),
237+
classifier.labels: y_test,
238+
})
239+
print(f'Test accuracy: {test_accuracy:.5f}')
240+
241+
classifier.save(sess, MODEL_FILE_PATH)
242+
243+
244+
if __name__ == '__main__':
245+
train_test_and_save_model()

0 commit comments

Comments
 (0)