Skip to content
This repository was archived by the owner on Aug 31, 2021. It is now read-only.

Commit 1627cc1

Browse files
committed
Adding neural translation with words and language modeling on characters.
1 parent d6ca2b7 commit 1627cc1

File tree

3 files changed

+263
-6
lines changed

3 files changed

+263
-6
lines changed

examples/README.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
Examples of Using skflow
2-
========================
1+
# Examples of Using skflow
32

43
* [Deep Neural Network Regression with Boston Data](boston.py)
54
* [Convolutional Neural Networks with Digits Data](digits.py)
@@ -14,18 +13,27 @@ Examples of Using skflow
1413
* [Out-of-core Data Classification Using Dask](out_of_core_data_classification.py)
1514

1615

17-
Image classification
18-
--------------------
16+
## Image classification
1917

2018
* [Convolutional Neural Networks on MNIST Data](mnist.py)
2119
* [Deep Residual Networks on MNIST Data](resnet.py)
2220

2321

24-
Text classification
25-
-------------------
22+
## Text classification
2623

2724
* [Text Classification Using Recurrent Neural Networks on Words](text_classification.py) (See also [Simplified Version Using Built-in RNN Model](text_classification_builtin_rnn_model.py))
2825
* [Text Classification Using Convolutional Neural Networks on Words](text_classification_cnn.py)
2926
* [Text Classification Using Recurrent Neural Networks on Characters](text_classification_character_rnn.py)
3027
* [Text Classification Using Convolutional Neural Networks on Characters](text_classification_character_cnn.py)
3128

29+
30+
## Language modeling
31+
32+
* [Character level language modeling](language_model.py)
33+
34+
35+
## Text sequence to sequence
36+
37+
* [Character level neural language translation](neural_translation.py)
38+
* [Word level neural language translation](neural_translation_word.py)
39+

examples/language_model.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# encoding: utf-8
2+
3+
# Copyright 2015-present Scikit Flow Authors. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from __future__ import division, print_function, absolute_import
18+
19+
import itertools
20+
import math
21+
import os
22+
import numpy as np
23+
24+
import tensorflow as tf
25+
26+
import skflow
27+
28+
### Training data
29+
30+
CORPUS_FILENAME = "europarl-v6.fr-en.en"
31+
MAX_DOC_LENGTH = 10
32+
33+
def training_data(filename):
34+
f = open(filename)
35+
for line in f:
36+
yield line
37+
38+
39+
def iter_docs(docs):
40+
for doc in docs:
41+
n_parts = int(math.ceil(float(len(doc)) / MAX_DOC_LENGTH))
42+
for part in range(n_parts):
43+
offset_begin = part * MAX_DOC_LENGTH
44+
offset_end = offset_begin + MAX_DOC_LENGTH
45+
inp = np.zeros(MAX_DOC_LENGTH, dtype=np.int32)
46+
out = np.zeros(MAX_DOC_LENGTH, dtype=np.int32)
47+
inp[:min(offset_end - offset_begin, len(doc) - offset_begin)] = doc[offset_begin:offset_end]
48+
out[:min(offset_end - offset_begin, len(doc) - offset_begin - 1)] = doc[offset_begin + 1:offset_end + 1]
49+
yield inp, out
50+
51+
52+
def unpack_xy(iter_obj):
53+
X, y = itertools.tee(iter_obj)
54+
return (item[0] for item in X), (item[1] for item in y)
55+
56+
57+
byte_processor = skflow.preprocessing.ByteProcessor(
58+
max_document_length=MAX_DOC_LENGTH)
59+
60+
data = training_data(CORPUS_FILENAME)
61+
data = byte_processor.transform(data)
62+
X, y = unpack_xy(iter_docs(data))
63+
64+
65+
### Model
66+
67+
HIDDEN_SIZE = 10
68+
69+
70+
def seq_autoencoder(X, y):
71+
"""Sequence auto-encoder with RNN."""
72+
inputs = skflow.ops.one_hot_matrix(X, 256)
73+
in_X, in_y, out_y = skflow.ops.seq2seq_inputs(inputs, y, MAX_DOC_LENGTH, MAX_DOC_LENGTH)
74+
encoder_cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE)
75+
decoder_cell = tf.nn.rnn_cell.OutputProjectionWrapper(tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE), 256)
76+
decoding, _, sampling_decoding, _ = skflow.ops.rnn_seq2seq(in_X, in_y, encoder_cell, decoder_cell)
77+
return skflow.ops.sequence_classifier(decoding, out_y, sampling_decoding)
78+
79+
80+
def get_language_model(hidden_size):
81+
"""Returns a language model with given hidden size."""
82+
83+
def language_model(X, y):
84+
inputs = skflow.ops.one_hot_matrix(X, 256)
85+
inputs = skflow.ops.split_squeeze(1, MAX_DOC_LENGTH, inputs)
86+
target = skflow.ops.split_squeeze(1, MAX_DOC_LENGTH, y)
87+
encoder_cell = tf.nn.rnn_cell.OutputProjectionWrapper(tf.nn.rnn_cell.GRUCell(hidden_size),256)
88+
output, _ = tf.nn.rnn(encoder_cell, inputs, dtype=tf.float32)
89+
return skflow.ops.sequence_classifier(output, target)
90+
91+
return language_model
92+
93+
94+
### Training model.
95+
96+
estimator = skflow.TensorFlowEstimator(model_fn=get_language_model(HIDDEN_SIZE),
97+
n_classes=256,
98+
optimizer='Adam', learning_rate=0.01,
99+
steps=1000, batch_size=64, continue_training=True)
100+
101+
estimator.fit(X, y)
102+
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# encoding: utf-8
2+
3+
# Copyright 2015-present Scikit Flow Authors. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from __future__ import division, print_function, absolute_import
18+
19+
import itertools
20+
import os
21+
import numpy as np
22+
23+
import tensorflow as tf
24+
25+
import skflow
26+
27+
# Get training data
28+
29+
# This dataset can be downloaded from http://www.statmt.org/europarl/v6/fr-en.tgz
30+
31+
ENGLISH_CORPUS = "europarl-v6.fr-en.en"
32+
FRENCH_CORPUS = "europarl-v6.fr-en.fr"
33+
34+
def read_iterator(filename, reporting=True):
35+
f = open(filename)
36+
line_count = 0
37+
for line in f:
38+
line_count += 1
39+
if reporting and line_count % 100000 == 0:
40+
print("%d lines read from %s" % (line_count, filename))
41+
yield line.strip()
42+
43+
44+
def repeated_read_iterator(filename):
45+
while True:
46+
f = open(filename)
47+
for line in f:
48+
yield line.strip()
49+
50+
51+
def split_train_test(data, partition=0.2, random_seed=42):
52+
rnd = np.random.RandomState(random_seed)
53+
for item in data:
54+
if rnd.uniform() > partition:
55+
yield (0, item)
56+
else:
57+
yield (1, item)
58+
59+
60+
def save_partitions(data, filenames):
61+
files = [open(filename, 'w') for filename in filenames]
62+
for partition, item in data:
63+
files[partition].write(item + '\n')
64+
65+
66+
def loop_iterator(data):
67+
while True:
68+
for item in data:
69+
yield item
70+
71+
72+
if not (os.path.exists('train.data') and os.path.exists('test.data')):
73+
english_data = read_iterator(ENGLISH_CORPUS)
74+
french_data = read_iterator(FRENCH_CORPUS)
75+
parallel_data = ('%s;;;%s' % (eng, fr) for eng, fr in itertools.izip(english_data, french_data))
76+
save_partitions(split_train_test(parallel_data), ['train.data', 'test.data'])
77+
78+
def Xy(data):
79+
def split_lines(data):
80+
for item in data:
81+
yield item.split(';;;')
82+
X, y = itertools.tee(split_lines(data))
83+
return (item[0] for item in X), (item[1] for item in y)
84+
85+
X_train, y_train = Xy(repeated_read_iterator('train.data'))
86+
X_test, y_test = Xy(read_iterator('test.data'))
87+
88+
# Preprocessing
89+
90+
MAX_DOCUMENT_LENGTH = 10
91+
92+
X_vocab_processor = skflow.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH,
93+
min_frequency=5)
94+
y_vocab_processor = skflow.preprocessing.VocabularyPRocessor(MAX_DOCUMENT_LENGTH,
95+
min_frequency=5)
96+
Xtrainff, ytrainff = Xy(read_iterator('train.data'))
97+
print('Fitting dictionary for English...')
98+
X_vocab_processor.fit(Xtrainff)
99+
print('Fitting dictionary for French...')
100+
y_vocab_processor.fit(ytrainff)
101+
print('Transforming...')
102+
X_train = X_vocab_processor.transform(X_train)
103+
y_train = y_vocab_processor.transform(y_train)
104+
X_test = np.array(list(X_vocab_processor.transform(X_test))[:20])
105+
y_test = list(y_test)[:20]
106+
107+
n_words = len(X_vocab_processor.vocabulary_)
108+
print('Total words: %d' % n_words)
109+
110+
# Translation model
111+
112+
HIDDEN_SIZE = 20
113+
EMBEDDING_SIZE = 20
114+
115+
def translate_model(X, y):
116+
word_vectors = skflow.ops.categorical_variable(X, n_classes=n_words,
117+
embedding_size=EMBEDDING_SIZE, name='words')
118+
in_X, in_y, out_y = skflow.ops.seq2seq_inputs(
119+
word_list, y, MAX_DOCUMENT_LENGTH, MAX_DOCUMENT_LENGTH)
120+
cell = tf.nn.rnn_cell.OutputProjectionWrapper(tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE), 256)
121+
decoding, _, sampling_decoding, _ = skflow.ops.rnn_seq2seq(in_X, in_y, cell)
122+
return skflow.ops.sequence_classifier(decoding, out_y, sampling_decoding)
123+
124+
125+
PATH = '/tmp/tf_examples/ntm_words/'
126+
127+
if os.path.exists(PATH):
128+
translator = skflow.TensorFlowEstimator.restore(PATH)
129+
else:
130+
translator = skflow.TensorFlowEstimator(model_fn=translate_model,
131+
n_classes=n_words,
132+
optimizer='Adam', learning_rate=0.01, batch_size=128,
133+
continue_training=True)
134+
135+
while True:
136+
translator.fit(X_train, y_train, logdir=PATH)
137+
translator.save(PATH)
138+
139+
predictions = translator.predict(xpred, axis=2)
140+
xpred_inp = X_vocab_processor.reverse(xpred)
141+
text_outputs = y_vocab_processor.reverse(predictions)
142+
for inp_data, input_text, pred, output_text, gold in zip(xpred, xpred_inp,
143+
predictions, text_outputs, ygold):
144+
print('English: %s. French (pred): %s, French (gold): %s' %
145+
(input_text, output_text, gold.decode('utf-8')))
146+
print(inp_data, pred)
147+

0 commit comments

Comments
 (0)