Skip to content

Commit 172c887

Browse files
authored
init (#9462)
1 parent faa752a commit 172c887

File tree

6 files changed

+1355
-0
lines changed

6 files changed

+1355
-0
lines changed
Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""seq2seq model for fluid."""
15+
from __future__ import absolute_import
16+
from __future__ import division
17+
from __future__ import print_function
18+
19+
import numpy as np
20+
import argparse
21+
import time
22+
import distutils.util
23+
24+
import paddle.v2 as paddle
25+
import paddle.fluid as fluid
26+
import paddle.fluid.core as core
27+
import paddle.fluid.framework as framework
28+
from paddle.fluid.executor import Executor
29+
30+
parser = argparse.ArgumentParser(description=__doc__)
31+
parser.add_argument(
32+
"--embedding_dim",
33+
type=int,
34+
default=512,
35+
help="The dimension of embedding table. (default: %(default)d)")
36+
parser.add_argument(
37+
"--encoder_size",
38+
type=int,
39+
default=512,
40+
help="The size of encoder bi-rnn unit. (default: %(default)d)")
41+
parser.add_argument(
42+
"--decoder_size",
43+
type=int,
44+
default=512,
45+
help="The size of decoder rnn unit. (default: %(default)d)")
46+
parser.add_argument(
47+
"--batch_size",
48+
type=int,
49+
default=16,
50+
help="The sequence number of a mini-batch data. (default: %(default)d)")
51+
parser.add_argument(
52+
"--dict_size",
53+
type=int,
54+
default=30000,
55+
help="The dictionary capacity. Dictionaries of source sequence and "
56+
"target dictionary have same capacity. (default: %(default)d)")
57+
parser.add_argument(
58+
"--pass_num",
59+
type=int,
60+
default=2,
61+
help="The pass number to train. (default: %(default)d)")
62+
parser.add_argument(
63+
"--learning_rate",
64+
type=float,
65+
default=0.0002,
66+
help="Learning rate used to train the model. (default: %(default)f)")
67+
parser.add_argument(
68+
"--infer_only", action='store_true', help="If set, run forward only.")
69+
parser.add_argument(
70+
"--beam_size",
71+
type=int,
72+
default=3,
73+
help="The width for beam searching. (default: %(default)d)")
74+
parser.add_argument(
75+
"--use_gpu",
76+
type=distutils.util.strtobool,
77+
default=True,
78+
help="Whether to use gpu. (default: %(default)d)")
79+
parser.add_argument(
80+
"--max_length",
81+
type=int,
82+
default=250,
83+
help="The maximum length of sequence when doing generation. "
84+
"(default: %(default)d)")
85+
86+
87+
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
88+
def linear(inputs):
89+
return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
90+
91+
forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
92+
input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
93+
output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
94+
cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
95+
96+
cell_t = fluid.layers.sums(input=[
97+
fluid.layers.elementwise_mul(
98+
x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
99+
x=input_gate, y=cell_tilde)
100+
])
101+
102+
hidden_t = fluid.layers.elementwise_mul(
103+
x=output_gate, y=fluid.layers.tanh(x=cell_t))
104+
105+
return hidden_t, cell_t
106+
107+
108+
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
109+
target_dict_dim, is_generating, beam_size, max_length):
110+
"""Construct a seq2seq network."""
111+
112+
def bi_lstm_encoder(input_seq, gate_size):
113+
# Linear transformation part for input gate, output gate, forget gate
114+
# and cell activation vectors need be done outside of dynamic_lstm.
115+
# So the output size is 4 times of gate_size.
116+
input_forward_proj = fluid.layers.fc(input=input_seq,
117+
size=gate_size * 4,
118+
act=None,
119+
bias_attr=False)
120+
forward, _ = fluid.layers.dynamic_lstm(
121+
input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
122+
input_reversed_proj = fluid.layers.fc(input=input_seq,
123+
size=gate_size * 4,
124+
act=None,
125+
bias_attr=False)
126+
reversed, _ = fluid.layers.dynamic_lstm(
127+
input=input_reversed_proj,
128+
size=gate_size * 4,
129+
is_reverse=True,
130+
use_peepholes=False)
131+
return forward, reversed
132+
133+
src_word_idx = fluid.layers.data(
134+
name='source_sequence', shape=[1], dtype='int64', lod_level=1)
135+
136+
src_embedding = fluid.layers.embedding(
137+
input=src_word_idx,
138+
size=[source_dict_dim, embedding_dim],
139+
dtype='float32')
140+
141+
src_forward, src_reversed = bi_lstm_encoder(
142+
input_seq=src_embedding, gate_size=encoder_size)
143+
144+
encoded_vector = fluid.layers.concat(
145+
input=[src_forward, src_reversed], axis=1)
146+
147+
encoded_proj = fluid.layers.fc(input=encoded_vector,
148+
size=decoder_size,
149+
bias_attr=False)
150+
151+
backward_first = fluid.layers.sequence_pool(
152+
input=src_reversed, pool_type='first')
153+
154+
decoder_boot = fluid.layers.fc(input=backward_first,
155+
size=decoder_size,
156+
bias_attr=False,
157+
act='tanh')
158+
159+
def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
160+
decoder_boot, decoder_size):
161+
def simple_attention(encoder_vec, encoder_proj, decoder_state):
162+
decoder_state_proj = fluid.layers.fc(input=decoder_state,
163+
size=decoder_size,
164+
bias_attr=False)
165+
decoder_state_expand = fluid.layers.sequence_expand(
166+
x=decoder_state_proj, y=encoder_proj)
167+
concated = fluid.layers.concat(
168+
input=[encoder_proj, decoder_state_expand], axis=1)
169+
attention_weights = fluid.layers.fc(input=concated,
170+
size=1,
171+
act='tanh',
172+
bias_attr=False)
173+
attention_weights = fluid.layers.sequence_softmax(
174+
input=attention_weights)
175+
weigths_reshape = fluid.layers.reshape(
176+
x=attention_weights, shape=[-1])
177+
scaled = fluid.layers.elementwise_mul(
178+
x=encoder_vec, y=weigths_reshape, axis=0)
179+
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
180+
return context
181+
182+
rnn = fluid.layers.DynamicRNN()
183+
184+
cell_init = fluid.layers.fill_constant_batch_size_like(
185+
input=decoder_boot,
186+
value=0.0,
187+
shape=[-1, decoder_size],
188+
dtype='float32')
189+
cell_init.stop_gradient = False
190+
191+
with rnn.block():
192+
current_word = rnn.step_input(target_embedding)
193+
encoder_vec = rnn.static_input(encoder_vec)
194+
encoder_proj = rnn.static_input(encoder_proj)
195+
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
196+
cell_mem = rnn.memory(init=cell_init)
197+
context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
198+
decoder_inputs = fluid.layers.concat(
199+
input=[context, current_word], axis=1)
200+
h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
201+
rnn.update_memory(hidden_mem, h)
202+
rnn.update_memory(cell_mem, c)
203+
out = fluid.layers.fc(input=h,
204+
size=target_dict_dim,
205+
bias_attr=True,
206+
act='softmax')
207+
rnn.output(out)
208+
return rnn()
209+
210+
if not is_generating:
211+
trg_word_idx = fluid.layers.data(
212+
name='target_sequence', shape=[1], dtype='int64', lod_level=1)
213+
214+
trg_embedding = fluid.layers.embedding(
215+
input=trg_word_idx,
216+
size=[target_dict_dim, embedding_dim],
217+
dtype='float32')
218+
219+
prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
220+
encoded_proj, decoder_boot,
221+
decoder_size)
222+
label = fluid.layers.data(
223+
name='label_sequence', shape=[1], dtype='int64', lod_level=1)
224+
cost = fluid.layers.cross_entropy(input=prediction, label=label)
225+
avg_cost = fluid.layers.mean(x=cost)
226+
227+
feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
228+
229+
return avg_cost, feeding_list
230+
231+
232+
def to_lodtensor(data, place):
233+
seq_lens = [len(seq) for seq in data]
234+
cur_len = 0
235+
lod = [cur_len]
236+
for l in seq_lens:
237+
cur_len += l
238+
lod.append(cur_len)
239+
flattened_data = np.concatenate(data, axis=0).astype("int64")
240+
flattened_data = flattened_data.reshape([len(flattened_data), 1])
241+
lod_t = core.LoDTensor()
242+
lod_t.set(flattened_data, place)
243+
lod_t.set_lod([lod])
244+
return lod_t, lod[-1]
245+
246+
247+
def lodtensor_to_ndarray(lod_tensor):
248+
dims = lod_tensor.get_dims()
249+
ndarray = np.zeros(shape=dims).astype('float32')
250+
for i in xrange(np.product(dims)):
251+
ndarray.ravel()[i] = lod_tensor.get_float_element(i)
252+
return ndarray
253+
254+
255+
def train():
256+
avg_cost, feeding_list = seq_to_seq_net(
257+
args.embedding_dim,
258+
args.encoder_size,
259+
args.decoder_size,
260+
args.dict_size,
261+
args.dict_size,
262+
False,
263+
beam_size=args.beam_size,
264+
max_length=args.max_length)
265+
266+
# clone from default main program
267+
inference_program = fluid.default_main_program().clone()
268+
269+
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
270+
optimizer.minimize(avg_cost)
271+
272+
fluid.memory_optimize(fluid.default_main_program())
273+
274+
train_batch_generator = paddle.batch(
275+
paddle.reader.shuffle(
276+
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
277+
batch_size=args.batch_size)
278+
279+
test_batch_generator = paddle.batch(
280+
paddle.reader.shuffle(
281+
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
282+
batch_size=args.batch_size)
283+
284+
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
285+
exe = Executor(place)
286+
exe.run(framework.default_startup_program())
287+
288+
def do_validation():
289+
total_loss = 0.0
290+
count = 0
291+
for batch_id, data in enumerate(test_batch_generator()):
292+
src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
293+
trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
294+
lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
295+
296+
fetch_outs = exe.run(inference_program,
297+
feed={
298+
feeding_list[0]: src_seq,
299+
feeding_list[1]: trg_seq,
300+
feeding_list[2]: lbl_seq
301+
},
302+
fetch_list=[avg_cost],
303+
return_numpy=False)
304+
305+
total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
306+
count += 1
307+
308+
return total_loss / count
309+
310+
for pass_id in xrange(args.pass_num):
311+
pass_start_time = time.time()
312+
words_seen = 0
313+
for batch_id, data in enumerate(train_batch_generator()):
314+
src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
315+
words_seen += word_num
316+
trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
317+
words_seen += word_num
318+
lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
319+
320+
fetch_outs = exe.run(framework.default_main_program(),
321+
feed={
322+
feeding_list[0]: src_seq,
323+
feeding_list[1]: trg_seq,
324+
feeding_list[2]: lbl_seq
325+
},
326+
fetch_list=[avg_cost])
327+
328+
avg_cost_val = np.array(fetch_outs[0])
329+
print('pass_id=%d, batch_id=%d, train_loss: %f' %
330+
(pass_id, batch_id, avg_cost_val))
331+
332+
pass_end_time = time.time()
333+
test_loss = do_validation()
334+
time_consumed = pass_end_time - pass_start_time
335+
words_per_sec = words_seen / time_consumed
336+
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
337+
(pass_id, test_loss, words_per_sec, time_consumed))
338+
339+
340+
def infer():
341+
pass
342+
343+
344+
if __name__ == '__main__':
345+
args = parser.parse_args()
346+
if args.infer_only:
347+
infer()
348+
else:
349+
train()

0 commit comments

Comments
 (0)