Skip to content

Commit 29f3441

Browse files
authored
Cherry pick/fix transformer (#16620)
* Imperative deep-first backward process (#16605) * Fix bug of gradient interface * shrink transformer * Right transformer * Change from width-first backward to deep-first backward process test=develop * Reverse iterator op's input test=develop * Polish code * Change the iteration direction in ingrads' map slots test=develop * Polish code test=develop * test=develop, cherry-pick fix for transformer in dygraph * test=develop, fix transformer in dygraph /
1 parent 89d09b8 commit 29f3441

File tree

3 files changed

+18
-35
lines changed

3 files changed

+18
-35
lines changed

paddle/fluid/imperative/layer.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,14 @@ class Autograd {
122122
std::map<std::string, std::vector<VarBase*>> input_grads =
123123
ready_op->ApplyGrad();
124124

125-
for (auto it : input_grads) {
126-
const std::vector<VarBase*>& ingrads = it.second;
125+
for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
126+
const std::vector<VarBase*>& ingrads = it->second;
127127
for (size_t i = 0; i < ingrads.size(); ++i) {
128128
if (!ingrads[i]) continue;
129-
if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
129+
if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
130130
continue;
131131
}
132-
OpBase* pre_op = ready_op->pre_ops_[it.first][i];
132+
OpBase* pre_op = ready_op->pre_ops_[it->first][i];
133133
if (!pre_op) continue;
134134

135135
dep_counts[pre_op] -= 1;

python/paddle/fluid/framework.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,8 @@ def _backward(self):
493493
self._ivar._run_backward()
494494

495495
def _gradient(self):
496-
return np.array(self._ivar._grad_value())
496+
new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
497+
return np.array(new_ivar.value().get_tensor())
497498

498499
def _clear_gradient(self):
499500
self._ivar._clear_gradient()

python/paddle/fluid/tests/unittests/test_imperative_transformer.py

Lines changed: 12 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,11 @@ def make_all_inputs(input_fields):
302302
# if we run sync mode
303303
sync = False
304304

305-
# how many batches we use
306-
batch_num = 2
305+
if not core.is_compiled_with_cuda():
306+
# how many batches we use
307+
batch_num = 50
308+
else:
309+
batch_num = 5
307310

308311
np.random.seed = 1
309312
src_word_np = np.random.randint(
@@ -335,24 +338,6 @@ def make_all_inputs(input_fields):
335338
dtype='int64')
336339
lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
337340

338-
# np.random.seed = 1
339-
# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
340-
# src_pos_np = np.random.randint(
341-
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
342-
# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
343-
# seq_len, seq_len).astype('float32')
344-
#
345-
# trg_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
346-
# trg_pos_np = np.random.randint(
347-
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
348-
# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
349-
# seq_len, seq_len).astype('float32')
350-
# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
351-
# seq_len, seq_len).astype('float32')
352-
#
353-
# lbl_word_np = np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
354-
# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
355-
#
356341
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
357342
ModelHyperParams.d_model)
358343
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
@@ -739,7 +724,7 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
739724
enc_attn_output_pp = self._multihead_attention_layer2(
740725
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
741726
enc_attn_output = self._post_process_layer2(
742-
slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
727+
slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
743728
self._prepostprcess_dropout)
744729
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
745730
self._preprocess_cmd,
@@ -1076,20 +1061,17 @@ def test_transformer_float32(self):
10761061
4]] = out[k]
10771062

10781063
self.assertTrue(
1079-
np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
1064+
np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
10801065
self.assertTrue(
1081-
np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
1066+
np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
10821067
self.assertTrue(
1083-
np.allclose(
1084-
static_predict_value, dy_predict._numpy(), atol=1e-5))
1068+
np.array_equal(static_predict_value, dy_predict._numpy()))
10851069
self.assertTrue(
1086-
np.allclose(static_token_num_value, dy_token_num._numpy()))
1070+
np.array_equal(static_token_num_value, dy_token_num._numpy()))
10871071
for key, value in six.iteritems(static_param_init):
1088-
self.assertTrue(np.allclose(value, dy_param_init[key]))
1072+
self.assertTrue(np.array_equal(value, dy_param_init[key]))
10891073
for key, value in six.iteritems(static_param_updated):
1090-
self.assertTrue(
1091-
np.allclose(
1092-
value, dy_param_updated[key], atol=1e-4))
1074+
self.assertTrue(np.array_equal(value, dy_param_updated[key]))
10931075

10941076

10951077
if __name__ == '__main__':

0 commit comments

Comments
 (0)