Skip to content

Commit ddc2c6e

Browse files
authored
Merge pull request #1893 from reyoung/Add_error_clipping_to_mt_demo
Add error clipping to MT demo.
2 parents 194e492 + da2adea commit ddc2c6e

File tree

7 files changed

+120
-22
lines changed

7 files changed

+120
-22
lines changed

demo/seqToseq/seqToseq_net.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
6969
encoder_size=512,
7070
decoder_size=512,
7171
beam_size=3,
72-
max_length=250):
72+
max_length=250,
73+
error_clipping=50):
7374
"""
7475
A wrapper for an attention version of GRU Encoder-Decoder network
7576
is_generating: whether this config is used for generating
@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
9091
input=src_word_id,
9192
size=word_vector_dim,
9293
param_attr=ParamAttr(name='_source_language_embedding'))
93-
src_forward = simple_gru(input=src_embedding, size=encoder_size)
94+
src_forward = simple_gru(
95+
input=src_embedding,
96+
size=encoder_size,
97+
naive=True,
98+
gru_layer_attr=ExtraLayerAttribute(
99+
error_clipping_threshold=error_clipping))
94100
src_backward = simple_gru(
95-
input=src_embedding, size=encoder_size, reverse=True)
101+
input=src_embedding,
102+
size=encoder_size,
103+
reverse=True,
104+
naive=True,
105+
gru_layer_attr=ExtraLayerAttribute(
106+
error_clipping_threshold=error_clipping))
96107
encoded_vector = concat_layer(input=[src_forward, src_backward])
97108

98109
with mixed_layer(size=decoder_size) as encoded_proj:
@@ -117,11 +128,13 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
117128
decoder_inputs += full_matrix_projection(input=context)
118129
decoder_inputs += full_matrix_projection(input=current_word)
119130

120-
gru_step = gru_step_layer(
131+
gru_step = gru_step_naive_layer(
121132
name='gru_decoder',
122133
input=decoder_inputs,
123134
output_mem=decoder_mem,
124-
size=decoder_size)
135+
size=decoder_size,
136+
layer_attr=ExtraLayerAttribute(
137+
error_clipping_threshold=error_clipping))
125138

126139
with mixed_layer(
127140
size=target_dict_dim, bias_attr=True,

paddle/gserver/tests/sequence_layer_group.conf

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ lstm = lstmemory_group(
4848
size=hidden_dim,
4949
act=TanhActivation(),
5050
gate_act=SigmoidActivation(),
51-
state_act=TanhActivation(),
52-
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
51+
state_act=TanhActivation())
5352

5453
lstm_last = last_seq(input=lstm)
5554

paddle/gserver/tests/sequence_nest_layer_group.conf

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input):
5151
size=hidden_dim,
5252
act=TanhActivation(),
5353
gate_act=SigmoidActivation(),
54-
state_act=TanhActivation(),
55-
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
54+
state_act=TanhActivation())
5655
return lstm_output
5756

5857

python/paddle/trainer_config_helpers/attrs.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,12 +208,15 @@ def __init__(self,
208208
drop_rate=None,
209209
device=None):
210210
self.attr = dict()
211-
if isinstance(error_clipping_threshold, float):
212-
assert error_clipping_threshold > 0
213-
self.attr["error_clipping_threshold"] = error_clipping_threshold
214-
215-
if isinstance(drop_rate, float):
216-
assert drop_rate > 0
211+
if error_clipping_threshold is not None:
212+
error_clipping_threshold = float(error_clipping_threshold)
213+
if error_clipping_threshold < 0:
214+
raise ValueError("Error clipping must > 0")
215+
self.attr['error_clipping_threshold'] = error_clipping_threshold
216+
if drop_rate is not None:
217+
drop_rate = float(drop_rate)
218+
if drop_rate < 0:
219+
raise ValueError("Dropout rate must > 0")
217220
self.attr["drop_rate"] = drop_rate
218221

219222
if isinstance(device, int):

python/paddle/trainer_config_helpers/layers.py

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
'GeneratedInput',
8585
'SubsequenceInput',
8686
'gru_step_layer',
87+
'gru_step_naive_layer',
8788
'recurrent_layer',
8889
'BaseGeneratedInput',
8990
'conv_operator',
@@ -2284,7 +2285,7 @@ def img_pool_layer(input,
22842285

22852286
type_name = pool_type.name + '-projection' \
22862287
if (
2287-
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
2288+
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
22882289
else pool_type.name
22892290

22902291
pool_size_y = pool_size if pool_size_y is None else pool_size_y
@@ -3084,6 +3085,78 @@ def gru_step_layer(input,
30843085
activation=act)
30853086

30863087

3088+
@wrap_bias_attr_default()
3089+
@wrap_param_attr_default()
3090+
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
3091+
@wrap_act_default(act=TanhActivation())
3092+
@wrap_name_default('gru_step')
3093+
@layer_support(ERROR_CLIPPING, DROPOUT)
3094+
def gru_step_naive_layer(input,
3095+
output_mem,
3096+
size=None,
3097+
name=None,
3098+
act=None,
3099+
gate_act=None,
3100+
bias_attr=None,
3101+
param_attr=None,
3102+
layer_attr=None):
3103+
"""
3104+
GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
3105+
and DROPOUT.
3106+
3107+
:param input:
3108+
:param output_mem:
3109+
:param size:
3110+
:param name:
3111+
:param act:
3112+
:param gate_act:
3113+
:param bias_attr:
3114+
:param param_attr:
3115+
:param layer_attr:
3116+
:return:
3117+
"""
3118+
if input.size % 3 != 0:
3119+
raise ValueError("GruStep input size must be divided by 3")
3120+
if size is None:
3121+
size = input.size / 3
3122+
3123+
def __gate__(gate_name, offset):
3124+
with mixed_layer(
3125+
name=name + "_" + gate_name,
3126+
size=size,
3127+
layer_attr=layer_attr,
3128+
bias_attr=bias_attr,
3129+
act=gate_act) as gate:
3130+
gate += identity_projection(input=input, offset=offset)
3131+
gate += full_matrix_projection(
3132+
input=output_mem, param_attr=param_attr)
3133+
return gate
3134+
3135+
update_gate = __gate__("update", 0)
3136+
reset_gate = __gate__("reset", size)
3137+
3138+
with mixed_layer(
3139+
name=name + "_reset_output", bias_attr=False) as reset_output:
3140+
reset_output += dotmul_operator(a=output_mem, b=reset_gate)
3141+
3142+
with mixed_layer(
3143+
name=name + "_output_candidate",
3144+
size=size,
3145+
layer_attr=layer_attr,
3146+
bias_attr=bias_attr,
3147+
act=act) as output_candidate:
3148+
output_candidate += identity_projection(input=input, offset=2 * size)
3149+
output_candidate += full_matrix_projection(
3150+
input=reset_output, param_attr=param_attr)
3151+
3152+
with mixed_layer(name=name) as output:
3153+
output += identity_projection(output_mem)
3154+
output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
3155+
output += dotmul_operator(a=output_candidate, b=update_gate)
3156+
3157+
return output
3158+
3159+
30873160
@wrap_name_default()
30883161
@layer_support()
30893162
def get_output_layer(input, arg_name, name=None, layer_attr=None):

python/paddle/trainer_config_helpers/networks.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -825,7 +825,8 @@ def gru_unit(input,
825825
gru_param_attr=None,
826826
act=None,
827827
gate_act=None,
828-
gru_layer_attr=None):
828+
gru_layer_attr=None,
829+
naive=False):
829830
"""
830831
Define calculations that a gated recurrent unit performs in a single time
831832
step. This function itself is not a recurrent layer, so that it can not be
@@ -857,7 +858,12 @@ def gru_unit(input,
857858

858859
out_mem = memory(name=name, size=size)
859860

860-
gru_out = gru_step_layer(
861+
if naive:
862+
__step__ = gru_step_naive_layer
863+
else:
864+
__step__ = gru_step_layer
865+
866+
gru_out = __step__(
861867
name=name,
862868
input=input,
863869
output_mem=out_mem,
@@ -879,7 +885,8 @@ def gru_group(input,
879885
gru_param_attr=None,
880886
act=None,
881887
gate_act=None,
882-
gru_layer_attr=None):
888+
gru_layer_attr=None,
889+
naive=False):
883890
"""
884891
gru_group is a recurrent layer group version of Gated Recurrent Unit. It
885892
does exactly the same calculation as the grumemory layer does. A promising
@@ -928,7 +935,8 @@ def __gru_step__(ipt):
928935
gru_param_attr=gru_param_attr,
929936
act=act,
930937
gate_act=gate_act,
931-
gru_layer_attr=gru_layer_attr)
938+
gru_layer_attr=gru_layer_attr,
939+
naive=naive)
932940

933941
return recurrent_group(
934942
name='%s_recurrent_group' % name,
@@ -949,7 +957,8 @@ def simple_gru(input,
949957
gru_param_attr=None,
950958
act=None,
951959
gate_act=None,
952-
gru_layer_attr=None):
960+
gru_layer_attr=None,
961+
naive=False):
953962
"""
954963
You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
955964
simple_gru in network.py. The reason why there are so many interfaces is
@@ -1018,7 +1027,8 @@ def simple_gru(input,
10181027
gru_param_attr=gru_param_attr,
10191028
act=act,
10201029
gate_act=gate_act,
1021-
gru_layer_attr=gru_layer_attr)
1030+
gru_layer_attr=gru_layer_attr,
1031+
naive=naive)
10221032

10231033

10241034
@wrap_name_default('simple_gru2')

python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ layers {
320320
}
321321
}
322322
drop_rate: 0.5
323+
error_clipping_threshold: 40.0
323324
}
324325
parameters {
325326
name: "___embedding_0__.w0"

0 commit comments

Comments
 (0)