|
20 | 20 |
|
21 | 21 | __all__ = [
|
22 | 22 | 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
|
23 |
| - 'polynomial_decay', 'piecewise_decay' |
| 23 | + 'polynomial_decay', 'piecewise_decay', 'noam_decay' |
24 | 24 | ]
|
25 | 25 | """
|
26 | 26 | When training a model, it's often useful to decay the
|
|
32 | 32 | """
|
33 | 33 |
|
34 | 34 |
|
35 |
| -def _decay_step_counter(): |
| 35 | +def _decay_step_counter(begin=0): |
36 | 36 | # the first global step is zero in learning rate decay
|
37 | 37 | global_step = nn.autoincreased_step_counter(
|
38 |
| - counter_name='@LR_DECAY_COUNTER@', begin=0, step=1) |
| 38 | + counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1) |
39 | 39 | global_step = tensor.cast(global_step, 'float32')
|
40 | 40 | return global_step
|
41 | 41 |
|
42 | 42 |
|
| 43 | +def noam_decay(d_model, warmup_steps): |
| 44 | + """Apply decay to learning rate. |
| 45 | + ```python |
| 46 | + lr_value = np.power(d_model, -0.5) * np.min([ |
| 47 | + np.power(current_steps, -0.5), |
| 48 | + np.power(warmup_steps, -1.5) * current_steps |
| 49 | + ]) |
| 50 | + ``` |
| 51 | +
|
| 52 | + Args: |
| 53 | + d_model(Variable): The dimensionality of input and output of model. |
| 54 | + Reference: attention is all you need |
| 55 | + https://arxiv.org/pdf/1706.03762.pdf |
| 56 | + warmup_steps(Variable): A super parameter. |
| 57 | +
|
| 58 | + Returns: |
| 59 | + The decayed learning rate. |
| 60 | + """ |
| 61 | + global_step = _decay_step_counter(1) |
| 62 | + with init_on_cpu(): |
| 63 | + a = global_step**-0.5 |
| 64 | + b = (warmup_steps**-1.5) * global_step |
| 65 | + lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) |
| 66 | + |
| 67 | + return lr_value |
| 68 | + |
| 69 | + |
43 | 70 | def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
|
44 | 71 | """Applies exponential decay to the learning rate.
|
45 | 72 |
|
|
0 commit comments