|
| 1 | +from .Optimizer import Optimizer |
| 2 | +from ..utils import create_param_state_as |
| 3 | + |
| 4 | +from theano.tensor import sqrt |
| 5 | + |
| 6 | + |
| 7 | +class AdaDelta(Optimizer): |
| 8 | + """ |
| 9 | + Implements Matt Zeiler's "Adaptive Learningrate" method, aka. AdaDelta. |
| 10 | + The paper itself is really neat, and both very convincing and practical. |
| 11 | +
|
| 12 | + TL;DR: 1. AdaGrad quickly anneals, AdaDelta doesn't. (No proof.) |
| 13 | + 2. AdaGrad *is* sensitive to learning-rate, AdaGrad not so much. (Table 1.) |
| 14 | + 3. AdaGrad includes 2nd-order approximation. (3.2) |
| 15 | +
|
| 16 | + The updates are: |
| 17 | +
|
| 18 | + g²_{e+1} = ρ * g²_e + (1-ρ) * ∇p_e² |
| 19 | + up_{e+1} = √(d²_e / g²_{e+1}) * ∇p_e |
| 20 | + d²_{e+1} = ρ * d²_e + (1-ρ) * up² |
| 21 | + p_{e+1} = p_e - up_{e+1} |
| 22 | +
|
| 23 | + As in RMSProp, we need to add epsilons in order to create stability. |
| 24 | +
|
| 25 | + It turns out that the effective learning-rate will converge to 1 as the |
| 26 | + gradients decrease (and thus learning grinds to a halt). This could be used |
| 27 | + to check for convergence by a specialized trainer. |
| 28 | +
|
| 29 | + The only reason `lr` is still there is this tweet by Alec Radford: |
| 30 | +
|
| 31 | + https://twitter.com/AlecRad/status/543518744799358977 |
| 32 | +
|
| 33 | + @kastnerkyle @ogrisel @johnmyleswhite @tcovert Adadelta raw is finicky, |
| 34 | + shrinking its updates by 0.5 "just works" in my experience as well. |
| 35 | + """ |
| 36 | + |
| 37 | + def __init__(self, rho, eps=1e-7, lr=1): |
| 38 | + Optimizer.__init__(self, rho=rho, eps=eps, lr=lr) |
| 39 | + |
| 40 | + def get_updates(self, params, grads, rho, eps, lr): |
| 41 | + updates = [] |
| 42 | + |
| 43 | + for param, grad in zip(params, grads): |
| 44 | + g2_state = create_param_state_as(param, prefix='g2_') |
| 45 | + d2_state = create_param_state_as(param, prefix='d2_') |
| 46 | + |
| 47 | + new_g2 = rho*g2_state + (1-rho)*grad*grad |
| 48 | + up = lr * sqrt((d2_state+eps) / (new_g2+eps)) * grad |
| 49 | + new_d2 = rho*d2_state + (1-rho)*up*up |
| 50 | + |
| 51 | + updates.append((g2_state, new_g2)) |
| 52 | + updates.append((param, param - up)) |
| 53 | + updates.append((d2_state, new_d2)) |
| 54 | + |
| 55 | + return updates |
0 commit comments