Skip to content

Commit 2730d01

Browse files
committed
Merge pull request #5 from lucasb-eyer/moar-optimizers
Moar optimizers
2 parents 0a8acf9 + 490b1cb commit 2730d01

File tree

14 files changed

+379
-5
lines changed

14 files changed

+379
-5
lines changed

DeepFried2/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .layers import *
22
from .containers import *
33
from .criteria import *
4+
from .optimizers import *

DeepFried2/optimizers/AdaDelta.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from .Optimizer import Optimizer
2+
from ..utils import create_param_state_as
3+
4+
from theano.tensor import sqrt
5+
6+
7+
class AdaDelta(Optimizer):
8+
"""
9+
Implements Matt Zeiler's "Adaptive Learningrate" method, aka. AdaDelta.
10+
The paper itself is really neat, and both very convincing and practical.
11+
12+
TL;DR: 1. AdaGrad quickly anneals, AdaDelta doesn't. (No proof.)
13+
2. AdaGrad *is* sensitive to learning-rate, AdaGrad not so much. (Table 1.)
14+
3. AdaGrad includes 2nd-order approximation. (3.2)
15+
16+
The updates are:
17+
18+
g²_{e+1} = ρ * g²_e + (1-ρ) * ∇p_e²
19+
up_{e+1} = √(d²_e / g²_{e+1}) * ∇p_e
20+
d²_{e+1} = ρ * d²_e + (1-ρ) * up²
21+
p_{e+1} = p_e - up_{e+1}
22+
23+
As in RMSProp, we need to add epsilons in order to create stability.
24+
25+
It turns out that the effective learning-rate will converge to 1 as the
26+
gradients decrease (and thus learning grinds to a halt). This could be used
27+
to check for convergence by a specialized trainer.
28+
29+
The only reason `lr` is still there is this tweet by Alec Radford:
30+
31+
https://twitter.com/AlecRad/status/543518744799358977
32+
33+
@kastnerkyle @ogrisel @johnmyleswhite @tcovert Adadelta raw is finicky,
34+
shrinking its updates by 0.5 "just works" in my experience as well.
35+
"""
36+
37+
def __init__(self, rho, eps=1e-7, lr=1):
38+
Optimizer.__init__(self, rho=rho, eps=eps, lr=lr)
39+
40+
def get_updates(self, params, grads, rho, eps, lr):
41+
updates = []
42+
43+
for param, grad in zip(params, grads):
44+
g2_state = create_param_state_as(param, prefix='g2_')
45+
d2_state = create_param_state_as(param, prefix='d2_')
46+
47+
new_g2 = rho*g2_state + (1-rho)*grad*grad
48+
up = lr * sqrt((d2_state+eps) / (new_g2+eps)) * grad
49+
new_d2 = rho*d2_state + (1-rho)*up*up
50+
51+
updates.append((g2_state, new_g2))
52+
updates.append((param, param - up))
53+
updates.append((d2_state, new_d2))
54+
55+
return updates

DeepFried2/optimizers/AdaGrad.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from .Optimizer import Optimizer
2+
from ..utils import create_param_state_as
3+
4+
from theano.tensor import sqrt
5+
6+
7+
class AdaGrad(Optimizer):
8+
"""
9+
Implements Duchi's "Adaptive Subgradient" method, aka AdaGrad.
10+
Chris Dyer's "Notes on AdaGrad" are pretty awesome for practical purposes.
11+
12+
TL;DR: AdaGrad doesn't need additional parameters (a lie) and makes the
13+
optimization much less sensitive to the learning-rate!
14+
15+
In reality, it was a pioneer of fixing slow-learning features by adapting
16+
a feature's own learning-rate using an estimate of its raw 2nd moment, but
17+
its ideas have flown into superior AdaDelta and Adam.
18+
19+
The updates are:
20+
21+
g²_{e+1} = g²_e + ∇(p_e)²
22+
p_{e+1} = p_e - (lr / √g²_{e+1}) * ∇p_e
23+
24+
that is, divide the learning-rate by a running square of the gradient.
25+
26+
Note that this would lead to division by 0 in the beginning for those
27+
weights which don't receive a gradient (might be many with ReLUs), so we
28+
initialize g² with a small value.
29+
"""
30+
31+
def __init__(self, lr, eps=1e-7):
32+
Optimizer.__init__(self, lr=lr)
33+
34+
# eps is only needed as numeric value for initializing state and it's
35+
# not possible to initialize state using symbolic variables.
36+
self.eps=eps
37+
38+
def get_updates(self, params, grads, lr):
39+
updates = []
40+
41+
for param, grad in zip(params, grads):
42+
g2_state = create_param_state_as(param, initial_value=self.eps)
43+
new_g2 = g2_state + grad*grad
44+
updates.append((g2_state, new_g2))
45+
updates.append((param, param - lr/sqrt(new_g2) * grad))
46+
47+
return updates

DeepFried2/optimizers/Momentum.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,17 @@
33

44

55
class Momentum(Optimizer):
6+
"""
7+
Implementation of the "Classical Momentum" (CM) which is explained in
8+
further detail in
9+
10+
"On the importance of initialization and momentum in deep learning"
11+
12+
The updates are:
13+
14+
v_{e+1} = mom * v_e - lr * ∇p_e
15+
p_{e+1} = p_e + v_{e+1}
16+
"""
617

718
def __init__(self, lr, momentum):
819
Optimizer.__init__(self, lr=lr, momentum=momentum)

DeepFried2/optimizers/Nesterov.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from .Optimizer import Optimizer
2+
from ..utils import create_param_state_as
3+
4+
5+
class Nesterov(Optimizer):
6+
"""
7+
Implementation of "Nesterov's Accelerated Gradient" (NAG) which is explained
8+
in further detail in
9+
10+
"On the importance of initialization and momentum in deep learning"
11+
12+
But the equation for NAG has been reshuffled by Nicolas Boulanger in
13+
14+
https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617
15+
16+
for easier implementation in Theano. The updates are:
17+
18+
v_{e+1} = mom * v_e - lr * ∇p_e
19+
p_{e+1} = p_e + mom * v_{e+1} - lr * ∇p_e
20+
"""
21+
22+
def __init__(self, lr, momentum):
23+
Optimizer.__init__(self, lr=lr, momentum=momentum)
24+
25+
def get_updates(self, params, grads, lr, momentum):
26+
updates = []
27+
28+
for param, grad in zip(params, grads):
29+
param_mom = create_param_state_as(param)
30+
v = momentum * param_mom - lr * grad
31+
updates.append((param_mom, v))
32+
updates.append((param, param + momentum * v - lr * grad))
33+
34+
return updates

DeepFried2/optimizers/Optimizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,6 @@ def update_parameters(self, model):
2424

2525
def get_updates(self, params, grads):
2626
raise NotImplementedError
27+
28+
def __repr__(self):
29+
return type(self).__name__ + "(" + ", ".join(k+"="+str(v) for k,v in self.hyperparams.items()) + ")"

DeepFried2/optimizers/RMSProp.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from .Optimizer import Optimizer
2+
from ..utils import create_param_state_as
3+
4+
from theano.tensor import sqrt
5+
6+
7+
class RMSProp(Optimizer):
8+
"""
9+
Implements Hinton's "RMSProp" method presented in his Coursera lecture 6.5.
10+
Essentially, it sits right in-between AdaGrad and AdaDelta by being a
11+
windowed version of AdaGrad.
12+
13+
The updates are:
14+
15+
g²_{e+1} = ρ * g²_e + (1-ρ) * ∇p_e²
16+
p_{e+1} = p_e - (lr / √g²_{e+1}) * ∇p_e
17+
18+
Note that in this case just initializing with epsilon is not enough anymore
19+
as we might get zero-gradient for some units long enough to completely fill
20+
the window.
21+
"""
22+
23+
def __init__(self, lr, rho, eps=1e-7):
24+
Optimizer.__init__(self, lr=lr, rho=rho, eps=eps)
25+
26+
def get_updates(self, params, grads, lr, rho, eps):
27+
updates = []
28+
29+
for param, grad in zip(params, grads):
30+
g2_state = create_param_state_as(param)
31+
new_g2 = rho*g2_state + (1-rho)*grad*grad
32+
updates.append((g2_state, new_g2))
33+
updates.append((param, param - lr/sqrt(new_g2+eps) * grad))
34+
35+
return updates

DeepFried2/optimizers/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1-
from .Optimizer import *
2-
from .Momentum import *
3-
from .SGD import *
1+
from .Optimizer import Optimizer
2+
from .SGD import SGD
3+
from .Momentum import Momentum
4+
from .Nesterov import Nesterov
5+
from .AdaGrad import AdaGrad
6+
from .RMSProp import RMSProp
7+
from .AdaDelta import AdaDelta

DeepFried2/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ def create_param_and_grad(shape, init, fan=None, name=None, type=_th.config.floa
1414
return param, grad_param
1515

1616

17-
def create_param_state_as(other, initial_value=0):
17+
def create_param_state_as(other, initial_value=0, prefix='state_for_'):
1818
return _th.shared(other.get_value()*0 + initial_value,
1919
broadcastable=other.broadcastable,
20-
name='state_for_' + str(other.name)
20+
name=prefix + str(other.name)
2121
)

examples/Optimizers/mnist.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
import gzip
3+
import pickle
4+
import sys
5+
6+
# Python 2/3 compatibility.
7+
try:
8+
from urllib.request import urlretrieve
9+
except ImportError:
10+
from urllib import urlretrieve
11+
12+
13+
'''Adapted from theano tutorial'''
14+
15+
16+
def load_mnist(data_file = os.path.join(os.path.dirname(__file__), 'mnist.pkl.gz')):
17+
18+
if not os.path.exists(data_file):
19+
origin = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
20+
print('Downloading data from {}'.format(origin))
21+
urlretrieve(origin, data_file)
22+
23+
print('... loading data')
24+
25+
with gzip.open(data_file, 'rb') as f:
26+
if sys.version_info[0] == 3:
27+
return pickle.load(f, encoding='latin1')
28+
else:
29+
return pickle.load(f)

0 commit comments

Comments
 (0)