forked from mwong009/mtltrajet
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoptimizers.py
More file actions
224 lines (196 loc) · 7.48 KB
/
optimizers.py
File metadata and controls
224 lines (196 loc) · 7.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import theano
import numpy as np
import theano.tensor as T
DTYPE_FLOATX = theano.config.floatX
class Optimizers(object):
def __init__(self, name=None):
self.name = name
def sgd_updates(self, params, grads, learning_rate=1e-3):
"""
sgd_updates func
Stochastic Gradient Descent (SGD)
Generates update expressions of the form:
param := param - learning_rate * gradient
Parameters
----------
params : `list` of shared variables
The variables to generate update expressions for\n
grads : `list` of shared variables
The update expressions for each variable\n
learning_rate : `float` or symbolic scalar
The learning rate controlling the size of update steps\n
Returns
-------
updates:
Specify how to update the parameters of the model as a list of
(variable, update expression)
"""
# given two lists of the same length, A = [a1, a2, a3, a4] and
# B = [b1, b2, b3, b4], zip generates a list C of same size, where each
# element is a pair formed from the two lists :
# C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
updates = []
for param, grad in zip(params, grads):
updates.append((param, param - learning_rate * grad))
return updates
def rmsprop_updates(self, params, grads, learning_rate=1e-3, rho=0.9,
e=1e-8):
"""
rmsprop_updates func
RMSProp
Divide the gradient by a running average of its recent magnitude
accu_new := rho * accu + (1 - rho) * gradient^2
rmsprop := - learning rate * grad / sqrt(accu_new + e)
param_new := param + rmsprop
Parameters
----------
params : `list` of shared variables
The variables to generate update expressions for
grads : `list` of shared variables
The update expressions for each variable
learning_rate : `float` or symbolic scalar
The learning rate controlling the size of update steps
rho : `float` or symbolic scalar
Gradient moving average decay factor
e : `float` or symbolic scalar
Small value added for numerical stability
nesterov : `boolean`
Whether to apply the Nesterov Accelerated Gradient momentum
v_new := momentum * velocity + rmsprop
nesterov := v_new + momentum * (v_new - velocity)
param_new := param + nesterov
Returns
-------
updates:
Specify how to update the parameters of the model as a list of
(variable, update expression)
"""
updates = []
for param, grad in zip(params, grads):
size = param.shape.eval()
# accumulator
accu = theano.shared(
value=np.zeros(shape=size, dtype=theano.config.floatX),
name='accu'
)
# update accumulator
accu_new = rho * accu + (1. - rho) * T.sqr(grad)
updates.append((accu, accu_new))
# RMSProp
rmsgrad = T.maximum(T.sqrt(accu_new), e)
inc = - learning_rate * grad / rmsgrad
updates.append((param, param + inc))
return updates
def momentum_updates(self, params, grads, learning_rate=1e-3, m=0.9,
nesterov=True):
"""
momentum_updates func
SGD with Nesterov Accelerated Gradient (NAG) momentum
Parameters
----------
params : `list` of shared variables
The variables to generate update expressions for
grads : `list` of shared variables
The update expressions for each variable
learning_rate : `float` or symbolic scalar
The learning rate controlling the size of update steps
m : `float` or symbolic scalar
The momentum rate of the velocity vector
Returns
-------
updates:
Specify how to update the parameters of the model as a list of
(variable, update expression)
"""
updates = []
lr = learning_rate * (1 - m)
for param, grad in zip(params, grads):
size = param.eval().shape
# momentum velocity
v = theano.shared(
value=np.zeros(size, DTYPE_FLOATX),
name='v_0'
)
# velocity update
inc = m * v - lr * grad
updates.append((v, inc))
# Nesterov accelerated momentum
if nesterov:
inc = m * inc - lr * grad
updates.append((param, param + inc))
return updates
def adam_updates(self, params, grads, lr=1e-3, b1=0.9, b2=0.999, e=1e-8,
amsgrad=True):
"""
adam_updates func
Notes
-----
Adam optimizer
Adam - A Method for Stochastic Optimization
(http://arxiv.org/abs/1412.6980v8)
AMSGrad modification
On the Convergence of Adam and Beyond
(https://openreview.net/forum?id=ryQu7f-RZ)
Parameters
----------
params : `list` of shared variables
The variables to generate update expressions for
grads : `list` of shared variables
The update expressions for each variable
learning_rate : `float` or symbolic scalar
The learning rate controlling the size of update step
beta1 : `float` or symbolic scalar
Exponential decay rate for the first moment estimates
beta2 : `float` or symbolic scalar
Exponential decay rate for the second moment estimates
e : `float` or symbolic scalar
Constant for numerical stability
amsgrad : `boolean`
Whether to apply the AMSGrad variant of Adam
Returns
-------
updates:
Specify how to update the parameters of the model as a list of
(variable, update expression)
"""
updates = []
# initialize timestep
one = T.constant(1)
i = theano.shared(np.float32(0.))
# increment timestep
i_t = i + 1.
# adjust learning rate at timestep
a_t = lr * T.sqrt(one - b2**i_t) / (one - b1**i_t)
for param, grad in zip(params, grads):
size = param.eval().shape
# 1st moment vector
m = theano.shared(
value=np.zeros(size, DTYPE_FLOATX),
name='m_0'
)
# 2nd moment vector
v = theano.shared(
value=np.zeros(size, DTYPE_FLOATX),
name='v_0'
)
if amsgrad:
vhat = theano.shared(
value=np.zeros(size, DTYPE_FLOATX),
name='v_hat'
)
# momentum calculation
m_t = b1 * m + (one - b1) * grad
v_t = b2 * v + (one - b2) * grad**2
updates.append((v, v_t))
updates.append((m, m_t))
if amsgrad:
vhat_t = T.maximum(vhat, v_t)
grad_t = m_t / (T.sqrt(vhat_t) + e)
updates.append((vhat, vhat_t))
else:
grad_t = m_t / (T.sqrt(v_t) + e)
# Adam update rule
adam = a_t * grad_t
updates.append((param, param - adam))
updates.append((i, i_t))
return updates