1
+ @deprecate ADAM Adam
2
+ @deprecate NADAM NAdam
3
+ @deprecate ADAMW AdamW
4
+ @deprecate RADAM RAdam
5
+ @deprecate OADAM OAdam
6
+ @deprecate ADAGrad AdaGrad
7
+ @deprecate ADADelta AdaDelta
8
+
1
9
"""
2
10
Descent(η = 1f-1)
3
11
@@ -110,9 +118,9 @@ function apply!(o::RMSProp, state, x, dx)
110
118
end
111
119
112
120
"""
113
- ADAM (η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
121
+ Adam (η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
114
122
115
- [ADAM ](https://arxiv.org/abs/1412.6980) optimiser.
123
+ [Adam ](https://arxiv.org/abs/1412.6980) optimiser.
116
124
117
125
# Parameters
118
126
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -122,16 +130,18 @@ end
122
130
- Machine epsilon (`ϵ`): Constant to prevent division by zero
123
131
(no need to change default)
124
132
"""
125
- struct ADAM {T}
133
+ struct Adam {T}
126
134
eta:: T
127
135
beta:: Tuple{T, T}
128
136
epsilon:: T
129
137
end
130
- ADAM (η = 1f-3 , β = (9f-1 , 9.99f-1 ), ϵ = eps (typeof (η))) = ADAM {typeof(η)} (η, β, ϵ)
138
+ Adam (η = 1f-3 , β = (9f-1 , 9.99f-1 ), ϵ = eps (typeof (η))) = Adam {typeof(η)} (η, β, ϵ)
139
+
140
+ const Adam = Adam
131
141
132
- init (o:: ADAM , x:: AbstractArray ) = (zero (x), zero (x), o. beta)
142
+ init (o:: Adam , x:: AbstractArray ) = (zero (x), zero (x), o. beta)
133
143
134
- function apply! (o:: ADAM , state, x, dx)
144
+ function apply! (o:: Adam , state, x, dx)
135
145
η, β, ϵ = o. eta, o. beta, o. epsilon
136
146
mt, vt, βt = state
137
147
@@ -143,9 +153,9 @@ function apply!(o::ADAM, state, x, dx)
143
153
end
144
154
145
155
"""
146
- RADAM (η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
156
+ RAdam (η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
147
157
148
- [Rectified ADAM ](https://arxiv.org/abs/1908.03265) optimizer.
158
+ [Rectified Adam ](https://arxiv.org/abs/1908.03265) optimizer.
149
159
150
160
# Parameters
151
161
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -155,16 +165,16 @@ end
155
165
- Machine epsilon (`ϵ`): Constant to prevent division by zero
156
166
(no need to change default)
157
167
"""
158
- struct RADAM {T}
168
+ struct RAdam {T}
159
169
eta:: T
160
170
beta:: Tuple{T, T}
161
171
epsilon:: T
162
172
end
163
- RADAM (η = 1f-3 , β = (9f-1 , 9.99f-1 ), ϵ = eps (typeof (η))) = RADAM {typeof(η)} (η, β, ϵ)
173
+ RAdam (η = 1f-3 , β = (9f-1 , 9.99f-1 ), ϵ = eps (typeof (η))) = RAdam {typeof(η)} (η, β, ϵ)
164
174
165
- init (o:: RADAM , x:: AbstractArray ) = (zero (x), zero (x), o. beta, 1 )
175
+ init (o:: RAdam , x:: AbstractArray ) = (zero (x), zero (x), o. beta, 1 )
166
176
167
- function apply! (o:: RADAM , state, x, dx)
177
+ function apply! (o:: RAdam , state, x, dx)
168
178
η, β, ϵ = o. eta, o. beta, o. epsilon
169
179
ρ∞ = 2 / (1 - β[2 ])- 1
170
180
186
196
"""
187
197
AdaMax(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
188
198
189
- [AdaMax](https://arxiv.org/abs/1412.6980) is a variant of ADAM based on the ∞-norm.
199
+ [AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm.
190
200
191
201
# Parameters
192
202
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -217,10 +227,10 @@ function apply!(o::AdaMax, state, x, dx)
217
227
end
218
228
219
229
"""
220
- OADAM (η = 1f-3, β = (5f-1, 9f-1), ϵ = eps(typeof(η)))
230
+ OAdam (η = 1f-3, β = (5f-1, 9f-1), ϵ = eps(typeof(η)))
221
231
222
- [OADAM ](https://arxiv.org/abs/1711.00141) (Optimistic ADAM )
223
- is a variant of ADAM adding an "optimistic" term suitable for adversarial training.
232
+ [OAdam ](https://arxiv.org/abs/1711.00141) (Optimistic Adam )
233
+ is a variant of Adam adding an "optimistic" term suitable for adversarial training.
224
234
225
235
# Parameters
226
236
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -230,16 +240,16 @@ is a variant of ADAM adding an "optimistic" term suitable for adversarial traini
230
240
- Machine epsilon (`ϵ`): Constant to prevent division by zero
231
241
(no need to change default)
232
242
"""
233
- struct OADAM {T}
243
+ struct OAdam {T}
234
244
eta:: T
235
245
beta:: Tuple{T, T}
236
246
epsilon:: T
237
247
end
238
- OADAM (η = 1f-3 , β = (5f-1 , 9f-1 ), ϵ = eps (typeof (η))) = OADAM {typeof(η)} (η, β, ϵ)
248
+ OAdam (η = 1f-3 , β = (5f-1 , 9f-1 ), ϵ = eps (typeof (η))) = OAdam {typeof(η)} (η, β, ϵ)
239
249
240
- init (o:: OADAM , x:: AbstractArray ) = (zero (x), zero (x), o. beta, zero (x))
250
+ init (o:: OAdam , x:: AbstractArray ) = (zero (x), zero (x), o. beta, zero (x))
241
251
242
- function apply! (o:: OADAM , state, x, dx)
252
+ function apply! (o:: OAdam , state, x, dx)
243
253
η, β, ϵ = o. eta, o. beta, o. epsilon
244
254
mt, vt, βt, term = state
245
255
@@ -253,9 +263,9 @@ function apply!(o::OADAM, state, x, dx)
253
263
end
254
264
255
265
"""
256
- ADAGrad (η = 1f-1, ϵ = eps(typeof(η)))
266
+ AdaGrad (η = 1f-1, ϵ = eps(typeof(η)))
257
267
258
- [ADAGrad ](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
268
+ [AdaGrad ](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
259
269
parameter specific learning rates based on how frequently it is updated.
260
270
Parameters don't need tuning.
261
271
@@ -265,15 +275,15 @@ Parameters don't need tuning.
265
275
- Machine epsilon (`ϵ`): Constant to prevent division by zero
266
276
(no need to change default)
267
277
"""
268
- struct ADAGrad {T}
278
+ struct AdaGrad {T}
269
279
eta:: T
270
280
epsilon:: T
271
281
end
272
- ADAGrad (η = 1f-1 , ϵ = eps (typeof (η))) = ADAGrad {typeof(η)} (η, ϵ)
282
+ AdaGrad (η = 1f-1 , ϵ = eps (typeof (η))) = AdaGrad {typeof(η)} (η, ϵ)
273
283
274
- init (o:: ADAGrad , x:: AbstractArray ) = onevalue (o. epsilon, x)
284
+ init (o:: AdaGrad , x:: AbstractArray ) = onevalue (o. epsilon, x)
275
285
276
- function apply! (o:: ADAGrad , state, x, dx)
286
+ function apply! (o:: AdaGrad , state, x, dx)
277
287
η, ϵ = o. eta, o. epsilon
278
288
acc = state
279
289
@@ -284,9 +294,9 @@ function apply!(o::ADAGrad, state, x, dx)
284
294
end
285
295
286
296
"""
287
- ADADelta (ρ = 9f-1, ϵ = eps(typeof(ρ)))
297
+ AdaDelta (ρ = 9f-1, ϵ = eps(typeof(ρ)))
288
298
289
- [ADADelta ](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
299
+ [AdaDelta ](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning
290
300
rate based on a window of past gradient updates.
291
301
Parameters don't need tuning.
292
302
@@ -295,15 +305,15 @@ Parameters don't need tuning.
295
305
- Machine epsilon (`ϵ`): Constant to prevent division by zero
296
306
(no need to change default)
297
307
"""
298
- struct ADADelta {T}
308
+ struct AdaDelta {T}
299
309
rho:: T
300
310
epsilon:: T
301
311
end
302
- ADADelta (ρ = 9f-1 , ϵ = eps (typeof (ρ))) = ADADelta {typeof(ρ)} (ρ, ϵ)
312
+ AdaDelta (ρ = 9f-1 , ϵ = eps (typeof (ρ))) = AdaDelta {typeof(ρ)} (ρ, ϵ)
303
313
304
- init (o:: ADADelta , x:: AbstractArray ) = (zero (x), zero (x))
314
+ init (o:: AdaDelta , x:: AbstractArray ) = (zero (x), zero (x))
305
315
306
- function apply! (o:: ADADelta , state, x, dx)
316
+ function apply! (o:: AdaDelta , state, x, dx)
307
317
ρ, ϵ = o. rho, o. epsilon
308
318
acc, Δacc = state
309
319
318
328
"""
319
329
AMSGrad(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
320
330
321
- The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
331
+ The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam
322
332
optimiser. Parameters don't need tuning.
323
333
324
334
# Parameters
@@ -352,9 +362,9 @@ function apply!(o::AMSGrad, state, x, dx)
352
362
end
353
363
354
364
"""
355
- NADAM (η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
365
+ NAdam (η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
356
366
357
- [NADAM ](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of ADAM .
367
+ [NAdam ](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam .
358
368
Parameters don't need tuning.
359
369
360
370
# Parameters
@@ -365,16 +375,16 @@ Parameters don't need tuning.
365
375
- Machine epsilon (`ϵ`): Constant to prevent division by zero
366
376
(no need to change default)
367
377
"""
368
- struct NADAM {T}
378
+ struct NAdam {T}
369
379
eta:: T
370
380
beta:: Tuple{T, T}
371
381
epsilon:: T
372
382
end
373
- NADAM (η = 1f-3 , β = (9f-1 , 9.99f-1 ), ϵ = eps (typeof (η))) = NADAM {typeof(η)} (η, β, ϵ)
383
+ NAdam (η = 1f-3 , β = (9f-1 , 9.99f-1 ), ϵ = eps (typeof (η))) = NAdam {typeof(η)} (η, β, ϵ)
374
384
375
- init (o:: NADAM , x:: AbstractArray ) = (zero (x), zero (x), o. beta)
385
+ init (o:: NAdam , x:: AbstractArray ) = (zero (x), zero (x), o. beta)
376
386
377
- function apply! (o:: NADAM , state, x, dx)
387
+ function apply! (o:: NAdam , state, x, dx)
378
388
η, β, ϵ = o. eta, o. beta, o. epsilon
379
389
380
390
mt, vt, βt = state
@@ -388,9 +398,9 @@ function apply!(o::NADAM, state, x, dx)
388
398
end
389
399
390
400
"""
391
- ADAMW (η = 1f-3, β = (9f-1, 9.99f-1), γ = 0, ϵ = eps(typeof(η)))
401
+ AdamW (η = 1f-3, β = (9f-1, 9.99f-1), γ = 0, ϵ = eps(typeof(η)))
392
402
393
- [ADAMW ](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its
403
+ [AdamW ](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
394
404
weight decay regularization.
395
405
396
406
# Parameters
@@ -402,14 +412,14 @@ weight decay regularization.
402
412
- Machine epsilon (`ϵ`): Constant to prevent division by zero
403
413
(no need to change default)
404
414
"""
405
- ADAMW (η = 1f-3 , β = (9f-1 , 9.99f-1 ), γ = 0 , ϵ = eps (typeof (η))) =
406
- OptimiserChain (ADAM {typeof(η)} (η, β, ϵ), WeightDecay {typeof(η)} (γ))
415
+ AdamW (η = 1f-3 , β = (9f-1 , 9.99f-1 ), γ = 0 , ϵ = eps (typeof (η))) =
416
+ OptimiserChain (Adam {typeof(η)} (η, β, ϵ), WeightDecay {typeof(η)} (γ))
407
417
408
418
"""
409
419
AdaBelief(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1e-16)
410
420
411
421
The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
412
- ADAM optimiser.
422
+ Adam optimiser.
413
423
414
424
# Parameters
415
425
- Learning rate (`η`): Amount by which gradients are discounted before updating
0 commit comments