@@ -230,248 +230,36 @@ Citations
230230
231231`AdamP <https://scholar.googleusercontent.com/scholar.bib?q=info:SfSq5UFS71wJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YevydU:AAGBfm0AAAAAYxCp0dVqrS10vvLfEDcY31SdH8ZRpeB4&scisig=AAGBfm0AAAAAYxCp0bLEn4nNd2Gmpb64J-nsN62Hq19N&scisf=4&ct=citation&cd=-1&hl=en >`__
232232
233+ `Adaptive Gradient Clipping (AGC) <https://scholar.googleusercontent.com/scholar.bib?q=info:G6OwKvfrhU4J:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YesC_0:AAGBfm0AAAAAYxCqE_3u1oAcHorMaAJ_SR7Xo5PvdxIC&scisig=AAGBfm0AAAAAYxCqEz7D8y15Q5sJL5QUdbpTMdFHGSMi&scisf=4&ct=citation&cd=-1&hl=en >`__
233234
234- .. [ AGC ]
235+ ` Chebyshev LR Schedules < https://scholar.googleusercontent.com/scholar.bib?q=info:5bxSTRao5pUJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YesV7g:AAGBfm0AAAAAYxCqT7jEP6cOz39vHjSXD71OiD_WHNeu&scisig=AAGBfm0AAAAAYxCqTxBAT7yBvhGW1KZopv6tYDL6fjhq&scisf=4&ct=citation&cd=-1&hl=en >`__
235236
236- @article{brock2021high,
237- author={Andrew Brock and Soham De and Samuel L. Smith and Karen Simonyan},
238- title={High-Performance Large-Scale Image Recognition Without Normalization},
239- journal={arXiv preprint arXiv:2102.06171},
240- year={2021}
241- }
242-
243- .. [Chebyshev-LR-Schedules ]
244-
245- @article{agarwal2021acceleration,
246- title={Acceleration via Fractal Learning Rate Schedules},
247- author={Agarwal, Naman and Goel, Surbhi and Zhang, Cyril},
248- journal={arXiv preprint arXiv:2103.01338},
249- year={2021}
250- }
251-
252-
253- .. raw :: html
254-
255- </details >
256-
257- .. raw :: html
258-
259- <details >
260- <summary ><a >Chebyshev LR Schedules: Acceleration via Fractal Learning Rate Schedules</a ></summary >
261-
262- ::
263-
264- @article{agarwal2021acceleration,
265- title={Acceleration via Fractal Learning Rate Schedules},
266- author={Agarwal, Naman and Goel, Surbhi and Zhang, Cyril},
267- journal={arXiv preprint arXiv:2103.01338},
268- year={2021}
269- }
237+ `Gradient Centralization (GC) <https://scholar.googleusercontent.com/scholar.bib?q=info:MQDRtwz4RekJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YeskLw:AAGBfm0AAAAAYxCqiLx6z7Lo-Fag54T6c22UyMxC3uKU&scisig=AAGBfm0AAAAAYxCqiDzweYqjl8tPPjAVYv4y42-amW04&scisf=4&ct=citation&cd=-1&hl=en >`__
270238
271- .. raw :: html
239+ ` Lookahead < https://scholar.googleusercontent.com/scholar.bib?q=info:A1J2Cn9LEyQJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0Yest68:AAGBfm0AAAAAYxCqr68LW2mC6SXXXXIEv17IH1VfVwTU&scisig=AAGBfm0AAAAAYxCqr0ZQGEPcASa4BcFlRIMYfC_ELoH3&scisf=4&ct=citation&cd=-1&hl=en >`__
272240
273- </ details >
241+ ` RAdam < https://scholar.googleusercontent.com/scholar.bib?q=info:tTLLKZi0NB4J:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0Yes-Kc:AAGBfm0AAAAAYxCq4KdbtBaCrCnPM3teTRbkG2ke4zu1&scisig=AAGBfm0AAAAAYxCq4DKANM54ZoMqj8sYTKjhrrWTYZJv&scisf=4&ct=citation&cd=-1&hl=en >`__
274242
275- .. raw :: html
243+ ` Norm Loss < https://scholar.googleusercontent.com/scholar.bib?q=info:cgudi9fC610J:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YetGG8:AAGBfm0AAAAAYxCrAG8mPyX5faDy-Orn0sNT3laCqhCX&scisig=AAGBfm0AAAAAYxCrAPhudmT6SGj0XyHAGuBIgn4iP9UM&scisf=4&ct=citation&cd=-1&hl=en >`__
276244
277- <details >
278- <summary ><a >Gradient Centralization (GC)</a ></summary >
245+ `Positive-Negative Momentum <https://scholar.googleusercontent.com/scholar.bib?q=info:EU4LbWCU44UJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YetNIE:AAGBfm0AAAAAYxCrLIFD4YhCP2b755xkmgM9ekT5z2I3&scisig=AAGBfm0AAAAAYxCrLA0s6cI4xGBVGFOpGDBJkD4jW45M&scisf=4&ct=citation&cd=-1&hl=en >`__
279246
280- ::
247+ ` Explore-Exploit Learning Rate Schedule < https://scholar.googleusercontent.com/scholar.bib?q=info:-Z0_Ot7wtzsJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YetRPU:AAGBfm0AAAAAYxCrXPVjSJKqfwDN1V1KDkX--4xZuQ3d&scisig=AAGBfm0AAAAAYxCrXLMftLTqnC4BUjTH8TEDoeg8Xn0P&scisf=4&ct=citation&cd=-1&hl=en >`__
281248
282- @inproceedings{yong2020gradient,
283- title={Gradient centralization: A new optimization technique for deep neural networks},
284- author={Yong, Hongwei and Huang, Jianqiang and Hua, Xiansheng and Zhang, Lei},
285- booktitle={European Conference on Computer Vision},
286- pages={635--652},
287- year={2020},
288- organization={Springer}
289- }
249+ `On the adequacy of untuned warmup for adaptive optimization <https://scholar.googleusercontent.com/scholar.bib?q=info:_xl7KQ5GS8wJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0Yetb_s:AAGBfm0AAAAAYxCrd_t2aLAHKkunOI588UJkaMygzX7V&scisig=AAGBfm0AAAAAYxCrd4xDt7wmBQYV2J88Dv1klVIEEldW&scisf=4&ct=citation&cd=-1&hl=en >`__
290250
291- .. raw :: html
251+ ` Stable weight decay regularization < https://scholar.googleusercontent.com/scholar.bib?q=info:braJqOHCLpcJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0Yetu34:AAGBfm0AAAAAYxCro36JSgGOwWVwx8K21_sJaiJCi_tc&scisig=AAGBfm0AAAAAYxCro42f96rMxskixD8vZdyLuRCv9hzp&scisf=4&ct=citation&cd=-1&hl=en >`__
292252
293- </ details >
253+ ` Softplus transformation < https://scholar.googleusercontent.com/scholar.bib?q=info:_V_Tt16gXUsJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0Yet3gY:AAGBfm0AAAAAYxCrxgbrSUaRQqStYNBuVBPS3TMRgH7f&scisig=AAGBfm0AAAAAYxCrxqnu8UQn70pqZWxbBoJaz05eCgsj&scisf=4&ct=citation&cd=-1&hl=en >`__
294254
295- .. raw :: html
255+ ` MADGRAD < https://scholar.googleusercontent.com/scholar.bib?q=info:WnYNAExj8yEJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0Yet6g8:AAGBfm0AAAAAYxCr8g-OAPHACQZtBVamCAXY3mUPO7qR&scisig=AAGBfm0AAAAAYxCr8iVTWljaTOsxZ9ZHce61Uh5rYWdB&scisf=4&ct=citation&cd=-1&hl=en >`__
296256
297- <details >
298- <summary ><a >Lookahead: k steps forward, 1 step back</a ></summary >
299-
300- ::
257+ `AdaHessian <https://scholar.googleusercontent.com/scholar.bib?q=info:NVTf2oQp6YoJ:scholar.google.com/&output=citation&scisdr=CgX1Wk9EELXN0YeqDj8:AAGBfm0AAAAAYxCsFj89NAaxz72Tc2BaFva6FGFHuzjO&scisig=AAGBfm0AAAAAYxCsFm7SeFVY6NaIy5w0BOLAVGM4oy-z&scisf=4&ct=citation&cd=-1&hl=en >`__
301258
302- @article{zhang2019lookahead,
303- title={Lookahead optimizer: k steps forward, 1 step back},
304- author={Zhang, Michael R and Lucas, James and Hinton, Geoffrey and Ba, Jimmy},
305- journal={arXiv preprint arXiv:1907.08610},
306- year={2019}
307- }
308-
309- .. raw :: html
310-
311- </details >
259+ ` <>`__
312260
313- .. raw :: html
314-
315- <details >
316- <summary ><a >RAdam: On the Variance of the Adaptive Learning Rate and Beyond</a ></summary >
317-
318- ::
319-
320- @inproceedings{liu2019radam,
321- author = {Liu, Liyuan and Jiang, Haoming and He, Pengcheng and Chen, Weizhu and Liu, Xiaodong and Gao, Jianfeng and Han, Jiawei},
322- booktitle = {Proceedings of the Eighth International Conference on Learning Representations (ICLR 2020)},
323- month = {April},
324- title = {On the Variance of the Adaptive Learning Rate and Beyond},
325- year = {2020}
326- }
327-
328- .. raw :: html
329-
330- </details >
331-
332- .. raw :: html
333-
334- <details >
335- <summary ><a >Norm Loss: An efficient yet effective regularization method for deep neural networks</a ></summary >
336-
337- ::
338-
339- @inproceedings{georgiou2021norm,
340- title={Norm Loss: An efficient yet effective regularization method for deep neural networks},
341- author={Georgiou, Theodoros and Schmitt, Sebastian and B{\"a}ck, Thomas and Chen, Wei and Lew, Michael},
342- booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
343- pages={8812--8818},
344- year={2021},
345- organization={IEEE}
346- }
347-
348- .. raw :: html
349-
350- </details >
351-
352- .. raw :: html
353-
354- <details >
355- <summary ><a >Positive-Negative Momentum: Manipulating Stochastic Gradient Noise to Improve Generalization</a ></summary >
356-
357- ::
358-
359- @article{xie2021positive,
360- title={Positive-Negative Momentum: Manipulating Stochastic Gradient Noise to Improve Generalization},
361- author={Xie, Zeke and Yuan, Li and Zhu, Zhanxing and Sugiyama, Masashi},
362- journal={arXiv preprint arXiv:2103.17182},
363- year={2021}
364- }
365-
366- .. raw :: html
367-
368- </details >
261+ ` <>`__
369262
370- .. raw :: html
371-
372- <details >
373- <summary ><a >Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule</a ></summary >
374-
375- ::
376-
377- @article{iyer2020wide,
378- title={Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule},
379- author={Iyer, Nikhil and Thejas, V and Kwatra, Nipun and Ramjee, Ramachandran and Sivathanu, Muthian},
380- journal={arXiv preprint arXiv:2003.03977},
381- year={2020}
382- }
383-
384- .. raw :: html
385-
386- </details >
387-
388- .. raw :: html
389-
390- <details >
391- <summary ><a >On the adequacy of untuned warmup for adaptive optimization</a ></summary >
392-
393- ::
394-
395- @article{ma2019adequacy,
396- title={On the adequacy of untuned warmup for adaptive optimization},
397- author={Ma, Jerry and Yarats, Denis},
398- journal={arXiv preprint arXiv:1910.04209},
399- volume={7},
400- year={2019}
401- }
402-
403- .. raw :: html
404-
405- </details >
406-
407- .. raw :: html
408-
409- <details >
410- <summary ><a >Stable weight decay regularization</a ></summary >
411-
412- ::
413-
414- @article{xie2020stable,
415- title={Stable weight decay regularization},
416- author={Xie, Zeke and Sato, Issei and Sugiyama, Masashi},
417- journal={arXiv preprint arXiv:2011.11152},
418- year={2020}
419- }
420-
421- .. raw :: html
422-
423- </details >
424-
425- .. raw :: html
426-
427- <details >
428- <summary ><a >Softplus transformation</a ></summary >
429-
430- ::
431-
432- @article{tong2019calibrating,
433- title={Calibrating the adaptive learning rate to improve convergence of adam},
434- author={Tong, Qianqian and Liang, Guannan and Bi, Jinbo},
435- journal={arXiv preprint arXiv:1908.00700},
436- year={2019}
437- }
438-
439- .. raw :: html
440-
441- </details >
442-
443- .. raw :: html
444-
445- <details >
446- <summary ><a >MADGRAD: a momentumized, adaptive, dual averaged gradient method for stochastic optimization</a ></summary >
447-
448- ::
449-
450- @article{defazio2021adaptivity,
451- title={Adaptivity without compromise: a momentumized, adaptive, dual averaged gradient method for stochastic optimization},
452- author={Defazio, Aaron and Jelassi, Samy},
453- journal={arXiv preprint arXiv:2101.11075},
454- year={2021}
455- }
456-
457- .. raw :: html
458-
459- </details >
460-
461-
462- .. raw :: html
463-
464- <details >
465- <summary ><a >AdaHessian: An adaptive second order optimizer for machine learning</a ></summary >
466-
467- ::
468-
469- @article{yao2020adahessian,
470- title={ADAHESSIAN: An adaptive second order optimizer for machine learning},
471- author={Yao, Zhewei and Gholami, Amir and Shen, Sheng and Mustafa, Mustafa and Keutzer, Kurt and Mahoney, Michael W},
472- journal={arXiv preprint arXiv:2006.00719},
473- year={2020}
474- }
475263
476264.. raw :: html
477265
0 commit comments