|
45 | 45 | DAdaptSGD, |
46 | 46 | DiffGrad, |
47 | 47 | Fromage, |
| 48 | + GaLore, |
48 | 49 | Gravity, |
49 | 50 | Lamb, |
50 | 51 | Lion, |
|
401 | 402 | (CAME, {'lr': 7.5e-1, 'weight_decay': 1e-3}, 75), |
402 | 403 | (CAME, {'lr': 7.5e-1, 'weight_decay': 1e-3, 'ams_bound': True}, 75), |
403 | 404 | (Aida, {'lr': 1e0, 'weight_decay': 1e-3, 'ams_bound': True}, 5), |
| 405 | + ( |
| 406 | + GaLore, |
| 407 | + {'lr': 1e0, 'weight_decay': 1e-3, 'rank': 2, 'scale': 1.0, 'update_proj_gap': 1, 'projection_type': 'std'}, |
| 408 | + 5, |
| 409 | + ), |
| 410 | + ( |
| 411 | + GaLore, |
| 412 | + { |
| 413 | + 'lr': 1e0, |
| 414 | + 'weight_decay': 1e-3, |
| 415 | + 'rank': 2, |
| 416 | + 'scale': 1.0, |
| 417 | + 'update_proj_gap': 1, |
| 418 | + 'projection_type': 'reverse_std', |
| 419 | + }, |
| 420 | + 5, |
| 421 | + ), |
| 422 | + ( |
| 423 | + GaLore, |
| 424 | + {'lr': 5e-1, 'weight_decay': 1e-3, 'rank': 2, 'scale': 1.0, 'update_proj_gap': 2, 'projection_type': 'left'}, |
| 425 | + 5, |
| 426 | + ), |
| 427 | + ( |
| 428 | + GaLore, |
| 429 | + {'lr': 1e0, 'weight_decay': 1e-3, 'rank': 2, 'scale': 1.0, 'update_proj_gap': 1, 'projection_type': 'right'}, |
| 430 | + 5, |
| 431 | + ), |
| 432 | + ( |
| 433 | + GaLore, |
| 434 | + {'lr': 5e-1, 'weight_decay': 1e-3, 'rank': 2, 'scale': 1.0, 'update_proj_gap': 2, 'projection_type': 'full'}, |
| 435 | + 5, |
| 436 | + ), |
404 | 437 | ] |
405 | 438 | ADANORM_SUPPORTED_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [ |
406 | 439 | (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'adanorm': True}, 10), |
|
0 commit comments