-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Expand file tree
/
Copy pathoptimizer.py
More file actions
114 lines (104 loc) · 5.37 KB
/
optimizer.py
File metadata and controls
114 lines (104 loc) · 5.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
def get_optimizer(
params,
optimizer_type='adam',
learning_rate=1e-6,
optimizer_params=None
):
if optimizer_params is None:
optimizer_params = {}
lower_type = optimizer_type.lower()
if lower_type.startswith("dadaptation"):
# dadaptation optimizer does not use standard learning rate. 1 is the default value
import dadaptation
print("Using DAdaptAdam optimizer")
use_lr = learning_rate
if use_lr < 0.1:
# dadaptation uses different lr that is values of 0.1 to 1.0. default to 1.0
use_lr = 1.0
if lower_type.endswith('lion'):
optimizer = dadaptation.DAdaptLion(params, eps=1e-8, lr=use_lr, **optimizer_params)
elif lower_type.endswith('adam'):
optimizer = dadaptation.DAdaptLion(params, eps=1e-8, lr=use_lr, **optimizer_params)
elif lower_type == 'dadaptation':
# backwards compatibility
optimizer = dadaptation.DAdaptAdam(params, eps=1e-8, lr=use_lr, **optimizer_params)
# warn user that dadaptation is deprecated
print("WARNING: Dadaptation optimizer type has been changed to DadaptationAdam. Please update your config.")
elif lower_type.startswith("prodigy8bit"):
from toolkit.optimizers.prodigy_8bit import Prodigy8bit
print("Using Prodigy optimizer")
use_lr = learning_rate
if use_lr < 0.1:
# dadaptation uses different lr that is values of 0.1 to 1.0. default to 1.0
use_lr = 1.0
print(f"Using lr {use_lr}")
# let net be the neural network you want to train
# you can choose weight decay value based on your problem, 0 by default
optimizer = Prodigy8bit(params, lr=use_lr, eps=1e-8, **optimizer_params)
elif lower_type.startswith("adamw_fp8"):
from toolkit.optimizers.adamw_fp8 import AdamWFP8
print("Using adamw_fp8")
use_lr = learning_rate
optimizer = AdamWFP8(params, lr=use_lr, eps=1e-8, **optimizer_params)
elif lower_type.startswith("adamw_bf16"):
from toolkit.optimizers.adamw_bf16 import AdamWBF16
print("Using adamw_bf16")
use_lr = learning_rate
optimizer = AdamWBF16(params, lr=use_lr, eps=1e-8, **optimizer_params)
elif lower_type.startswith("prodigy"):
from prodigyopt import Prodigy
print("Using Prodigy optimizer")
use_lr = learning_rate
if use_lr < 0.1:
# dadaptation uses different lr that is values of 0.1 to 1.0. default to 1.0
use_lr = 1.0
print(f"Using lr {use_lr}")
# let net be the neural network you want to train
# you can choose weight decay value based on your problem, 0 by default
optimizer = Prodigy(params, lr=use_lr, eps=1e-8, use_bias_correction=True, d0=5e-5, d_coef=1.0, safeguard_warmup=True, **optimizer_params)
elif lower_type == "adam8":
from toolkit.optimizers.adam8bit import Adam8bit
optimizer = Adam8bit(params, lr=learning_rate, eps=1e-8, **optimizer_params)
elif lower_type == "adamw8":
from toolkit.optimizers.adam8bit import Adam8bit
optimizer = Adam8bit(params, lr=learning_rate, eps=1e-8, decouple=True, **optimizer_params)
elif lower_type.endswith("8bit"):
import bitsandbytes
if lower_type == "adam8bit":
return bitsandbytes.optim.Adam8bit(params, lr=learning_rate, eps=1e-8, **optimizer_params)
if lower_type == "ademamix8bit":
return bitsandbytes.optim.AdEMAMix8bit(params, lr=learning_rate, eps=1e-8, **optimizer_params)
elif lower_type == "adamw8bit":
return bitsandbytes.optim.AdamW8bit(params, lr=learning_rate, eps=1e-8, **optimizer_params)
elif lower_type == "lion8bit":
return bitsandbytes.optim.Lion8bit(params, lr=learning_rate, **optimizer_params)
else:
raise ValueError(f'Unknown optimizer type {optimizer_type}')
elif lower_type == 'adam':
optimizer = torch.optim.Adam(params, lr=float(learning_rate), eps=1e-8, **optimizer_params)
elif lower_type == 'adamw':
optimizer = torch.optim.AdamW(params, lr=float(learning_rate), eps=1e-8, **optimizer_params)
elif lower_type == 'lion':
try:
from lion_pytorch import Lion
return Lion(params, lr=learning_rate, **optimizer_params)
except ImportError:
raise ImportError("Please install lion_pytorch to use Lion optimizer -> pip install lion-pytorch")
elif lower_type == 'adagrad':
optimizer = torch.optim.Adagrad(params, lr=float(learning_rate), **optimizer_params)
elif lower_type == 'adafactor':
from toolkit.optimizers.adafactor import Adafactor
if 'relative_step' not in optimizer_params:
optimizer_params['relative_step'] = False
if 'scale_parameter' not in optimizer_params:
optimizer_params['scale_parameter'] = False
if 'warmup_init' not in optimizer_params:
optimizer_params['warmup_init'] = False
optimizer = Adafactor(params, lr=float(learning_rate), **optimizer_params)
elif lower_type == 'automagic':
from toolkit.optimizers.automagic import Automagic
optimizer = Automagic(params, lr=float(learning_rate), **optimizer_params)
else:
raise ValueError(f'Unknown optimizer type {optimizer_type}')
return optimizer