|
129 | 129 | (Ranger, {'lr': 5e-1, 'weight_decay': 1e-3}, 150), |
130 | 130 | (Ranger21, {'lr': 5e-1, 'weight_decay': 1e-3, 'num_iterations': 500}, 200), |
131 | 131 | (Shampoo, {'lr': 5e-1, 'weight_decay': 1e-3, 'momentum': 0.1}, 10), |
132 | | - (ScalableShampoo, {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'graft_type': 0}, 10), |
133 | | - (ScalableShampoo, {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'graft_type': 1}, 10), |
134 | | - (ScalableShampoo, {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'graft_type': 2}, 10), |
135 | | - (ScalableShampoo, {'lr': 1e-2, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'graft_type': 3}, 10), |
136 | | - (ScalableShampoo, {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'graft_type': 4}, 10), |
137 | 132 | ( |
138 | 133 | ScalableShampoo, |
139 | | - {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'pre_conditioner_type': 0}, |
| 134 | + { |
| 135 | + 'lr': 1e-1, |
| 136 | + 'weight_decay': 1e-3, |
| 137 | + 'start_preconditioning_step': 9, |
| 138 | + 'preconditioning_compute_steps': 10, |
| 139 | + 'graft_type': 0, |
| 140 | + }, |
140 | 141 | 10, |
141 | 142 | ), |
142 | 143 | ( |
143 | 144 | ScalableShampoo, |
144 | | - {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'pre_conditioner_type': 1}, |
| 145 | + { |
| 146 | + 'lr': 1e-1, |
| 147 | + 'weight_decay': 1e-3, |
| 148 | + 'start_preconditioning_step': 9, |
| 149 | + 'preconditioning_compute_steps': 10, |
| 150 | + 'graft_type': 1, |
| 151 | + }, |
145 | 152 | 10, |
146 | 153 | ), |
147 | 154 | ( |
148 | 155 | ScalableShampoo, |
149 | | - {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'pre_conditioner_type': 2}, |
| 156 | + { |
| 157 | + 'lr': 1e-1, |
| 158 | + 'weight_decay': 1e-3, |
| 159 | + 'start_preconditioning_step': 9, |
| 160 | + 'preconditioning_compute_steps': 10, |
| 161 | + 'graft_type': 2, |
| 162 | + }, |
150 | 163 | 10, |
151 | 164 | ), |
152 | 165 | ( |
153 | 166 | ScalableShampoo, |
154 | | - {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'inverse_exponent_override': 1}, |
| 167 | + { |
| 168 | + 'lr': 1e-2, |
| 169 | + 'weight_decay': 1e-3, |
| 170 | + 'start_preconditioning_step': 9, |
| 171 | + 'preconditioning_compute_steps': 10, |
| 172 | + 'graft_type': 3, |
| 173 | + }, |
155 | 174 | 10, |
156 | 175 | ), |
157 | | - (ScalableShampoo, {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'nesterov': False}, 10), |
158 | 176 | ( |
159 | 177 | ScalableShampoo, |
160 | | - {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'decoupled_weight_decay': True}, |
| 178 | + { |
| 179 | + 'lr': 1e-1, |
| 180 | + 'weight_decay': 1e-3, |
| 181 | + 'start_preconditioning_step': 9, |
| 182 | + 'preconditioning_compute_steps': 10, |
| 183 | + 'graft_type': 4, |
| 184 | + }, |
161 | 185 | 10, |
162 | 186 | ), |
163 | 187 | ( |
164 | 188 | ScalableShampoo, |
165 | | - {'lr': 1e-0, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'decoupled_learning_rate': False}, |
| 189 | + { |
| 190 | + 'lr': 1e-1, |
| 191 | + 'weight_decay': 1e-3, |
| 192 | + 'start_preconditioning_step': 9, |
| 193 | + 'preconditioning_compute_steps': 10, |
| 194 | + 'pre_conditioner_type': 0, |
| 195 | + }, |
166 | 196 | 10, |
167 | 197 | ), |
168 | 198 | ( |
169 | 199 | ScalableShampoo, |
170 | | - {'lr': 1e-1, 'weight_decay': 1e-3, 'start_preconditioning_step': 9, 'moving_average_for_momentum': True}, |
| 200 | + { |
| 201 | + 'lr': 1e-1, |
| 202 | + 'weight_decay': 1e-3, |
| 203 | + 'start_preconditioning_step': 9, |
| 204 | + 'preconditioning_compute_steps': 10, |
| 205 | + 'pre_conditioner_type': 1, |
| 206 | + }, |
| 207 | + 10, |
| 208 | + ), |
| 209 | + ( |
| 210 | + ScalableShampoo, |
| 211 | + { |
| 212 | + 'lr': 1e-1, |
| 213 | + 'weight_decay': 1e-3, |
| 214 | + 'start_preconditioning_step': 9, |
| 215 | + 'preconditioning_compute_steps': 10, |
| 216 | + 'pre_conditioner_type': 2, |
| 217 | + }, |
| 218 | + 10, |
| 219 | + ), |
| 220 | + ( |
| 221 | + ScalableShampoo, |
| 222 | + { |
| 223 | + 'lr': 1e-1, |
| 224 | + 'weight_decay': 1e-3, |
| 225 | + 'start_preconditioning_step': 9, |
| 226 | + 'preconditioning_compute_steps': 10, |
| 227 | + 'inverse_exponent_override': 1, |
| 228 | + }, |
| 229 | + 10, |
| 230 | + ), |
| 231 | + ( |
| 232 | + ScalableShampoo, |
| 233 | + { |
| 234 | + 'lr': 1e-1, |
| 235 | + 'weight_decay': 1e-3, |
| 236 | + 'start_preconditioning_step': 9, |
| 237 | + 'preconditioning_compute_steps': 10, |
| 238 | + 'nesterov': False, |
| 239 | + }, |
| 240 | + 10, |
| 241 | + ), |
| 242 | + ( |
| 243 | + ScalableShampoo, |
| 244 | + { |
| 245 | + 'lr': 1e-1, |
| 246 | + 'weight_decay': 1e-3, |
| 247 | + 'start_preconditioning_step': 9, |
| 248 | + 'preconditioning_compute_steps': 10, |
| 249 | + 'decoupled_weight_decay': True, |
| 250 | + }, |
| 251 | + 10, |
| 252 | + ), |
| 253 | + ( |
| 254 | + ScalableShampoo, |
| 255 | + { |
| 256 | + 'lr': 1e-0, |
| 257 | + 'weight_decay': 1e-3, |
| 258 | + 'start_preconditioning_step': 9, |
| 259 | + 'preconditioning_compute_steps': 10, |
| 260 | + 'decoupled_learning_rate': False, |
| 261 | + }, |
| 262 | + 10, |
| 263 | + ), |
| 264 | + ( |
| 265 | + ScalableShampoo, |
| 266 | + { |
| 267 | + 'lr': 1e-1, |
| 268 | + 'weight_decay': 1e-3, |
| 269 | + 'start_preconditioning_step': 9, |
| 270 | + 'preconditioning_compute_steps': 10, |
| 271 | + 'moving_average_for_momentum': True, |
| 272 | + }, |
171 | 273 | 10, |
172 | 274 | ), |
173 | 275 | (PNM, {'lr': 3e-1}, 50), |
|
0 commit comments