@@ -233,219 +233,6 @@ def testTP2Sharding4(self):
233
233
np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
234
234
235
235
236
- @pytest .mark .xdist_group (name = "UC" )
237
- class TestUnifiedCheckpointFull (TestUnifiedCheckpointBase ):
238
- @skip_for_none_ce_case
239
- @require_paddle_at_least_8_gpu
240
- def testTP8 (self ):
241
- remove_logs ()
242
- remove_ckpt (pretrain_arguments ["output_dir" ])
243
-
244
- train_args = self .configs ["TP8" ]
245
- self .runfirst (train_args )
246
- self .rerun (train_args )
247
-
248
- if self .need_allclose :
249
- res = check_acc ()
250
- assert len (res ) == 2
251
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
252
-
253
- @skip_for_none_ce_case
254
- @require_paddle_at_least_8_gpu
255
- def testTP4DP2 (self ):
256
- remove_logs ()
257
- remove_ckpt (pretrain_arguments ["output_dir" ])
258
-
259
- train_args = self .configs ["TP4DP2" ]
260
- self .runfirst (train_args )
261
- self .rerun (train_args )
262
-
263
- if self .need_allclose :
264
- res = check_acc ()
265
- assert len (res ) == 2
266
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
267
-
268
- @skip_for_none_ce_case
269
- @require_paddle_at_least_8_gpu
270
- def testTP4Sharding2 (self ):
271
- remove_logs ()
272
- remove_ckpt (pretrain_arguments ["output_dir" ])
273
-
274
- train_args = self .configs ["TP4Sharding2" ]
275
- self .runfirst (train_args )
276
- self .rerun (train_args )
277
-
278
- if self .need_allclose :
279
- res = check_acc ()
280
- assert len (res ) == 2
281
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
282
-
283
- @skip_for_none_ce_case
284
- @require_paddle_at_least_8_gpu
285
- def testTP2PP4 (self ):
286
- remove_logs ()
287
- remove_ckpt (pretrain_arguments ["output_dir" ])
288
-
289
- train_args = self .configs ["TP2PP4" ]
290
- self .runfirst (train_args )
291
- self .rerun (train_args )
292
-
293
- if self .need_allclose :
294
- res = check_acc ()
295
- assert len (res ) == 2
296
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
297
-
298
- @skip_for_none_ce_case
299
- @require_paddle_at_least_8_gpu
300
- def testPP8 (self ):
301
- remove_logs ()
302
- remove_ckpt (pretrain_arguments ["output_dir" ])
303
-
304
- train_args = self .configs ["PP8" ]
305
- self .runfirst (train_args )
306
- self .rerun (train_args )
307
-
308
- if self .need_allclose :
309
- res = check_acc ()
310
- assert len (res ) == 2
311
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
312
-
313
- @skip_for_none_ce_case
314
- @require_paddle_at_least_8_gpu
315
- def testPP4DP2 (self ):
316
- remove_logs ()
317
- remove_ckpt (pretrain_arguments ["output_dir" ])
318
-
319
- train_args = self .configs ["PP4DP2" ]
320
- self .runfirst (train_args )
321
- self .rerun (train_args )
322
-
323
- if self .need_allclose :
324
- res = check_acc ()
325
- assert len (res ) == 2
326
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
327
-
328
- @skip_for_none_ce_case
329
- @require_paddle_at_least_8_gpu
330
- def testPP4Sharding2 (self ):
331
- remove_logs ()
332
- remove_ckpt (pretrain_arguments ["output_dir" ])
333
-
334
- train_args = self .configs ["PP4Sharding2" ]
335
- self .runfirst (train_args )
336
- self .rerun (train_args )
337
-
338
- if self .need_allclose :
339
- res = check_acc ()
340
- assert len (res ) == 2
341
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
342
-
343
- @skip_for_none_ce_case
344
- @require_paddle_at_least_8_gpu
345
- def testSharding8S1 (self ):
346
- remove_logs ()
347
- remove_ckpt (pretrain_arguments ["output_dir" ])
348
-
349
- train_args = self .configs ["Sharding8S1" ]
350
- self .runfirst (train_args )
351
- self .rerun (train_args )
352
-
353
- if self .need_allclose :
354
- res = check_acc ()
355
- assert len (res ) == 2
356
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
357
-
358
- @skip_for_none_ce_case
359
- @require_paddle_at_least_8_gpu
360
- def testSharding8S2 (self ):
361
- remove_logs ()
362
- remove_ckpt (pretrain_arguments ["output_dir" ])
363
-
364
- train_args = self .configs ["Sharding8S2" ]
365
- self .runfirst (train_args )
366
- self .rerun (train_args )
367
-
368
- if self .need_allclose :
369
- res = check_acc ()
370
- assert len (res ) == 2
371
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
372
-
373
- @skip_for_none_ce_case
374
- @require_paddle_at_least_8_gpu
375
- def testSharding4S1DP2 (self ):
376
- remove_logs ()
377
- remove_ckpt (pretrain_arguments ["output_dir" ])
378
-
379
- train_args = self .configs ["Sharding4S1DP2" ]
380
- self .runfirst (train_args )
381
- self .rerun (train_args )
382
-
383
- if self .need_allclose :
384
- res = check_acc ()
385
- assert len (res ) == 2
386
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
387
-
388
- @skip_for_none_ce_case
389
- @require_paddle_at_least_8_gpu
390
- def testSharding4S2DP2 (self ):
391
- remove_logs ()
392
- remove_ckpt (pretrain_arguments ["output_dir" ])
393
-
394
- train_args = self .configs ["Sharding4S2DP2" ]
395
- self .runfirst (train_args )
396
- self .rerun (train_args )
397
-
398
- if self .need_allclose :
399
- res = check_acc ()
400
- assert len (res ) == 2
401
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
402
-
403
- @skip_for_none_ce_case
404
- @require_paddle_at_least_8_gpu
405
- def testSharding2S1DP4 (self ):
406
- remove_logs ()
407
- remove_ckpt (pretrain_arguments ["output_dir" ])
408
-
409
- train_args = self .configs ["Sharding2S1DP4" ]
410
- self .runfirst (train_args )
411
- self .rerun (train_args )
412
-
413
- if self .need_allclose :
414
- res = check_acc ()
415
- assert len (res ) == 2
416
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
417
-
418
- @skip_for_none_ce_case
419
- @require_paddle_at_least_8_gpu
420
- def testSharding2S2DP4 (self ):
421
- remove_logs ()
422
- remove_ckpt (pretrain_arguments ["output_dir" ])
423
-
424
- train_args = self .configs ["Sharding2S2DP4" ]
425
- self .runfirst (train_args )
426
- self .rerun (train_args )
427
-
428
- if self .need_allclose :
429
- res = check_acc ()
430
- assert len (res ) == 2
431
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
432
-
433
- @skip_for_none_ce_case
434
- @require_paddle_at_least_8_gpu
435
- def testDP8 (self ):
436
- remove_logs ()
437
- remove_ckpt (pretrain_arguments ["output_dir" ])
438
-
439
- train_args = self .configs ["DP8" ]
440
- self .runfirst (train_args )
441
- self .rerun (train_args )
442
-
443
- if self .need_allclose :
444
- res = check_acc ()
445
- assert len (res ) == 2
446
- np .testing .assert_allclose (res [0 ], res [1 ], self .rtol )
447
-
448
-
449
236
@pytest .mark .skipif (True , reason = "Skip for None CE" )
450
237
class TestUnifiedCheckpointOnN2C4 (TestUnifiedCheckpointBase ):
451
238
def setUp (self ):
@@ -460,28 +247,6 @@ def rerun(self, train_args):
460
247
self .run_n2c4 (self .run_pretrain_file , ** train_args )
461
248
462
249
463
- # Test Unified Checkpoint Hybrid Parallel Strategy Convert on N1C8
464
- @pytest .mark .skipif (True , reason = "Skip for failed" )
465
- class TestUnifiedCheckpointOnN1C8Dynamic (TestUnifiedCheckpointFull ):
466
- def setUp (self ):
467
- super ().setUp ()
468
- self .need_allclose = False
469
- self .rtol = 1e-4
470
- self .k = MAX_CONVERT_CONFIGS # max: 16, min: 1
471
-
472
- def runfirst (self , train_args ):
473
- self .run_n1c8 (self .run_pretrain_file , ** train_args )
474
-
475
- def rerun (self , train_args ):
476
- configs = random_sample (self .configs .keys (), k = self .k )
477
- for config_name in configs :
478
- print (f"Rerun using { config_name } " )
479
- config = self .configs [config_name ]
480
- self .run_n1c8 (self .run_pretrain_file , ** config )
481
- res = check_acc ()
482
- np .testing .assert_allclose (res [0 ], res [- 1 ], rtol = self .rtol )
483
-
484
-
485
250
# Test Unified Checkpoint Hybrid Parallel Strategy Convert on N2C4
486
251
@pytest .mark .skipif (True , reason = "Skip for failed" )
487
252
class TestUnifiedCheckpointOnN2C4Dynamic (TestUnifiedCheckpointBase ):
@@ -1132,42 +897,3 @@ def rerun(self, train_args):
1132
897
self .run_n1c8 (self .run_pretrain_file , ** config )
1133
898
res = check_acc ()
1134
899
np .testing .assert_allclose (res [0 ], res [- 1 ], rtol = self .rtol )
1135
-
1136
-
1137
- @pytest .mark .skipif (True , reason = "Skip for None CE" )
1138
- class TestUnifiedCheckpointOnN1C8SaveLoadSpeed (TestUnifiedCheckpointFull ):
1139
- def setUp (self ):
1140
- super ().setUp ()
1141
- for config_key in self .configs :
1142
- self .configs [config_key ]["skip_profile_timer" ] = 0
1143
- self .configs [config_key ]["unified_checkpoint" ] = 1
1144
- self .configs [config_key ]["save_steps" ] = 6
1145
- self .configs [config_key ]["unified_checkpoint_config" ] = "skip_save_model_weight master_weight_compatible"
1146
-
1147
- self .need_allclose = False
1148
- self .rtol = 1e-7
1149
-
1150
- def runfirst (self , train_args ):
1151
- self .run_n1c8 (self .run_pretrain_file , log_dir = "log_uc" , ** train_args )
1152
-
1153
- def rerun (self , train_args ):
1154
- self .run_n1c8 (self .run_pretrain_file , log_dir = "log_uc" , ** train_args )
1155
-
1156
-
1157
- @pytest .mark .skipif (True , reason = "Skip for None CE" )
1158
- class TestPaddleCheckpointOnN1C8SaveLoadSpeed (TestUnifiedCheckpointFull ):
1159
- def setUp (self ):
1160
- super ().setUp ()
1161
- for config_key in self .configs :
1162
- self .configs [config_key ]["skip_profile_timer" ] = 0
1163
- self .configs [config_key ]["unified_checkpoint" ] = 0
1164
- self .configs [config_key ]["save_steps" ] = 6
1165
-
1166
- self .need_allclose = False
1167
- self .rtol = 1e-7
1168
-
1169
- def runfirst (self , train_args ):
1170
- self .run_n1c8 (self .run_pretrain_file , log_dir = "log_pd" , ** train_args )
1171
-
1172
- def rerun (self , train_args ):
1173
- self .run_n1c8 (self .run_pretrain_file , log_dir = "log_pd" , ** train_args )
0 commit comments