@@ -44,7 +44,7 @@ def test_lora_ddp(self, temp_dir):
44
44
"lora_alpha" : 16 ,
45
45
"lora_dropout" : 0.05 ,
46
46
"lora_target_linear" : True ,
47
- "val_set_size" : 0.05 ,
47
+ "val_set_size" : 0.01 ,
48
48
"special_tokens" : {
49
49
"pad_token" : "<|endoftext|>" ,
50
50
},
@@ -58,7 +58,7 @@ def test_lora_ddp(self, temp_dir):
58
58
"max_steps" : 2 ,
59
59
"micro_batch_size" : 4 ,
60
60
"gradient_accumulation_steps" : 4 ,
61
- "gradient_checkpointing" : True ,
61
+ # "gradient_checkpointing": True,
62
62
"output_dir" : temp_dir ,
63
63
"learning_rate" : 0.00001 ,
64
64
"optimizer" : "adamw_8bit" ,
@@ -108,7 +108,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
108
108
"lora_alpha" : 16 ,
109
109
"lora_dropout" : 0.05 ,
110
110
"lora_target_linear" : True ,
111
- "val_set_size" : 0.05 ,
111
+ "val_set_size" : 0.01 ,
112
112
"special_tokens" : {
113
113
"pad_token" : "<|endoftext|>" ,
114
114
},
@@ -122,7 +122,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
122
122
"max_steps" : 2 ,
123
123
"micro_batch_size" : 1 ,
124
124
"gradient_accumulation_steps" : gradient_accumulation_steps ,
125
- "gradient_checkpointing" : True ,
125
+ # "gradient_checkpointing": True,
126
126
"output_dir" : temp_dir ,
127
127
"learning_rate" : 0.00001 ,
128
128
"optimizer" : "adamw_8bit" ,
@@ -169,7 +169,7 @@ def test_dpo_lora_ddp(self, temp_dir):
169
169
"lora_alpha" : 16 ,
170
170
"lora_dropout" : 0.05 ,
171
171
"lora_target_linear" : True ,
172
- "val_set_size" : 0.05 ,
172
+ "val_set_size" : 0.01 ,
173
173
"special_tokens" : {
174
174
"pad_token" : "<|endoftext|>" ,
175
175
},
@@ -195,7 +195,7 @@ def test_dpo_lora_ddp(self, temp_dir):
195
195
"max_steps" : 2 ,
196
196
"micro_batch_size" : 4 ,
197
197
"gradient_accumulation_steps" : 4 ,
198
- "gradient_checkpointing" : True ,
198
+ # "gradient_checkpointing": True,
199
199
"output_dir" : temp_dir ,
200
200
"warmup_steps" : 0 ,
201
201
"learning_rate" : 0.00001 ,
@@ -247,7 +247,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
247
247
"lora_alpha" : 16 ,
248
248
"lora_dropout" : 0.05 ,
249
249
"lora_target_linear" : True ,
250
- "val_set_size" : 0.05 ,
250
+ "val_set_size" : 0.01 ,
251
251
"special_tokens" : {
252
252
"pad_token" : "<|endoftext|>" ,
253
253
},
@@ -273,7 +273,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
273
273
"max_steps" : 2 ,
274
274
"micro_batch_size" : 2 ,
275
275
"gradient_accumulation_steps" : 4 ,
276
- "gradient_checkpointing" : True ,
276
+ # "gradient_checkpointing": True,
277
277
"output_dir" : temp_dir ,
278
278
"warmup_steps" : 0 ,
279
279
"learning_rate" : 0.00001 ,
@@ -334,7 +334,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
334
334
"max_steps" : 2 ,
335
335
"micro_batch_size" : 2 ,
336
336
"gradient_accumulation_steps" : gradient_accumulation_steps ,
337
- "gradient_checkpointing" : True ,
337
+ # "gradient_checkpointing": True,
338
338
"output_dir" : temp_dir ,
339
339
"learning_rate" : 0.00001 ,
340
340
"optimizer" : "adamw_torch_fused" ,
@@ -391,7 +391,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
391
391
"sample_packing" : True ,
392
392
"pad_to_sequence_len" : True ,
393
393
"sequence_len" : 2048 ,
394
- "val_set_size" : 0.05 ,
394
+ "val_set_size" : 0.01 ,
395
395
"special_tokens" : {
396
396
"pad_token" : "<|endoftext|>" ,
397
397
},
@@ -405,7 +405,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
405
405
"max_steps" : 2 ,
406
406
"micro_batch_size" : 4 ,
407
407
"gradient_accumulation_steps" : 2 ,
408
- "gradient_checkpointing" : True ,
408
+ # "gradient_checkpointing": True,
409
409
"output_dir" : temp_dir ,
410
410
"learning_rate" : 0.00001 ,
411
411
"optimizer" : "adamw_torch_fused" ,
@@ -470,7 +470,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
470
470
"eval_sample_packing" : False ,
471
471
"pad_to_sequence_len" : True ,
472
472
"sequence_len" : 2048 ,
473
- "val_set_size" : 0.05 ,
473
+ "val_set_size" : 0.01 ,
474
474
"special_tokens" : {
475
475
"pad_token" : "<|endoftext|>" ,
476
476
},
@@ -485,7 +485,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
485
485
"max_steps" : 2 ,
486
486
"micro_batch_size" : 4 ,
487
487
"gradient_accumulation_steps" : 2 ,
488
- "gradient_checkpointing" : True ,
488
+ # "gradient_checkpointing": True,
489
489
"output_dir" : temp_dir ,
490
490
"learning_rate" : 0.00001 ,
491
491
"optimizer" : "adamw_torch_fused" ,
@@ -567,7 +567,7 @@ def test_ds_zero3_packed(
567
567
"sample_packing" : True ,
568
568
"pad_to_sequence_len" : True ,
569
569
"sequence_len" : 2048 ,
570
- "val_set_size" : 0.05 ,
570
+ "val_set_size" : 0.01 ,
571
571
"special_tokens" : {
572
572
"pad_token" : "<|endoftext|>" ,
573
573
},
@@ -640,7 +640,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
640
640
"sample_packing" : True ,
641
641
"pad_to_sequence_len" : True ,
642
642
"sequence_len" : 2048 ,
643
- "val_set_size" : 0.05 ,
643
+ "val_set_size" : 0.01 ,
644
644
"special_tokens" : {
645
645
"pad_token" : "<|endoftext|>" ,
646
646
},
@@ -713,7 +713,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
713
713
"sample_packing" : True ,
714
714
"pad_to_sequence_len" : True ,
715
715
"sequence_len" : 2048 ,
716
- "val_set_size" : 0.05 ,
716
+ "val_set_size" : 0.01 ,
717
717
"special_tokens" : {
718
718
"pad_token" : "<|endoftext|>" ,
719
719
},
@@ -788,7 +788,7 @@ def test_fix_untrained_tokens(self, temp_dir):
788
788
"max_steps" : 2 ,
789
789
"micro_batch_size" : 1 ,
790
790
"gradient_accumulation_steps" : 1 ,
791
- "gradient_checkpointing" : True ,
791
+ # "gradient_checkpointing": True,
792
792
"output_dir" : temp_dir ,
793
793
"learning_rate" : 0.00001 ,
794
794
"optimizer" : "adamw_torch_fused" ,
0 commit comments