Skip to content

Commit cfb5d65

Browse files
committed
move dsv3 from paddlenlp-dsve-sft
1 parent a103c70 commit cfb5d65

File tree

15 files changed

+1165
-138
lines changed

15 files changed

+1165
-138
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
{
2+
"model_name_or_path": "/root/paddlejob/tmpspace/huggingface_model/huggingface/deepseek-ai/DeepSeek-V3-bf16/",
3+
"dataset_name_or_path": "./data",
4+
"output_dir": "./checkpoints/sft_ckpts",
5+
"train_dataset_path": "/root/paddlejob/tmpspace/chenzhichao/PaddleNLP-SFT/llm/en_data/train.json",
6+
"train_dataset_prob": "1.0",
7+
"train_dataset_type": "erniekit",
8+
"eval_dataset_path": "/root/paddlejob/tmpspace/chenzhichao/PaddleNLP-SFT/llm/en_data/dev.json",
9+
"eval_dataset_prob": "1.0",
10+
"eval_dataset_type": "erniekit",
11+
"per_device_train_batch_size": 1,
12+
"gradient_accumulation_steps": 16,
13+
"per_device_eval_batch_size": 1,
14+
"eval_accumulation_steps": 1,
15+
"num_train_epochs": 1,
16+
"max_steps": 20,
17+
"learning_rate": 2.2e-04,
18+
"warmup_steps": 30,
19+
"logging_steps": 1,
20+
"evaluation_strategy": "no",
21+
"save_strategy": "no",
22+
"max_seq_len": 131072,
23+
"bf16": true,
24+
"fp16_opt_level": "O2",
25+
"do_train": true,
26+
"do_eval": false,
27+
"disable_tqdm": true,
28+
"use_expert_parallel": true,
29+
"expert_parallel_degree": 16,
30+
"continue_training": false,
31+
"pipeline_parallel_config": "enable_delay_scale_loss disable_partial_send_recv disable_batch_p2p_comm",
32+
"tensor_parallel_config": "enable_delay_scale_loss",
33+
"load_best_model_at_end": true,
34+
"eval_with_do_generation": false,
35+
"metric_for_best_model": "loss",
36+
"recompute": true,
37+
"recompute_use_reentrant": true,
38+
"recompute_granularity": "full",
39+
"save_total_limit": 1,
40+
"tensor_parallel_degree": 8,
41+
"pipeline_parallel_degree": 8,
42+
"sharding_parallel_degree": 2,
43+
"sharding": "stage1",
44+
"zero_padding": true,
45+
"unified_checkpoint": true,
46+
"use_flash_attention": true,
47+
"flash_mask": true,
48+
"using_fake_gate": true,
49+
"using_flex_token": true,
50+
"use_fused_rms_norm": true,
51+
"moe_subbatch_token_num": 1024,
52+
"pre_alloc_memory": 60,
53+
"tensorwise_offload_optimizer": true,
54+
"sequence_parallel": true,
55+
"tensor_parallel_output": true,
56+
"amp_master_grad": true,
57+
"sharding_parallel_config": "split_param",
58+
"num_nextn_predict_layers": 1,
59+
"convert_from_hf": true
60+
}
61+
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
{
2+
"model_name_or_path": "/root/paddlejob/tmpspace/huggingface_model/huggingface/deepseek-ai/DeepSeek-V3-bf16/",
3+
"dataset_name_or_path": "/root/paddlejob/tmpspace/chenzhichao/PaddleNLP-SFT/llm/en_data",
4+
"output_dir": "./checkpoints/sft_ckpts",
5+
"train_dataset_path": "/root/paddlejob/tmpspace/chenzhichao/PaddleNLP-SFT/llm/en_data/train.json",
6+
"train_dataset_prob": "1.0",
7+
"train_dataset_type": "erniekit",
8+
"eval_dataset_path": "/root/paddlejob/tmpspace/chenzhichao/PaddleNLP-SFT/llm/en_data/dev.json",
9+
"eval_dataset_prob": "1.0",
10+
"eval_dataset_type": "erniekit",
11+
"per_device_train_batch_size": 1,
12+
"gradient_accumulation_steps": 16,
13+
"per_device_eval_batch_size": 1,
14+
"eval_accumulation_steps": 1,
15+
"learning_rate": 2.2e-04,
16+
"warmup_steps": 30,
17+
"max_seq_len": 4096,
18+
"logging_steps": 1,
19+
"evaluation_strategy": "no",
20+
"save_strategy": "no",
21+
"bf16": true,
22+
"amp_master_grad": true,
23+
"fp16_opt_level": "O2",
24+
"do_train": true,
25+
"do_eval": false,
26+
"disable_tqdm": true,
27+
"use_expert_parallel": true,
28+
"expert_parallel_degree": 8,
29+
"continue_training": false,
30+
"pipeline_parallel_config": "enable_delay_scale_loss disable_partial_send_recv disable_batch_p2p_comm",
31+
"tensor_parallel_config": "enable_delay_scale_loss",
32+
"load_best_model_at_end": false,
33+
"eval_with_do_generation": false,
34+
"metric_for_best_model": "loss",
35+
"recompute": true,
36+
"recompute_use_reentrant": true,
37+
"recompute_granularity": "full",
38+
"save_total_limit": 1,
39+
"tensor_parallel_degree": 1,
40+
"sequence_parallel": false,
41+
"pipeline_parallel_degree": 16,
42+
"sharding_parallel_degree": 8,
43+
"sharding": "stage1",
44+
"zero_padding": true,
45+
"unified_checkpoint": false,
46+
"save_sharded_model": false,
47+
"save_steps": 15,
48+
"use_flash_attention": true,
49+
"flash_mask": true,
50+
"using_fake_gate": false,
51+
"using_flex_token": true,
52+
"pre_alloc_memory": 60,
53+
"tensorwise_offload_optimizer": true,
54+
"use_fused_rms_norm": true,
55+
"max_steps": 9,
56+
"sharding_parallel_config": "split_param",
57+
"tensor_parallel_output": true,
58+
"num_nextn_predict_layers": 1,
59+
"convert_from_hf": true
60+
}
61+

examples/run_finetune.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
get_last_checkpoint,
3333
set_seed,
3434
)
35+
from paddleformers.trainer.trainer_callback import (
36+
MoECorrectionBiasAdjustCallback,
37+
MoeExpertsGradScaleCallback,
38+
)
3539
from paddleformers.transformers import (
3640
AutoConfig,
3741
AutoModelForCausalLM,
@@ -49,6 +53,19 @@
4953
os.environ["USE_CASUAL_MASK"] = "False"
5054

5155

56+
def mock_offload_optimizer():
57+
"""
58+
mock offload optimizer
59+
"""
60+
try:
61+
from paddleformers.trainer.utils.offload_optimizer import hack_offload_optimizer
62+
63+
hack_offload_optimizer()
64+
logger.warning("hack_offload_optimizer called.")
65+
except ImportError:
66+
logger.warning("hack_offload_optimizer is not imported")
67+
68+
5269
def main():
5370
parser = PdArgumentParser((ModelConfig, DataConfig, SFTConfig))
5471
if len(sys.argv) >= 2 and sys.argv[1].endswith(".json"):
@@ -60,6 +77,9 @@ def main():
6077
else:
6178
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
6279

80+
if training_args.tensorwise_offload_optimizer:
81+
mock_offload_optimizer()
82+
6383
training_args.print_config(model_args, "Model")
6484
training_args.print_config(data_args, "Data")
6585

@@ -141,6 +161,7 @@ def main():
141161
model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
142162
model_config._attn_implementation = model_args.attn_impl
143163
model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
164+
model_config.gradient_accumulation_steps = training_args.gradient_accumulation_steps
144165
logger.info(f"Final model config: {model_config}")
145166
logger.info("Creating model")
146167

@@ -151,6 +172,11 @@ def main():
151172

152173
model_class = AutoModelForCausalLMPipe
153174

175+
model_config.using_flex_token = model_args.using_flex_token
176+
model_config.using_fake_gate = model_args.using_fake_gate
177+
model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
178+
model_config.aux_loss_alpha = model_args.aux_loss_alpha
179+
154180
if model_args.continue_training and not training_args.autotuner_benchmark:
155181
model = model_class.from_pretrained(
156182
model_args.model_name_or_path,
@@ -279,8 +305,10 @@ def neft_post_hook(module, input, output):
279305
training_args.logging_steps = int(training_args.max_steps / training_args.num_train_epochs)
280306

281307
callbacks = []
308+
282309
if getattr(model_config, "topk_method", None) == "noaux_tc":
283-
callbacks += [MoECorrectionBiasAdjustCallback(lr=0)]
310+
# deepseek_v3 finetune do not update the bias, so set lr to 0.0
311+
callbacks += [MoECorrectionBiasAdjustCallback(lr=0.0)]
284312

285313
if training_args.use_expert_parallel:
286314
callbacks += [MoeExpertsGradScaleCallback(training_args)]
@@ -296,6 +324,7 @@ def neft_post_hook(module, input, output):
296324
data_collator=data_collator,
297325
do_generation=data_args.eval_with_do_generation,
298326
data_args=data_args,
327+
callbacks=callbacks,
299328
)
300329
trainable_parameters = [
301330
p for p in model.parameters() if not p.stop_gradient or ("quantization_linear" in p.name and "w_1" in p.name)

0 commit comments

Comments
 (0)