Skip to content

Commit 5bb3018

Browse files
committed
fix some typo
1 parent cfb5d65 commit 5bb3018

File tree

5 files changed

+9
-10
lines changed

5 files changed

+9
-10
lines changed

examples/config/deepseek_v3/sft_128k_argument_dsv3.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"disable_tqdm": true,
2828
"use_expert_parallel": true,
2929
"expert_parallel_degree": 16,
30-
"continue_training": false,
30+
"continue_training": true,
3131
"pipeline_parallel_config": "enable_delay_scale_loss disable_partial_send_recv disable_batch_p2p_comm",
3232
"tensor_parallel_config": "enable_delay_scale_loss",
3333
"load_best_model_at_end": true,
@@ -41,7 +41,6 @@
4141
"pipeline_parallel_degree": 8,
4242
"sharding_parallel_degree": 2,
4343
"sharding": "stage1",
44-
"zero_padding": true,
4544
"unified_checkpoint": true,
4645
"use_flash_attention": true,
4746
"flash_mask": true,

examples/config/deepseek_v3/sft_4k_argument_dsv3.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
"pipeline_parallel_degree": 16,
4242
"sharding_parallel_degree": 8,
4343
"sharding": "stage1",
44-
"zero_padding": true,
4544
"unified_checkpoint": false,
4645
"save_sharded_model": false,
4746
"save_steps": 15,

examples/run_finetune.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ def main():
162162
model_config._attn_implementation = model_args.attn_impl
163163
model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
164164
model_config.gradient_accumulation_steps = training_args.gradient_accumulation_steps
165+
model_config.using_flex_token = model_args.using_flex_token
166+
model_config.using_fake_gate = model_args.using_fake_gate
167+
model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
168+
model_config.aux_loss_alpha = model_args.aux_loss_alpha
165169
logger.info(f"Final model config: {model_config}")
166170
logger.info("Creating model")
167171

@@ -172,11 +176,6 @@ def main():
172176

173177
model_class = AutoModelForCausalLMPipe
174178

175-
model_config.using_flex_token = model_args.using_flex_token
176-
model_config.using_fake_gate = model_args.using_fake_gate
177-
model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
178-
model_config.aux_loss_alpha = model_args.aux_loss_alpha
179-
180179
if model_args.continue_training and not training_args.autotuner_benchmark:
181180
model = model_class.from_pretrained(
182181
model_args.model_name_or_path,
@@ -313,7 +312,8 @@ def neft_post_hook(module, input, output):
313312
if training_args.use_expert_parallel:
314313
callbacks += [MoeExpertsGradScaleCallback(training_args)]
315314

316-
print("callbacks:", callbacks, flush=True)
315+
logger.info("callbacks:", callbacks, flush=True)
316+
317317
trainer = SFTTrainer(
318318
model=model,
319319
args=training_args,

paddleformers/transformers/deepseek_v2/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2277,7 +2277,7 @@ def forward(self, hidden_states, tensor_parallel_output=None):
22772277
# Enable tensor_parallel_output when using sequence and tensor parallelism, or loss will be wrong.
22782278
if self.config.sequence_parallel and self.config.tensor_parallel_degree > 1:
22792279
assert (
2280-
self.config.tensor_parallel_output is False
2280+
self.config.tensor_parallel_output is True
22812281
), "tensor_parallel_output must be true when using sequence_parallel and tensor_parallel"
22822282

22832283
if get_env_device() == "xpu" and self.xpu_parallel_matmul is not None:

paddleformers/transformers/deepseek_v2/modeling_pp.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ def forward(self, args):
295295
if moelayer_use_subbatch_recompute:
296296
hidden_states = super().subbatch_recompute_forward(
297297
hidden_states,
298+
inputs_embeds_cur_depth,
298299
position_ids=position_ids,
299300
attention_mask=attention_mask,
300301
attn_mask_startend_row_indices=attn_mask_startend_row_indices,

0 commit comments

Comments
 (0)