Skip to content

Commit b2b8c33

Browse files
committed
add dsv3
1 parent 03b533c commit b2b8c33

20 files changed

+1283
-2444
lines changed

examples/run_finetune.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ def main():
140140
model_config.max_sequence_length = training_args.max_seq_len
141141
model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
142142
model_config._attn_implementation = model_args.attn_impl
143+
model_config.using_fake_gate = model_args.using_fake_gate
144+
model_config.aux_loss_alpha = model_args.aux_loss_alpha
143145
logger.info(f"Final model config: {model_config}")
144146
logger.info("Creating model")
145147

@@ -278,13 +280,16 @@ def neft_post_hook(module, input, output):
278280
training_args.logging_steps = int(training_args.max_steps / training_args.num_train_epochs)
279281

280282
callbacks = []
283+
281284
if getattr(model_config, "topk_method", None) == "noaux_tc":
282-
callbacks += [MoECorrectionBiasAdjustCallback(lr=0)]
285+
# deepseek_v3 finetune do not update the bias, so set lr to 0.0
286+
callbacks += [MoECorrectionBiasAdjustCallback(lr=0.0)]
283287

284288
if training_args.use_expert_parallel:
285289
callbacks += [MoeExpertsGradScaleCallback(training_args)]
286290

287-
print("callbacks:", callbacks, flush=True)
291+
logger.info(f"callbacks: {callbacks}")
292+
288293
trainer = SFTTrainer(
289294
model=model,
290295
args=training_args,
@@ -295,6 +300,7 @@ def neft_post_hook(module, input, output):
295300
data_collator=data_collator,
296301
do_generation=data_args.eval_with_do_generation,
297302
data_args=data_args,
303+
callbacks=callbacks,
298304
)
299305
trainable_parameters = [
300306
p for p in model.parameters() if not p.stop_gradient or ("quantization_linear" in p.name and "w_1" in p.name)

paddleformers/nn/pp_model.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -508,12 +508,28 @@ class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
508508
_embed_cls = None
509509
_rotary_emb_cls = None
510510
_norm_cls = "rms_norm"
511+
_mtp_layer_pipe_cls = None
512+
_embedding_pipe_cls = None
513+
_decoder_layer_pipe_cls = None
514+
_criterion_pipe_cls = None
515+
_lmhead_pipe_cls = None
516+
_rms_norm_pipe_cls = None
511517

512518
def __init__(self, config: PretrainedConfig, **kwargs):
513519
# dynamic inherit DecoderLayer
514520
if self._decoder_layer_cls is None:
515521
raise ValueError("_decoder_layer_cls must be set before init.")
516-
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)
522+
523+
EmbeddingPipeCls = self._embedding_pipe_cls if self._embedding_pipe_cls is not None else Embedding
524+
525+
if self._decoder_layer_pipe_cls is None:
526+
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)
527+
else:
528+
DecoderLayerPipe = self._decoder_layer_pipe_cls
529+
530+
LMHeadPipeCls = self._lmhead_pipe_cls if self._lmhead_pipe_cls is not None else LMHeadPipe
531+
MTPLayerPipeCls = self._mtp_layer_pipe_cls if self._mtp_layer_pipe_cls is not None else None
532+
RMSNormPipeCls = self._rms_norm_pipe_cls if self._rms_norm_pipe_cls is not None else RMSNormPipe
517533

518534
new_initializer_range = math.sqrt(0.3333 / config.hidden_size)
519535
logger.info(f"change initializer-range from {config.initializer_range} to {new_initializer_range}")
@@ -560,7 +576,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):
560576
else:
561577
self.add_sequential_layer(
562578
LayerDesc(
563-
EmbeddingPipe, config=config, embed_cls=self._embed_cls, rotary_emb_cls=self._rotary_emb_cls
579+
EmbeddingPipeCls, config=config, embed_cls=self._embed_cls, rotary_emb_cls=self._rotary_emb_cls
564580
),
565581
"model",
566582
)
@@ -574,6 +590,12 @@ def __init__(self, config: PretrainedConfig, **kwargs):
574590
),
575591
f"model.layers.{i}",
576592
)
593+
for i in range(config.num_nextn_predict_layers):
594+
if MTPLayerPipeCls is not None:
595+
self.add_sequential_layer(
596+
LayerDesc(MTPLayerPipeCls, config=config, layer_idx=config.num_hidden_layers + i),
597+
f"model.layers.{config.num_hidden_layers + i}",
598+
)
577599
for i in range(config.add_tail_layers):
578600
self.add_sequential_layer(
579601
LayerDesc(
@@ -583,22 +605,22 @@ def __init__(self, config: PretrainedConfig, **kwargs):
583605
)
584606

585607
self.add_sequential_layer(
586-
LayerDesc(RMSNormPipe if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),
608+
LayerDesc(RMSNormPipeCls if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),
587609
"model.norm",
588610
)
589611

590612
if config.tie_word_embeddings:
591613
self.add_sequential_layer(
592614
SharedLayerDesc(
593615
"model_shared_weight",
594-
LMHeadPipe,
616+
LMHeadPipeCls,
595617
shared_weight_attr="embedding_weight",
596618
config=config,
597619
),
598620
"lm_head",
599621
)
600622
else:
601-
self.add_sequential_layer(LayerDesc(LMHeadPipe, config=config), "lm_head")
623+
self.add_sequential_layer(LayerDesc(LMHeadPipeCls, config=config), "lm_head")
602624
recompute_interval = 0
603625

604626
seg_method = config.pp_seg_method if hasattr(config, "pp_seg_method") else "layer:DecoderLayer|EmptyLayer"
@@ -631,10 +653,12 @@ def __init__(self, config: PretrainedConfig, **kwargs):
631653
)
632654

633655
def get_loss_fn(self, config):
656+
CriterionPipeCls = self._criterion_pipe_cls if self._criterion_pipe_cls is not None else CriterionLayerPipe
657+
634658
if config.get("dpo_config", None) is not None:
635-
loss_fn = CriterionLayerPipe(config, use_infohub=True)
659+
loss_fn = CriterionPipeCls(config, use_infohub=True)
636660
else:
637-
loss_fn = CriterionLayerPipe(config)
661+
loss_fn = CriterionPipeCls(config)
638662

639663
return loss_fn
640664

0 commit comments

Comments
 (0)