Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions examples/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def main():
model_config.max_sequence_length = training_args.max_seq_len
model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
model_config._attn_implementation = model_args.attn_impl
model_config.using_fake_gate = model_args.using_fake_gate
model_config.aux_loss_alpha = model_args.aux_loss_alpha
logger.info(f"Final model config: {model_config}")
logger.info("Creating model")

Expand Down Expand Up @@ -278,13 +280,16 @@ def neft_post_hook(module, input, output):
training_args.logging_steps = int(training_args.max_steps / training_args.num_train_epochs)

callbacks = []

if getattr(model_config, "topk_method", None) == "noaux_tc":
callbacks += [MoECorrectionBiasAdjustCallback(lr=0)]
# deepseek_v3 finetune do not update the bias, so set lr to 0.0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个glm4.5也会用到,注释不要指定模型

callbacks += [MoECorrectionBiasAdjustCallback(lr=0.0)]

if training_args.use_expert_parallel:
callbacks += [MoeExpertsGradScaleCallback(training_args)]

print("callbacks:", callbacks, flush=True)
logger.info(f"callbacks: {callbacks}")

trainer = SFTTrainer(
model=model,
args=training_args,
Expand All @@ -295,6 +300,7 @@ def neft_post_hook(module, input, output):
data_collator=data_collator,
do_generation=data_args.eval_with_do_generation,
data_args=data_args,
callbacks=callbacks,
)
trainable_parameters = [
p for p in model.parameters() if not p.stop_gradient or ("quantization_linear" in p.name and "w_1" in p.name)
Expand Down
38 changes: 31 additions & 7 deletions paddleformers/nn/pp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,12 +508,28 @@ class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
_embed_cls = None
_rotary_emb_cls = None
_norm_cls = "rms_norm"
_mtp_layer_pipe_cls = None
_embedding_pipe_cls = None
_decoder_layer_pipe_cls = None
_criterion_pipe_cls = None
_lmhead_pipe_cls = None
_rms_norm_pipe_cls = None

def __init__(self, config: PretrainedConfig, **kwargs):
# dynamic inherit DecoderLayer
if self._decoder_layer_cls is None:
raise ValueError("_decoder_layer_cls must be set before init.")
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)

EmbeddingPipeCls = self._embedding_pipe_cls if self._embedding_pipe_cls is not None else Embedding

if self._decoder_layer_pipe_cls is None:
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)
else:
DecoderLayerPipe = self._decoder_layer_pipe_cls

LMHeadPipeCls = self._lmhead_pipe_cls if self._lmhead_pipe_cls is not None else LMHeadPipe
MTPLayerPipeCls = self._mtp_layer_pipe_cls if self._mtp_layer_pipe_cls is not None else None
RMSNormPipeCls = self._rms_norm_pipe_cls if self._rms_norm_pipe_cls is not None else RMSNormPipe

new_initializer_range = math.sqrt(0.3333 / config.hidden_size)
logger.info(f"change initializer-range from {config.initializer_range} to {new_initializer_range}")
Expand Down Expand Up @@ -560,7 +576,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):
else:
self.add_sequential_layer(
LayerDesc(
EmbeddingPipe, config=config, embed_cls=self._embed_cls, rotary_emb_cls=self._rotary_emb_cls
EmbeddingPipeCls, config=config, embed_cls=self._embed_cls, rotary_emb_cls=self._rotary_emb_cls
),
"model",
)
Expand All @@ -574,6 +590,12 @@ def __init__(self, config: PretrainedConfig, **kwargs):
),
f"model.layers.{i}",
)
for i in range(config.num_nextn_predict_layers):
if MTPLayerPipeCls is not None:
self.add_sequential_layer(
LayerDesc(MTPLayerPipeCls, config=config, layer_idx=config.num_hidden_layers + i),
f"model.layers.{config.num_hidden_layers + i}",
)
for i in range(config.add_tail_layers):
self.add_sequential_layer(
LayerDesc(
Expand All @@ -583,22 +605,22 @@ def __init__(self, config: PretrainedConfig, **kwargs):
)

self.add_sequential_layer(
LayerDesc(RMSNormPipe if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),
LayerDesc(RMSNormPipeCls if self._norm_cls == "rms_norm" else LayerNormPipe, config=config),
"model.norm",
)

if config.tie_word_embeddings:
self.add_sequential_layer(
SharedLayerDesc(
"model_shared_weight",
LMHeadPipe,
LMHeadPipeCls,
shared_weight_attr="embedding_weight",
config=config,
),
"lm_head",
)
else:
self.add_sequential_layer(LayerDesc(LMHeadPipe, config=config), "lm_head")
self.add_sequential_layer(LayerDesc(LMHeadPipeCls, config=config), "lm_head")
recompute_interval = 0

seg_method = config.pp_seg_method if hasattr(config, "pp_seg_method") else "layer:DecoderLayer|EmptyLayer"
Expand Down Expand Up @@ -631,10 +653,12 @@ def __init__(self, config: PretrainedConfig, **kwargs):
)

def get_loss_fn(self, config):
CriterionPipeCls = self._criterion_pipe_cls if self._criterion_pipe_cls is not None else CriterionLayerPipe

if config.get("dpo_config", None) is not None:
loss_fn = CriterionLayerPipe(config, use_infohub=True)
loss_fn = CriterionPipeCls(config, use_infohub=True)
else:
loss_fn = CriterionLayerPipe(config)
loss_fn = CriterionPipeCls(config)

return loss_fn

Expand Down
Loading