Skip to content

Commit 5b36686

Browse files
committed
fix general pipeline model
1 parent 524f2a2 commit 5b36686

File tree

14 files changed

+258
-126
lines changed

14 files changed

+258
-126
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"model_name_or_path": "baidu/ERNIE-4.5-0.3B-PT",
3+
"train_dataset_path": "./data/train.json",
4+
"train_dataset_prob": "1.0",
5+
"train_dataset_type": "erniekit",
6+
"eval_dataset_path": "./data/dev.json",
7+
"eval_dataset_prob": "1.0",
8+
"eval_dataset_type": "erniekit",
9+
"packing": true,
10+
"mix_strategy": "random",
11+
"output_dir": "./checkpoints/ernie4_5_paddle_sft_ckpts",
12+
"max_seq_len": 8192,
13+
"per_device_train_batch_size": 1,
14+
"gradient_accumulation_steps": 4,
15+
"per_device_eval_batch_size": 8,
16+
"eval_accumulation_steps":16,
17+
"num_train_epochs": 1,
18+
"learning_rate": 3e-05,
19+
"warmup_steps": 10,
20+
"logging_steps": 1,
21+
"max_steps": 100,
22+
"evaluation_strategy": "epoch",
23+
"save_strategy": "epoch",
24+
"src_length": 1024,
25+
"max_length": 2048,
26+
"bf16": true,
27+
"fp16_opt_level": "O2",
28+
"do_train": true,
29+
"do_eval": true,
30+
"disable_tqdm": true,
31+
"load_best_model_at_end": true,
32+
"eval_with_do_generation": false,
33+
"metric_for_best_model": "accuracy",
34+
"recompute": true,
35+
"save_total_limit": 1,
36+
"tensor_parallel_degree": 2,
37+
"pipeline_parallel_degree": 2,
38+
"sharding": "stage2",
39+
"zero_padding": true,
40+
"flash_mask": true,
41+
"unified_checkpoint": true,
42+
"use_flash_attention": true,
43+
"sequence_parallel": true,
44+
"report_to": "none",
45+
"convert_from_hf": true,
46+
"pp_seg_method": "layer:DecoderLayer|EmptyLayer"
47+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"model_name_or_path": "baidu/ERNIE-4.5-21B-A3B-PT",
3+
"train_dataset_path": "./data/train.json",
4+
"train_dataset_prob": "1.0",
5+
"train_dataset_type": "erniekit",
6+
"eval_dataset_path": "./data/dev.json",
7+
"eval_dataset_prob": "1.0",
8+
"eval_dataset_type": "erniekit",
9+
"packing": true,
10+
"mix_strategy": "random",
11+
"output_dir": "./checkpoints/ernie4_5_paddle_sft_ckpts",
12+
"max_seq_len": 8192,
13+
"per_device_train_batch_size": 1,
14+
"gradient_accumulation_steps": 4,
15+
"per_device_eval_batch_size": 8,
16+
"eval_accumulation_steps":16,
17+
"num_train_epochs": 1,
18+
"learning_rate": 3e-05,
19+
"warmup_steps": 10,
20+
"logging_steps": 1,
21+
"max_steps": 100,
22+
"evaluation_strategy": "epoch",
23+
"save_strategy": "epoch",
24+
"src_length": 1024,
25+
"max_length": 2048,
26+
"bf16": true,
27+
"fp16_opt_level": "O2",
28+
"do_train": true,
29+
"do_eval": true,
30+
"disable_tqdm": true,
31+
"load_best_model_at_end": true,
32+
"eval_with_do_generation": false,
33+
"metric_for_best_model": "accuracy",
34+
"recompute": true,
35+
"save_total_limit": 1,
36+
"tensor_parallel_degree": 4,
37+
"pipeline_parallel_degree": 2,
38+
"sharding": "stage2",
39+
"zero_padding": true,
40+
"flash_mask": true,
41+
"unified_checkpoint": true,
42+
"use_flash_attention": true,
43+
"sequence_parallel": true,
44+
"report_to": "none",
45+
"convert_from_hf": true,
46+
"pp_seg_method": "layer:DecoderLayer|EmptyLayer"
47+
}

examples/run_finetune.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@
3131
DeepseekV2ForCausalLMPipe,
3232
DeepseekV3ForCausalLM,
3333
DeepseekV3ForCausalLMPipe,
34+
Ernie4_5_MoeForCausalLM,
35+
Ernie4_5_MoeForCausalLMPipe,
36+
Ernie4_5ForCausalLM,
37+
Ernie4_5ForCausalLMPipe,
3438
Llama3Tokenizer,
3539
LlamaForCausalLM,
3640
LlamaForCausalLMPipe,
@@ -53,6 +57,10 @@
5357
DeepseekV2ForCausalLMPipe,
5458
DeepseekV3ForCausalLM,
5559
DeepseekV3ForCausalLMPipe,
60+
Ernie4_5ForCausalLM,
61+
Ernie4_5ForCausalLMPipe,
62+
Ernie4_5_MoeForCausalLM,
63+
Ernie4_5_MoeForCausalLMPipe,
5664
LlamaForCausalLM,
5765
LlamaForCausalLMPipe,
5866
Qwen2ForCausalLM,
@@ -138,11 +146,11 @@ def main():
138146
model_config.fuse_attention_qkv = model_args.fuse_attention_qkv
139147
if model_args.fuse_attention_ffn is not None:
140148
model_config.fuse_attention_ffn = model_args.fuse_attention_ffn
141-
149+
model_config.pp_seg_method = training_args.pp_seg_method
142150
model_config.seq_length = data_args.max_length
151+
model_config.max_sequence_length = training_args.max_seq_length
143152
model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
144153
logger.info(f"Final model config: {model_config}")
145-
146154
logger.info("Creating model")
147155

148156
model_class = AutoModelForCausalLM
@@ -157,7 +165,7 @@ def main():
157165
model_args.model_name_or_path,
158166
config=model_config,
159167
download_hub=model_args.download_hub,
160-
convert_from_hf=False, # run paddle weights
168+
convert_from_hf=training_args.convert_from_hf, # run paddle weights
161169
)
162170
else:
163171
model = model_class.from_config(model_config, dtype=dtype)
@@ -166,6 +174,7 @@ def main():
166174
logger.warning("`flash_mask` must use with zero padding and flash attention.")
167175
data_args.zero_padding = True
168176
model.config.use_flash_attention = True
177+
model.config._attn_implementation = "flashmask"
169178

170179
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
171180
raise NotImplementedError(f"{model.__class__} not support flash mask.")

paddleformers/datasets/finetuning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -698,4 +698,4 @@ def gen_attn_mask_startend_row_indices(batch_token_ids: List[List[int]], max_seq
698698
if offset < max_seq_len:
699699
attn_mask_startend_row_indices.extend(list(range(offset, max_seq_len)))
700700
# NOTE(hehuang): The dtype of attn_mask_startend_row_indices must be np.int32
701-
return np.array(attn_mask_startend_row_indices, dtype=np.int32)[None, None]
701+
return np.array(attn_mask_startend_row_indices, dtype=np.int32)[None, None, ..., None] # add dimension modify

paddleformers/nn/attention/flashmask_attention.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def flashmask_attention_forward(
2626
query: paddle.Tensor,
2727
key: paddle.Tensor,
2828
value: paddle.Tensor,
29-
attn_mask_start_row_indices: paddle.Tensor,
29+
attn_mask_startend_row_indices: paddle.Tensor,
3030
dropout: float = 0.0,
3131
sink: Optional[paddle.Tensor] = None,
3232
scaling: Optional[float] = None,
@@ -39,7 +39,7 @@ def flashmask_attention_forward(
3939
query,
4040
key,
4141
value,
42-
startend_row_indices=attn_mask_start_row_indices,
42+
startend_row_indices=attn_mask_startend_row_indices,
4343
causal=True,
4444
)
4545
else:
@@ -48,7 +48,7 @@ def flashmask_attention_forward(
4848
key,
4949
value,
5050
sink,
51-
startend_row_indices=attn_mask_start_row_indices,
51+
startend_row_indices=attn_mask_startend_row_indices,
5252
dropout_p=dropout,
5353
softmax_scale=scaling,
5454
causal=is_causal,

paddleformers/nn/attention/sdpa_attention.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,19 @@ def sdpa_attention_forward(
2727
key: paddle.Tensor,
2828
value: paddle.Tensor,
2929
attention_mask: Optional[paddle.Tensor] = None,
30-
attn_mask_start_row_indices=None,
30+
attn_mask_startend_row_indices=None,
3131
dropout: float = 0.0,
3232
sink: Optional[paddle.Tensor] = None,
3333
scaling: Optional[float] = None,
3434
is_causal: Optional[bool] = None,
3535
**kwargs,
3636
):
3737
# query: b l h d
38-
if is_causal is None and attn_mask_start_row_indices is None:
38+
if is_causal is None and attn_mask_startend_row_indices is None:
3939
is_causal = query.shape[1] > 1 and attention_mask is None and getattr(module, "is_causal", True)
40-
elif attn_mask_start_row_indices is not None:
40+
elif attn_mask_startend_row_indices is not None:
4141
is_causal = False
42-
attention_mask = _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, query.dtype)
42+
attention_mask = _gen_from_sparse_attn_mask_indices(attn_mask_startend_row_indices, query.dtype)
4343

4444
if sink is None:
4545
attn_output = nn.functional.scaled_dot_product_attention(

paddleformers/nn/pp_model.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def forward(self, args):
254254
emb = self.embed_tokens(input_ids).astype(self.embed_tokens.weight.dtype)
255255
if position_ids is None and not self.config.fuse_rope:
256256
position_ids = (
257-
paddle.range(
257+
paddle.arange(
258258
0,
259259
input_ids.shape[1],
260260
dtype="int64",
@@ -410,13 +410,13 @@ def forward(self, args):
410410
max_seq_len = hidden_states.shape[0] * self.config.tensor_parallel_degree
411411
if attention_mask is None:
412412
tgt_mask = None
413-
attn_mask_start_row_indices = None
413+
attn_mask_startend_row_indices = None
414414
elif attention_mask.dtype == paddle.int32:
415415
tgt_mask = None
416-
attn_mask_start_row_indices = attention_mask[:, :, :max_seq_len]
416+
attn_mask_startend_row_indices = attention_mask[:, :, :max_seq_len]
417417
else:
418418
tgt_mask = attention_mask[:, :, :max_seq_len, :max_seq_len]
419-
attn_mask_start_row_indices = None
419+
attn_mask_startend_row_indices = None
420420
assert len(tgt_mask.shape) == 4, f"Attention mask should be 4D tensor, but got {tgt_mask.shape}."
421421

422422
position_ids_decoder = None
@@ -436,7 +436,7 @@ def forward(self, args):
436436
self,
437437
hidden_states,
438438
attention_mask=tgt_mask,
439-
attn_mask_start_row_indices=attn_mask_start_row_indices,
439+
attn_mask_startend_row_indices=attn_mask_startend_row_indices,
440440
position_ids=position_ids_decoder,
441441
position_embeddings=tuple_position_embeddings,
442442
use_reentrant=self.config.recompute_use_reentrant,
@@ -446,7 +446,7 @@ def forward(self, args):
446446
self,
447447
hidden_states=hidden_states,
448448
attention_mask=tgt_mask,
449-
attn_mask_start_row_indices=attn_mask_start_row_indices,
449+
attn_mask_startend_row_indices=attn_mask_startend_row_indices,
450450
position_ids=position_ids_decoder,
451451
position_embeddings=tuple_position_embeddings,
452452
)
@@ -492,36 +492,44 @@ def forward(self, logits, labels):
492492

493493

494494
class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
495+
_decoder_layer_cls = None
496+
_get_tensor_parallel_mappings = None
497+
_init_weights = None
498+
_keep_in_fp32_modules = None
495499
_tied_weights_keys = ["lm_head.weight"]
500+
config_class = PretrainedConfig
501+
transpose_weight_keys = None
496502

497-
def __init__(self, config: PretrainedConfig, decoder_layer, **kwargs):
503+
def __init__(self, config: PretrainedConfig, **kwargs):
498504
# dynamic inherit DecoderLayer
499-
DecoderLayerPipe = make_decoder_layer_pipe(decoder_layer)
505+
if self._decoder_layer_cls is None:
506+
raise ValueError("_decoder_layer_cls must be set before init.")
507+
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)
508+
500509
new_initializer_range = math.sqrt(0.3333 / config.hidden_size)
501510
logger.info(f"change initializer-range from {config.initializer_range} to {new_initializer_range}")
502511
config.initializer_range = new_initializer_range
503512

504-
if config.get("moe_group", "") == "mp":
513+
moe_group = config.get("moe_group", "dummy")
514+
if moe_group == "mp":
505515
assert config.sequence_parallel
506516

507-
if config.moe_group in {"mp", "model", "tp", "mpdp"}:
517+
if moe_group in {"mp", "model", "tp", "mpdp"}:
508518
assert config.sequence_parallel
509-
logger.info(f"disable FFN tensor model parallel, moe-group={config.moe_group}")
519+
logger.info(f"disable FFN tensor model parallel, moe-group={moe_group}")
510520
config.disable_ffn_model_parallel = True
511521

512-
config.moe_group_origin = config.moe_group
513-
config.moe_group = _parse_moe_group(config.moe_group)
522+
config.moe_group_origin = moe_group
523+
config.moe_group = _parse_moe_group(moe_group)
514524
config.moe_world_size = dist.get_world_size(config.moe_group)
515525
if config.moe_world_size < 0:
516526
config.moe_world_size = 1
517527
config.moe_rank = dist.get_rank(config.moe_group)
518528

519529
self.config = config
520-
521530
hcg = get_hcg()
522531
tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1)
523532
tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0)
524-
525533
config.tensor_parallel_degree = tensor_parallel_degree
526534
config.tensor_parallel_rank = tensor_parallel_rank
527535

@@ -607,7 +615,7 @@ def __init__(self, config: PretrainedConfig, decoder_layer, **kwargs):
607615
)
608616

609617
def get_loss_fn(self, config):
610-
if config.dpo_config is not None:
618+
if config.get("dpo_config", None) is not None:
611619
loss_fn = CriterionLayerPipe(config, use_infohub=True)
612620
else:
613621
loss_fn = CriterionLayerPipe(config)
@@ -633,7 +641,7 @@ def register_cls_attr(cls, config_class=None, pretrained_model_class=None):
633641
def _prepare_pipeline_inputs_func(cls, inputs):
634642
first_stage_keys = [
635643
"input_ids",
636-
"attn_mask_start_row_indices",
644+
"attn_mask_startend_row_indices",
637645
"position_ids",
638646
"nbatch_pack_offset",
639647
]

paddleformers/trainer/training_args.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,7 @@ class TrainingArguments:
10881088
default=False,
10891089
metadata={"help": "Save model to HuggingFace safetensors."},
10901090
)
1091+
pp_seg_method: Optional[str] = field(default=None, metadata={"help": "PP Segmentation Method"})
10911092

10921093
def __post_init__(self):
10931094
world_size = paddle.distributed.get_world_size()

paddleformers/transformers/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,10 @@
169169
],
170170
"deepseek_v3.modeling_pp": ["DeepseekV3ForCausalLMPipe"],
171171
"ernie4_5.configuration": ["Ernie4_5Config"],
172-
"ernie4_5.modeling": ["Ernie4_5Model", "Ernie4_5ForCausalLM"],
172+
"ernie4_5.modeling": ["Ernie4_5Model", "Ernie4_5ForCausalLM", "Ernie4_5ForCausalLMPipe"],
173173
"ernie4_5.tokenizer": ["Ernie4_5Tokenizer"],
174174
"ernie4_5_moe.configuration": ["Ernie4_5_MoeConfig"],
175-
"ernie4_5_moe.modeling": ["Ernie4_5_MoeModel", "Ernie4_5_MoeForCausalLM"],
175+
"ernie4_5_moe.modeling": ["Ernie4_5_MoeModel", "Ernie4_5_MoeForCausalLM", "Ernie4_5_MoeForCausalLMPipe"],
176176
"export": ["export_model"],
177177
"llama.configuration": [
178178
"LLAMA_PRETRAINED_INIT_CONFIGURATION",

paddleformers/transformers/ernie4_5/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import_structure = {
2121
"tokenizer": ["Ernie4_5Tokenizer"],
2222
"configuration": ["Ernie4_5Config"],
23-
"modeling": ["Ernie4_5DecoderLayer", "Ernie4_5Model", "Ernie4_5ForCausalLM"],
23+
"modeling": ["Ernie4_5DecoderLayer", "Ernie4_5Model", "Ernie4_5ForCausalLM", "Ernie4_5ForCausalLMPipe"],
2424
}
2525

2626
if TYPE_CHECKING:

0 commit comments

Comments
 (0)