Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,39 @@
wget https://bj.bcebos.com/paddlenlp/datasets/examples/alpaca_demo.gz
tar -xvf alpaca_demo.gz
```
### 模型下载
```bash
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不需要加下载文档,后面可以直接用from_pretrained方式下载了

# PaddleNLP/Qwen2-0.5B-Instruct
aistudio download --model PaddleNLP/Qwen2-0.5B-Instruct --local_dir PaddleNLP/Qwen2-0.5B-Instruct

# baidu/ERNIE-4.5-0.3B-PT
aistudio download --model PaddlePaddle/ERNIE-4.5-0.3B-PT --local_dir baidu/ERNIE-4.5-0.3B-PT

# baidu/ERNIE-4.5-0.3B-PT
aistudio download --model PaddlePaddle/ERNIE-4.5-21B-A3B-PT --local_dir baidu/ERNIE-4.5-21B-A3B-PT
```

### 全参精调:SFT

单卡
```bash
# 需要12G显存左右
# 微调Qwen2-0.5B-Instruct 需要12G显存左右
python -u run_finetune.py ./config/qwen/sft_argument_qwen2_0p5b.json

# 微调ERNIE-4.5-0.3B-PT
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里先不用加ernie 4.5。先保证这ERNIEKit中能够训练就行

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

文档可以不改

python -u run_finetune.py ./config/ernie4_5/sft_argument_ernie4_5_0p3b.json
```

多卡
```bash
# SFT Qwen2-0.5B-Instruct
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_finetune.py ./config/qwen/sft_argument_qwen2_0p5b.json

# SFT ERNIE-4.5-0.3B-PT
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_finetune.py ./config/ernie4_5/sft_argument_ernie4_5_0p3b.json

# SFT ERNIE-4.5-21B-A3B-PT
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_finetune.py ./config/ernie4_5_moe/sft_argument_ernie4_5_21b_a3b.json
```

### LoRA
Expand Down
10 changes: 5 additions & 5 deletions examples/config/ernie4_5/sft_argument_ernie4_5_0p3b.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
"max_steps": 100,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"do_train": true,
Expand All @@ -33,15 +31,17 @@
"metric_for_best_model": "accuracy",
"recompute": true,
"save_total_limit": 1,
"tensor_parallel_degree": 2,
"pipeline_parallel_degree": 2,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"sharding": "stage2",
"zero_padding": true,
"flash_mask": true,
"unified_checkpoint": true,
"use_flash_attention": true,
"sequence_parallel": true,
"use_attn_mask_startend_row_indices": true,
"sequence_parallel": false,
"report_to": "none",
"convert_from_hf": true,
"save_to_hf": true,
"pp_seg_method": "layer:DecoderLayer|EmptyLayer"
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
"max_steps": 100,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"do_train": true,
Expand All @@ -43,5 +41,6 @@
"sequence_parallel": true,
"report_to": "none",
"convert_from_hf": true,
"save_to_hf": true,
"pp_seg_method": "layer:DecoderLayer|EmptyLayer"
}
8 changes: 7 additions & 1 deletion examples/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,13 @@ def main():
logger.info(f"Final model config: {model_config}")
logger.info("Creating model")

if model_args.flash_mask and model_args.use_attn_mask_startend_row_indices:
model_config._attn_implementation = "flashmask"
elif model_args.flash_mask and not model_args.use_attn_mask_startend_row_indices:
model_config._attn_implementation = "sdpa"
else:
model_config._attn_implementation = "eager"

model_class = AutoModelForCausalLM
if training_args.pipeline_parallel_degree > 1:
if data_args.eval_with_do_generation and training_args.do_eval:
Expand All @@ -174,7 +181,6 @@ def main():
logger.warning("`flash_mask` must use with zero padding and flash attention.")
data_args.zero_padding = True
model.config.use_flash_attention = True
model.config._attn_implementation = "flashmask"

if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")
Expand Down
3 changes: 2 additions & 1 deletion paddleformers/transformers/ernie4_5/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,8 @@ def forward(

# Pretrain & Eval must have labels
assert labels is not None
return self.criterion(logits, labels, loss_mask)
loss, _ = self.criterion(logits, labels, loss_mask)
return loss, logits


class Ernie4_5ForCausalLMPipe(GeneralModelForCausalLMPipe):
Expand Down
3 changes: 2 additions & 1 deletion paddleformers/transformers/ernie4_5_moe/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1157,7 +1157,8 @@ def forward(
# Pretrain & Eval must have labels
assert labels is not None

return self.criterion(logits, labels, loss_mask, router_loss=router_loss, mtp_logits=mtp_logits)
loss, _ = self.criterion(logits, labels, loss_mask, router_loss=router_loss, mtp_logits=mtp_logits)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这是为什么?

return loss, logits


class Ernie4_5_MoeForCausalLMPipe(GeneralModelForCausalLMPipe):
Expand Down
4 changes: 1 addition & 3 deletions paddleformers/transformers/llama/fusion_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,14 @@ def fusion_flash_attention(
else:
if attn_mask_startend_row_indices is not None:
assert alibi is None, "flashmask_attention or flash_attention_with_sparse_mask not support alibi"
if len(attn_mask_startend_row_indices.shape) == 2:
attn_mask_startend_row_indices = paddle.unsqueeze(attn_mask_startend_row_indices, axis=1)

if hasattr(F, "flashmask_attention"):
attn_output = no_recompute(
F.flashmask_attention,
query_states,
key_states,
value_states,
startend_row_indices=attn_mask_startend_row_indices.unsqueeze(-1),
startend_row_indices=attn_mask_startend_row_indices,
causal=True,
enable=skip_recompute,
)
Expand Down
4 changes: 2 additions & 2 deletions paddleformers/utils/masking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, dtype):
Returns:
paddle.Tensor: The dense attention mask recovered from attn_mask_start_row_indices.
"""
batch_size, _, max_seq_len = attn_mask_start_row_indices.shape
batch_size, _, max_seq_len, _ = attn_mask_start_row_indices.shape
base = paddle.arange(max_seq_len, dtype="int32").unsqueeze(1).expand([batch_size, -1, max_seq_len]).unsqueeze(1)
mask_indices = attn_mask_start_row_indices.unsqueeze(1)
mask_indices = attn_mask_start_row_indices

tril = paddle.tril(
paddle.ones([max_seq_len, max_seq_len], dtype="bool").expand([batch_size, 1, max_seq_len, max_seq_len])
Expand Down