Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ env:
jobs:
Lint:
name: Lint
runs-on: [self-hosted, ernie-cpu]
runs-on: [self-hosted, ernie-cpu-01]
permissions:
pull-requests: write
contents: read
Expand Down
4 changes: 3 additions & 1 deletion erniekit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .version.env import VERSION
from .version import commit
from .utils.process import terminate_process_tree, detect_device, set_ascend_environment
from .hparams import get_env_args

script_dir = Path(__file__).parent.resolve()
parent_dir = script_dir.parent
Expand Down Expand Up @@ -139,7 +140,8 @@ def main():
os.environ["FLAGS_dataloader_use_file_descriptor"] = "False"

if current_device == "xpu":
os.environ["FLAGS_use_stride_kernel"] = "1"
args = get_env_args()
os.environ["FLAGS_use_stride_kernel"] = str(args.FLAGS_use_stride_kernel)
os.environ["XPU_PADDLE_L3_SIZE"] = "0"
os.environ["XPUAPI_DEFAULT_SIZE"] = "2205258752"

Expand Down
10 changes: 9 additions & 1 deletion erniekit/hparams/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@
from .finetuning_args import FinetuningArguments
from .generating_args import GeneratingArguments
from .model_args import ModelArguments
from .parser import get_eval_args, get_export_args, get_server_args, get_train_args, read_args
from .parser import (
get_eval_args,
get_export_args,
get_server_args,
get_train_args,
get_env_args,
read_args,
)
from .server_args import ServerArguments

__all__ = [
Expand All @@ -31,5 +38,6 @@
"get_eval_args",
"get_server_args",
"get_export_args",
"get_env_args",
"read_args",
]
27 changes: 27 additions & 0 deletions erniekit/hparams/env_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass, field


@dataclass
class EnvConfigArguments:
"""Environment parameters"""

FLAGS_use_stride_kernel: bool = field(
default=False,
metadata={
"help": "Controls whether the Stride mechanism is enabled. Currently, enabling this mechanism on XPU may cause performance degradation, so it is disabled by default."
},
)
10 changes: 10 additions & 0 deletions erniekit/hparams/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from .model_args import ModelArguments
from .server_args import ServerArguments
from .preprocess_args import End2EndProcessorArguments
from .env_args import EnvConfigArguments

_TRAIN_ARGS = [
ModelArguments,
Expand Down Expand Up @@ -291,3 +292,12 @@ def get_export_args(
_parse_export_args(args)
)
return model_args, data_args, generating_args, finetuning_args, export_args


def get_env_args(
args: Optional[Union[dict[str, Any], list[str]]] = None
) -> EnvConfigArguments:
parser = PdArgumentParser(EnvConfigArguments)
allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
(env_args,) = _parse_args(parser, args=args, allow_extra_keys=allow_extra_keys)
return env_args
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#
stage: VL-SFT

# model
model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/
multimodal: true
fuse_linear: true
fuse_rms_norm: false
use_flash_attention: 1
use_moe: true
fine_tuning: Full
use_sparse_head_and_loss_fn: true
use_recompute_loss_fn: true
moe_group: "mp"
moe_use_aux_free_update_coef: 0.0
moe_aux_loss_lambda: 0.0
moe_use_aux_free: true
moe_use_hard_gate: true
moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text
pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer

# data
train_dataset_path: "examples/data/sft_vl-train_demo1.jsonl"
train_dataset_prob: "1.0"
text_dataset_path: ""
text_dataset_prob: ""
max_seq_len: 32768
num_samples_each_epoch: 10000000
modality_ratio: "[1,1]"

# preprocess
variable_resolution: 1
pad_to_max_seqlen: 32768
render_timestamp: true
serialize_output: false
one_sample_in_one_seq: true
chat_template: "ernie_vl_thinking"

# dataloader
dataloader_num_workers: 1

# train
do_train: true
batch_size: 1
prefetch_factor: 10
seed: 42
gradient_accumulation_steps: 4
max_steps: 8000
save_steps: 10000
logging_steps: 1
weight_decay: 0.1
warmup_steps: 100
output_dir: ./output
add_sys_token: true
same_data: true
freeze_config: "freeze_vision"
trigger_data_prob: 1.0
from_scratch: 0
gc_interval: 100000
drop_history_with_k: true
overwrite_output_dir: true

# optim
lr_scheduler_type: "cosine"
learning_rate: 1.0e-05
min_lr: 1.0e-06
moe_gate_lr_ratio: 0.01
visual_ld: 0.9
vit_lr_ratio: 0.9
adam_beta2: 0.95
adam_beta1: 0.9
adam_epsilon: 1.0e-08
scale_loss: 4096

# performance
sequence_parallel: 1
use_sp_callback: true
tensor_parallel_degree: 4
pipeline_parallel_degree: 2
pp_need_data: true
pp_need_data_degree: 2
virtual_pp_degree: 1
tensor_parallel_config: "sync_param sync_grad sync_moment"
pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler"
disable_pipeline_warmup: false
sharding: "stage1"
sharding_parallel_config: "split_param enable_fuse_optimizer_states"
sharding_comm_buffer_size_MB: 2048
save_sharding_stage1_model_include_freeze_params: true
offload_optim: false
tensorwise_offload_optimizer: false
unified_checkpoint_config: ignore_merge_optimizer
recompute: true
recompute_granularity: full
refined_recompute: "global:2"
pre_alloc_memory: 60

# amp
bf16: true
fp16_opt_level: "O2"
amp_master_grad: 1

# checkpoint
unified_checkpoint: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#
stage: VL-SFT

# model
model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking/
multimodal: true
fuse_linear: true
fuse_rms_norm: false
use_flash_attention: 1
use_moe: true
fine_tuning: LoRA
lora_rank: 32
use_sparse_head_and_loss_fn: true
use_recompute_loss_fn: true
moe_group: "mp"
moe_use_aux_free_update_coef: 0.0
moe_aux_loss_lambda: 0.0
moe_use_aux_free: true
moe_use_hard_gate: true
moe_multimodal_dispatch_use_allgather: v2-alltoall-unpad-text
pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer

# data
train_dataset_path: "examples/data/sft_vl-train_demo1.jsonl"
train_dataset_prob: "1.0"
text_dataset_path: ""
text_dataset_prob: ""
max_seq_len: 32768
num_samples_each_epoch: 10000000
modality_ratio: "[1,1]"

# preprocess
variable_resolution: 1
pad_to_max_seqlen: 32768
render_timestamp: true
serialize_output: false
one_sample_in_one_seq: true
chat_template: "ernie_vl_thinking"

# dataloader
dataloader_num_workers: 1

# train
do_train: true
batch_size: 1
prefetch_factor: 10
seed: 42
gradient_accumulation_steps: 4
max_steps: 8000
save_steps: 10000
logging_steps: 1
weight_decay: 0.1
warmup_steps: 100
output_dir: ./output
add_sys_token: true
same_data: true
freeze_config: "freeze_vision"
trigger_data_prob: 1.0
from_scratch: 0
gc_interval: 100000
drop_history_with_k: true
overwrite_output_dir: true

# optim
lr_scheduler_type: "cosine"
learning_rate: 3.0e-04
min_lr: 1.0e-06
moe_gate_lr_ratio: 0.01
visual_ld: 0.9
vit_lr_ratio: 0.9
adam_beta2: 0.95
adam_beta1: 0.9
adam_epsilon: 1.0e-08
scale_loss: 4096

# performance
sequence_parallel: 1
use_sp_callback: true
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pp_need_data: true
pp_need_data_degree: 2
virtual_pp_degree: 1
tensor_parallel_config: "sync_param sync_grad sync_moment"
pipeline_parallel_config: "enable_offload_queue enable_delay_scale_loss enable_overlap_p2p_comm best_unbalanced_scheduler"
disable_pipeline_warmup: false
sharding: "stage1"
sharding_parallel_config: "split_param enable_fuse_optimizer_states"
sharding_comm_buffer_size_MB: 2048
save_sharding_stage1_model_include_freeze_params: true
offload_optim: false
tensorwise_offload_optimizer: false
unified_checkpoint_config: ignore_merge_optimizer
recompute: true
recompute_granularity: full
refined_recompute: "global:2"
pre_alloc_memory: 60

# amp
bf16: true
fp16_opt_level: "O2"
amp_master_grad: 1

# checkpoint
unified_checkpoint: true
Loading