add xpu case

plusNew001 · plusNew001 · commit 9f77043ad62d · 2026-02-04T07:25:02.000Z
diff --git a/scripts/xpu_ci/base_value/ernie_28b_thinking_sft_loss.json b/scripts/xpu_ci/base_value/ernie_28b_thinking_sft_loss.json
@@ -0,0 +1,12 @@
+{
+    "1": 11.43915844,
+    "2": 10.98821735,
+    "3": 10.11469746,
+    "4": 9.73008347,
+    "5": 8.18760586,
+    "6": 8.02382469,
+    "7": 7.94480753,
+    "8": 7.78190613,
+    "9": 7.66679621,
+    "10": 7.5971694
+}
diff --git a/scripts/xpu_ci/config/ernie_vl_28b_sft.yaml b/scripts/xpu_ci/config/ernie_vl_28b_sft.yaml
@@ -0,0 +1,74 @@
+### data
+train_dataset_type: messages
+train_dataset_path: tests/fixtures/dummy/sft-vl/thinking_safety_demo.jsonl
+train_dataset_prob: "1.0"
+max_seq_len: 32768
+packing: true
+mix_strategy: concat
+template_backend: custom
+template: ernie_vl
+random_shuffle: false
+dataloader_num_workers: 4
+
+### model
+model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
+attn_impl: flashmask
+num_hidden_layers: 4
+
+### finetuning
+# base
+stage: VL-SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: false
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: 10
+save_steps: 10000
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 1
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/ernie-vl-thinking-sft-full
+disable_tqdm: true
+
+# train
+warmup_steps: 0
+learning_rate: 1.0e-5
+
+# performance
+tensor_model_parallel_size: 2
+pipeline_model_parallel_size: 2
+sharding: stage1
+use_sparse_head_and_loss_fn: true
+bf16: true
+fp16_opt_level: O2
+save_checkpoint_format: "flex_checkpoint"
+load_checkpoint_format: "flex_checkpoint"
+freeze_config: freeze_vision
+
+# recompute
+recompute: true
+recompute_granularity: full
+recompute_method: uniform
+recompute_num_layers: 1
+recompute_modules: ["loss_fn"]
+recompute_use_reentrant: true
+
+use_flash_attention: true
+sequence_parallel: true
+pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
+offload_queue: true
+pp_delay_scale_loss: true
+overlap_p2p_comm: true
+best_unbalanced_scheduler: true
+sharding_comm_buffer_size_MB: 2048
+save_sharding_stage1_model_include_freeze_params: true
+offload_optim: false
+tensorwise_offload_optimizer: false
+unified_checkpoint_config: ignore_merge_optimizer
+pre_alloc_memory: 60
+amp_master_grad: 1
+
+device: xpu
diff --git a/scripts/xpu_ci/test_ernie_28b_thinking_sft.py b/scripts/xpu_ci/test_ernie_28b_thinking_sft.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from conftest import run_command_and_validate
+
+
+def test_ernie_28b_thinking_sft_training(project_root, base_value_dir, log_file):
+    """Test ERNIE-28B-thinking SFT training loss values.
+
+    This test runs the following shell command:
+        paddleformers-cli train scripts/xpu_ci/config/ernie_vl_28b_sft.yaml
+
+    Then validates that loss values match the baseline within tolerance of 1e-6.
+    """
+    # Define the exact shell command to execute
+    cmd = "paddleformers-cli train scripts/xpu_ci/config/ernie_vl_28b_sft.yaml"
+
+    # Execute command and validate results
+    passed, error_msg = run_command_and_validate(
+        cmd=cmd,
+        baseline_path=base_value_dir / "ernie_21b_sft_loss.json",
+        log_file=log_file,
+        working_dir=project_root,
+        tolerance=1e-6,
+        timeout=3600,
+    )
+
+    if not passed:
+        pytest.fail(error_msg)