InternLM
diff --git a/‎.github/workflows/e2e_test_npu.yaml‎
Lines changed: 57 additions & 0 deletions b/‎.github/workflows/e2e_test_npu.yaml‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎.github/workflows/unit_test.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/unit_test.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autotest/cluster/clusterx.py‎
Lines changed: 1 addition & 1 deletion b/‎autotest/cluster/clusterx.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autotest/config-npu.yaml‎
Lines changed: 205 additions & 0 deletions b/‎autotest/config-npu.yaml‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎autotest/config.yaml‎
Lines changed: 21 additions & 1 deletion b/‎autotest/config.yaml‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎autotest/config/npu_qwen3.py‎
Lines changed: 55 additions & 0 deletions b/‎autotest/config/npu_qwen3.py‎
Lines changed: 55 additions & 0 deletions
@@ -0,0 +1,57 @@
+name: ete_test_npu
+
+permissions:
+  contents: write
+  pages: write
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/xtuner'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+  schedule:
+    - cron:  '00 21 * * 0-4'
+
+jobs:
+  ete_test:
+    if: ${{!cancelled() }}
+    runs-on: [d_cluster]
+    steps:
+      - name: Clean workdir
+        run: sudo git clean -ffdx
+      - name: Clone repository
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/xtuner' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: run-test
+        run: |
+          source activate npuci
+          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
+          export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}
+
+      - name: Upload Artifacts
+        if: ${{ !cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: ${{ github.workspace }}/${{ github.run_id }}
+          if-no-files-found: ignore
+          retention-days: 7
+          name: npu-e2e-${{ github.run_id }}
+
+      - name: Deploy to GitHub Pages
+        if: ${{ !cancelled() }}
+        uses: JamesIves/github-pages-deploy-action@v4
+        with:
+          token: ${{ github.token }}
+          branch: gh-pages
+          folder: ./${{ github.run_id }}
+          target-folder: ${{ github.run_id }}
@@ -8,7 +8,8 @@ on:
       - "docs/**"
       - "**.md"
       - "autotest/**"
-      - ".github/workflows/e2e_test.yaml "
+      - ".github/workflows/e2e_test.yaml"
+      - ".github/workflows/e2e_test_npu.yaml"
       - ".github/workflows/lint.yml"
 env:
   WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-5)
 
@@ -35,7 +35,7 @@ def execute_task(self, task_config: Dict[str, Any]):
             all_command.append(f"export {env}")
 
         all_command.append(command)
-        run_command = "\n".join(all_command)
+        run_command = "; ".join(all_command)
 
         try:
             job_name = "-".join([task_config["type"], task_config["case_name"], task_config["run_id"]])
 
@@ -0,0 +1,205 @@
+base_path:
+    base_output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+    base_baseline_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_baseline
+
+default_config:
+    train: 
+        resource:
+            gpus_per_task: 16
+            cpus_per_task: 256
+            memory_per_task: 1920
+            image: registry2.d.pjlab.org.cn/ccr-yehaochen/910c:xtuner_rc2-20251011-2
+            envs: 
+                - HF_HUB_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/models/hf_hub
+    eval:
+        resource:
+            gpus_per_task: 0
+            cpus_per_task: 16
+            memory_per_task: 128
+            image: registry.h.pjlab.org.cn/ailab-puyu/auto-eval:ld_0101_oc_8ee07ac_v3
+            envs: 
+                - HF_HUB_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/models/hf_hub
+                - HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache
+                - COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
+                - HF_DATASETS_OFFLINE=1
+                - HF_HUB_OFFLINE=1
+
+case: 
+    npu-qwen3-sft:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/npu_qwen3.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                num_nodes: 1
+                envs: 
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+                    - RANK=0
+            assert_info:
+                base_metric: npu-qwen3-sft/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+                    memory/max_memory_GB: 0.2
+                    runtime_info/tgs: 0.05
+                    runtime_info/text_tokens: 0
+            timeout: 10800
+
+    npu-qwen3-sft-ep8:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/npu_qwen3_moe_30BA3_ep8.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                num_nodes: 1
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+            assert_info:
+                base_metric: npu-qwen3-sft-ep8/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+                    memory/max_memory_GB: 0.2
+                    runtime_info/tgs: 0.5
+                    runtime_info/text_tokens: 0
+            timeout: 10800
+        -
+            type: sft
+            pre_action:
+                command: 'python ./autotest/utils/update_meta.py'
+            parameters:
+                config: autotest/config/npu_qwen3_moe_30BA3_ep8_resume.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                num_nodes: 1
+                cpus_per_task: 80
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+            assert_info:
+                base_metric: npu-qwen3-sft-ep8/812c1021_resume/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+                    memory/max_memory_GB: 0.2
+                    runtime_info/text_tokens: 0
+            timeout: 10800
+
+    npu-qwen3-sft-tp2:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/npu_qwen3_moe_30BA3_tp2.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - XTUNER_USE_FA3=1
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+            assert_info:
+                base_metric: npu-qwen3-sft-tp2/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+                    memory/max_memory_GB: 0.2
+                    runtime_info/tgs: 0.05
+                    runtime_info/text_tokens: 0
+            timeout: 10800
+
+    npu-qwen3-sft-recompute:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/npu_qwen3_recompute.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                num_nodes: 2
+                cpus_per_task: 256
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - XTUNER_ACTIVATION_OFFLOAD=1
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+            assert_info:
+                base_metric: npu-qwen3-sft-recompute/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+                    memory/max_memory_GB: 0.2
+                    runtime_info/tgs: 0.05
+                    runtime_info/text_tokens: 0
+            timeout: 10800
+
+    npu-qwen3-sft-16nums:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/npu_qwen3_16nums.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                num_nodes: 2
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+            assert_info:
+                base_metric: npu-qwen3-sft/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+            timeout: 10800
+
+    npu-qwen3-sft-celoss-vs-gpu:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/npu_qwen3_sft_celoss.py
+                output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
+            resource:
+                num_nodes: 1
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
+                    - XTUNER_DETERMINISTIC=true
+                    - TORCH_NPU_USE_HCCL=1
+                    - PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
+                    - PIP_TRUSTED_HOST=pkg.pjlab.org.cn
+            assert_info:
+                base_metric: npu-qwen3-sft-celoss-vs-gpu/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.02
+                    loss/reduced_llm_loss: 0.01
+                    lr: 0.01
+            timeout: 10800
@@ -91,7 +91,6 @@ case:
                     loss/reduced_llm_loss: 0.000001
                     lr: 0
                     memory/max_memory_GB: 0.2
-                    runtime_info/tgs: 0.05
                     runtime_info/text_tokens: 0
             timeout: 10800
 
@@ -186,5 +185,26 @@ case:
                     grad_norm: 1
                     loss/reduced_llm_loss: 0.02
                     lr: 0
+            timeout: 10800
+
+    qwen3-sft-celoss:
+        -
+            type: sft
+            parameters:
+                config: autotest/config/qwen3_sft_celoss.py
+                output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
+            resource:
+                envs:
+                    - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-30B-A3B
+                    - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
+                    - XTUNER_DETERMINISTIC=true
+            assert_info:
+                base_metric: qwen3-sft-celoss/812c1021/tracker.jsonl
+                check_metrics:
+                    grad_norm: 0.000001
+                    loss/reduced_llm_loss: 0.000001
+                    lr: 0
+                    memory/max_memory_GB: 0.2
                     runtime_info/tgs: 0.05
+                    runtime_info/text_tokens: 0
             timeout: 10800
@@ -0,0 +1,55 @@
+import os
+
+from xtuner.v1.config import (
+    AdamWConfig,
+    FSDPConfig,
+    LRConfig,
+)
+from xtuner.v1.datasets import FTDPTokenizeFnConfig
+from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
+from xtuner.v1.loss.ce_loss import CELossConfig
+from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
+from xtuner.v1.train import TrainerConfig
+
+
+QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
+ALPACA_PATH = os.environ["ALPACA_PATH"]
+
+
+moe_cfg = Qwen3MoE30BA3Config()
+optim_cfg = AdamWConfig(lr=6e-05)
+lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
+fsdp_cfg = FSDPConfig(
+    torch_compile=False,
+    cpu_offload=False,
+    ep_size=moe_cfg.ep_size,
+)
+
+dataset_config = [
+    {
+        "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
+        "tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
+    },
+]
+
+dataloader_config = DataloaderConfig(pack_max_length=16384)
+
+loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) # CELossConfig()
+
+
+trainer = TrainerConfig(
+    load_from=QWEN3_MOE_PATH,
+    model_cfg=moe_cfg,
+    optim_cfg=optim_cfg,
+    fsdp_cfg=fsdp_cfg,
+    dataset_cfg=dataset_config,
+    dataloader_cfg=dataloader_config,
+    lr_cfg=lr_cfg,
+    loss_cfg=loss_cfg,
+    tokenizer_path=QWEN3_MOE_PATH,
+    global_batch_size=64,
+    total_epoch=1,
+    work_dir=f"/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft/sft",
+    seed=0,
+    dist_backend="npu:hccl",
+)