Add workflow for torchtune

xuedinge233 · xuedinge233 · commit 53a8d960264b · 2025-06-30T16:21:31.000+08:00
diff --git a/.github/workflows/_ascend_npu_torchtune.yml b/.github/workflows/_ascend_npu_torchtune.yml
@@ -0,0 +1,128 @@
+name: "_ascend_npu_torchtune"
+
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+        description: "The runner selected to run on"
+      image:
+        required: true
+        type: string
+        description: "The docker image which will be loaded"
+      device:
+        required: true
+        type: string
+        description: "The device selected to run on"
+      torch-artifact:
+        required: false
+        type: string
+        description: "The distribution artifact name of torch"
+      torch-npu-artifact:
+        required: true
+        type: string
+        description: "The distribution artifact name of torch_npu"
+
+defaults:
+  run:
+    shell: bash -el {0}
+
+jobs:
+  torchtune:
+    name: run torchtune for torch_npu
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.image }}
+      env:
+        HF_ENDPOINT: https://hf-mirror.com
+
+    steps:
+      - name: Show NPU info
+        run: |
+          npu-smi info
+
+      - name: Config mirrors
+        run: |
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+      - name: Install system dependencies
+        run: |
+          apt-get update
+          apt-get install -y \
+              git gcc g++ make cmake ninja-build curl \
+              libgl1 libglib2.0-0 libsndfile1
+
+      # See: https://github.com/actions/checkout/issues/363#issuecomment-1915075699
+      # See: https://github.com/hunshcn/gh-proxy/issues/28#issuecomment-773769630
+      - name: Config git
+        run: |
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Checkout torchtune
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/torchtune
+          path: torchtune
+     
+      - name: Install torchtune
+        working-directory: torchtune
+        run: |
+          pip install -e .
+
+      - name: Download torch artifact
+        if: ${{ inputs.torch-artifact }}
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch-artifact }}
+
+      - name: Install torch
+        if: ${{ inputs.torch-artifact }}
+        run: |
+          pip install ${{ inputs.torch-artifact }}
+
+      - name: Install torch_npu dependencies
+        if: ${{ !inputs.torch-artifact }}
+        run: |
+          pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt
+
+      - name: List torch version
+        id: list-torch-version
+        shell: bash
+        run: |
+          torch_version=$(python -c "import torch; print(torch.__version__)")
+          echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT
+
+      - name: Download torch_npu artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch-npu-artifact }}
+          path: ascend_npu
+
+      - name: Install torch_npu
+        working-directory: ascend_npu
+        run: |
+          pip install ${{ inputs.torch-npu-artifact }}
+          
+      - name: Show environment info
+        run: |
+          pip list
+          
+      - name: Download Qwen2.5 model
+        run: |
+          export HF_ENDPOINT=https://hf-mirror.com
+          huggingface-cli download --resume-download Qwen/Qwen2.5-0.5B-Instruct \
+            --local-dir /tmp/Qwen2.5-0.5B-Instruct \
+
+      - name: Run torchtune with lora finetune
+        run: |
+          tune run lora_finetune_single_device --config qwen2_5/0.5B_lora_single_device
+      
+      - name: Run torchtune with full finetune
+        run: |
+          tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device
diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml
@@ -11,6 +11,7 @@ on:
       - ".github/workflows/_ascend_npu_ut.yml"
       - ".github/workflows/_ascend_npu_benchmark.yml"
       - ".github/workflows/_ascend_npu_torchtitan.yml"
+      - ".github/workflows/_ascend_npu_torchtune.yml"
       - ".ci/**"
       - "ascend_npu/**"
       - "src/**"
@@ -25,6 +26,7 @@ on:
       - ".github/workflows/_ascend_npu_ut.yml"
       - ".github/workflows/_ascend_npu_benchmark.yml"
       - ".github/workflows/_ascend_npu_torchtitan.yml"
+      - ".github/workflows/_ascend_npu_torchtune.yml"
       - ".ci/**"
       - "ascend_npu/**"
       - "src/**"
@@ -120,6 +122,41 @@ jobs:
       image: ${{ needs.prepare.outputs.image }}
       torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
 
+  test:
+    name: Test torch_npu
+    needs:
+      - prepare
+      - build-torch
+      - build
+    if: |
+      !cancelled() && github.event_name != 'repository_dispatch' &&
+      (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success'))
+    uses: ./.github/workflows/_ascend_npu_ut.yml
+    with:
+      runner: ${{ needs.prepare.outputs.runner }}
+      image: ${{ needs.prepare.outputs.image }}
+      device: ${{ needs.prepare.outputs.device }}
+      torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
+      torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
+
+  benchmark:
+    name: Run benchmarks
+    needs:
+      - prepare
+      - build-torch
+      - build
+    if: |
+      !cancelled() && github.event_name != 'repository_dispatch' &&
+      (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success'))
+    uses: ./.github/workflows/_ascend_npu_benchmark.yml
+    with:
+      runner: ${{ needs.prepare.outputs.runner }}
+      image: ${{ needs.prepare.outputs.image }}
+      torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
+      torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
+    secrets:
+      pr-token: ${{ secrets.COSDT_BOT_TOKEN }}
+      
   torchtitan:
     name: Run torchtitan
     needs:
@@ -136,3 +173,19 @@ jobs:
       torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
       torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
 
+  torchtune:
+    name: Run torchtune for torch_npu
+    needs:
+      - prepare
+      - build-torch
+      - build
+    if: |
+      !cancelled() && github.event_name != 'repository_dispatch' &&
+      (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success'))
+    uses: ./.github/workflows/_ascend_npu_torchtune.yml
+    with:
+      runner: ${{ needs.prepare.outputs.runner }}
+      image: ${{ needs.prepare.outputs.image }}
+      torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
+      torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
+