From cf2cc601671ed01300b4974e9912cd3198f16339 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 9 Jul 2025 15:24:57 +0000 Subject: [PATCH 1/9] distributed_weekly test --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/actions/linux-uttest/action.yml | 7 +++++-- .github/scripts/build.sh | 1 - .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 12 +++++++++--- .github/workflows/pull.yml | 8 ++++---- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 0ca7b8fb2..da469d929 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index c6ac2eb2b..0b2fa0619 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -143,7 +143,7 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: timeout 3600 bash -xeu -o pipefail {0} + shell: bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m @@ -152,12 +152,15 @@ runs: cat ${{ github.workspace }}/ptrace_scope.bk echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/xpu_distributed + pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers + cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ cd pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then echo -e "[ERROR] XCCL is not enabled" exit 1 fi - python run_distributed.py \ + python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + cp *.xml ${{ github.workspace }}/ut_log diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b4f526297..b0b7f17b2 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -44,7 +44,6 @@ git remote -v && git branch && git show -s # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 286d124b7..41f546924 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 146db6c72..dbe562312 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string @@ -88,6 +88,13 @@ jobs: if [ -e ut_failure_list.csv ];then cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true fi + - name: UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \ + >> $GITHUB_STEP_SUMMARY || true - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 @@ -195,7 +202,6 @@ jobs: else ut_list="${{ inputs.ut }}" fi - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ for ut_name in ${ut_list} do cp Known_issue.log.tmp Known_issue.log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bfb2d913a..bd5ed87d1 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -99,8 +99,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - runner: pvc_rolling - pytorch: main + pytorch: distributed_2.9 + runner: PVC-7358 linux-ut: needs: [conditions-filter, linux-build] @@ -124,8 +124,8 @@ jobs: ut_name: [xpu_distributed] uses: ./.github/workflows/_linux_ut.yml with: - runner: pvc_rolling - pytorch: main + runner: PVC-7358 + pytorch: distributed_2.9 ut: ${{ matrix.ut_name }} linux-e2e: From 3294a0cd3bab8000d3870a624a651c580925682b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 10 Jul 2025 23:30:10 +0000 Subject: [PATCH 2/9] update --- .github/workflows/_linux_ut.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index dbe562312..8db8a5024 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -92,7 +92,8 @@ jobs: run: | source activate xpu_op_${ZE_AFFINITY_MASK} pip install junitparser - python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml \ + cd ${{ github.workspace }}/ut_log/ + python check-ut.py ${{ github.workspace }}/ut_log/*.xml \ 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \ >> $GITHUB_STEP_SUMMARY || true - name: Upload Inductor XPU UT Log From 407f1d7b75886242d460a25cffba916da31d9e38 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 17:23:17 +0800 Subject: [PATCH 3/9] update --- .github/workflows/pull.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bd5ed87d1..03065c08b 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -99,8 +99,13 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: +<<<<<<< HEAD pytorch: distributed_2.9 runner: PVC-7358 +======= + pytorch: distributed_2.8 + runner: pvc_e2e +>>>>>>> 680c2cce (update) linux-ut: needs: [conditions-filter, linux-build] From 56b36c31c8ff32f5f94830e10dc7e2c8508f7f0f Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 23:18:35 +0800 Subject: [PATCH 4/9] update --- .github/workflows/pull.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 03065c08b..bd5ed87d1 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -99,13 +99,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: -<<<<<<< HEAD pytorch: distributed_2.9 runner: PVC-7358 -======= - pytorch: distributed_2.8 - runner: pvc_e2e ->>>>>>> 680c2cce (update) linux-ut: needs: [conditions-filter, linux-build] From b88965d972ef3aeb50fbf4ad702afe37b15791a6 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 17:15:27 +0800 Subject: [PATCH 5/9] update --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/scripts/build.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index da469d929..a8257a992 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -69,9 +69,9 @@ runs: fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" fi git clone ${PYTORCH_REPO} pytorch cd pytorch diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b0b7f17b2..44ae14a35 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -19,7 +19,7 @@ done # Set pytorch rm -rf ${WORKSPACE}/pytorch -git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch +git clone https://github.com/daisyden/pytorch.git ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s From 1e208785fe352b86591e364fabff211a0def223b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 22:34:14 +0800 Subject: [PATCH 6/9] update --- .github/actions/linux-testenv/action.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index a8257a992..679a0b878 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -101,14 +101,9 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - fi + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Install E2E Requirements shell: bash -xe {0} From 9266fcf757a058624a674a0652691f0c3539fab8 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 23:29:04 +0800 Subject: [PATCH 7/9] update --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8db8a5024..7d24af128 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -126,7 +126,7 @@ jobs: GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1 + PYTEST_ADDOPTS: -v steps: - name: Init test run: | From d23c3b1fa5bdab45eb65913c72887da0e364c87d Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Sat, 30 Aug 2025 12:49:51 +0800 Subject: [PATCH 8/9] Update _linux_ut.yml --- .github/workflows/_linux_ut.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 7d24af128..f5c763603 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -120,6 +120,7 @@ jobs: host: needs: runner runs-on: ${{ needs.runner.outputs.runner_id }} + timeout-minutes: 1200 if: ${{ contains(inputs.ut, 'distributed') }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool From fb632db2abe8999ca94373e14f99ffb6d4427358 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Sun, 31 Aug 2025 02:26:49 +0800 Subject: [PATCH 9/9] update --- .github/actions/linux-uttest/action.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 0b2fa0619..25b18509f 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -160,7 +160,8 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi + source /opt/intel/oneapi/ccl/latest/env/vars.sh python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log - cp *.xml ${{ github.workspace }}/ut_log + cp *.xml ${{ github.workspace }}/ut_log/xpu_distributed