[Nightly] Enable bisect search (#1849)

mengfei25 · web-flow · commit 1b2c53909bbb · 2025-08-14T08:44:20.000Z
disable_all
diff --git a/.github/scripts/bisect_search.sh b/.github/scripts/bisect_search.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+set -xe
+export GIT_PAGER=cat
+
+# Init params
+WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"})
+PYTORCH_VERSION=${PYTORCH_VERSION:-"main"}
+TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"}
+for var; do
+    eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")"
+done
+
+if [ "${PYTORCH_VERSION}" == "search" ];then
+    PYTORCH_VERSION="$(git rev-parse HEAD)"
+fi
+if [ "${TORCH_XPU_OPS_VERSION}" == "search" ];then
+    TORCH_XPU_OPS_VERSION="$(git rev-parse HEAD)"
+fi
+
+# Clean WORKSPACE
+mkdir -p ${WORKSPACE}
+rm -rf "${WORKSPACE:?}/"* || sudo rm -rf "${WORKSPACE:?}/"*
+
+# Build pytorch
+pip uninstall -y torch
+source $(dirname $(realpath $0))/env.sh 2> /dev/null
+build_status="$($(dirname $(realpath $0))/build.sh \
+    --WORKSPACE="${WORKSPACE}" \
+    --PYTORCH_VERSION="${PYTORCH_VERSION}" \
+    --TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \
+    > ${GITHUB_WORKSPACE}/gs-logs/build-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+if [ ${build_status} -ne 0 ];then
+    tail -n 100 ${GITHUB_WORKSPACE}/gs-logs/build-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log
+    echo "Build got failed"
+    exit 1
+fi
+pip list |grep torch
+
+# Test
+test_result=1
+if [ "${SEARCH_CHECK}" == "accuracy" ];then
+    cd ${WORKSPACE}/pytorch
+    rm -rf torch
+    test_status="$(eval "${SEARCH_CASE} --output=${WORKSPACE}/tmp.csv" \
+        > ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+    if [ ${test_status} -eq 0 ];then
+        acc_result=$(tail -n 1 ${WORKSPACE}/tmp.csv |awk -F, '{print $4}')
+        if [[ "${acc_result}" == "pass"* ]];then
+            test_result=0
+        fi
+    fi
+elif [ "${SEARCH_CHECK}" == "performance" ];then
+    cd ${WORKSPACE}/pytorch
+    rm -rf torch
+    test_status="$(eval "${SEARCH_CASE} --output=${WORKSPACE}/tmp.csv" \
+        > ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+    if [ ${test_status} -eq 0 ];then
+        perf_result=$(tail -n 1 ${WORKSPACE}/tmp.csv |awk -F, '{print $5}')
+        test_result=$(echo "${perf_result},${SEARCH_GOOD_VALUE:-"0.00001"},${SEARCH_CRITERIA}" |awk -F, '{
+            if ($1/$2 > (1 - $3)){
+                print "0";
+            }else{
+                print "1";
+            }
+        }')
+    fi
+elif [ "${SEARCH_CHECK}" == "ut_regressions" ];then
+    cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/regressions
+    test_status="$(eval "${SEARCH_CASE}" \
+        > ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+    if [ ${test_status} -eq 0 ];then
+        test_result=0
+    fi
+elif [ "${SEARCH_CHECK}" == "ut_extended" ];then
+    cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/xpu/extended
+    test_status="$(eval "${SEARCH_CASE}" \
+        > ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+    if [ ${test_status} -eq 0 ];then
+        test_result=0
+    fi
+elif [ "${SEARCH_CHECK}" == "ut_xpu" ];then
+    cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/xpu
+    test_status="$(eval "${SEARCH_CASE}" \
+        > ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+    if [ ${test_status} -eq 0 ];then
+        test_result=0
+    fi
+else
+    test_status="$(eval "${SEARCH_CASE}" \
+        > ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
+    if [ ${test_status} -eq 0 ];then
+        test_result=0
+    fi
+fi
+
+# Test result
+cat ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log
+echo "${test_result},${acc_result},${perf_result},${PYTORCH_VERSION},${TORCH_XPU_OPS_VERSION}" |\
+    tee -a ${GITHUB_WORKSPACE}/gs-logs/summary.csv |tee -a ${WORKSPACE}/result.csv
+exit ${test_result}
diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh
@@ -50,7 +50,6 @@ git submodule sync && git submodule update --init --recursive
 python -m pip install -r requirements.txt
 python -m pip install mkl-static mkl-include
 export USE_STATIC_MKL=1
-export USE_XCCL=1
 export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
     intel-cmplr-lib-rt==2025.1.1 | \
     intel-cmplr-lib-ur==2025.1.1 | \
diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 
-source /opt/intel/oneapi/compiler/latest/env/vars.sh
-source /opt/intel/oneapi/pti/latest/env/vars.sh
-source /opt/intel/oneapi/umf/latest/env/vars.sh
-source /opt/intel/oneapi/ccl/latest/env/vars.sh
-source /opt/intel/oneapi/mpi/latest/env/vars.sh
+XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}
+
+source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh
+source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh
+source ${XPU_ONEAPI_PATH}/umf/latest/env/vars.sh
+source ${XPU_ONEAPI_PATH}/ccl/latest/env/vars.sh
+source ${XPU_ONEAPI_PATH}/mpi/latest/env/vars.sh
 icpx --version
 sycl-ls
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
@@ -124,6 +124,7 @@ jobs:
           source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
           # gcc 11
           source /opt/rh/gcc-toolset-11/enable
+          export USE_XCCL=1
           ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \
             --WORKSPACE="${{ github.workspace }}" \
             --PYTORCH_REPO="${PYTORCH_REPO}" \
diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml
@@ -0,0 +1,247 @@
+name: Bisect Search
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        required: true
+        type: string
+        default: 'pvc_rolling'
+        description: Test node
+      search_commits:
+        required: true
+        type: string
+        default: ''
+        description: Target commits, such as 'pytorch=old/new,xpu-ops=old/new'
+      search_check:
+        type: string
+        default: ''
+        description: Test case type, 'performance, accuracy, <ut_regressions/ut_extended/ut_xpu> or others'
+      search_case:
+        required: true
+        type: string
+        default: ''
+        description: Test case, such as 'python xxx.py or pytest -k xxx'
+      search_criteria:
+        type: string
+        default: '0.1'
+        description: Criteria for performance check, default is 10%
+      oneapi:
+        type: string
+        default: '2025.1.3'
+        description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
+      python:
+        type: string
+        default: '3.10'
+        description: Python version
+
+permissions: read-all
+
+jobs:
+  get_runner:
+    runs-on: ${{ inputs.runner }}
+    outputs:
+      test_host: ${{ steps.runner-info.outputs.test_host }}
+      test_user: ${{ steps.runner-info.outputs.test_user }}
+      test_group: ${{ steps.runner-info.outputs.test_group }}
+    steps:
+      - name: Get runner info
+        id: runner-info
+        run: |
+          # get test runner
+          echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT}
+          echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT}
+          echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
+          # show host info
+          cat /etc/os-release
+          uname -a
+          source /opt/intel/oneapi/setvars.sh
+          sycl-ls
+          dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
+      - name: Cleanup workspace
+        if: ${{ always() }}
+        run: |
+          # clean docker cache
+          docker stop $(docker ps -aq) || true
+          docker system prune -af || true
+          # clean files
+          ls -al
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+
+  biisect-search:
+    needs: get_runner
+    runs-on: ${{ needs.get_runner.outputs.test_host }}
+    container:
+      image: mengfeili/intel-pvc-driver:1146-1136
+      volumes:
+        - ${{ github.workspace }}:${{ github.workspace }}
+      options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g
+              -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }}
+      env:
+        AGENT_TOOLSDIRECTORY: /tmp/_tools
+        HF_HOME: /tmp/.cache/huggingface
+        TORCH_HOME: /tmp/.cache/torch
+        GH_TOKEN: ${{ github.token }}
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        SEARCH_COMMITS: ${{ inputs.search_commits }}
+        SEARCH_CHECK: ${{ inputs.search_check }}
+        SEARCH_CASE: ${{ inputs.search_case }}
+        SEARCH_CRITERIA: ${{ inputs.search_criteria }}
+        TORCH_XPU_ARCH_LIST: pvc
+        USE_XCCL: 0
+        USE_KINETO: 0
+    defaults:
+      run:
+        shell: bash -xe {0}
+    steps:
+      - name: Check runner
+        run: |
+          ls -al
+          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+          sudo rm -rf /tmp/_tools
+      - name: Setup python-${{ inputs.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ inputs.python }}
+      - name: Check runner
+        run: |
+          hostname && whoami && id
+          clinfo --list
+          gcc -v && g++ -v
+          which python && which pip
+          python -V
+          pip install -U pip wheel setuptools
+          pip list
+          uname -a
+          dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
+          pip install cmake ninja pandas psutil scipy requests pybind11
+          mkdir gs-logs gs-search
+          echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
+      - name: Install oneAPI DLE
+        if: ${{ inputs.oneapi != 'installed' }}
+        run: |
+          rm -rf ~/intel ~/.intel /tmp/intel
+          if [ "${{ inputs.oneapi }}" == "2025.1.3" ];then
+            ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh"
+          elif [ "${{ inputs.oneapi }}" == "2025.2.0" ];then
+            ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/49d38360-b403-4b06-9104-86fa8d886e6d/intel-deep-learning-essentials-2025.2.0.558_offline.sh"
+          else
+            ONEAPI_URL="${{ inputs.oneapi }}"
+          fi
+          wget -q -O oneapi.sh "${ONEAPI_URL}"
+          bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi
+          echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV}
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+        with:
+          path: gs-scripts
+      - name: Prepare source code
+        run: |
+          git clone https://github.com/pytorch/pytorch gs-pytorch
+          cd gs-pytorch
+          LATEST_PT_COMMIT="$(git rev-parse HEAD)"
+          cd ..
+          git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops
+          cd gs-torch-xpu-ops
+          LATEST_XPU_COMMIT="$(git rev-parse HEAD)"
+          cd ..
+          echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV}
+          echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV}
+      - name: Prepare test env
+        run: |
+          pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
+          if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then
+            pip install transformers==4.44.2
+          elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then
+            pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14
+            pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
+          elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then
+            model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')"
+            git clone https://github.com/pytorch/benchmark gs-benchmark
+            cd gs-benchmark
+            echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
+            python install.py ${model_name}
+          else
+            pip install -r gs-pytorch/.ci/docker/requirements-ci.txt
+          fi
+          pip uninstall -y torch && pip uninstall -y torch
+      - name: Bisect search pytorch
+        if: ${{ contains(inputs.search_commits, 'pytorch') }}
+        run: |
+          pytorch_commits="$(echo ${{ inputs.search_commits }} |sed 's+.*pytorch=++;s+,.*++')"
+          old_commit="$(echo ${pytorch_commits} |awk -F '/' '{print $1}')"
+          new_commit="$(echo ${pytorch_commits} |awk -F '/' '{print $2}')"
+          old_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
+                  --WORKSPACE="${{ github.workspace }}/gs-search" \
+                  --PYTORCH_VERSION="${old_commit}" \
+                  --TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
+                  > ${{ github.workspace }}/gs-logs/search-${old_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
+          old_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
+          export SEARCH_GOOD_VALUE="$(echo ${old_result} |awk -F, '{print $3}')"
+          new_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
+                  --WORKSPACE="${{ github.workspace }}/gs-search" \
+                  --PYTORCH_VERSION="${new_commit}" \
+                  --TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
+                  > ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
+          new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
+          if [ "${old_status}" != "${new_status}" ];then
+            cd gs-pytorch
+            git reset --hard
+            rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
+            git bisect start ${new_commit} ${old_commit}
+            git bisect run ./gs-scripts/.github/scripts/bisect_search.sh \
+                    --WORKSPACE="${{ github.workspace }}/gs-search" \
+                    --PYTORCH_VERSION="search" \
+                    --TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
+                    2>&1 |tee ${{ github.workspace }}/gs-logs/bisect-pytorch.log
+            git bisect log |tee ${{ github.workspace }}/gs-logs/result-pytorch.log
+          else
+            echo "Checked and no regression !"
+          fi
+      - name: Bisect search torch-xpu-ops
+        if: ${{ contains(inputs.search_commits, 'xpu-ops') }}
+        run: |
+          xpu_ops_commits="$(echo ${{ inputs.search_commits }} |sed 's+.*xpu-ops=++;s+,.*++')"
+          old_commit="$(echo ${xpu_ops_commits} |awk -F '/' '{print $1}')"
+          new_commit="$(echo ${xpu_ops_commits} |awk -F '/' '{print $2}')"
+          old_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
+                  --WORKSPACE="${{ github.workspace }}/gs-search" \
+                  --PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
+                  --TORCH_XPU_OPS_VERSION="${old_commit}" \
+                  > ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${old_commit}.log && echo $? || echo $?)"
+          old_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
+          export SEARCH_GOOD_VALUE="$(echo ${old_result} |awk -F, '{print $3}')"
+          new_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
+                  --WORKSPACE="${{ github.workspace }}/gs-search" \
+                  --PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
+                  --TORCH_XPU_OPS_VERSION="${new_commit}" \
+                  > ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)"
+          new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
+          if [ "${old_status}" != "${new_status}" ];then
+            cd gs-pytorch
+            git reset --hard
+            rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
+            git bisect start ${new_commit} ${old_commit}
+            git bisect run ./gs-scripts/.github/scripts/bisect_search.sh \
+                    --WORKSPACE="${{ github.workspace }}/gs-search" \
+                    --PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
+                    --TORCH_XPU_OPS_VERSION="search" \
+                    2>&1 |tee ${{ github.workspace }}/gs-logs/bisect-torch-xpu-ops.log
+            git bisect log |tee ${{ github.workspace }}/gs-logs/result-torch-xpu-ops.log
+          else
+            echo "Checked and no regression !"
+          fi
+      - name: Summary
+        run: |
+          cat gs-logs/summary.csv |tee -a ${GITHUB_STEP_SUMMARY}
+          for reulst_log in $(find  gs-logs -name "result-*.log")
+          do
+            echo -e "\n\n\n${reulst_log}" |tee -a ${GITHUB_STEP_SUMMARY}
+            cat ${reulst_log} |tee -a ${GITHUB_STEP_SUMMARY}
+          done
+      - name: Upload Logs
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: bisect-search
+          path: ${{ github.workspace }}/gs-logs