From d3f32fa0d235a0e6f249fc707098eae4d9506df8 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 09:02:26 +0800 Subject: [PATCH 001/160] modify build --- .github/scripts/build.sh | 64 +++++++++++++++-------------- .github/scripts/env.sh | 12 +++--- .github/workflows/_linux_build.yml | 65 +++++++++++++----------------- 3 files changed, 67 insertions(+), 74 deletions(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 9dcd170aa1..f10f095934 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -1,18 +1,18 @@ #!/bin/bash # Usage: # ./build.sh --WORKSPACE= \ -# --PYTORCH_REPO= --PYTORCH_VERSION= \ +# --PYTORCH_REPO= --PYTORCH_COMMIT= \ # --TORCH_XPU_OPS_REPO= \ -# --TORCH_XPU_OPS_VERSION= +# --TORCH_XPU_OPS_COMMIT= set -xe export GIT_PAGER=cat # Init params WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"}) PYTORCH_REPO=${PYTORCH_REPO:-"https://github.com/pytorch/pytorch.git"} -PYTORCH_VERSION=${PYTORCH_VERSION:-"main"} +PYTORCH_COMMIT=${PYTORCH_COMMIT:-"main"} TORCH_XPU_OPS_REPO=${TORCH_XPU_OPS_REPO:-"https://github.com/intel/torch-xpu-ops.git"} -TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"} +TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT:-"main"} for var; do eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")" done @@ -21,20 +21,20 @@ done rm -rf ${WORKSPACE}/pytorch git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch -git checkout ${PYTORCH_VERSION} +git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s git rev-parse HEAD > ${WORKSPACE}/pytorch.commit # Set torch-xpu-ops -if [ "${TORCH_XPU_OPS_VERSION,,}" == "pinned" ];then +if [ "${TORCH_XPU_OPS_COMMIT,,}" == "pinned" ];then TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - TORCH_XPU_OPS_VERSION="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" + TORCH_XPU_OPS_COMMIT="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" fi -if [ "${TORCH_XPU_OPS_VERSION,,}" != "cicd" ];then +if [ "${TORCH_XPU_OPS_COMMIT,,}" != "cicd" ];then rm -rf ${WORKSPACE}/torch-xpu-ops git clone ${TORCH_XPU_OPS_REPO} ${WORKSPACE}/torch-xpu-ops cd ${WORKSPACE}/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_VERSION} + git checkout ${TORCH_XPU_OPS_COMMIT} fi cd ${WORKSPACE}/torch-xpu-ops git remote -v && git branch && git show -s @@ -51,28 +51,30 @@ python -m pip install -r requirements.txt python -m pip install mkl-static mkl-include export USE_STATIC_MKL=1 export USE_XCCL=1 -export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.1.1 | \ - intel-cmplr-lib-ur==2025.1.1 | \ - intel-cmplr-lic-rt==2025.1.1 | \ - intel-sycl-rt==2025.1.1 | \ - oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - onemkl-sycl-blas==2025.1.0 | \ - onemkl-sycl-dft==2025.1.0 | \ - onemkl-sycl-lapack==2025.1.0 | \ - onemkl-sycl-rng==2025.1.0 | \ - onemkl-sycl-sparse==2025.1.0 | \ - dpcpp-cpp-rt==2025.1.1 | \ - intel-opencl-rt==2025.1.1 | \ - mkl==2025.1.0 | \ - intel-openmp==2025.1.1 | \ - tbb==2022.1.0 | \ - tcmlib==1.3.0 | \ - umf==0.10.0 | \ - intel-pti==0.12.3 -" +if [ "${XPU_ONEAPI_PATH}" != "" ];then + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ + intel-cmplr-lib-rt==2025.1.1 | \ + intel-cmplr-lib-ur==2025.1.1 | \ + intel-cmplr-lic-rt==2025.1.1 | \ + intel-sycl-rt==2025.1.1 | \ + oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + onemkl-sycl-blas==2025.1.0 | \ + onemkl-sycl-dft==2025.1.0 | \ + onemkl-sycl-lapack==2025.1.0 | \ + onemkl-sycl-rng==2025.1.0 | \ + onemkl-sycl-sparse==2025.1.0 | \ + dpcpp-cpp-rt==2025.1.1 | \ + intel-opencl-rt==2025.1.1 | \ + mkl==2025.1.0 | \ + intel-openmp==2025.1.1 | \ + tbb==2022.1.0 | \ + tcmlib==1.3.0 | \ + umf==0.10.0 | \ + intel-pti==0.12.3 + " +fi # Build sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 831864d6d4..d0f7cfd338 100644 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,9 +1,11 @@ #!/bin/bash -source /opt/intel/oneapi/compiler/latest/env/vars.sh -source /opt/intel/oneapi/pti/latest/env/vars.sh -source /opt/intel/oneapi/umf/latest/env/vars.sh -source /opt/intel/oneapi/ccl/latest/env/vars.sh -source /opt/intel/oneapi/mpi/latest/env/vars.sh +XPU_ONEAPI_PATH="${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}" + +source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh +source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh +source ${XPU_ONEAPI_PATH}/umf/latest/env/vars.sh +source ${XPU_ONEAPI_PATH}/ccl/latest/env/vars.sh +source ${XPU_ONEAPI_PATH}/mpi/latest/env/vars.sh icpx --version sycl-ls diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 60dd2c49b6..89f29ac34f 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -4,34 +4,26 @@ on: workflow_call: inputs: pytorch: - required: true type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false - type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - driver: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: 'lts' - description: Driver lts/rolling + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: - required: false type: string default: '3.10' description: Python version + oneapi: + type: string + default: 'host' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed runner: required: true type: string - default: 'linux.idc.xpu' + default: 'pvc_rolling' description: Runner label - outputs: - torch_commit_id: - description: The commit id of the torch build - value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }} permissions: read-all @@ -44,18 +36,13 @@ jobs: - ${{ github.workspace }}:${{ github.workspace }} env: PATH: /opt/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - commit_issue: 1280 GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - outputs: - TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }} timeout-minutes: 300 steps: - name: Setup based env run: | # Cleanup workspace - rm -rf ${{ github.workspace }}/* + rm -rf ./* # Install gh dnf install 'dnf-command(config-manager)' dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo @@ -73,35 +60,37 @@ jobs: - name: Build Pytorch XPU run: | set -xe -o pipefail - if [ "${{ inputs.driver }}" == "lts" ]; then - export TORCH_XPU_ARCH_LIST='pvc' - fi if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" + PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" else PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - PYTORCH_VERSION="${{ inputs.pytorch }}" + PYTORCH_COMMIT="${{ inputs.pytorch }}" fi - if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')" - elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then - TORCH_XPU_OPS_VERSION="pinned" + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" else - TORCH_XPU_OPS_VERSION="cicd" + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi # oneAPI DLE + if [ "${{ inputs.oneapi }}" != "host" ];then + rm -rf ~/intel ~/.intel /opt/intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" + fi source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh # gcc 11 source /opt/rh/gcc-toolset-11/enable ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ - --PYTORCH_VERSION="${PYTORCH_VERSION}" \ + --PYTORCH_COMMIT="${PYTORCH_COMMIT}" \ --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ - --TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \ - 2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_VERSION//\//-}.log + --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ + 2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_COMMIT//\//-}.log - name: Torch Config run: | python -c "import torch; print(torch.__config__.show())" @@ -128,4 +117,4 @@ jobs: if: always() run: | chmod 777 . -R - rm -rf pytorch torch-xpu-ops pytorch_*.log torch*.whl + rm -rf ./* From bdc58d75c4854b480755e583523f04955c59eb46 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 09:03:36 +0800 Subject: [PATCH 002/160] modify ut --- .github/workflows/_linux_ut.yml | 39 ++++++++++++++------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index d43fe6b809..82ef33df1a 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -7,17 +7,26 @@ on: required: false type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: required: false type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false type: string - default: '' - description: Triton commit. Use pytorch pined commit by default + default: 'pinned' + description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + required: false + type: string + default: '3.10' + description: Python version ut: required: true type: string @@ -28,21 +37,11 @@ on: type: string default: '' description: List disabled tests, such as disable_ut or disable_distributed - python: - required: false - type: string - default: '3.10' - description: Python version runner: required: true type: string default: 'linux.idc.xpu' description: Runner label - driver: - required: false - type: string - default: 'lts' - description: Driver lts/rolling permissions: read-all @@ -53,9 +52,6 @@ jobs: timeout-minutes: 300 env: GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - ut_skip_issue: 1624 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -263,7 +259,7 @@ jobs: eval $test_cmd 2>${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test_error.log | \ tee ${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test.log - name: Run Torch XPU Profile UT - if: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }} + if: ${{ contains(inputs.ut, 'xpu_profiling') }} run: | source activate xpu_op_${ZE_AFFINITY_MASK} mkdir -p ${{ github.workspace }}/ut_log/profile_test @@ -379,9 +375,6 @@ jobs: timeout-minutes: 60 env: GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - ut_skip_issue: 1624 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From b07b490c5ec579959524150975f07e6b719c9f0e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 09:03:46 +0800 Subject: [PATCH 003/160] modify e2e --- .github/workflows/nightly_ondemand.yml | 247 +++++++++++-------------- 1 file changed, 112 insertions(+), 135 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 25c3af0245..48b6521829 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -3,31 +3,42 @@ name: Nightly-OnDemand Tests on: schedule: # GMT+8 21:00 every workday - - cron: '0 13 * * 0-4' - # GMT+8 0:00 Saturday - - cron: '0 16 * * 5' + - cron: '10 13 * * 0-4' # build from source + - cron: '30 13 * * 0-4' # nightly wheel + # GMT+8 00:00 Saturday + - cron: '10 16 * * 5' # build from source + - cron: '30 16 * * 5' # nightly wheel workflow_dispatch: inputs: pytorch: required: false type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: required: false type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - ut: + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + triton: required: false type: string - default: 'torch_xpu' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma - triton: + default: 'pinned' + description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: required: false type: string - default: '' - description: Triton commit. Use pytorch pined commit by default + default: '3.10' + description: Python version + ut: + required: false + type: string + default: 'op_regression' + description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed`. Delimiter is comma suite: required: true type: string @@ -53,75 +64,120 @@ on: type: string default: '' description: Model. Will only run this one mode if set - python: - required: false - type: string - default: '3.10' - description: Python version permissions: read-all -concurrency: - group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: ${{ github.event_name != 'schedule' }} - jobs: - Linux-Nightly-Ondemand-Build: + Conditions-Filter: + name: conditions-filter if: ${{ github.repository_owner == 'intel' }} - name: linux-nightly-ondemand + runs-on: ubuntu-latest + timeout-minutes: 3 + outputs: + test_type: ${{ steps.inputs-check.outputs.test_type }} + pytorch: ${{ steps.inputs-check.outputs.pytorch }} + torch_xpu_ops: ${{ steps.inputs-check.outputs.torch_xpu_ops }} + steps: + - name: Inputs check + id: inputs-check + run: | + if [ "${{ github.event_name }}" == "schedule" ];then + if [ "${{ github.event.schedule }}" == "10 13 * * 0-4" ];then + test_type="build-nightly" + pytorch="main" + torch_xpu_ops="main" + elif [ "${{ github.event.schedule }}" == "30 13 * * 0-4" ];then + test_type="wheel-nightly" + pytorch="nightly_wheel" + torch_xpu_ops="pinned" + elif [ "${{ github.event.schedule }}" == "10 16 * * 5" ];then + test_type="build-weekly" + pytorch="main" + torch_xpu_ops="main" + elif [ "${{ github.event.schedule }}" == "30 16 * * 5" ];then + test_type="wheel-weekly" + pytorch="nightly_wheel" + torch_xpu_ops="pinned" + else + test_type="unknown" + pytorch="main" + torch_xpu_ops="main" + fi + else + if [ "${{ inputs.pytorch }}" == "nightly_wheel" ] || [ "${{ inputs.pytorch }}" == "release_wheel" ];then + test_type="wheel-ondemand" + pytorch="${{ inputs.pytorch }}" + torch_xpu_ops="pinned" + else + test_type="build-ondemand" + pytorch="${{ inputs.pytorch }}" + torch_xpu_ops="${{ inputs.torch_xpu_ops }}" + fi + fi + echo "test_type=${test_type}" >> ${GITHUB_OUTPUT} + echo "pytorch=${pytorch}" >> ${GITHUB_OUTPUT} + echo "torch_xpu_ops=${torch_xpu_ops}" >> ${GITHUB_OUTPUT} + + Linux-Nightly-Ondemand-Build: + if: ${{ needs.Conditions-Filter.outputs.pytorch != 'nightly_wheel' && needs.Conditions-Filter.outputs.pytorch != 'release_wheel' }} + needs: [Conditions-Filter] + name: linux secrets: inherit uses: ./.github/workflows/_linux_build.yml with: - pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - runner: pvc_e2e + runner: pvc_rolling Linux-Nightly-Ondemand-UT-Tests: - if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} - name: linux-nightly-ondemand - needs: Linux-Nightly-Ondemand-Build + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} + name: linux + needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }} + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} + triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }} + ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} runner: linux.idc.xpu Linux-Nightly-Ondemand-E2E-Tests: - runs-on: pvc_e2e - name: linux-nightly-ondemand / e2e_test - needs: Linux-Nightly-Ondemand-Build + runs-on: pvc_rolling + name: linux / e2e_test + needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] timeout-minutes: 3600 permissions: issues: write + container: + image: 'xpu:test' + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g env: + AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} reference_issue: 1645 - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} + triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} - outputs: - TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} - TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} - TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} - TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} - TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} - TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} - TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} - TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} steps: + - name: Cleanup workspace + run: | + rm -rf ./* || sudo rm -rf ./* - name: Checkout torch-xpu-ops uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Setup python ${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} - name: Prepare Conda ENV run: | which conda && conda clean -ay @@ -357,88 +413,9 @@ jobs: name: Windows-nightly-ondemand uses: ./.github/workflows/_windows_ut.yml with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} + torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} src_changed: false has_label: true runner: Windows_CI - - Tests-Failure-And-Report: - if: ${{ ! cancelled() }} - runs-on: [ self-hosted, Linux ] - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - needs: Linux-Nightly-Ondemand-E2E-Tests - steps: - - name: Report github issue for XPU OPS nightly - if: github.repository_owner == 'intel' - run: | - set -xe - # Test env - build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - repo="${{ github.repository }}" - TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_BRANCH_ID }}" - TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_COMMIT_ID }}" - DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.DRIVER_VERSION }}" - KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.KERNEL_VERSION }}" - BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.BUNDLE_VERSION }}" - OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.OS_PRETTY_NAME }}" - GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.GCC_VERSION }}" - TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}" - TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}" - TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}" - TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}" - TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}" - TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}" - TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMEOUT_MODELS }}" - # Test status - if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "success" ];then - test_status=Success - elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "failure" ];then - test_status=Failure - cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" - else - test_status=None - exit 0 - fi - # Test Type - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_type="On-demand" - test_issue_id=426 - cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" - elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then - test_type="Weekly" - test_issue_id=432 - else - test_type="Nightly" - test_issue_id=432 - fi - # Test report - echo -e "**${test_status}** $test_type Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt - printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt - echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION | $KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" - if [ "${{ inputs.model }}" != "" ];then - test_scope+="; model=${{ inputs.model }}" - fi - echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt - fi - echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt - echo "$cc_comment" >> ${{ github.workspace }}/report.txt - # Report - report_txt=$(cat ${{ github.workspace }}/report.txt) - gh --repo $repo issue comment $test_issue_id --body "$report_txt" From 7b4582bc48ea9222c1ef416ecdfefa362d46fc06 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 17:06:50 +0800 Subject: [PATCH 004/160] update --- .../actions/inductor-xpu-e2e-test/action.yml | 53 ++--- .github/workflows/_linux_build.yml | 66 ++++-- .github/workflows/_linux_ut.yml | 47 +---- .github/workflows/nightly_ondemand.yml | 197 +++++++++--------- 4 files changed, 167 insertions(+), 196 deletions(-) diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml index 1631f399f2..8f9b90780f 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -29,9 +29,6 @@ inputs: type: string default: 'all' description: which cards can be used in the test - hf_token: - required: false - description: HUGGING_FACE_HUB_TOKEN for torchbench test pytorch: required: false type: string @@ -50,23 +47,16 @@ runs: if: ${{ inputs.env_prepare }} shell: bash run: | - source activate e2e_ci if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git - cd audio && git checkout $TORCHAUDIO_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchaudio -y && pip install dist/*.whl - cd ../ && rm -rf vision && git clone --single-branch -b main https://github.com/pytorch/vision.git - cd vision && git checkout $TORCHVISION_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl - fi - cd ../ && python -c "import torch, torchvision, torchaudio" - rm -rf benchmark && git clone https://github.com/pytorch/benchmark.git - cd benchmark && git checkout $TORCHBENCH_COMMIT_ID + python -c "import torch, torchvision, torchaudio" + cd ./pytorch + TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt) + git clone https://github.com/pytorch/benchmark.git xpu-benchmark + cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID # remove deps which will reinstall torch pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch) + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) pip install -U transformers==4.44.2 sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt git status && git diff @@ -81,28 +71,17 @@ runs: pip install -U transformers==4.44.2 fi if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../ && rm -rf vision && git clone --single-branch -b main https://github.com/pytorch/vision.git - cd vision && git checkout $TORCHVISION_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl - fi # install timm without dependencies - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 # install timm dependencies without torch and torchvision - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch) + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) fi - pip install numpy==1.26.4 + pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - env: - HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} shell: bash run: | - source activate e2e_ci - cp .github/scripts/inductor_xpu_test.sh ../pytorch - cd ../pytorch - + cp ./.github/scripts/inductor_xpu_test.sh ./pytorch + cd ./pytorch # check param function contains() { contains_status="echo 'Start $2 ...'" @@ -164,18 +143,16 @@ runs: HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} shell: bash run: | - cd ../pytorch + cd ./pytorch rm -f inductor_log/summary_accuracy.csv for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv") do sed -i "s/$/,$(basename $var)/" $var cat $var >> inductor_log/summary_accuracy.csv done - - source activate e2e_ci cd ${{ github.workspace }} - cp .github/scripts/inductor_summary.py ../pytorch - cd ../pytorch + cp ./.github/scripts/inductor_summary.py ./pytorch + cd ./pytorch pip install styleFrame scipy pandas set -xe dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 89f29ac34f..d22b63955a 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -3,6 +3,10 @@ name: Linux PyTorch XPU Build on: workflow_call: inputs: + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel pytorch: type: string default: 'main' @@ -11,14 +15,19 @@ on: type: string default: 'main' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin - python: + triton: + required: false type: string - default: '3.10' - description: Python version + default: 'pinned' + description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' oneapi: type: string - default: 'host' + default: 'installed' description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version runner: required: true type: string @@ -29,6 +38,7 @@ permissions: read-all jobs: build: + if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ inputs.runner }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' @@ -75,7 +85,7 @@ jobs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi # oneAPI DLE - if [ "${{ inputs.oneapi }}" != "host" ];then + if [ "${{ inputs.oneapi }}" != "installed" ];then rm -rf ~/intel ~/.intel /opt/intel wget -q -O oneapi.sh "${{ inputs.oneapi }}" bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi @@ -90,29 +100,61 @@ jobs: --PYTORCH_COMMIT="${PYTORCH_COMMIT}" \ --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ - 2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_COMMIT//\//-}.log + 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log + - name: Build Triton + run: | + cd ./pytorch + pip install cmake ninja pybind11 + rm -rf pytorch_triton_xpu-*.whl + if [ "${{ inputs.triton }}" != "pinned" ];then + TRITON_COMMIT_ID="${{ inputs.triton }}" + else + TRITON_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)" + fi + TRITON_VERSION_NAME="$( + curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ + grep '__version__' |head -n 1 |awk -F "'" '{print $2}' + )" + python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ + 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log + pip install pytorch_triton_xpu-*.whl + cp pytorch_triton_xpu-*.whl ${{ github.workspace }} + - name: Build Torchvision and Torchaudio + run: | + cd ./pytorch + TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" + TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" + git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision + cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID} + python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log + pip install dist/*.whl + cp dist/*.whl ${{ github.workspace }} + git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio + cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID} + python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log + pip install dist/*.whl + cp dist/*.whl ${{ github.workspace }} - name: Torch Config run: | python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import triton; print(triton.__version__)" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py - - name: Identify Build version - id: build_version - run: | - echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" - name: Upload Torch XPU Wheel if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/torch*.whl + path: ${{ github.workspace }}/*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/pytorch_*.log + path: ${{ github.workspace }}/build_*.log - name: Cleanup if: always() run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 82ef33df1a..a5d543fd9c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -13,11 +13,6 @@ on: type: string default: 'main' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin - triton: - required: false - type: string - default: 'pinned' - description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' oneapi: type: string default: 'installed' @@ -75,7 +70,7 @@ jobs: git clone https://github.com/pytorch/pytorch pytorch source activate xpu_op_${ZE_AFFINITY_MASK} if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall ${{ github.workspace }}/torch*.whl + pip install --force-reinstall ${{ github.workspace }}/*.whl TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') cd ./pytorch git checkout ${TORCH_COMMIT_ID} @@ -103,25 +98,6 @@ jobs: cd third_party/torch-xpu-ops git checkout ${TORCH_XPU_OPS_COMMIT} fi - - name: Triton Installation - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - fi - name: Torch Config run: | source activate xpu_op_${ZE_AFFINITY_MASK} @@ -398,7 +374,7 @@ jobs: git clone https://github.com/pytorch/pytorch pytorch source activate xpu_op_${ZE_AFFINITY_MASK} if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall ${{ github.workspace }}/torch*.whl + pip install --force-reinstall ${{ github.workspace }}/*.whl TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') cd ./pytorch git checkout ${TORCH_COMMIT_ID} @@ -426,25 +402,6 @@ jobs: cd third_party/torch-xpu-ops git checkout ${TORCH_XPU_OPS_COMMIT} fi - - name: Triton Installation - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - fi - name: Torch Config run: | source activate xpu_op_${ZE_AFFINITY_MASK} diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 48b6521829..edc86e7c7f 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -104,7 +104,7 @@ jobs: torch_xpu_ops="main" fi else - if [ "${{ inputs.pytorch }}" == "nightly_wheel" ] || [ "${{ inputs.pytorch }}" == "release_wheel" ];then + if [["${{ inputs.pytorch }}" == *"_wheel"]];then test_type="wheel-ondemand" pytorch="${{ inputs.pytorch }}" torch_xpu_ops="pinned" @@ -119,14 +119,15 @@ jobs: echo "torch_xpu_ops=${torch_xpu_ops}" >> ${GITHUB_OUTPUT} Linux-Nightly-Ondemand-Build: - if: ${{ needs.Conditions-Filter.outputs.pytorch != 'nightly_wheel' && needs.Conditions-Filter.outputs.pytorch != 'release_wheel' }} needs: [Conditions-Filter] name: linux secrets: inherit uses: ./.github/workflows/_linux_build.yml with: + test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} + triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} runner: pvc_rolling @@ -137,9 +138,9 @@ jobs: needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: + test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} - triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} @@ -160,43 +161,51 @@ jobs: env: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} reference_issue: 1645 + test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} - torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} - triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} steps: - name: Cleanup workspace run: | - rm -rf ./* || sudo rm -rf ./* + rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* + mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - with: - path: torch-xpu-ops - name: Setup python ${{ env.python }} uses: actions/setup-python@v5 with: python-version: ${{ env.python }} - name: Prepare Conda ENV run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=${{ env.python }} cmake ninja -y - source activate e2e_ci + which python + pip list pip install pandas scipy psutil requests + - name: Install oneAPI DLE + if: ${{ env.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel /opt/intel + wget -q -O oneapi.sh "${{ env.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} + if: ${{ ! contains(env.test_type, 'wheel') }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + pattern: Torch-XPU-Wheel-* - name: Prepare Stock Pytorch run: | - pwd - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl + if [ "${{ env.pytorch }}" == "release_wheel" ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ "${{ env.pytorch }}" == "test_wheel" ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ "${{ env.pytorch }}" == "nightly_wheel" ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install --force-reinstall ${{ github.workspace }}/*.whl + fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') git clone https://github.com/pytorch/pytorch pytorch cd pytorch @@ -204,141 +213,128 @@ jobs: # apply extra PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py git status && git diff && git show -s - - name: Identify pinned versions - id: pinned + - name: Install deps run: | - source .github/scripts/env.sh - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - else - echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then + python -c "import torch, torchvision, torchaudio" + cd pytorch + TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt) + git clone https://github.com/pytorch/benchmark.git xpu-benchmark + cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID + # remove deps which will reinstall torch + pip install --no-deps accelerate + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) + pip install -U transformers==4.44.2 + sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt + git status && git diff + pip install -r requirements.txt + python install.py --continue_on_fail + # deps for torchrec_dlrm + pip install pyre_extensions + pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu + pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec fi - echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo ${GITHUB_ENV} - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - - name: Show GITHUB_ENV + if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then + pip install -U transformers==4.44.2 + fi + if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) + fi + - name: Torch Config run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache + printenv + python -c "import torch; print(torch.__config__.show())" + python -c "import torch; print(torch.__config__.parallel_info())" + python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + source /opt/intel/oneapi/setvars.sh + sycl-ls # Nihglty launch - - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} + - name: Nightly Huggingface Full Test + if: ${{ contains(env.test_type, 'nightly') }} uses: ./.github/actions/inductor-xpu-e2e-test with: - suite: huggingface env_prepare: true + suite: huggingface dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training - scenario: accuracy - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Torchbench BF16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} + scenario: accuracy,performance + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(env.test_type, 'nightly') }} uses: ./.github/actions/inductor-xpu-e2e-test with: + env_prepare: true suite: torchbench dt: bfloat16 mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Timm_models FP16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} + scenario: accuracy,performance + - name: Nightly Timm_models FP16 Training Test + if: ${{ contains(env.test_type, 'nightly') }} uses: ./.github/actions/inductor-xpu-e2e-test with: + env_prepare: true suite: timm_models dt: float16 mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + scenario: accuracy,performance - name: Nightly PT2E Full Test - if: ${{ env.run_type == 'nightly' }} + if: ${{ contains(env.test_type, 'nightly') }} uses: ./.github/actions/pt2e with: dt: float32,int8 scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} # Weekly launch - - name: Weekly Huggingface Full Test - if: ${{ env.run_type == 'weekly' }} + - name: Nightly Huggingface Full Test + if: ${{ contains(env.test_type, 'weekly') }} uses: ./.github/actions/inductor-xpu-e2e-test with: - suite: huggingface env_prepare: true + suite: huggingface dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Torchbench Full Test - if: ${{ env.run_type == 'weekly' }} + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(env.test_type, 'weekly') }} uses: ./.github/actions/inductor-xpu-e2e-test with: - suite: torchbench env_prepare: true + suite: torchbench dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Timm_models Full Test - if: ${{ env.run_type == 'weekly' }} + - name: Nightly Timm_models FP16 Training Test + if: ${{ contains(env.test_type, 'weekly') }} uses: ./.github/actions/inductor-xpu-e2e-test with: - suite: timm_models env_prepare: true + suite: timm_models dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly PT2E Full Test - if: ${{ env.run_type == 'weekly' }} + - name: Nightly PT2E Full Test + if: ${{ contains(env.test_type, 'weekly') }} uses: ./.github/actions/pt2e with: - env_prepare: true dt: float32,int8 scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} # On-demand launch - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} uses: ./.github/actions/inductor-xpu-e2e-test with: - suite: ${{ inputs.suite }} env_prepare: true + suite: ${{ inputs.suite }} dt: ${{ inputs.dt }} mode: ${{ inputs.mode }} scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} uses: ./.github/actions/pt2e @@ -346,7 +342,6 @@ jobs: env_prepare: true dt: ${{ inputs.dt }} scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - name: Download Reference Artifact id: reference_id @@ -354,10 +349,10 @@ jobs: set -xe source activate e2e_ci conda install gh --channel conda-forge -y - if [ "${{ env.run_type }}" == "on-demand" ];then + if [ "${{ env.pytorch }}" == "on-demand" ];then artifact_type="weekly" else - artifact_type="${{ env.run_type }}" + artifact_type="${{ env.pytorch }}" fi REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ --json body -q .body |grep "Inductor-${artifact_type}-LTS-XPU-E2E" |sed 's/.*: *//')" @@ -399,13 +394,13 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-${{ env.run_type }}-LTS-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} + name: Inductor-${{ env.pytorch }}-LTS-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/upload_files - name: Upload Reference Run ID - if: ${{ env.run_type != 'on-demand' }} + if: ${{ env.pytorch != 'on-demand' }} run: | gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.run_type }}-LTS-XPU-E2E:.*/Inductor-${{ env.run_type }}-LTS-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt + sed "s/Inductor-${{ env.pytorch }}-LTS-XPU-E2E:.*/Inductor-${{ env.pytorch }}-LTS-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt Windows-Nightly-Ondemand-UT-Tests: From 3ae4b09be641601bec791dc5bcd1e87d2d53dc29 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 17:12:16 +0800 Subject: [PATCH 005/160] update --- .github/workflows/nightly_ondemand.yml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index edc86e7c7f..2c52d87dca 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -29,11 +29,6 @@ on: type: string default: 'installed' description: Installed oneAPI DLE on host by default, fill offline.sh url if needed - python: - required: false - type: string - default: '3.10' - description: Python version ut: required: false type: string @@ -129,7 +124,7 @@ jobs: torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} runner: pvc_rolling Linux-Nightly-Ondemand-UT-Tests: @@ -142,7 +137,7 @@ jobs: pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} runner: linux.idc.xpu @@ -166,7 +161,7 @@ jobs: test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} steps: - name: Cleanup workspace run: | @@ -410,7 +405,7 @@ jobs: with: torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} src_changed: false has_label: true runner: Windows_CI From fe06ca3aad64f9617e8aa76b156775320d5ccc32 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 18:01:09 +0800 Subject: [PATCH 006/160] update --- .github/workflows/_linux_build.yml | 10 +- .github/workflows/_linux_e2e.yml | 277 +++++++++++++++++++++++++ .github/workflows/_linux_ut.yml | 14 +- .github/workflows/nightly_ondemand.yml | 273 ++---------------------- 4 files changed, 303 insertions(+), 271 deletions(-) create mode 100644 .github/workflows/_linux_e2e.yml diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d22b63955a..481c9527b2 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -3,6 +3,11 @@ name: Linux PyTorch XPU Build on: workflow_call: inputs: + runner: + required: true + type: string + default: 'pvc_rolling' + description: Runner label test_type: type: string default: 'build-from-source' @@ -28,11 +33,6 @@ on: type: string default: '3.10' description: Python version - runner: - required: true - type: string - default: 'pvc_rolling' - description: Runner label permissions: read-all diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml new file mode 100644 index 0000000000..fb6405cf48 --- /dev/null +++ b/.github/workflows/_linux_e2e.yml @@ -0,0 +1,277 @@ +name: Linux PyTorch XPU Build + +on: + workflow_call: + inputs: + runner: + required: true + type: string + default: 'pvc_rolling' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel + pytorch: + type: string + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version + suite: + required: true + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma + dt: + required: true + type: string + default: 'float32' + description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma + mode: + required: true + type: string + default: 'inference' + description: Test mode. `inference,training`. Delimiter is comma + scenario: + required: true + type: string + default: 'accuracy' + description: Test scenario. `accuracy,performance`. Delimiter is comma + model: + required: false + type: string + default: '' + description: Model. Will only run this one mode if set + +permissions: read-all + +jobs: + e2e: + runs-on: ${{ inputs.runner }} + timeout-minutes: 3600 + permissions: + issues: write + container: + image: 'xpu:test' + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g + env: + AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" + GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + reference_issue: 1645 + steps: + - name: Cleanup workspace + run: | + rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* + mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Setup python ${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Prepare Conda ENV + run: | + which python + pip list + pip install pandas scipy psutil requests + - name: Install oneAPI DLE + if: ${{ inputs.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel /opt/intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + - name: Download Pytorch wheel + if: ${{ ! contains(inputs.test_type, 'wheel') }} + uses: actions/download-artifact@v4 + with: + pattern: Torch-XPU-Wheel-* + - name: Prepare Stock Pytorch + run: | + if [ "${{ inputs.pytorch }}" == "release_wheel" ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ "${{ inputs.pytorch }}" == "test_wheel" ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ "${{ inputs.pytorch }}" == "nightly_wheel" ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install --force-reinstall ${{ github.workspace }}/*.whl + fi + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + git clone https://github.com/pytorch/pytorch pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} + # apply extra PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git diff && git show -s + - name: Torch Config + run: | + printenv + python -c "import torch; print(torch.__config__.show())" + python -c "import torch; print(torch.__config__.parallel_info())" + python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + source /opt/intel/oneapi/setvars.sh + sycl-ls + + # Nihglty launch + - name: Nightly Huggingface Full Test + if: ${{ contains(inputs.test_type, 'nightly') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: huggingface + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'nightly') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: torchbench + dt: bfloat16 + mode: training + scenario: accuracy,performance + - name: Nightly Timm_models FP16 Training Test + if: ${{ contains(inputs.test_type, 'nightly') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: timm_models + dt: float16 + mode: training + scenario: accuracy,performance + - name: Nightly PT2E Full Test + if: ${{ contains(inputs.test_type, 'nightly') }} + uses: ./.github/actions/pt2e + with: + dt: float32,int8 + scenario: accuracy,performance + + # Weekly launch + - name: Nightly Huggingface Full Test + if: ${{ contains(inputs.test_type, 'weekly') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: huggingface + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'weekly') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: torchbench + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Nightly Timm_models FP16 Training Test + if: ${{ contains(inputs.test_type, 'weekly') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: timm_models + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Nightly PT2E Full Test + if: ${{ contains(inputs.test_type, 'weekly') }} + uses: ./.github/actions/pt2e + with: + dt: float32,int8 + scenario: accuracy,performance + + # On-demand launch + - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + if: ${{ contains(inputs.test_type, 'ondemand') && inputs.suite != 'pt2e' }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: ${{ inputs.suite }} + dt: ${{ inputs.dt }} + mode: ${{ inputs.mode }} + scenario: ${{ inputs.scenario }} + - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'pt2e') }} + uses: ./.github/actions/pt2e + with: + env_prepare: true + dt: ${{ inputs.dt }} + scenario: ${{ inputs.scenario }} + + - name: Download Reference Artifact + id: reference_id + run: | + set -xe + source activate e2e_ci + conda install gh --channel conda-forge -y + artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/')" + REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ + --json body -q .body |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo 'n/a')" + gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" && \ + rm -rf reference && mv Inductor-*-XPU-E2E-* reference || echo 'No reference' + - name: Summarize archieve files + id: summary + if: ${{ ! cancelled() }} + run: | + set -x -e -o pipefail + rm -rf ${{ github.workspace }}/upload_files + cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files + mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ + find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days + tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs + # Print summary + if [ "${{ inputs.suite }}" != 'pt2e' ];then + source activate e2e_ci + bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ + ${{ github.workspace }}/upload_files \ + ${{ github.workspace }}/reference \ + >> ${GITHUB_STEP_SUMMARY} + exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) + if [ ${exit_label} -ne 0 ];then + grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 + echo "There are ${exit_label} cases that need look into!!! Please check them" + exit ${exit_label} + fi + fi + pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" + if [ -f "${pt2e_summary_csv}" ];then + cat ${pt2e_summary_csv} + failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) + if [ ${failed_num} -ne 0 ];then + echo "[Warning] PT2E has failures!" + fi + fi + - name: Upload Inductor XPU E2E Data + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} + path: ${{ github.workspace }}/upload_files + - name: Upload Reference Run ID + if: ${{ ! contains(inputs.test_type, 'ondemand') }} + run: | + gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body 2>&1 |tee new_body.txt 2>&1 + has_or_not="$(grep 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt |wc -l)" + if [ ${has_or_not} -ne 0 ];then + sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt + else + echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt + fi + gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index a5d543fd9c..1511d686f4 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -3,6 +3,15 @@ name: Linux UT Test on: workflow_call: inputs: + runner: + required: true + type: string + default: 'linux.idc.xpu' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel pytorch: required: false type: string @@ -32,11 +41,6 @@ on: type: string default: '' description: List disabled tests, such as disable_ut or disable_distributed - runner: - required: true - type: string - default: 'linux.idc.xpu' - description: Runner label permissions: read-all diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 2c52d87dca..b1a3754dcf 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -11,17 +11,14 @@ on: workflow_dispatch: inputs: pytorch: - required: false type: string default: 'main' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: - required: false type: string default: 'main' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: - required: false type: string default: 'pinned' description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' @@ -30,32 +27,26 @@ on: default: 'installed' description: Installed oneAPI DLE on host by default, fill offline.sh url if needed ut: - required: false type: string default: 'op_regression' description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed`. Delimiter is comma suite: - required: true type: string default: 'huggingface' description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma dt: - required: true type: string default: 'float32' description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma mode: - required: true type: string default: 'inference' description: Test mode. `inference,training`. Delimiter is comma scenario: - required: true type: string default: 'accuracy' description: Test scenario. `accuracy,performance`. Delimiter is comma model: - required: false type: string default: '' description: Model. Will only run this one mode if set @@ -119,13 +110,13 @@ jobs: secrets: inherit uses: ./.github/workflows/_linux_build.yml with: + runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} - runner: pvc_rolling Linux-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} @@ -133,270 +124,30 @@ jobs: needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: + runner: linux.idc.xpu test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - runner: linux.idc.xpu Linux-Nightly-Ondemand-E2E-Tests: - runs-on: pvc_rolling - name: linux / e2e_test + if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} + name: linux needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - timeout-minutes: 3600 - permissions: - issues: write - container: - image: 'xpu:test' - volumes: - - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g - env: - AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - reference_issue: 1645 + uses: ./.github/workflows/_linux_e2e.yml + with: + runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} - steps: - - name: Cleanup workspace - run: | - rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* - mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Setup python ${{ env.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ env.python }} - - name: Prepare Conda ENV - run: | - which python - pip list - pip install pandas scipy psutil requests - - name: Install oneAPI DLE - if: ${{ env.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel /opt/intel - wget -q -O oneapi.sh "${{ env.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - - name: Download Pytorch wheel - if: ${{ ! contains(env.test_type, 'wheel') }} - uses: actions/download-artifact@v4 - with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - run: | - if [ "${{ env.pytorch }}" == "release_wheel" ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ "${{ env.pytorch }}" == "test_wheel" ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ "${{ env.pytorch }}" == "nightly_wheel" ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - else - pip install --force-reinstall ${{ github.workspace }}/*.whl - fi - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - # apply extra PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Install deps - run: | - if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then - python -c "import torch, torchvision, torchaudio" - cd pytorch - TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt) - git clone https://github.com/pytorch/benchmark.git xpu-benchmark - cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID - # remove deps which will reinstall torch - pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - pip install -U transformers==4.44.2 - sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt - git status && git diff - pip install -r requirements.txt - python install.py --continue_on_fail - # deps for torchrec_dlrm - pip install pyre_extensions - pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu - pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec - fi - if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then - pip install -U transformers==4.44.2 - fi - if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - fi - - name: Torch Config - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - source /opt/intel/oneapi/setvars.sh - sycl-ls - - # Nihglty launch - - name: Nightly Huggingface Full Test - if: ${{ contains(env.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: huggingface - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - - name: Nightly Torchbench BF16 Training Test - if: ${{ contains(env.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy,performance - - name: Nightly Timm_models FP16 Training Test - if: ${{ contains(env.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: timm_models - dt: float16 - mode: training - scenario: accuracy,performance - - name: Nightly PT2E Full Test - if: ${{ contains(env.test_type, 'nightly') }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - - # Weekly launch - - name: Nightly Huggingface Full Test - if: ${{ contains(env.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: huggingface - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - - name: Nightly Torchbench BF16 Training Test - if: ${{ contains(env.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: torchbench - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - - name: Nightly Timm_models FP16 Training Test - if: ${{ contains(env.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: timm_models - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - - name: Nightly PT2E Full Test - if: ${{ contains(env.test_type, 'weekly') }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - - # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - env_prepare: true - suite: ${{ inputs.suite }} - dt: ${{ inputs.dt }} - mode: ${{ inputs.mode }} - scenario: ${{ inputs.scenario }} - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: ${{ inputs.dt }} - scenario: ${{ inputs.scenario }} - - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - if [ "${{ env.pytorch }}" == "on-demand" ];then - artifact_type="weekly" - else - artifact_type="${{ env.pytorch }}" - fi - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-${artifact_type}-LTS-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - id: summary - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary - if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ env.pytorch }}-LTS-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - name: Upload Reference Run ID - if: ${{ env.pytorch != 'on-demand' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.pytorch }}-LTS-XPU-E2E:.*/Inductor-${{ env.pytorch }}-LTS-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt + suite: ${{ github.event_name == 'schedule' && 'huggingface' || inputs.suite }} + dt: ${{ github.event_name == 'schedule' && 'float32' || inputs.dt }} + mode: ${{ github.event_name == 'schedule' && 'inference' || inputs.mode }} + scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} + model: ${{ github.event_name == 'schedule' && '' || inputs.model }} Windows-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} From be531f753c8eff4eab8943420eb347dc55e2f80e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 16 Jul 2025 20:57:53 +0800 Subject: [PATCH 007/160] Update nightly_ondemand.yml --- .github/workflows/nightly_ondemand.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index b1a3754dcf..c87e7fae33 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -135,6 +135,7 @@ jobs: Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} name: linux + secrets: inherit needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml with: From 1df6138ed15387d8a2c3e220b0708617068dd96f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 17 Jul 2025 09:03:34 +0800 Subject: [PATCH 008/160] update --- .github/workflows/nightly_ondemand.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index c87e7fae33..b83268f6ec 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -107,7 +107,6 @@ jobs: Linux-Nightly-Ondemand-Build: needs: [Conditions-Filter] name: linux - secrets: inherit uses: ./.github/workflows/_linux_build.yml with: runner: pvc_rolling @@ -135,7 +134,8 @@ jobs: Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} name: linux - secrets: inherit + permissions: + issues: write needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml with: From 9fe4dcba262ae44359c5b0a2c8af3a58789eac99 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 17 Jul 2025 13:50:51 +0800 Subject: [PATCH 009/160] update --- .github/workflows/nightly_ondemand.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index b83268f6ec..3e461ef82b 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -107,6 +107,7 @@ jobs: Linux-Nightly-Ondemand-Build: needs: [Conditions-Filter] name: linux + secrets: inherit uses: ./.github/workflows/_linux_build.yml with: runner: pvc_rolling From ef919846b3d6c966e57e54c08b275c444155d292 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 17 Jul 2025 17:05:47 +0800 Subject: [PATCH 010/160] update --- .github/workflows/_linux_build.yml | 7 +++++++ .github/workflows/_linux_e2e.yml | 15 +++++++++------ .github/workflows/_windows_ut.yml | 4 ++-- .github/workflows/nightly_ondemand.yml | 9 ++++----- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 481c9527b2..91efb43611 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -37,6 +37,13 @@ on: permissions: read-all jobs: + wheel: + if: ${{ contains(inputs.test_type, 'wheel') }} + name: ${{ inputs.pytorch }} + runs-on: ubuntu-latest + steps: + - name: Use ${{ inputs.pytorch }} + run: echo 'Use ${{ inputs.pytorch }}' build: if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ inputs.runner }} diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index fb6405cf48..80a609bc9b 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -1,4 +1,4 @@ -name: Linux PyTorch XPU Build +name: Linux E2E Test on: workflow_call: @@ -53,24 +53,26 @@ on: permissions: read-all jobs: - e2e: + e2e_test: runs-on: ${{ inputs.runner }} timeout-minutes: 3600 permissions: issues: write container: - image: 'xpu:test' + image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g + - /etc/group:/etc/group + options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g -u jenkins env: - AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" + AGENT_TOOLSDIRECTORY: "/opt/_tools" GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} reference_issue: 1645 steps: - name: Cleanup workspace run: | + whoami rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp - name: Checkout torch-xpu-ops @@ -79,9 +81,10 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} - - name: Prepare Conda ENV + - name: Check python run: | which python + python -V pip list pip install pandas scipy psutil requests - name: Install oneAPI DLE diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml index ee628792f0..9ca7f7eb8d 100644 --- a/.github/workflows/_windows_ut.yml +++ b/.github/workflows/_windows_ut.yml @@ -8,7 +8,7 @@ on: type: string default: 'main' description: Pytorch branch/commit - keep_torch_xpu_ops: + torch_xpu_ops: required: false type: string default: 'false' @@ -89,7 +89,7 @@ jobs: git status git show -s git submodule sync && git submodule update --init --recursive - if ${{ inputs.keep_torch_xpu_ops }} == 'true' ( + if ${{ inputs.torch_xpu_ops }} == 'pinned' ( echo "Don't replace torch-xpu-ops!" ) else ( echo "Replace torch-xpu-ops!" diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 3e461ef82b..7df0cfe763 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -90,7 +90,7 @@ jobs: torch_xpu_ops="main" fi else - if [["${{ inputs.pytorch }}" == *"_wheel"]];then + if [[ "${{ inputs.pytorch }}" == *"_wheel" ]];then test_type="wheel-ondemand" pytorch="${{ inputs.pytorch }}" torch_xpu_ops="pinned" @@ -124,7 +124,7 @@ jobs: needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: - runner: linux.idc.xpu + runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} @@ -135,8 +135,7 @@ jobs: Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} name: linux - permissions: - issues: write + permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml with: @@ -152,7 +151,7 @@ jobs: model: ${{ github.event_name == 'schedule' && '' || inputs.model }} Windows-Nightly-Ondemand-UT-Tests: - if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} + if: ${{ github.event_name == 'schedule' }} name: Windows-nightly-ondemand uses: ./.github/workflows/_windows_ut.yml with: From 01fbe460f240863e83747470476e8e0c7bc9fd2e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 17 Jul 2025 17:15:51 +0800 Subject: [PATCH 011/160] update --- .github/workflows/nightly_ondemand.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 7df0cfe763..57800e2eb4 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -28,23 +28,23 @@ on: description: Installed oneAPI DLE on host by default, fill offline.sh url if needed ut: type: string - default: 'op_regression' + default: '' description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed`. Delimiter is comma suite: type: string - default: 'huggingface' + default: '' description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma dt: type: string - default: 'float32' + default: '' description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma mode: type: string - default: 'inference' + default: '' description: Test mode. `inference,training`. Delimiter is comma scenario: type: string - default: 'accuracy' + default: '' description: Test scenario. `accuracy,performance`. Delimiter is comma model: type: string From 191b5c046ca68fd2cf35e6231826dae51d0d40c5 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 17 Jul 2025 17:21:39 +0800 Subject: [PATCH 012/160] update --- .github/workflows/_linux_e2e.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 80a609bc9b..4dbe713f0c 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -73,6 +73,7 @@ jobs: - name: Cleanup workspace run: | whoami + sudo chmod 777 . -R rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp - name: Checkout torch-xpu-ops From f313b856074882518689f78f23fbd1578fa69a30 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 09:38:51 +0800 Subject: [PATCH 013/160] update --- .github/workflows/_linux_e2e.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 4dbe713f0c..168b843446 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -72,8 +72,8 @@ jobs: steps: - name: Cleanup workspace run: | - whoami - sudo chmod 777 . -R + hostname && whoami + chmod 777 . -R || sudo chmod 777 . -R rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp - name: Checkout torch-xpu-ops @@ -102,11 +102,12 @@ jobs: pattern: Torch-XPU-Wheel-* - name: Prepare Stock Pytorch run: | - if [ "${{ inputs.pytorch }}" == "release_wheel" ];then + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ "${{ inputs.pytorch }}" == "test_wheel" ];then + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ "${{ inputs.pytorch }}" == "nightly_wheel" ];then + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu else pip install --force-reinstall ${{ github.workspace }}/*.whl @@ -128,7 +129,7 @@ jobs: python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - source /opt/intel/oneapi/setvars.sh + source ${{ github.workspace }}/.github/scripts/env.sh sycl-ls # Nihglty launch From 7d4488b79aa970c9d72008b3d4af7cdee015f268 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 09:41:37 +0800 Subject: [PATCH 014/160] update --- .github/workflows/_linux_e2e.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 168b843446..ee36264e8e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -73,9 +73,9 @@ jobs: - name: Cleanup workspace run: | hostname && whoami - chmod 777 . -R || sudo chmod 777 . -R - rm -rf ~/.triton /tmp ./* || sudo rm -rf ~/.triton /tmp ./* - mkdir -m 777 /tmp || sudo mkdir -m 777 /tmp + sudo chmod 777 . -R + sudo rm -rf ~/.triton /tmp ./* + sudo mkdir -m 777 /tmp - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Setup python ${{ inputs.python }} From 66e28dafd114c1c51b51b05b45367cdb360bf05f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 09:53:52 +0800 Subject: [PATCH 015/160] update --- .github/workflows/_linux_e2e.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index ee36264e8e..4a82ec2eb1 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -73,9 +73,7 @@ jobs: - name: Cleanup workspace run: | hostname && whoami - sudo chmod 777 . -R - sudo rm -rf ~/.triton /tmp ./* - sudo mkdir -m 777 /tmp + sudo find ./ /tmp ${HOME}/.triton |grep -vE "^(./|/tmp|${HOME}/.triton)$" |xargs sudo rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Setup python ${{ inputs.python }} From 8b224186ca6486a7c67947c3278c9daf7b171226 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 10:01:17 +0800 Subject: [PATCH 016/160] update --- .github/workflows/_linux_e2e.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 4a82ec2eb1..148f725358 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -65,7 +65,7 @@ jobs: - /etc/group:/etc/group options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g -u jenkins env: - AGENT_TOOLSDIRECTORY: "/opt/_tools" + AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} reference_issue: 1645 @@ -115,7 +115,7 @@ jobs: cd pytorch git checkout ${TORCH_COMMIT_ID} # apply extra PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + python ../.github/scripts/apply_torch_pr.py git status && git diff && git show -s - name: Torch Config run: | @@ -233,7 +233,7 @@ jobs: id: summary if: ${{ ! cancelled() }} run: | - set -x -e -o pipefail + set -xe -o pipefail rm -rf ${{ github.workspace }}/upload_files cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ From acf94d148fc4e8a58362e75e20aac6bd0089d904 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 10:28:21 +0800 Subject: [PATCH 017/160] update --- .github/scripts/env.sh | 0 .github/workflows/_linux_build.yml | 2 +- .github/workflows/_linux_e2e.yml | 14 ++++---------- .github/workflows/pull.yml | 2 +- 4 files changed, 6 insertions(+), 12 deletions(-) mode change 100644 => 100755 .github/scripts/env.sh diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh old mode 100644 new mode 100755 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 91efb43611..c4b1b25ef3 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -98,7 +98,7 @@ jobs: bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" fi - source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh + ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh # gcc 11 source /opt/rh/gcc-toolset-11/enable ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 148f725358..04cde983f6 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -69,6 +69,9 @@ jobs: GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} reference_issue: 1645 + defaults: + run: + shell: bash -xe steps: - name: Cleanup workspace run: | @@ -127,7 +130,7 @@ jobs: python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - source ${{ github.workspace }}/.github/scripts/env.sh + ${{ github.workspace }}/.github/scripts/env.sh sycl-ls # Nihglty launch @@ -221,9 +224,6 @@ jobs: - name: Download Reference Artifact id: reference_id run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/')" REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ --json body -q .body |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo 'n/a')" @@ -233,15 +233,9 @@ jobs: id: summary if: ${{ ! cancelled() }} run: | - set -xe -o pipefail rm -rf ${{ github.workspace }}/upload_files cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ ${{ github.workspace }}/upload_files \ ${{ github.workspace }}/reference \ diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3f3b1c1b58..5b539f800d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -169,7 +169,7 @@ jobs: . /etc/os-release echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - source ../torch-xpu-ops/.github/scripts/env.sh + ../torch-xpu-ops/.github/scripts/env.sh echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" From a6fa7da12b08d087fa9dcd312f30c60e154287a0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 10:47:27 +0800 Subject: [PATCH 018/160] update --- .github/workflows/_linux_e2e.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 04cde983f6..c3bc6d011b 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -71,7 +71,7 @@ jobs: reference_issue: 1645 defaults: run: - shell: bash -xe + shell: bash steps: - name: Cleanup workspace run: | From 053bed35ff20a4accb5dfc6558ebf03052cd11a4 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 10:51:32 +0800 Subject: [PATCH 019/160] update --- .github/workflows/_linux_e2e.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index c3bc6d011b..59806f22da 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -76,7 +76,8 @@ jobs: - name: Cleanup workspace run: | hostname && whoami - sudo find ./ /tmp ${HOME}/.triton |grep -vE "^(./|/tmp|${HOME}/.triton)$" |xargs sudo rm -rf + sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf + sudo rm -rf ~/.triton ${HOME}/.triton - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Setup python ${{ inputs.python }} From 7ee8d4bf76e94beb914bfdc01770a0ac2fcdc545 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 10:59:54 +0800 Subject: [PATCH 020/160] update --- .github/workflows/_linux_e2e.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 59806f22da..f162dc590a 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -71,13 +71,14 @@ jobs: reference_issue: 1645 defaults: run: - shell: bash + shell: bash -xe {0} steps: - name: Cleanup workspace run: | hostname && whoami sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf sudo rm -rf ~/.triton ${HOME}/.triton + clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Setup python ${{ inputs.python }} From 428e4832f825db449f3930c2e3bc8a9f81d7f0d5 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 11:01:58 +0800 Subject: [PATCH 021/160] update --- .github/workflows/_linux_e2e.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index f162dc590a..34861ce551 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -75,9 +75,10 @@ jobs: steps: - name: Cleanup workspace run: | - hostname && whoami + hostname && whoami && id sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf sudo rm -rf ~/.triton ${HOME}/.triton + sudo chmod . 777 -R clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From c483968b50d109fdfbb368fed6a9b5d8b22d26ee Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 11:20:00 +0800 Subject: [PATCH 022/160] update --- .github/workflows/_linux_e2e.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 34861ce551..6a296801e4 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -62,8 +62,7 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - - /etc/group:/etc/group - options: --device=/dev/mem --device=/dev/dri --privileged --shm-size=8g -u jenkins + options: --device=/dev/mem --device=/dev/dri --group-add video --group-add 109 --privileged --shm-size=8g -u jenkins:109 env: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} @@ -78,7 +77,7 @@ jobs: hostname && whoami && id sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf sudo rm -rf ~/.triton ${HOME}/.triton - sudo chmod . 777 -R + sudo chmod 777 . -R clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From 32474f8efb83454326f117ada9e42cce472bc278 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 11:24:14 +0800 Subject: [PATCH 023/160] update --- .github/workflows/_linux_e2e.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 6a296801e4..5cd6d1504f 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -77,7 +77,7 @@ jobs: hostname && whoami && id sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf sudo rm -rf ~/.triton ${HOME}/.triton - sudo chmod 777 . -R + sudo chmod 777 . /__w -R clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From 59760992ee195fd3fa208a4fb65914eedc311d6b Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 11:24:39 +0800 Subject: [PATCH 024/160] update --- .github/workflows/_linux_e2e.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 5cd6d1504f..2c02abeed8 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -77,7 +77,7 @@ jobs: hostname && whoami && id sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf sudo rm -rf ~/.triton ${HOME}/.triton - sudo chmod 777 . /__w -R + sudo chmod 777 . /__w -R || true clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From 28e53b287110e7c088f986e43ed9e55bc6b65f4d Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 11:42:38 +0800 Subject: [PATCH 025/160] update --- .github/workflows/_linux_e2e.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 2c02abeed8..f237125867 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -62,7 +62,8 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --group-add video --group-add 109 --privileged --shm-size=8g -u jenkins:109 + options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g \ + -u $(id -u):$(getent group render | cut -d ':' -f3) env: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} From 2e9921e655a1d6cfb4480dcba391dccfc3a63c28 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 11:45:55 +0800 Subject: [PATCH 026/160] update --- .github/workflows/_linux_e2e.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index f237125867..4b224a920c 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -62,8 +62,9 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g \ - -u $(id -u):$(getent group render | cut -d ':' -f3) + options: --device=/dev/mem --device=/dev/dri --group-add video + --privileged --shm-size=8g + -u $(id -u):$(getent group render | cut -d ':' -f3) env: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} From b058b1a605a79cbb45723c43cc0683060c43e42f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 12:12:26 +0800 Subject: [PATCH 027/160] update --- .github/workflows/_linux_e2e.yml | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 4b224a920c..9e3d77c9b6 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -53,8 +53,22 @@ on: permissions: read-all jobs: - e2e_test: + get_runner: runs-on: ${{ inputs.runner }} + outputs: + test_host: ${{ steps.runner-info.outputs.test_host }} + test_user: ${{ steps.runner-info.outputs.test_user }} + test_group: ${{ steps.runner-info.outputs.test_group }} + steps: + - name: Get runner info + id: runner-info + run: | + echo "test_host=${RUNNER_NAME}" >> ${GITHUB_OUTPUT} + echo "test_user=$(id -u)" >> ${GITHUB_OUTPUT} + echo "test_group=$(getent group render |cut -d: -f3)" >> ${GITHUB_OUTPUT} + e2e_test: + runs-on: ${{ needs.get_runner.outputs.test_host }} + needs: [get_runner] timeout-minutes: 3600 permissions: issues: write @@ -62,9 +76,8 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --group-add video - --privileged --shm-size=8g - -u $(id -u):$(getent group render | cut -d ':' -f3) + options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g + -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} env: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} @@ -77,9 +90,8 @@ jobs: - name: Cleanup workspace run: | hostname && whoami && id - sudo find ./ /tmp |grep -vE "^(./|/tmp)$" |xargs sudo rm -rf - sudo rm -rf ~/.triton ${HOME}/.triton - sudo chmod 777 . /__w -R || true + find ./ |grep -v "^\./$" |xargs rm -rf + rm -rf ~/.triton ${HOME}/.triton clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From 93e5444128d9422f2c80c0c29135c7248b9e1ecb Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 12:36:05 +0800 Subject: [PATCH 028/160] update --- .github/workflows/_linux_e2e.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 9e3d77c9b6..27ea4d365f 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -55,6 +55,7 @@ permissions: read-all jobs: get_runner: runs-on: ${{ inputs.runner }} + name: ${{ github.runner.name }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} test_user: ${{ steps.runner-info.outputs.test_user }} @@ -66,6 +67,14 @@ jobs: echo "test_host=${RUNNER_NAME}" >> ${GITHUB_OUTPUT} echo "test_user=$(id -u)" >> ${GITHUB_OUTPUT} echo "test_group=$(getent group render |cut -d: -f3)" >> ${GITHUB_OUTPUT} + + # show host info + cat /etc/os-release + uname -a + source /opt/intel/oneapi/setvars.sh + sycl-ls + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + e2e_test: runs-on: ${{ needs.get_runner.outputs.test_host }} needs: [get_runner] @@ -146,8 +155,6 @@ jobs: python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - ${{ github.workspace }}/.github/scripts/env.sh - sycl-ls # Nihglty launch - name: Nightly Huggingface Full Test From 8baec841fcbb14d3ac09056bd96d748c39cbecee Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 15:25:27 +0800 Subject: [PATCH 029/160] update --- .github/workflows/_linux_e2e.yml | 115 ++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 39 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 27ea4d365f..648ac304af 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -53,9 +53,8 @@ on: permissions: read-all jobs: - get_runner: + get_e2e_runner: runs-on: ${{ inputs.runner }} - name: ${{ github.runner.name }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} test_user: ${{ steps.runner-info.outputs.test_user }} @@ -64,54 +63,57 @@ jobs: - name: Get runner info id: runner-info run: | + # get test runner echo "test_host=${RUNNER_NAME}" >> ${GITHUB_OUTPUT} echo "test_user=$(id -u)" >> ${GITHUB_OUTPUT} echo "test_group=$(getent group render |cut -d: -f3)" >> ${GITHUB_OUTPUT} - # show host info cat /etc/os-release uname -a source /opt/intel/oneapi/setvars.sh sycl-ls dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + - name: Cleanup workspace + if: ${{ always() }} + run: | + # clean docker cache + docker stop $(docker ps -aq) || true + docker system prune -af || true + # clean files + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf e2e_test: - runs-on: ${{ needs.get_runner.outputs.test_host }} - needs: [get_runner] + runs-on: ${{ needs.get_e2e_runner.outputs.test_host }} + needs: get_e2e_runner timeout-minutes: 3600 - permissions: - issues: write container: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} + -u ${{ needs.get_e2e_runner.outputs.test_user }}:${{ needs.get_e2e_runner.outputs.test_group }} env: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - reference_issue: 1645 defaults: run: shell: bash -xe {0} steps: - - name: Cleanup workspace - run: | - hostname && whoami && id - find ./ |grep -v "^\./$" |xargs rm -rf - rm -rf ~/.triton ${HOME}/.triton - clinfo --list - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Setup python ${{ inputs.python }} + - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} - - name: Check python + - name: Check runner run: | - which python + hostname && whoami && id + clinfo --list + gcc -v && g++ -v + which python && which pip python -V + pip install -U pip wheel setuptools pip list pip install pandas scipy psutil requests - name: Install oneAPI DLE @@ -244,24 +246,61 @@ jobs: dt: ${{ inputs.dt }} scenario: ${{ inputs.scenario }} - - name: Download Reference Artifact - id: reference_id + - name: Get archieve files + if: ${{ ! cancelled() }} + run: | + rm -rf ${{ github.workspace }}/upload_files + cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files + - name: Upload Inductor XPU E2E Data + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} + path: ${{ github.workspace }}/upload_files + + e2e_summary: + runs-on: ubuntu-latest + if: ${{ always() }} + needs: e2e_test + permissions: + issues: write + env: + GH_TOKEN: ${{ github.token }} + REFERENCE_ISSUE_ID: 1645 + steps: + - name: Install gh + run: | + apt-get update + apt-get install gh rsync -y + find ./ |grep -v "^\./$" |xargs rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Download Target Artifact + run: | + target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}" + gh --repo intel/torch-xpu-ops run download ${GITHUB_RUN_ID} -n "${target_dir}" + if [ -d "${target_dir}" ];then + rsync -avzq --delete ${target_dir}/ target/ + rm -rf ${target_dir}/ + fi + - name: Download Baseline Artifact run: | artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/')" - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ + REFERENCE_RUN_ID="$(gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} \ --json body -q .body |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo 'n/a')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" && \ - rm -rf reference && mv Inductor-*-XPU-E2E-* reference || echo 'No reference' - - name: Summarize archieve files - id: summary + gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" || true + baseline_dir="$(find . -name 'Inductor-*-XPU-E2E-*')" + if [ -d "${baseline_dir}" ];then + rsync -avzq --delete ${baseline_dir}/ baseline/ + rm -rf ${baseline_dir}/ + fi + - name: Get summary if: ${{ ! cancelled() }} run: | - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files if [ "${{ inputs.suite }}" != 'pt2e' ];then bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ + ${{ github.workspace }}/target \ + ${{ github.workspace }}/baseline \ >> ${GITHUB_STEP_SUMMARY} exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) if [ ${exit_label} -ne 0 ];then @@ -270,7 +309,7 @@ jobs: exit ${exit_label} fi fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" + pt2e_summary_csv="$(find ${{ github.workspace }}/target/ -name "summary.csv")" if [ -f "${pt2e_summary_csv}" ];then cat ${pt2e_summary_csv} failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) @@ -278,20 +317,18 @@ jobs: echo "[Warning] PT2E has failures!" fi fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - name: Upload Reference Run ID - if: ${{ ! contains(inputs.test_type, 'ondemand') }} + if: ${{ ! contains(inputs.test_type, 'ondemand') && github.repository_owner == 'intel' }} run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body 2>&1 |tee new_body.txt 2>&1 + gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 has_or_not="$(grep 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt |wc -l)" if [ ${has_or_not} -ne 0 ];then sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt else echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt fi - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt + gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt + - name: Cleanup workspace + if: ${{ always() }} + run: | + find ./ |grep -v "^\./$" |xargs rm -rf From d4c78aa4992c0ef8338c01641d6e10ec0b441dc5 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 15:32:27 +0800 Subject: [PATCH 030/160] update --- .../actions/inductor-xpu-e2e-test/action.yml | 26 +++---------------- .github/workflows/_linux_e2e.yml | 1 + 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml index 8f9b90780f..68a0e5f5a1 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -1,14 +1,14 @@ name: inductor-xpu-e2e-test inputs: + env_prepare: + required: false + description: If set to any value, will prepare suite test env suite: required: true type: string default: 'huggingface' description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma - env_prepare: - required: false - description: If set to any value, will prepare suite test env dt: required: true type: string @@ -24,28 +24,12 @@ inputs: type: string default: 'accuracy' description: accuracy,performance. Delimiter is comma - cards: - required: false - type: string - default: 'all' - description: which cards can be used in the test - pytorch: - required: false - type: string - default: 'main' - description: Pytorch branch/commit - driver: - required: false - type: string - default: 'lts' - description: Driver lts/rolling runs: using: composite steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} - shell: bash run: | if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then python -c "import torch, torchvision, torchaudio" @@ -78,7 +62,6 @@ runs: fi pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - shell: bash run: | cp ./.github/scripts/inductor_xpu_test.sh ./pytorch cd ./pytorch @@ -139,9 +122,6 @@ runs: done - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - env: - HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} - shell: bash run: | cd ./pytorch rm -f inductor_log/summary_accuracy.csv diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 648ac304af..b501baaf3c 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -96,6 +96,7 @@ jobs: AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + MODEL_ONLY_NAME: ${{ inputs.model }} defaults: run: shell: bash -xe {0} From a8154f14d41a762d9b29735fa3309dcdc8cb9c5a Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 15:42:49 +0800 Subject: [PATCH 031/160] update --- .github/workflows/_linux_e2e.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index b501baaf3c..45d560c0c1 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -64,9 +64,9 @@ jobs: id: runner-info run: | # get test runner - echo "test_host=${RUNNER_NAME}" >> ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" >> ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" >> ${GITHUB_OUTPUT} + echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} + echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} # show host info cat /etc/os-release uname -a @@ -80,6 +80,7 @@ jobs: docker stop $(docker ps -aq) || true docker system prune -af || true # clean files + ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf e2e_test: @@ -93,7 +94,7 @@ jobs: options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g -u ${{ needs.get_e2e_runner.outputs.test_user }}:${{ needs.get_e2e_runner.outputs.test_group }} env: - AGENT_TOOLSDIRECTORY: "${{ github.workspace }}/_tools" + AGENT_TOOLSDIRECTORY: /opt/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} MODEL_ONLY_NAME: ${{ inputs.model }} @@ -101,8 +102,6 @@ jobs: run: shell: bash -xe {0} steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: @@ -117,6 +116,10 @@ jobs: pip install -U pip wheel setuptools pip list pip install pandas scipy psutil requests + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Install oneAPI DLE if: ${{ inputs.oneapi != 'installed' }} run: | From f25ecfeeec280d77f222f774e15f081633dc31fb Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 15:52:32 +0800 Subject: [PATCH 032/160] update --- .github/actions/inductor-xpu-e2e-test/action.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml index 68a0e5f5a1..904c6c8840 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -27,6 +27,9 @@ inputs: runs: using: composite + defaults: + run: + shell: bash -xe {0} steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} @@ -75,7 +78,6 @@ runs: contains_status="continue" } } - set -xe xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')" export OMP_NUM_THREADS=${cores_per_instance} @@ -134,7 +136,6 @@ runs: cp ./.github/scripts/inductor_summary.py ./pytorch cd ./pytorch pip install styleFrame scipy pandas - set -xe dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') From e06e1bd1c1e5feb1ffc55cd50f83329d5f92e6e7 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 16:01:17 +0800 Subject: [PATCH 033/160] update --- .github/actions/inductor-xpu-e2e-test/action.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml index 904c6c8840..7f7b4f3165 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -25,11 +25,12 @@ inputs: default: 'accuracy' description: accuracy,performance. Delimiter is comma +defaults: + run: + shell: bash -xe {0} + runs: using: composite - defaults: - run: - shell: bash -xe {0} steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} From c437f29b1a2adfd32397323390d2fdd535c8a1d4 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 16:15:24 +0800 Subject: [PATCH 034/160] update --- .../actions/inductor-xpu-e2e-test/action.yml | 7 +++--- .github/actions/pt2e/action.yml | 23 ++++--------------- .github/workflows/_linux_e2e.yml | 2 ++ 3 files changed, 9 insertions(+), 23 deletions(-) diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml index 7f7b4f3165..d269ce6d12 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -25,15 +25,12 @@ inputs: default: 'accuracy' description: accuracy,performance. Delimiter is comma -defaults: - run: - shell: bash -xe {0} - runs: using: composite steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} + shell: bash -xe {0} run: | if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then python -c "import torch, torchvision, torchaudio" @@ -66,6 +63,7 @@ runs: fi pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash -xe {0} run: | cp ./.github/scripts/inductor_xpu_test.sh ./pytorch cd ./pytorch @@ -125,6 +123,7 @@ runs: done - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash -xe {0} run: | cd ./pytorch rm -f inductor_log/summary_accuracy.csv diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index ac4067e7ce..7343913e7a 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -14,28 +14,19 @@ inputs: type: string default: 'accuracy' description: accuracy,performance. Delimiter is comma - hf_token: - required: false - description: HUGGING_FACE_HUB_TOKEN for torchbench test pytorch: required: false type: string default: 'main' description: Pytorch branch/commit - driver: - required: false - type: string - default: 'lts' - description: Driver lts/rolling runs: using: composite steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} - shell: bash + shell: bash -xe {0} run: | - source activate e2e_ci # accuracy code if [[ "${{ inputs.scenario }}" == *"accuracy"* ]];then rm -rf pt2e-accuracy @@ -65,8 +56,8 @@ runs: cd pt2e-performance # remove deps which will reinstall torch pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch) + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) pip install -U transformers==4.44.2 sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt git status && git diff @@ -89,14 +80,8 @@ runs: bash valprep.sh fi - name: PT2E Test (${{ inputs.dt }} ${{ inputs.scenario }}) - env: - HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - shell: bash + shell: bash -xe {0} run: | - source activate e2e_ci - set -xe pt2e_logs_dir="${{ github.workspace }}/../pytorch/inductor_log/pt2e" rm -rf "${pt2e_logs_dir}" && mkdir -p "${pt2e_logs_dir}" echo "Mode,Model,Dtype,Result" |tee ${pt2e_logs_dir}/summary.csv diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 45d560c0c1..5b56b7ded0 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -194,6 +194,7 @@ jobs: if: ${{ contains(inputs.test_type, 'nightly') }} uses: ./.github/actions/pt2e with: + env_prepare: true dt: float32,int8 scenario: accuracy,performance @@ -229,6 +230,7 @@ jobs: if: ${{ contains(inputs.test_type, 'weekly') }} uses: ./.github/actions/pt2e with: + env_prepare: true dt: float32,int8 scenario: accuracy,performance From 0ae0bb1f85fb68f53fa14f83badf096d373ed228 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 16:19:26 +0800 Subject: [PATCH 035/160] update --- .github/actions/pt2e/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 7343913e7a..65fde6a03b 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -92,14 +92,14 @@ runs: do if [[ "${{ inputs.dt }}" == *"float32"* ]];then ${cmd_line} --model_list ${model_name} --is_fp32 2>&1 |tee "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" || true - grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" |tail -n 1 |awk -v m="${model_name}" ' + (grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" || echo "failed a failed") 2>&1 |tail -n 1 |awk -v m="${model_name}" ' BEGIN{acc1 = "failed"; acc5 = "failed";} {acc1 = $(NF - 2); acc5 = $NF;} END{printf("Accuracy,%s,float32,%s,%s\n", m, acc1, acc5) }' |tee -a ${pt2e_logs_dir}/summary.csv fi if [[ "${{ inputs.dt }}" == *"int8"* ]];then ${cmd_line} --model_list ${model_name} 2>&1 |tee "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" || true - grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" |tail -n 1 |awk -v m="${model_name}" ' + (grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" || echo "failed a failed") 2>&1 |tail -n 1 |awk -v m="${model_name}" ' BEGIN{acc1 = "failed"; acc5 = "failed";} {acc1 = $(NF - 2); acc5 = $NF;} END{printf("Accuracy,%s,int8,%s,%s\n", m, acc1, acc5) }' |tee -a ${pt2e_logs_dir}/summary.csv From d4da95d9593323be1d2e807183fae747135f58ae Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 17:07:57 +0800 Subject: [PATCH 036/160] update --- .github/workflows/_linux_e2e.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 5b56b7ded0..89d002da98 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -276,8 +276,8 @@ jobs: steps: - name: Install gh run: | - apt-get update - apt-get install gh rsync -y + sudo apt-get update + sudo apt-get install gh rsync -y find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From db17d7db6a8d2c6620dee410fc234f97b12dc9f1 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 17:37:18 +0800 Subject: [PATCH 037/160] update --- .github/workflows/_linux_e2e.yml | 48 +++++++++++++++++++------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 89d002da98..50d9ddc96e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -265,40 +265,54 @@ jobs: path: ${{ github.workspace }}/upload_files e2e_summary: - runs-on: ubuntu-latest + runs-on: [self-hosted, Linux, X64] if: ${{ always() }} needs: e2e_test permissions: issues: write - env: - GH_TOKEN: ${{ github.token }} - REFERENCE_ISSUE_ID: 1645 + container: + image: ubuntu:24.04 + env: + GH_TOKEN: ${{ github.token }} + REFERENCE_ISSUE_ID: 1645 + defaults: + run: + shell: bash -xe {0} steps: - name: Install gh run: | - sudo apt-get update - sudo apt-get install gh rsync -y - find ./ |grep -v "^\./$" |xargs rm -rf + apt-get update + apt-get install gh rsync -y - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact run: | target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}" - gh --repo intel/torch-xpu-ops run download ${GITHUB_RUN_ID} -n "${target_dir}" + gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -n "${target_dir}" if [ -d "${target_dir}" ];then rsync -avzq --delete ${target_dir}/ target/ + ls -al target/ rm -rf ${target_dir}/ + else + echo "No artifacts!" + exit 1 fi - name: Download Baseline Artifact run: | artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/')" - REFERENCE_RUN_ID="$(gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} \ - --json body -q .body |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo 'n/a')" - gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" || true - baseline_dir="$(find . -name 'Inductor-*-XPU-E2E-*')" - if [ -d "${baseline_dir}" ];then - rsync -avzq --delete ${baseline_dir}/ baseline/ - rm -rf ${baseline_dir}/ + gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt + REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" + if [ "${REFERENCE_RUN_ID}" != "" ];then + gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" + baseline_dir="$(find . -name 'Inductor-*-XPU-E2E-*' -type d)" + if [ -d "${baseline_dir}" ];then + rsync -avzq --delete ${baseline_dir}/ baseline/ + ls -al baseline/ + rm -rf ${baseline_dir}/ + fi + else + echo "No reference!" + mkdir -p baseline fi - name: Get summary if: ${{ ! cancelled() }} @@ -334,7 +348,3 @@ jobs: echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt fi gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt - - name: Cleanup workspace - if: ${{ always() }} - run: | - find ./ |grep -v "^\./$" |xargs rm -rf From b9c247ab80d7665e55998da63943697fc603fb08 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 18:07:43 +0800 Subject: [PATCH 038/160] update --- .github/workflows/_linux_e2e.yml | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 50d9ddc96e..5da82c05b1 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -282,23 +282,18 @@ jobs: - name: Install gh run: | apt-get update - apt-get install gh rsync -y + apt-get install gh rsync ca-certificates -y - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact run: | + mkdir target/ + cd target/ target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}" gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -n "${target_dir}" - if [ -d "${target_dir}" ];then - rsync -avzq --delete ${target_dir}/ target/ - ls -al target/ - rm -rf ${target_dir}/ - else - echo "No artifacts!" - exit 1 - fi - name: Download Baseline Artifact run: | + mkdir baseline/ artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/')" gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" @@ -310,18 +305,12 @@ jobs: ls -al baseline/ rm -rf ${baseline_dir}/ fi - else - echo "No reference!" - mkdir -p baseline fi - name: Get summary if: ${{ ! cancelled() }} run: | if [ "${{ inputs.suite }}" != 'pt2e' ];then - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/target \ - ${{ github.workspace }}/baseline \ - >> ${GITHUB_STEP_SUMMARY} + bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) if [ ${exit_label} -ne 0 ];then grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 @@ -329,7 +318,7 @@ jobs: exit ${exit_label} fi fi - pt2e_summary_csv="$(find ${{ github.workspace }}/target/ -name "summary.csv")" + pt2e_summary_csv="$(find ./target/ -name "summary.csv")" if [ -f "${pt2e_summary_csv}" ];then cat ${pt2e_summary_csv} failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) From 9ae98ea3f46f7f343e60da632294c113bbeb04b9 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 18:31:38 +0800 Subject: [PATCH 039/160] update --- .github/workflows/_linux_build.yml | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 4774e7e327..776c03a59e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -74,30 +74,6 @@ jobs: uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Build Triton XPU - run: | - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ - source /opt/rh/gcc-toolset-13/enable - dnf install -y zlib-devel - cd ../ && rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - cp pytorch_triton_xpu-*.whl ${{ github.workspace }} - fi - name: Build Pytorch XPU run: | set -xe -o pipefail @@ -122,7 +98,7 @@ jobs: bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" fi - ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh + source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh # gcc 11 source /opt/rh/gcc-toolset-11/enable ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ @@ -134,6 +110,9 @@ jobs: 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log - name: Build Triton run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable cd ./pytorch pip install cmake ninja pybind11 rm -rf pytorch_triton_xpu-*.whl From c06f1eea46da0b703d40e8bfa140f329f957f91e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 18 Jul 2025 18:42:37 +0800 Subject: [PATCH 040/160] update --- .github/workflows/_linux_ut.yml | 84 ++++++++++++--------------------- 1 file changed, 30 insertions(+), 54 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 29292b6285..1a10201e40 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -3,46 +3,38 @@ name: Linux UT Test on: workflow_call: inputs: + runner: + required: true + type: string + description: Runner label + test_type: + required: true + type: string + description: Test scope pytorch: - required: false type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - triton: - required: false + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: type: string - default: '' - description: Triton commit. Use pytorch pined commit by default + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version ut: required: true type: string - default: '' description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu` Delimiter is comma disabled_tests: - required: false type: string default: '' description: List disabled tests, such as disable_ut or disable_distributed - python: - required: false - type: string - default: '3.10' - description: Python version - runner: - required: true - type: string - default: 'linux.idc.xpu' - description: Runner label - driver: - required: false - type: string - default: 'lts' - description: Driver lts/rolling permissions: read-all @@ -53,8 +45,6 @@ jobs: timeout-minutes: 300 env: GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} ut_skip_issue: 1624 strategy: fail-fast: false @@ -151,7 +141,7 @@ jobs: additional_steps: | pip install pytest pytest-timeout - name: 'xpu_profiling' - condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }} + condition: ${{ contains(inputs.ut, 'xpu_profiling') }} command_script: | # RN50 Test PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 @@ -200,11 +190,18 @@ jobs: rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME conda create -n $CONDA_ENV_NAME python=${{ inputs.python }} cmake ninja -y source activate $CONDA_ENV_NAME + - name: Install oneAPI DLE + if: ${{ inputs.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel /opt/intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} + if: ${{ ! contains(inputs.test_type, 'wheel') }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + pattern: Torch-XPU-Wheel-* - name: Prepare Stock Pytorch run: | cd ../ @@ -344,7 +341,7 @@ jobs: - name: 'torch_xpu' condition: ${{ contains(inputs.ut, 'torch_xpu') }} - name: 'xpu_profiling' - condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }} + condition: ${{ contains(inputs.ut, 'xpu_profiling') }} steps: - name: Get matrix UT value run: | @@ -396,8 +393,6 @@ jobs: timeout-minutes: 60 env: GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} ut_skip_issue: 1624 steps: - name: Checkout torch-xpu-ops @@ -450,25 +445,6 @@ jobs: cd third_party/torch-xpu-ops git checkout ${TORCH_XPU_OPS_COMMIT} fi - - name: Triton Installation - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - fi - name: Torch Config run: | source activate xpu_op_${ZE_AFFINITY_MASK} From 6e14f8b58306859a4de90ddf107f470f92b2a581 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 10:52:41 +0800 Subject: [PATCH 041/160] update --- .github/workflows/_linux_e2e.yml | 16 +- .github/workflows/_linux_ut.yml | 331 ++++++++++++++++++------------- 2 files changed, 204 insertions(+), 143 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 5da82c05b1..021f906d09 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -108,6 +108,8 @@ jobs: python-version: ${{ inputs.python }} - name: Check runner run: | + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf hostname && whoami && id clinfo --list gcc -v && g++ -v @@ -115,15 +117,14 @@ jobs: python -V pip install -U pip wheel setuptools pip list - pip install pandas scipy psutil requests - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Install oneAPI DLE if: ${{ inputs.oneapi != 'installed' }} run: | - rm -rf ~/intel ~/.intel /opt/intel + rm -rf ~/intel ~/.intel wget -q -O oneapi.sh "${{ inputs.oneapi }}" bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} @@ -145,7 +146,12 @@ jobs: pip install --force-reinstall ${{ github.workspace }}/*.whl fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - git clone https://github.com/pytorch/pytorch pytorch + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch cd pytorch git checkout ${TORCH_COMMIT_ID} # apply extra PRs for stock pytorch diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1a10201e40..b905428035 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -39,60 +39,93 @@ on: permissions: read-all jobs: + get_ut_runner: + runs-on: ${{ inputs.runner }} + outputs: + test_host: ${{ steps.runner-info.outputs.test_host }} + test_user: ${{ steps.runner-info.outputs.test_user }} + test_group: ${{ steps.runner-info.outputs.test_group }} + steps: + - name: Get runner info + id: runner-info + run: | + # get test runner + echo "test_host=${NODE_LABEL}" |tee -a ${GITHUB_OUTPUT} + echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + # show host info + cat /etc/os-release + uname -a + source /opt/intel/oneapi/setvars.sh + sycl-ls + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + - name: Cleanup workspace + if: ${{ always() }} + run: | + # clean docker cache + # docker stop $(docker ps -aq) || true + docker system prune -af || true + # clean files + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf ut_test: - runs-on: ${{ matrix.test.runner || inputs.runner }} + needs: get_ut_runner + runs-on: ${{ matrix.test.runner || needs.get_ut_runner.outputs.test_host }} if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g + -u ${{ needs.get_ut_runner.outputs.test_user }}:${{ needs.get_ut_runner.outputs.test_group }} + env: + AGENT_TOOLSDIRECTORY: /opt/_tools + GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} env: - GH_TOKEN: ${{ github.token }} - ut_skip_issue: 1624 + UT_NAME: ${{ matrix.test.name }} + defaults: + run: + shell: bash -xe {0} strategy: fail-fast: false matrix: test: - name: 'op_regression' condition: ${{ contains(inputs.ut, 'op_regression') }} - directory: 'test/regressions' + directory: 'pytorch/third_party/torch-xpu-ops/test/regressions' command: 'pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml' log_prefix: 'op_regression' - timeout: 8000 - additional_steps: | - clinfo --list - pip install pytest pytest-timeout + timeout: 3600 - name: 'op_regression_dev1' condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - directory: 'test/regressions' + directory: 'pytorch/third_party/torch-xpu-ops/test/regressions' command: 'pytest --timeout 600 -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml' log_prefix: 'op_regression_dev1' - timeout: 8000 - additional_steps: | - clinfo --list - unset ZE_AFFINITY_MASK - pip install pytest pytest-timeout - runner: 'pvc_e2e' + timeout: 300 + runner: 'pvc_rolling' - name: 'op_transformers' condition: ${{ contains(inputs.ut, 'op_transformers') }} - directory: '../pytorch' + directory: 'pytorch' command: 'pytest --timeout 600 -v test/test_transformers.py -k xpu --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml' log_prefix: 'op_transformers' timeout: 3600 additional_steps: | - pip install pytest pytest-timeout export PYTORCH_TEST_WITH_SLOW=1 - name: 'op_extended' condition: ${{ contains(inputs.ut, 'op_extended') }} - directory: '../pytorch/third_party/torch-xpu-ops/test/xpu/extended/' + directory: 'pytorch/third_party/torch-xpu-ops/test/xpu/extended/' command: 'python run_test_with_skip.py' log_prefix: 'op_extended' - timeout: 10000 + timeout: 3600 additional_steps: | - pip install pytest pytest-timeout export PYTORCH_TEST_WITH_SLOW=1 xml_post_processing: | cp op_extended.xml $GITHUB_WORKSPACE/ut_log - name: 'op_ut' condition: ${{ contains(inputs.ut, 'op_ut') }} - directory: '../pytorch/third_party/torch-xpu-ops/test/xpu' + directory: 'pytorch/third_party/torch-xpu-ops/test/xpu' log_prefix: 'op_ut' command_script: | export PYTORCH_ENABLE_XPU_FALLBACK=1 @@ -122,11 +155,9 @@ jobs: 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log - additional_steps: | - pip install pytest pytest-timeout - name: 'torch_xpu' condition: ${{ contains(inputs.ut, 'torch_xpu') }} - directory: '../pytorch' + directory: 'pytorch' command_script: | export PYTORCH_TEST_WITH_SLOW=1 export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" @@ -138,11 +169,10 @@ jobs: tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log log_prefix: 'torch_xpu' timeout: 10000 - additional_steps: | - pip install pytest pytest-timeout - name: 'xpu_profiling' condition: ${{ contains(inputs.ut, 'xpu_profiling') }} command_script: | + cd torch-xpu-ops # RN50 Test PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test @@ -170,30 +200,36 @@ jobs: python -m pytest --timeout 600 -vs test_profiler_tree.py | \ tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log additional_steps: | - pip install pytest pytest-timeout mkdir -p ut_log/profile_test/issue_reproduce outputs: ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }} steps: + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Check runner + run: | + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf + hostname && whoami && id + clinfo --list + gcc -v && g++ -v + which python && which pip + python -V + pip install -U pip wheel setuptools + pip list + pip install pytest pytest-timeout + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Create unique workspace - run: | - # Create unique conda env for each UT test - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=xpu_op_${ZE_AFFINITY_MASK}_${{ matrix.test.name }}_${random}" >> $GITHUB_ENV - - name: Create Conda Env - run: | - pwd - which conda - conda remove --all -y -n $CONDA_ENV_NAME || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -n $CONDA_ENV_NAME python=${{ inputs.python }} cmake ninja -y - source activate $CONDA_ENV_NAME + with: + path: torch-xpu-ops - name: Install oneAPI DLE if: ${{ inputs.oneapi != 'installed' }} run: | - rm -rf ~/intel ~/.intel /opt/intel + rm -rf ~/intel ~/.intel wget -q -O oneapi.sh "${{ inputs.oneapi }}" bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} @@ -204,75 +240,71 @@ jobs: pattern: Torch-XPU-Wheel-* - name: Prepare Stock Pytorch run: | - cd ../ - rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone https://github.com/pytorch/pytorch pytorch - source activate $CONDA_ENV_NAME - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} - rm -rf vision || sudo rm -rf vision - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} + else + pip install --force-reinstall ${{ github.workspace }}/*.whl fi - pip install requests - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git show -s && git status && git diff + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} pip install -r .ci/docker/requirements-ci.txt + # apply extra PRs for stock pytorch + python ../.github/scripts/apply_torch_pr.py + git status && git diff && git show -s - name: Prepare Torch-xpu-ops run: | - cd ../pytorch + cd pytorch rm -rf third_party/torch-xpu-ops - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cp -r ${{ github.workspace }} third_party + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" else - TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}" + pip list |grep -E 'torch|intel' - name: Run XPU UT Test if: ${{ matrix.test.condition }} run: | set -e mkdir -p ${{ github.workspace }}/ut_log mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - source activate $CONDA_ENV_NAME echo "Running ${{ matrix.test.name }}" echo "Directory: ${{ matrix.test.directory }}" ${{ matrix.test.additional_steps }} - cd ${{ matrix.test.directory }} - if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then bash << "SCRIPT" set -e @@ -287,18 +319,11 @@ jobs: - name: UT Test Results Summary if: ${{ matrix.test.condition }} run: | - source activate $CONDA_ENV_NAME pip install junitparser - python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + python torch-xpu-ops/.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true if [ -e "ut_failure_list.csv" ];then cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv fi - - name: Clean up - if: ${{ always() }} - run: | - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - fi - name: Upload Inductor XPU UT Log if: ${{ matrix.test.condition }} uses: actions/upload-artifact@v4 @@ -323,7 +348,7 @@ jobs: timeout-minutes: 30 env: GH_TOKEN: ${{ github.token }} - ut_skip_issue: 1624 + UT_SKIP_ISSUE: 1624 strategy: fail-fast: false matrix: @@ -370,7 +395,7 @@ jobs: } set -xe cd ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log + gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log gh api "repos/${{ github.repository }}/issues?labels=skipped" \ --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ > issues.log @@ -393,69 +418,101 @@ jobs: timeout-minutes: 60 env: GH_TOKEN: ${{ github.token }} - ut_skip_issue: 1624 + AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/_tools steps: + - name: Check runner + run: | + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf + sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf + rm -rf ~/.triton ~/.torch + hostname && whoami && id + xpu-smi topology -m + gcc -v && g++ -v + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Create Conda Env + with: + path: torch-xpu-ops + - name: Install oneAPI DLE + if: ${{ inputs.oneapi != 'installed' }} run: | - pwd - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} + rm -rf ~/intel ~/.intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} + if: ${{ ! contains(inputs.test_type, 'wheel') }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + pattern: Torch-XPU-Wheel-* - name: Prepare Stock Pytorch run: | - cd ../ - rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone https://github.com/pytorch/pytorch pytorch - source activate xpu_op_${ZE_AFFINITY_MASK} - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} - rm -rf vision || sudo rm -rf vision - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else + which python && which pip + python -V + pip install -U pip wheel setuptools + pip list + pip install pytest pytest-timeout + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} + else + pip install --force-reinstall ${{ github.workspace }}/*.whl + fi + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" fi - pip install requests - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git show -s && git status && git diff + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} pip install -r .ci/docker/requirements-ci.txt + # apply extra PRs for stock pytorch + python ../.github/scripts/apply_torch_pr.py + git status && git diff && git show -s - name: Prepare Torch-xpu-ops run: | - cd ../pytorch + cd pytorch rm -rf third_party/torch-xpu-ops - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cp -r ${{ github.workspace }} third_party + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" else - TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}" cd ${{ github.workspace }}/ut_log/xpu_distributed - gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log + gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log gh api "repos/${{ github.repository }}/issues?labels=skipped" \ --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ > issues.log From 9a621c526c5635db786c62fb176361b169a99fc4 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 11:20:12 +0800 Subject: [PATCH 042/160] update --- .github/workflows/_linux_e2e.yml | 39 +++++++- .github/workflows/_linux_ut.yml | 14 ++- .github/workflows/pull.yml | 149 ++----------------------------- 3 files changed, 56 insertions(+), 146 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 021f906d09..c9ae30614e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -155,7 +155,11 @@ jobs: cd pytorch git checkout ${TORCH_COMMIT_ID} # apply extra PRs for stock pytorch - python ../.github/scripts/apply_torch_pr.py + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../.github/scripts/apply_torch_pr.py + fi git status && git diff && git show -s - name: Torch Config run: | @@ -168,6 +172,35 @@ jobs: pip list |grep -E 'torch|intel' dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + # CICD launch + - name: Nightly Huggingface BF16 & FP16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: huggingface + dt: bfloat16,float16 + mode: training + scenario: accuracy,performance + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: torchbench + dt: bfloat16 + mode: training + scenario: accuracy,performance + - name: Nightly Timm_models BF16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') }} + uses: ./.github/actions/inductor-xpu-e2e-test + with: + env_prepare: true + suite: timm_models + dt: bfloat16 + mode: training + scenario: accuracy,performance + # Nihglty launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'nightly') }} @@ -300,7 +333,7 @@ jobs: - name: Download Baseline Artifact run: | mkdir baseline/ - artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/')" + artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/;s/cicd/weekly/')" gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" if [ "${REFERENCE_RUN_ID}" != "" ];then @@ -333,7 +366,7 @@ jobs: fi fi - name: Upload Reference Run ID - if: ${{ ! contains(inputs.test_type, 'ondemand') && github.repository_owner == 'intel' }} + if: ${{ ! (contains(inputs.test_type, 'ondemand') && contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} run: | gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 has_or_not="$(grep 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt |wc -l)" diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index b905428035..4f3cf67baf 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -45,6 +45,7 @@ jobs: test_host: ${{ steps.runner-info.outputs.test_host }} test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} + ZE_AFFINITY_MASK: ${{ steps.runner-info.outputs.ZE_AFFINITY_MASK }} steps: - name: Get runner info id: runner-info @@ -53,6 +54,7 @@ jobs: echo "test_host=${NODE_LABEL}" |tee -a ${GITHUB_OUTPUT} echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + echo "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" |tee -a ${GITHUB_OUTPUT} # show host info cat /etc/os-release uname -a @@ -83,6 +85,7 @@ jobs: AGENT_TOOLSDIRECTORY: /opt/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + ZE_AFFINITY_MASK: ${{ needs.get_ut_runner.outputs.ZE_AFFINITY_MASK }} env: UT_NAME: ${{ matrix.test.name }} defaults: @@ -104,7 +107,8 @@ jobs: command: 'pytest --timeout 600 -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml' log_prefix: 'op_regression_dev1' timeout: 300 - runner: 'pvc_rolling' + additional_steps: | + unset ZE_AFFINITY_MASK - name: 'op_transformers' condition: ${{ contains(inputs.ut, 'op_transformers') }} directory: 'pytorch' @@ -261,7 +265,11 @@ jobs: git checkout ${TORCH_COMMIT_ID} pip install -r .ci/docker/requirements-ci.txt # apply extra PRs for stock pytorch - python ../.github/scripts/apply_torch_pr.py + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../.github/scripts/apply_torch_pr.py + fi git status && git diff && git show -s - name: Prepare Torch-xpu-ops run: | @@ -278,7 +286,7 @@ jobs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ];then + if [ "${{ inputs.test_type }}" == "cicd" ];then cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops else git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 5b539f800d..2b8e8b803d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -107,152 +107,21 @@ jobs: needs: [preci-conditions-filter, preci-linux-build] uses: ./.github/workflows/_linux_ut.yml with: - disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} - ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + test_type: build-cicd + pytorch: main + ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed + disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} preci-linux-e2e: if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} name: preci-linux / e2e_test needs: [preci-conditions-filter, preci-linux-build] - runs-on: pvc_e2e - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - timeout-minutes: 300 - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=3.10 cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number }} - - name: Install Pytorch XPU - run: | - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout ${TORCH_COMMIT_ID} - # apply PRs for stock pytorch - # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - git show -s && git status && git diff - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - python .github/scripts/build_triton_wheel.py --device xpu - pip install pytorch_triton_xpu-*.whl - - name: Identify pinned versions - run: | - cd ../pytorch - echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - ../torch-xpu-ops/.github/scripts/env.sh - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - - name: Torch Config - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - cd .. - source activate e2e_ci - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - - name: Huggingface BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Huggingface FP16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: float16 - mode: training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Timm_models BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Torchbench BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files || sudo rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - # Print summary - source activate e2e_ci - export IS_PR=1 - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files + uses: ./.github/workflows/_linux_e2e.yml + with: + runner: pvc_rolling + test_type: build-cicd + pytorch: main preci-windows: name: preci-windows From bb17babef15542bfbf9e5802ee4f6e2e7b27dc91 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 11:22:13 +0800 Subject: [PATCH 043/160] update --- .github/workflows/_linux_e2e.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index c9ae30614e..b314eacd1c 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -25,22 +25,18 @@ on: default: '3.10' description: Python version suite: - required: true type: string default: 'huggingface' description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma dt: - required: true type: string default: 'float32' description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma mode: - required: true type: string default: 'inference' description: Test mode. `inference,training`. Delimiter is comma scenario: - required: true type: string default: 'accuracy' description: Test scenario. `accuracy,performance`. Delimiter is comma From 981c7442b8c993e451cab560bbaca346ef5ba483 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 11:23:19 +0800 Subject: [PATCH 044/160] update --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 2b8e8b803d..90a11b020e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -116,6 +116,7 @@ jobs: preci-linux-e2e: if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} name: preci-linux / e2e_test + permissions: write-all needs: [preci-conditions-filter, preci-linux-build] uses: ./.github/workflows/_linux_e2e.yml with: From 6482077c2c82da713d0c55e8f03f25af12fb6d7c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 11:42:33 +0800 Subject: [PATCH 045/160] update --- .github/scripts/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index f10f095934..4545fc3bfa 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -51,7 +51,7 @@ python -m pip install -r requirements.txt python -m pip install mkl-static mkl-include export USE_STATIC_MKL=1 export USE_XCCL=1 -if [ "${XPU_ONEAPI_PATH}" != "" ];then +if [ "${XPU_ONEAPI_PATH}" == "" ];then export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ intel-cmplr-lib-rt==2025.1.1 | \ intel-cmplr-lib-ur==2025.1.1 | \ From ec0c1f231688c1ba68636af8f80bcc1f30196b6e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 11:53:52 +0800 Subject: [PATCH 046/160] update --- .github/workflows/_linux_ut.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 4f3cf67baf..ece9c2647b 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -45,7 +45,6 @@ jobs: test_host: ${{ steps.runner-info.outputs.test_host }} test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} - ZE_AFFINITY_MASK: ${{ steps.runner-info.outputs.ZE_AFFINITY_MASK }} steps: - name: Get runner info id: runner-info @@ -54,7 +53,6 @@ jobs: echo "test_host=${NODE_LABEL}" |tee -a ${GITHUB_OUTPUT} echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - echo "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" |tee -a ${GITHUB_OUTPUT} # show host info cat /etc/os-release uname -a @@ -72,7 +70,7 @@ jobs: sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf ut_test: needs: get_ut_runner - runs-on: ${{ matrix.test.runner || needs.get_ut_runner.outputs.test_host }} + runs-on: ${{ needs.get_ut_runner.outputs.test_host }} if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 container: @@ -85,7 +83,7 @@ jobs: AGENT_TOOLSDIRECTORY: /opt/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - ZE_AFFINITY_MASK: ${{ needs.get_ut_runner.outputs.ZE_AFFINITY_MASK }} + ZE_AFFINITY_MASK: ${{ env.ZE_AFFINITY_MASK }} env: UT_NAME: ${{ matrix.test.name }} defaults: From 1cc986eb2db3d1eaecc1f4d70e8ed7959d7ec773 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 12:39:06 +0800 Subject: [PATCH 047/160] update --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index ece9c2647b..8435ab972e 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -79,11 +79,11 @@ jobs: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g -u ${{ needs.get_ut_runner.outputs.test_user }}:${{ needs.get_ut_runner.outputs.test_group }} + -e ZE_AFFINITY_MASK env: AGENT_TOOLSDIRECTORY: /opt/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - ZE_AFFINITY_MASK: ${{ env.ZE_AFFINITY_MASK }} env: UT_NAME: ${{ matrix.test.name }} defaults: From b2b48c54efd38140ceb35d4cb503344c2b1487cf Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 14:07:58 +0800 Subject: [PATCH 048/160] update --- .github/workflows/_linux_e2e.yml | 1 + .github/workflows/_linux_ut.yml | 1 + .github/workflows/nightly_ondemand.yml | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index b314eacd1c..8d71a75eeb 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -115,6 +115,7 @@ jobs: pip list uname -a dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + pip install pandas psutil scipy requests - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Install oneAPI DLE diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8435ab972e..150580452a 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -224,6 +224,7 @@ jobs: pip install pytest pytest-timeout uname -a dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + pip install pandas psutil scipy requests - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 57800e2eb4..7b1c0bb685 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -124,7 +124,7 @@ jobs: needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: - runner: pvc_rolling + runner: linux.idc.xpu test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} From bccec9320ee8db8824010fc5d6d7f0345a9a57e5 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 15:10:31 +0800 Subject: [PATCH 049/160] update --- .github/workflows/_linux_e2e.yml | 5 +++++ .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 8d71a75eeb..1a3f729504 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -309,6 +309,7 @@ jobs: container: image: ubuntu:24.04 env: + AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} REFERENCE_ISSUE_ID: 1645 defaults: @@ -319,6 +320,10 @@ jobs: run: | apt-get update apt-get install gh rsync ca-certificates -y + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 150580452a..0f6dbf3dcb 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -50,7 +50,7 @@ jobs: id: runner-info run: | # get test runner - echo "test_host=${NODE_LABEL}" |tee -a ${GITHUB_OUTPUT} + echo "test_host=${RUNNER_NAME%-*}" |tee -a ${GITHUB_OUTPUT} echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} # show host info From 9f604a77c3912c486f7941b4dc44ed146fe5efe1 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 21 Jul 2025 16:00:55 +0800 Subject: [PATCH 050/160] update --- .github/workflows/_linux_e2e.yml | 3 ++- .github/workflows/_linux_ut.yml | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 1a3f729504..0980acd111 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -307,7 +307,7 @@ jobs: permissions: issues: write container: - image: ubuntu:24.04 + image: ubuntu:latest env: AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} @@ -350,6 +350,7 @@ jobs: - name: Get summary if: ${{ ! cancelled() }} run: | + pip install pandas requests if [ "${{ inputs.suite }}" != 'pt2e' ];then bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 0f6dbf3dcb..260203acfd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -265,9 +265,9 @@ jobs: pip install -r .ci/docker/requirements-ci.txt # apply extra PRs for stock pytorch if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then - python ../.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 else - python ../.github/scripts/apply_torch_pr.py + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py fi git status && git diff && git show -s - name: Prepare Torch-xpu-ops @@ -486,7 +486,7 @@ jobs: git checkout ${TORCH_COMMIT_ID} pip install -r .ci/docker/requirements-ci.txt # apply extra PRs for stock pytorch - python ../.github/scripts/apply_torch_pr.py + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py git status && git diff && git show -s - name: Prepare Torch-xpu-ops run: | From 8a78c7c0ca929d4fd107867a6eaa9dd159ee7490 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 08:42:36 +0800 Subject: [PATCH 051/160] update --- .../workflows/nightly_ondemand_rolling.yml | 460 ------------------ .github/workflows/nightly_ondemand_whl.yml | 396 --------------- 2 files changed, 856 deletions(-) delete mode 100644 .github/workflows/nightly_ondemand_rolling.yml delete mode 100644 .github/workflows/nightly_ondemand_whl.yml diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml deleted file mode 100644 index 03101ebf3a..0000000000 --- a/.github/workflows/nightly_ondemand_rolling.yml +++ /dev/null @@ -1,460 +0,0 @@ -name: Nightly-OnDemand Tests Rolling - -on: - schedule: - # GMT+8 21:30 every workday - - cron: '30 13 * * 0-4' - # GMT+8 0:30 Saturday - - cron: '30 16 * * 5' - workflow_dispatch: - inputs: - pytorch: - required: false - type: string - default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false - type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - ut: - required: false - type: string - default: 'torch_xpu' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma - triton: - required: false - type: string - default: '' - description: Triton commit. Use pytorch pined commit by default - suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma - dt: - required: true - type: string - default: 'float32' - description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma - mode: - required: true - type: string - default: 'inference' - description: Test mode. `inference,training`. Delimiter is comma - scenario: - required: true - type: string - default: 'accuracy' - description: Test scenario. `accuracy,performance`. Delimiter is comma - model: - required: false - type: string - default: '' - description: Model. Will only run this one mode if set - python: - required: false - type: string - default: '3.10' - description: Python version - -permissions: read-all - -concurrency: - group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: ${{ github.event_name != 'schedule' }} - -jobs: - Linux-Nightly-Ondemand-Build-Rolling: - if: ${{ github.repository_owner == 'intel' }} - name: linux-nightly-ondemand-rolling - secrets: inherit - uses: ./.github/workflows/_linux_build.yml - with: - pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - driver: rolling - runner: pvc_rolling - - Linux-Nightly-Ondemand-UT-Tests-Rolling: - if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} - name: linux-nightly-ondemand-rolling - needs: Linux-Nightly-Ondemand-Build-Rolling - uses: ./.github/workflows/_linux_ut.yml - with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }} - driver: rolling - runner: pvc_rolling - - Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: - name: linux-nightly-ondemand-rolling / Op_microbench - permissions: - issues: write - needs: Linux-Nightly-Ondemand-Build-Rolling - uses: ./.github/workflows/_linux_op_benchmark.yml - with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }} - driver: rolling - runner: pvc_rolling - - Linux-Nightly-Ondemand-E2E-Tests-Rolling: - runs-on: pvc_rolling - name: linux-nightly-ondemand-rolling / e2e_test - needs: Linux-Nightly-Ondemand-Build-Rolling - timeout-minutes: 3600 - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - NEOReadDebugKeys: 1 - DisableScratchPages: 1 - run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '30 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} - outputs: - TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} - TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} - TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} - TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} - TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} - TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} - TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} - TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=${{ env.python }} cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - - name: Prepare Stock Pytorch - run: | - pwd - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - # apply extra PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Identify pinned versions - id: pinned - run: | - source .github/scripts/env.sh - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - else - echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - fi - echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo ${GITHUB_ENV} - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - - name: Show GITHUB_ENV - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - - # Nihglty launch - - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16 - mode: inference,training - scenario: accuracy - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Nightly Torchbench BF16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Nightly Timm_models FP16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: float16 - mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Nightly PT2E Full Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - # Weekly launch - - name: Weekly Huggingface Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Weekly Torchbench Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Weekly Timm_models Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Weekly PT2E Accuracy Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: float32,int8 - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: ${{ inputs.suite }} - env_prepare: true - dt: ${{ inputs.dt }} - mode: ${{ inputs.mode }} - scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: ${{ inputs.dt }} - scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - if [ "${{ env.run_type }}" == "on-demand" ];then - artifact_type="weekly" - else - artifact_type="${{ env.run_type }}" - fi - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-${artifact_type}-Rolling-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - id: summary - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary - if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci - export LTS_OR_ROLLING='rolling' - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ env.run_type }}-Rolling-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - name: Upload Reference Run ID - if: ${{ env.run_type != 'on-demand' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.run_type }}-Rolling-XPU-E2E:.*/Inductor-${{ env.run_type }}-Rolling-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt - - Tests-Failure-And-Report: - if: ${{ ! cancelled() }} - runs-on: [ self-hosted, Linux ] - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - needs: Linux-Nightly-Ondemand-E2E-Tests-Rolling - steps: - - name: Report github issue for XPU OPS nightly - if: github.repository_owner == 'intel' - run: | - set -xe - # Test env - build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - repo="${{ github.repository }}" - TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_BRANCH_ID }}" - TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_COMMIT_ID }}" - KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.KERNEL_VERSION }}" - DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.DRIVER_VERSION }}" - BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.BUNDLE_VERSION }}" - OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.OS_PRETTY_NAME }}" - GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.GCC_VERSION }}" - TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHBENCH_COMMIT_ID }}" - TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHVISION_COMMIT_ID }}" - TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHAUDIO_COMMIT_ID }}" - TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TRANSFORMERS_VERSION }}" - TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TIMM_COMMIT_ID }}" - TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TRITON_COMMIT_ID }}" - TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TIMEOUT_MODELS }}" - # Test status - if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.result }}" == "success" ];then - test_status=Success - elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.result }}" == "failure" ];then - test_status=Failure - cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" - else - test_status=None - exit 0 - fi - # Test Type - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_type="On-demand" - test_issue_id=426 - cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" - elif [ "${{ github.event.schedule }}" == "30 16 * * 5" ];then - test_type="Weekly" - test_issue_id=432 - else - test_type="Nightly" - test_issue_id=432 - fi - # Test report - echo -e "**${test_status}** $test_type Rolling Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt - printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt - echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | rolling-$DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" - if [ "${{ inputs.model }}" != "" ];then - test_scope+="; model=${{ inputs.model }}" - fi - echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt - fi - echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt - echo "$cc_comment" >> ${{ github.workspace }}/report.txt - # Report - report_txt=$(cat ${{ github.workspace }}/report.txt) - gh --repo $repo issue comment $test_issue_id --body "$report_txt" diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml deleted file mode 100644 index 23f5456f28..0000000000 --- a/.github/workflows/nightly_ondemand_whl.yml +++ /dev/null @@ -1,396 +0,0 @@ -name: Torch Nightly WHL Tests - -on: - schedule: - # GMT+8 21:00 every workday - - cron: '0 14 * * 0-4' - # GMT+8 0:00 Saturday - - cron: '0 17 * * 5' - workflow_dispatch: - inputs: - pytorch: - required: false - type: string - default: 'nightly' - description: Pytorch branch/commit - ut: - required: false - type: string - default: 'torch_xpu' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma - suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma - dt: - required: true - type: string - default: 'float32' - description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma - mode: - required: true - type: string - default: 'inference' - description: Test mode. `inference,training`. Delimiter is comma - scenario: - required: true - type: string - default: 'accuracy' - description: Test scenario. `accuracy,performance`. Delimiter is comma - model: - required: false - type: string - default: '' - description: Model. Will only run this one mode if set - python: - required: false - type: string - default: '3.10' - description: Python version - -permissions: read-all - -concurrency: - group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: ${{ github.event_name != 'schedule' }} - -jobs: - Linux-Nightly-Ondemand-UT-WHL-Tests: - if: ${{ (github.event_name == 'schedule' || inputs.ut != '') && github.repository_owner == 'intel' }} - uses: ./.github/workflows/_linux_ut.yml - with: - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - pytorch: nightly_wheel - runner: linux.idc.xpu - - Linux-Nightly-Ondemand-E2E-WHL-Tests: - runs-on: pvc_e2e - if: ${{ github.repository_owner == 'intel' }} - timeout-minutes: 3600 - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - pytorch: ${{ github.event_name == 'schedule' && 'nightly' || inputs.pytorch }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 17 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} - outputs: - TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }} - TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }} - TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} - TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} - TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} - TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} - TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} - TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} - TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=${{ env.python }} cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Prepare Stock Pytorch - id: installed - run: | - pwd - cd ../ - source activate e2e_ci - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - rm -rf pytorch || sudo rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout ${TORCH_COMMIT_ID} - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Identify pinned versions - id: pinned - run: | - source activate e2e_ci - source .github/scripts/env.sh - echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - cd ../pytorch - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo ${GITHUB_ENV} - - name: Show GITHUB_ENV - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - - # Nihglty launch - - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Torchbench BF16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy - pytorch: nightly_wheel - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Timm_models FP16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: float16 - mode: training - scenario: accuracy - pytorch: nightly_wheel - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly PT2E Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - env_prepare: true - - # Weekly launch - - name: Weekly Huggingface Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Torchbench Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Timm_models Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly PT2E Accuracy Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: float32,int8 - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: ${{ inputs.suite }} - env_prepare: true - dt: ${{ inputs.dt }} - mode: ${{ inputs.mode }} - scenario: ${{ inputs.scenario }} - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: ${{ inputs.dt }} - scenario: ${{ inputs.scenario }} - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - if [ "${{ env.run_type }}" == "on-demand" ];then - artifact_type="weekly" - else - artifact_type="${{ env.run_type }}" - fi - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-${artifact_type}-Pre-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - id: summary - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary - if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ env.run_type }}-Pre-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - name: Upload Reference Run ID - if: ${{ env.run_type != 'on-demand' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.run_type }}-Pre-XPU-E2E:.*/Inductor-${{ env.run_type }}-Pre-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt - - Tests-Failure-And-Report: - if: ${{ ! cancelled() }} - runs-on: [ self-hosted, Linux ] - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - needs: Linux-Nightly-Ondemand-E2E-WHL-Tests - steps: - - name: Report github issue for XPU OPS nightly - if: github.repository_owner == 'intel' - run: | - set -xe - # Test env - build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - repo="${{ github.repository }}" - TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}" - TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}" - TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}" - DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}" - KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}" - BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}" - OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.OS_PRETTY_NAME }}" - GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.GCC_VERSION }}" - TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHBENCH_COMMIT_ID }}" - TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHVISION_COMMIT_ID }}" - TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHAUDIO_COMMIT_ID }}" - TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRANSFORMERS_VERSION }}" - TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMM_COMMIT_ID }}" - TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRITON_COMMIT_ID }}" - TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMEOUT_MODELS }}" - # Test status - if [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "success" ];then - test_status=Success - elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "failure" ];then - test_status=Failure - cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" - else - test_status=None - exit 0 - fi - # Test Type - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_type="On-demand" - test_issue_id=426 - cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" - elif [ "${{ github.event.schedule }}" == "0 17 * * 5" ];then - test_type="Weekly" - test_issue_id=432 - else - test_type="Nightly" - test_issue_id=432 - fi - # Test report - echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt - printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt - echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" - if [ "${{ inputs.model }}" != "" ];then - test_scope+="; model=${{ inputs.model }}" - fi - echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt - fi - echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt - echo "$cc_comment" >> ${{ github.workspace }}/report.txt - # Report - report_txt=$(cat ${{ github.workspace }}/report.txt) - gh --repo $repo issue comment $test_issue_id --body "$report_txt" From 46d00c8cc3761b9207a39dff4d583a709422bc0c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 09:16:44 +0800 Subject: [PATCH 052/160] update --- .github/workflows/_linux_build.yml | 42 ++++- .github/workflows/_linux_e2e.yml | 3 +- .github/workflows/_linux_op_benchmark.yml | 204 +++++++++++++--------- .github/workflows/_linux_ut.yml | 5 +- .github/workflows/nightly_ondemand.yml | 18 +- .github/workflows/pull.yml | 4 +- 6 files changed, 183 insertions(+), 93 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 776c03a59e..7fd29a97b4 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -44,15 +44,46 @@ jobs: steps: - name: Use ${{ inputs.pytorch }} run: echo 'Use ${{ inputs.pytorch }}' - build: + get_build_runner: if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ inputs.runner }} + outputs: + test_host: ${{ steps.runner-info.outputs.test_host }} + test_user: ${{ steps.runner-info.outputs.test_user }} + test_group: ${{ steps.runner-info.outputs.test_group }} + steps: + - name: Get runner info + id: runner-info + run: | + # get test runner + echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} + echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + # show host info + cat /etc/os-release + uname -a + source /opt/intel/oneapi/setvars.sh + sycl-ls + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + - name: Cleanup workspace + if: ${{ always() }} + run: | + # clean docker cache + docker stop $(docker ps -aq) || true + docker system prune -af || true + # clean files + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + build: + needs: get_build_runner + runs-on: ${{ needs.get_build_runner.outputs.test_host }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} + options: -u ${{ needs.get_build_runner.outputs.test_user }}:${{ needs.get_build_runner.outputs.test_group }} env: - PATH: /opt/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + PATH: /tmp/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin GH_TOKEN: ${{ github.token }} timeout-minutes: 300 steps: @@ -67,7 +98,7 @@ jobs: dnf install gh --repo gh-cli -y # Setup python local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /opt/xpu-build + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-build which python && python -V && pip list pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops @@ -93,7 +124,7 @@ jobs: fi # oneAPI DLE if [ "${{ inputs.oneapi }}" != "installed" ];then - rm -rf ~/intel ~/.intel /opt/intel + rm -rf ${HOME}/intel ${HOME}/.intel wget -q -O oneapi.sh "${{ inputs.oneapi }}" bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" @@ -146,6 +177,7 @@ jobs: cp dist/*.whl ${{ github.workspace }} - name: Torch Config run: | + printenv python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" @@ -153,6 +185,8 @@ jobs: python -c "import torchvision; print(torchvision.__version__)" python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' + - name: Upload Torch XPU Wheel if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 0980acd111..47c8218e39 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -90,7 +90,7 @@ jobs: options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g -u ${{ needs.get_e2e_runner.outputs.test_user }}:${{ needs.get_e2e_runner.outputs.test_group }} env: - AGENT_TOOLSDIRECTORY: /opt/_tools + AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} MODEL_ONLY_NAME: ${{ inputs.model }} @@ -167,7 +167,6 @@ jobs: python -c "import triton; print(triton.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 144bf04fb7..0588acec27 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -3,112 +3,157 @@ name: Linux OP Benchmark Test on: workflow_call: inputs: + runner: + required: true + type: string + default: 'pvc_rolling' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel pytorch: - required: false type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - triton: - required: false + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: type: string - default: '' - description: Triton commit. Use pytorch pined commit by default + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed python: - required: false type: string default: '3.10' description: Python version - runner: - required: true - type: string - default: 'linux.idc.xpu' - description: Runner label - driver: - required: false - type: string - default: 'rolling' - description: Driver lts/rolling -permissions: - issues: write +permissions: read-all jobs: + get_op_runner: + runs-on: ${{ inputs.runner }} + outputs: + test_host: ${{ steps.runner-info.outputs.test_host }} + test_user: ${{ steps.runner-info.outputs.test_user }} + test_group: ${{ steps.runner-info.outputs.test_group }} + steps: + - name: Get runner info + id: runner-info + run: | + # get test runner + echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} + echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + # show host info + cat /etc/os-release + uname -a + source /opt/intel/oneapi/setvars.sh + sycl-ls + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + - name: Cleanup workspace + if: ${{ always() }} + run: | + # clean docker cache + docker stop $(docker ps -aq) || true + docker system prune -af || true + # clean files + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf op_benchmark_test: - runs-on: ${{ inputs.runner }} + needs: get_op_runner + runs-on: ${{ needs.get_op_runner.outputs.test_host }} + permissions: + issues: write timeout-minutes: 900 - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1689 - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g + -u ${{ needs.get_op_runner.outputs.test_user }}:${{ needs.get_op_runner.outputs.test_group }} + env: + AGENT_TOOLSDIRECTORY: /opt/_tools + GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + REFERENCE_ISSUE: 1689 + defaults: + run: + shell: bash -xe {0} steps: + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Check runner + run: | + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf + hostname && whoami && id + clinfo --list + gcc -v && g++ -v + which python && which pip + python -V + pip install -U pip wheel setuptools + pip list + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + pip install pandas psutil scipy requests - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Prepare Stock Pytorch + - name: Install oneAPI DLE + if: ${{ inputs.oneapi != 'installed' }} run: | - pwd - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch - pip install requests - git clone https://github.com/pytorch/pytorch pytorch - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi - fi + rm -rf ~/intel ~/.intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} + if: ${{ ! contains(inputs.test_type, 'wheel') }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }} - - name: Install Pytorch XPU + pattern: Torch-XPU-Wheel-* + - name: Prepare Stock Pytorch run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../pytorch - export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - pip install -r requirements.txt - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + else + pip install --force-reinstall ${{ github.workspace }}/*.whl fi - pip install -r .ci/docker/requirements-ci.txt + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} + # apply extra PRs for stock pytorch + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../.github/scripts/apply_torch_pr.py + fi + git status && git diff && git show -s - name: Torch Config run: | - source activate xpu_op_${ZE_AFFINITY_MASK} + printenv python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - - cd .. + python -c "import triton; print(triton.__version__)" python pytorch/torch/utils/collect_env.py - rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache + pip list |grep -E 'torch|intel' + - name: Run Torch XPU Op Benchmark - if: ${{ inputs.driver == 'rolling' }} run: | - source activate xpu_op_${ZE_AFFINITY_MASK} mkdir -p ${{ github.workspace }}/op_benchmark cd test/microbench filename=$(find -- *.py) @@ -124,15 +169,12 @@ jobs: continue-on-error: true id: reference_id run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - conda install gh --channel conda-forge -y - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ + REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE} \ --json body -q .body |grep "Inductor-XPU-OP-Benchmark-Data" |sed 's/.*: *//')" gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-XPU-OP-Benchmark-Data-*" rm -rf ${GITHUB_WORKSPACE:-"/tmp"}/reference mkdir ${GITHUB_WORKSPACE:-"/tmp"}/reference mv Inductor-XPU-OP-Benchmark-Data-*/* ${GITHUB_WORKSPACE:-"/tmp"}/reference - mkdir ${{ github.workspace }}/baseline if [[ -f "${GITHUB_WORKSPACE:-"/tmp"}/reference/new_baseline/baseline_forward_op_summary.csv" ]]; then cp ${GITHUB_WORKSPACE:-"/tmp"}/reference/new_baseline/baseline_forward_op_summary.csv ${{ github.workspace }}/baseline @@ -143,7 +185,6 @@ jobs: fi - name: Check the OP Regression run: | - source activate xpu_op_${ZE_AFFINITY_MASK} pip install tabulate # Compare forward op python ${{ github.workspace }}/.github/scripts/op_perf_comparison.py --xpu_file ${{ github.workspace }}/op_benchmark/forward_op_summary.csv --baseline_file ${{ github.workspace }}/baseline/baseline_forward_op_summary.csv @@ -151,7 +192,6 @@ jobs: python ${{ github.workspace }}/.github/scripts/op_perf_comparison.py --xpu_file ${{ github.workspace }}/op_benchmark/backward_op_summary.csv --baseline_file ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv - name: Update OP Baseline run: | - source activate xpu_op_${ZE_AFFINITY_MASK} mkdir ${{ github.workspace }}/new_baseline cp ${{ github.workspace }}/baseline/baseline*.csv ${{ github.workspace }}/new_baseline # Update forward op @@ -167,6 +207,6 @@ jobs: path: ${{ github.workspace }}/op_benchmark - name: Upload Reference Run ID run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ + gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE} --json body -q .body | \ sed "s/Inductor-XPU-OP-Benchmark-Data:.*/Inductor-XPU-OP-Benchmark-Data: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt + gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE} --body-file new_body.txt diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 260203acfd..56b8da5901 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -81,7 +81,7 @@ jobs: -u ${{ needs.get_ut_runner.outputs.test_user }}:${{ needs.get_ut_runner.outputs.test_group }} -e ZE_AFFINITY_MASK env: - AGENT_TOOLSDIRECTORY: /opt/_tools + AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} env: @@ -302,6 +302,7 @@ jobs: python -c "import triton; print(triton.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' + - name: Run XPU UT Test if: ${{ matrix.test.condition }} run: | @@ -503,7 +504,7 @@ jobs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ];then + if [ "${{ inputs.test_type }}" == "cicd" ];then cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops else git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 7b1c0bb685..4dacb40ba7 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -127,7 +127,7 @@ jobs: runner: linux.idc.xpu test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} - torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} @@ -150,12 +150,26 @@ jobs: scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} model: ${{ github.event_name == 'schedule' && '' || inputs.model }} + Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: + if: ${{ github.event_name == 'schedule' }} + name: linux-nightly-ondemand-rolling / Op_microbench + permissions: write-all + needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] + uses: ./.github/workflows/_linux_op_benchmark.yml + with: + runner: pvc_rolling + test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} + Windows-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' }} name: Windows-nightly-ondemand uses: ./.github/workflows/_windows_ut.yml with: - torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.torch_xpu_ops }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} src_changed: false diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 90a11b020e..5387eb415e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -99,8 +99,10 @@ jobs: secrets: inherit uses: ./.github/workflows/_linux_build.yml with: + runner: pvc_rolling + test_type: build-cicd pytorch: main - runner: pvc_e2e + torch_xpu_ops: cicd preci-linux-ut: name: preci-linux From e3949d81221d98517bfcc2c8698d27122bee7308 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 10:22:53 +0800 Subject: [PATCH 053/160] update --- .github/workflows/_linux_e2e.yml | 2 ++ .github/workflows/_linux_op_benchmark.yml | 23 ++--------------------- .github/workflows/_linux_ut.yml | 5 +++++ .github/workflows/nightly_ondemand.yml | 1 - 4 files changed, 9 insertions(+), 22 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 47c8218e39..a89c01d8b0 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -164,6 +164,8 @@ jobs: python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python -c "import triton; print(triton.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 0588acec27..763d358677 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -16,10 +16,6 @@ on: type: string default: 'main' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' - torch_xpu_ops: - type: string - default: 'main' - description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin oneapi: type: string default: 'installed' @@ -126,30 +122,15 @@ jobs: else pip install --force-reinstall ${{ github.workspace }}/*.whl fi - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - fi - git clone ${PYTORCH_REPO} pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - # apply extra PRs for stock pytorch - if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then - python ../.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - else - python ../.github/scripts/apply_torch_pr.py - fi - git status && git diff && git show -s - name: Torch Config run: | printenv python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' - name: Run Torch XPU Op Benchmark diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 56b8da5901..195fd48b96 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -299,6 +299,8 @@ jobs: python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python -c "import triton; print(triton.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' @@ -518,9 +520,12 @@ jobs: python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python -c "import triton; print(triton.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' + - name: Set Ptrace_scope if: ${{ always() }} run: | diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 4dacb40ba7..226828471e 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -169,7 +169,6 @@ jobs: name: Windows-nightly-ondemand uses: ./.github/workflows/_windows_ut.yml with: - torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} src_changed: false From 3f692136fc6af6225a0f3d6c5ced7738956cc7c2 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 13:34:39 +0800 Subject: [PATCH 054/160] update --- .github/workflows/_linux_build.yml | 43 +++++++++++------------ .github/workflows/_linux_e2e.yml | 1 + .github/workflows/_linux_op_benchmark.yml | 2 ++ .github/workflows/_linux_ut.yml | 2 ++ 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 7fd29a97b4..00f12c003b 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -44,6 +44,7 @@ jobs: steps: - name: Use ${{ inputs.pytorch }} run: echo 'Use ${{ inputs.pytorch }}' + get_build_runner: if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ inputs.runner }} @@ -74,7 +75,9 @@ jobs: # clean files ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - build: + sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + + build_torch: needs: get_build_runner runs-on: ${{ needs.get_build_runner.outputs.test_host }} container: @@ -83,22 +86,27 @@ jobs: - ${{ github.workspace }}:${{ github.workspace }} options: -u ${{ needs.get_build_runner.outputs.test_user }}:${{ needs.get_build_runner.outputs.test_group }} env: - PATH: /tmp/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin GH_TOKEN: ${{ github.token }} + AGENT_TOOLSDIRECTORY: /tmp/_tools timeout-minutes: 300 + defaults: + run: + shell: bash -xe {0} steps: - - name: Setup based env + - name: Setup gh + uses: actions4gh/setup-gh@v1 + - name: Setup gcc + uses: Dup4/actions-setup-gcc@v1 + with: + version: 11 + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Clean workspace run: | # Cleanup workspace - rm -rf ./* - # Install gh - dnf install 'dnf-command(config-manager)' - dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf autoremove -y git236* && dnf install -y git - dnf install gh --repo gh-cli -y - # Setup python - local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-build + find ./ |grep -v "^\./$" |xargs rm -rf which python && python -V && pip list pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops @@ -107,7 +115,6 @@ jobs: path: torch-xpu-ops - name: Build Pytorch XPU run: | - set -xe -o pipefail if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" @@ -130,8 +137,6 @@ jobs: export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" fi source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh - # gcc 11 - source /opt/rh/gcc-toolset-11/enable ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ @@ -141,9 +146,6 @@ jobs: 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log - name: Build Triton run: | - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ zlib-devel - source /opt/rh/gcc-toolset-13/enable cd ./pytorch pip install cmake ninja pybind11 rm -rf pytorch_triton_xpu-*.whl @@ -199,8 +201,3 @@ jobs: with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/build_*.log - - name: Cleanup - if: always() - run: | - chmod 777 . -R - rm -rf ./* diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index a89c01d8b0..e3861703bf 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -78,6 +78,7 @@ jobs: # clean files ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} e2e_test: runs-on: ${{ needs.get_e2e_runner.outputs.test_host }} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 763d358677..89a42fbbfe 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -57,6 +57,8 @@ jobs: # clean files ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + op_benchmark_test: needs: get_op_runner runs-on: ${{ needs.get_op_runner.outputs.test_host }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 195fd48b96..8a3a27c017 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -68,6 +68,8 @@ jobs: # clean files ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + ut_test: needs: get_ut_runner runs-on: ${{ needs.get_ut_runner.outputs.test_host }} From e8b015a7eb47e94e94b025ef2a8d251570f499fb Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 13:50:18 +0800 Subject: [PATCH 055/160] update --- .github/workflows/_linux_build.yml | 10 +++--- .github/workflows/_linux_e2e.yml | 10 +++--- .github/workflows/_linux_op_benchmark.yml | 10 +++--- .github/workflows/_linux_ut.yml | 16 ++++----- .github/workflows/pull.yml | 40 +++++++++++------------ 5 files changed, 43 insertions(+), 43 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 00f12c003b..9a403caa97 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -45,7 +45,7 @@ jobs: - name: Use ${{ inputs.pytorch }} run: echo 'Use ${{ inputs.pytorch }}' - get_build_runner: + get_runner: if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ inputs.runner }} outputs: @@ -77,14 +77,14 @@ jobs: sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} - build_torch: - needs: get_build_runner - runs-on: ${{ needs.get_build_runner.outputs.test_host }} + build: + needs: get_runner + runs-on: ${{ needs.get_runner.outputs.test_host }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: -u ${{ needs.get_build_runner.outputs.test_user }}:${{ needs.get_build_runner.outputs.test_group }} + options: -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} env: GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/_tools diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index e3861703bf..3b56454362 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -49,7 +49,7 @@ on: permissions: read-all jobs: - get_e2e_runner: + get_runner: runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -81,15 +81,15 @@ jobs: sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} e2e_test: - runs-on: ${{ needs.get_e2e_runner.outputs.test_host }} - needs: get_e2e_runner + runs-on: ${{ needs.get_runner.outputs.test_host }} + needs: get_runner timeout-minutes: 3600 container: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_e2e_runner.outputs.test_user }}:${{ needs.get_e2e_runner.outputs.test_group }} + -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} env: AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} @@ -302,7 +302,7 @@ jobs: name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/upload_files - e2e_summary: + summary: runs-on: [self-hosted, Linux, X64] if: ${{ always() }} needs: e2e_test diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 89a42fbbfe..8ae2f349c9 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -28,7 +28,7 @@ on: permissions: read-all jobs: - get_op_runner: + get_runner: runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -59,9 +59,9 @@ jobs: sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} - op_benchmark_test: - needs: get_op_runner - runs-on: ${{ needs.get_op_runner.outputs.test_host }} + op_benchmark: + needs: get_runner + runs-on: ${{ needs.get_runner.outputs.test_host }} permissions: issues: write timeout-minutes: 900 @@ -70,7 +70,7 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_op_runner.outputs.test_user }}:${{ needs.get_op_runner.outputs.test_group }} + -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} env: AGENT_TOOLSDIRECTORY: /opt/_tools GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8a3a27c017..76db53209c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -39,7 +39,7 @@ on: permissions: read-all jobs: - get_ut_runner: + get_runner: runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -71,8 +71,8 @@ jobs: sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} ut_test: - needs: get_ut_runner - runs-on: ${{ needs.get_ut_runner.outputs.test_host }} + needs: get_runner + runs-on: ${{ needs.get_runner.outputs.test_host }} if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 container: @@ -80,7 +80,7 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_ut_runner.outputs.test_user }}:${{ needs.get_ut_runner.outputs.test_group }} + -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} -e ZE_AFFINITY_MASK env: AGENT_TOOLSDIRECTORY: /tmp/_tools @@ -354,7 +354,7 @@ jobs: run: | echo "UT_NAME=${{ matrix.test.name }}" >> $GITHUB_OUTPUT - ut_test_results_check: + ut_summary: needs: ut_test runs-on: ubuntu-22.04 timeout-minutes: 30 @@ -424,7 +424,7 @@ jobs: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked path: ${{ github.workspace }}/ut_log - distributed_ut_test: + distributed: runs-on: pytorch-06 if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} timeout-minutes: 60 @@ -562,8 +562,8 @@ jobs: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log - distributed_ut_test_results_check: - needs: distributed_ut_test + distributed_summary: + needs: distributed runs-on: ubuntu-22.04 timeout-minutes: 30 env: diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 5387eb415e..9e804a27ee 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -51,8 +51,8 @@ jobs: export CLANG=1 bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh - preci-conditions-filter: - name: preci-conditions-filter + conditions-filter: + name: conditions-filter if: ${{ github.event.pull_request.draft == false }} needs: [preci-lint-check] runs-on: ubuntu-22.04 @@ -92,10 +92,10 @@ jobs: disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)" echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" - preci-linux-build: - name: preci-linux - if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}} - needs: [preci-conditions-filter] + linux-build: + name: linux + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} + needs: [conditions-filter] secrets: inherit uses: ./.github/workflows/_linux_build.yml with: @@ -104,35 +104,35 @@ jobs: pytorch: main torch_xpu_ops: cicd - preci-linux-ut: - name: preci-linux - needs: [preci-conditions-filter, preci-linux-build] + linux-ut: + name: linux + needs: [conditions-filter, linux-build] uses: ./.github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: build-cicd pytorch: main ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed - disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} + disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} - preci-linux-e2e: - if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} - name: preci-linux / e2e_test + linux-e2e: + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} + name: linux permissions: write-all - needs: [preci-conditions-filter, preci-linux-build] + needs: [conditions-filter, linux-build] uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: build-cicd pytorch: main - preci-windows: - name: preci-windows - if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }} - needs: [preci-conditions-filter] + windows: + name: windows + if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} + needs: [conditions-filter] uses: ./.github/workflows/_windows_ut.yml with: ut: op_extended,torch_xpu runner: Windows_CI - src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }} - has_label: ${{ needs.preci-conditions-filter.outputs.has_label }} + src_changed: ${{ needs.conditions-filter.outputs.src_changed }} + has_label: ${{ needs.conditions-filter.outputs.has_label }} From bbd82cdefbd61ca2495f329b293a4b29ade68b66 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 14:09:03 +0800 Subject: [PATCH 056/160] update --- .github/workflows/_linux_build.yml | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 9a403caa97..9e341222f1 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -37,16 +37,7 @@ on: permissions: read-all jobs: - wheel: - if: ${{ contains(inputs.test_type, 'wheel') }} - name: ${{ inputs.pytorch }} - runs-on: ubuntu-latest - steps: - - name: Use ${{ inputs.pytorch }} - run: echo 'Use ${{ inputs.pytorch }}' - get_runner: - if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -78,7 +69,9 @@ jobs: sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} build: + name: ${{ contains(inputs.test_type, 'wheel') && inputs.pytorch || 'build' }} needs: get_runner + if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ needs.get_runner.outputs.test_host }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' From c144babd219575fd5c064235a992648fc8bae1b0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 14:29:44 +0800 Subject: [PATCH 057/160] get runner --- .github/actions/get-runner/action.yml | 39 +++++++++++++++++++++++ .github/workflows/_linux_build.yml | 24 ++------------ .github/workflows/_linux_e2e.yml | 24 ++------------ .github/workflows/_linux_op_benchmark.yml | 24 ++------------ .github/workflows/_linux_ut.yml | 24 ++------------ 5 files changed, 47 insertions(+), 88 deletions(-) create mode 100644 .github/actions/get-runner/action.yml diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml new file mode 100644 index 0000000000..853f30e6bd --- /dev/null +++ b/.github/actions/get-runner/action.yml @@ -0,0 +1,39 @@ +name: Get Runner Infos + +on: + workflow_call: + +permissions: read-all + +runs: + using: composite + steps: + - name: Get runner + shell: bash -xe {0} + run: | + # get test runner + echo "test_host=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT} + echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + # show host info + lscpu + free -h + cat /etc/os-release + uname -a + gcc -v && g++ -v + source /opt/intel/oneapi/setvars.sh + sycl-ls + icpx -v + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c + df -h + - name: Cleanup host + if: ${{ always() }} + shell: bash -xe {0} + run: | + # clean docker cache + docker system prune -af || true + # clean files + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 9e341222f1..b226fb9371 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -44,29 +44,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: - - name: Get runner info + - name: Get runner id: runner-info - run: | - # get test runner - echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - # show host info - cat /etc/os-release - uname -a - source /opt/intel/oneapi/setvars.sh - sycl-ls - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Cleanup workspace - if: ${{ always() }} - run: | - # clean docker cache - docker stop $(docker ps -aq) || true - docker system prune -af || true - # clean files - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + uses: .github/actions/get-runner build: name: ${{ contains(inputs.test_type, 'wheel') && inputs.pytorch || 'build' }} diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 3b56454362..682d22565b 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -56,29 +56,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: - - name: Get runner info + - name: Get runner id: runner-info - run: | - # get test runner - echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - # show host info - cat /etc/os-release - uname -a - source /opt/intel/oneapi/setvars.sh - sycl-ls - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Cleanup workspace - if: ${{ always() }} - run: | - # clean docker cache - docker stop $(docker ps -aq) || true - docker system prune -af || true - # clean files - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + uses: .github/actions/get-runner e2e_test: runs-on: ${{ needs.get_runner.outputs.test_host }} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 8ae2f349c9..50ed2efc63 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -35,29 +35,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: - - name: Get runner info + - name: Get runner id: runner-info - run: | - # get test runner - echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - # show host info - cat /etc/os-release - uname -a - source /opt/intel/oneapi/setvars.sh - sycl-ls - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Cleanup workspace - if: ${{ always() }} - run: | - # clean docker cache - docker stop $(docker ps -aq) || true - docker system prune -af || true - # clean files - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + uses: .github/actions/get-runner op_benchmark: needs: get_runner diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 76db53209c..6692c3fad1 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -46,29 +46,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: - - name: Get runner info + - name: Get runner id: runner-info - run: | - # get test runner - echo "test_host=${RUNNER_NAME%-*}" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - # show host info - cat /etc/os-release - uname -a - source /opt/intel/oneapi/setvars.sh - sycl-ls - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Cleanup workspace - if: ${{ always() }} - run: | - # clean docker cache - # docker stop $(docker ps -aq) || true - docker system prune -af || true - # clean files - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + uses: .github/actions/get-runner ut_test: needs: get_runner From 40180c0021033539435e0dfd1751e1ab0887cea6 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 14:52:44 +0800 Subject: [PATCH 058/160] test env --- .github/actions/get-runner/action.yml | 10 +- .github/actions/setup-testenv/action.yml | 136 +++++++++++++++ .github/workflows/_linux_e2e.yml | 77 +-------- .github/workflows/_linux_op_benchmark.yml | 60 +------ .github/workflows/_linux_ut.yml | 202 ++-------------------- 5 files changed, 168 insertions(+), 317 deletions(-) create mode 100644 .github/actions/setup-testenv/action.yml diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index 853f30e6bd..89dc1764d2 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -17,16 +17,10 @@ runs: echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} # show host info lscpu + lshw -C display free -h - cat /etc/os-release - uname -a - gcc -v && g++ -v - source /opt/intel/oneapi/setvars.sh - sycl-ls - icpx -v - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c df -h + cat /etc/os-release - name: Cleanup host if: ${{ always() }} shell: bash -xe {0} diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml new file mode 100644 index 0000000000..a51b8beb6f --- /dev/null +++ b/.github/actions/setup-testenv/action.yml @@ -0,0 +1,136 @@ +name: Get Runner Infos + +on: + workflow_call: + inputs: + test_type: + required: true + type: string + description: Test scope + pytorch: + type: string + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: + type: string + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version + +permissions: read-all + +runs: + using: composite + steps: + - name: Setup gh + uses: actions4gh/setup-gh@v1 + - name: Setup gcc + uses: Dup4/actions-setup-gcc@v1 + with: + version: 11 + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Check runner + run: | + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf + hostname && whoami && id + cat /etc/os-release + gcc -v && g++ -v + which python && which pip + python -V + pip install -U pip wheel setuptools + pip list + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + clinfo --list + cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Install oneAPI DLE + if: ${{ inputs.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + - name: Download Pytorch wheel + if: ${{ ! contains(inputs.test_type, 'wheel') }} + uses: actions/download-artifact@v4 + with: + pattern: Torch-XPU-Wheel-* + - name: Prepare Stock Pytorch + run: | + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install --force-reinstall ${{ github.workspace }}/*.whl + fi + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} + pip install -r .ci/docker/requirements-ci.txt + # apply extra PRs for stock pytorch + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + fi + git status && git diff && git show -s + - name: Prepare Torch-xpu-ops + if: ${{ inputs.torch_xpu_ops != 'skipped' }} + run: | + cd pytorch + rm -rf third_party/torch-xpu-ops + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" + else + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then + TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" + else + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" + fi + fi + if [ "${{ inputs.test_type }}" == "cicd" ];then + cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops + else + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + fi + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} + git status && git diff && git show -s + - name: Torch Config + run: | + printenv + python -c "import torch; print(torch.__config__.show())" + python -c "import torch; print(torch.__config__.parallel_info())" + python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 682d22565b..48e0dd915e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -79,77 +79,14 @@ jobs: run: shell: bash -xe {0} steps: - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} - - name: Check runner - run: | - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf - hostname && whoami && id - clinfo --list - gcc -v && g++ -v - which python && which pip - python -V - pip install -U pip wheel setuptools - pip list - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - pip install pandas psutil scipy requests - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Install oneAPI DLE - if: ${{ inputs.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - - name: Download Pytorch wheel - if: ${{ ! contains(inputs.test_type, 'wheel') }} - uses: actions/download-artifact@v4 + - name: Setup Test Env + uses: .github/actions/setup-testenv with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - run: | - # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - else - pip install --force-reinstall ${{ github.workspace }}/*.whl - fi - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - fi - git clone ${PYTORCH_REPO} pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - # apply extra PRs for stock pytorch - if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then - python ../.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - else - python ../.github/scripts/apply_torch_pr.py - fi - git status && git diff && git show -s - - name: Torch Config - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: skipped + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 50ed2efc63..582251fb80 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -60,60 +60,14 @@ jobs: run: shell: bash -xe {0} steps: - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 + - name: Setup Test Env + uses: .github/actions/setup-testenv with: - python-version: ${{ inputs.python }} - - name: Check runner - run: | - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf - hostname && whoami && id - clinfo --list - gcc -v && g++ -v - which python && which pip - python -V - pip install -U pip wheel setuptools - pip list - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - pip install pandas psutil scipy requests - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Install oneAPI DLE - if: ${{ inputs.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - - name: Download Pytorch wheel - if: ${{ ! contains(inputs.test_type, 'wheel') }} - uses: actions/download-artifact@v4 - with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - run: | - # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - else - pip install --force-reinstall ${{ github.workspace }}/*.whl - fi - - name: Torch Config - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" - python -c "import triton; print(triton.__version__)" - pip list |grep -E 'torch|intel' + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: skipped + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} - name: Run Torch XPU Op Benchmark run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 6692c3fad1..57371e75ce 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -188,104 +188,14 @@ jobs: outputs: ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }} steps: - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 + - name: Setup Test Env + uses: .github/actions/setup-testenv with: - python-version: ${{ inputs.python }} - - name: Check runner - run: | - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf - hostname && whoami && id - clinfo --list - gcc -v && g++ -v - which python && which pip - python -V - pip install -U pip wheel setuptools - pip list - pip install pytest pytest-timeout - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - pip install pandas psutil scipy requests - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - with: - path: torch-xpu-ops - - name: Install oneAPI DLE - if: ${{ inputs.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - - name: Download Pytorch wheel - if: ${{ ! contains(inputs.test_type, 'wheel') }} - uses: actions/download-artifact@v4 - with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - run: | - # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - else - pip install --force-reinstall ${{ github.workspace }}/*.whl - fi - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - fi - git clone ${PYTORCH_REPO} pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - pip install -r .ci/docker/requirements-ci.txt - # apply extra PRs for stock pytorch - if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - else - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - fi - git status && git diff && git show -s - - name: Prepare Torch-xpu-ops - run: | - cd pytorch - rm -rf third_party/torch-xpu-ops - if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" - else - TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then - TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" - else - TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" - fi - fi - if [ "${{ inputs.test_type }}" == "cicd" ];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - fi - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - git status && git diff && git show -s - - name: Torch Config - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: ${{ inputs.torch_xpu_ops }} + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} - name: Run XPU UT Test if: ${{ matrix.test.condition }} @@ -415,98 +325,18 @@ jobs: - name: Check runner run: | ls -al - find ./ |grep -v "^\./$" |xargs rm -rf + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf - rm -rf ~/.triton ~/.torch - hostname && whoami && id + sudo rm -rf ~/.triton ~/.torch xpu-smi topology -m - gcc -v && g++ -v - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 + - name: Setup Test Env + uses: .github/actions/setup-testenv with: - path: torch-xpu-ops - - name: Install oneAPI DLE - if: ${{ inputs.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - - name: Download Pytorch wheel - if: ${{ ! contains(inputs.test_type, 'wheel') }} - uses: actions/download-artifact@v4 - with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - run: | - which python && which pip - python -V - pip install -U pip wheel setuptools - pip list - pip install pytest pytest-timeout - # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - else - pip install --force-reinstall ${{ github.workspace }}/*.whl - fi - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - fi - git clone ${PYTORCH_REPO} pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - pip install -r .ci/docker/requirements-ci.txt - # apply extra PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Prepare Torch-xpu-ops - run: | - cd pytorch - rm -rf third_party/torch-xpu-ops - if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" - else - TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then - TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" - else - TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" - fi - fi - if [ "${{ inputs.test_type }}" == "cicd" ];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - fi - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - git status && git diff && git show -s - - name: Torch Config - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: ${{ inputs.torch_xpu_ops }} + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} - name: Set Ptrace_scope if: ${{ always() }} From 54ea2f0c5b37eba5c54d9f0f0a516d6539861db8 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 14:56:41 +0800 Subject: [PATCH 059/160] update --- .github/workflows/_linux_e2e.yml | 28 +++++++++++++------------- .github/workflows/nightly_ondemand.yml | 10 ++++----- .github/workflows/pull.yml | 8 ++++---- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 48e0dd915e..a9b36c2765 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -91,7 +91,7 @@ jobs: # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: huggingface @@ -100,7 +100,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: torchbench @@ -109,7 +109,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: timm_models @@ -120,7 +120,7 @@ jobs: # Nihglty launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: huggingface @@ -129,7 +129,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: torchbench @@ -138,7 +138,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: timm_models @@ -147,7 +147,7 @@ jobs: scenario: accuracy,performance - name: Nightly PT2E Full Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/pt2e + uses: .github/actions/pt2e with: env_prepare: true dt: float32,int8 @@ -156,7 +156,7 @@ jobs: # Weekly launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: huggingface @@ -165,7 +165,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: torchbench @@ -174,7 +174,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: timm_models @@ -183,7 +183,7 @@ jobs: scenario: accuracy,performance - name: Nightly PT2E Full Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/pt2e + uses: .github/actions/pt2e with: env_prepare: true dt: float32,int8 @@ -192,7 +192,7 @@ jobs: # On-demand launch - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ contains(inputs.test_type, 'ondemand') && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: .github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: ${{ inputs.suite }} @@ -201,7 +201,7 @@ jobs: scenario: ${{ inputs.scenario }} - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e + uses: .github/actions/pt2e with: env_prepare: true dt: ${{ inputs.dt }} @@ -271,7 +271,7 @@ jobs: run: | pip install pandas requests if [ "${{ inputs.suite }}" != 'pt2e' ];then - bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} + bash .github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) if [ ${exit_label} -ne 0 ];then grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 226828471e..3bedac1c96 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -108,7 +108,7 @@ jobs: needs: [Conditions-Filter] name: linux secrets: inherit - uses: ./.github/workflows/_linux_build.yml + uses: .github/workflows/_linux_build.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -122,7 +122,7 @@ jobs: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} name: linux needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: ./.github/workflows/_linux_ut.yml + uses: .github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -137,7 +137,7 @@ jobs: name: linux permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: ./.github/workflows/_linux_e2e.yml + uses: .github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -155,7 +155,7 @@ jobs: name: linux-nightly-ondemand-rolling / Op_microbench permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: ./.github/workflows/_linux_op_benchmark.yml + uses: .github/workflows/_linux_op_benchmark.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -167,7 +167,7 @@ jobs: Windows-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' }} name: Windows-nightly-ondemand - uses: ./.github/workflows/_windows_ut.yml + uses: .github/workflows/_windows_ut.yml with: ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9e804a27ee..b17252b41a 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -97,7 +97,7 @@ jobs: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} needs: [conditions-filter] secrets: inherit - uses: ./.github/workflows/_linux_build.yml + uses: .github/workflows/_linux_build.yml with: runner: pvc_rolling test_type: build-cicd @@ -107,7 +107,7 @@ jobs: linux-ut: name: linux needs: [conditions-filter, linux-build] - uses: ./.github/workflows/_linux_ut.yml + uses: .github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: build-cicd @@ -120,7 +120,7 @@ jobs: name: linux permissions: write-all needs: [conditions-filter, linux-build] - uses: ./.github/workflows/_linux_e2e.yml + uses: .github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: build-cicd @@ -130,7 +130,7 @@ jobs: name: windows if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} needs: [conditions-filter] - uses: ./.github/workflows/_windows_ut.yml + uses: .github/workflows/_windows_ut.yml with: ut: op_extended,torch_xpu runner: Windows_CI From 9b660b96d72b9359a63094c3830451965b0c3c55 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 14:59:38 +0800 Subject: [PATCH 060/160] Revert "update" This reverts commit 54ea2f0c5b37eba5c54d9f0f0a516d6539861db8. --- .github/workflows/_linux_e2e.yml | 28 +++++++++++++------------- .github/workflows/nightly_ondemand.yml | 10 ++++----- .github/workflows/pull.yml | 8 ++++---- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index a9b36c2765..48e0dd915e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -91,7 +91,7 @@ jobs: # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: huggingface @@ -100,7 +100,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: torchbench @@ -109,7 +109,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: timm_models @@ -120,7 +120,7 @@ jobs: # Nihglty launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: huggingface @@ -129,7 +129,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: torchbench @@ -138,7 +138,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: timm_models @@ -147,7 +147,7 @@ jobs: scenario: accuracy,performance - name: Nightly PT2E Full Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: .github/actions/pt2e + uses: ./.github/actions/pt2e with: env_prepare: true dt: float32,int8 @@ -156,7 +156,7 @@ jobs: # Weekly launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: huggingface @@ -165,7 +165,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: torchbench @@ -174,7 +174,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: timm_models @@ -183,7 +183,7 @@ jobs: scenario: accuracy,performance - name: Nightly PT2E Full Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: .github/actions/pt2e + uses: ./.github/actions/pt2e with: env_prepare: true dt: float32,int8 @@ -192,7 +192,7 @@ jobs: # On-demand launch - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ contains(inputs.test_type, 'ondemand') && inputs.suite != 'pt2e' }} - uses: .github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/inductor-xpu-e2e-test with: env_prepare: true suite: ${{ inputs.suite }} @@ -201,7 +201,7 @@ jobs: scenario: ${{ inputs.scenario }} - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'pt2e') }} - uses: .github/actions/pt2e + uses: ./.github/actions/pt2e with: env_prepare: true dt: ${{ inputs.dt }} @@ -271,7 +271,7 @@ jobs: run: | pip install pandas requests if [ "${{ inputs.suite }}" != 'pt2e' ];then - bash .github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} + bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) if [ ${exit_label} -ne 0 ];then grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 3bedac1c96..226828471e 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -108,7 +108,7 @@ jobs: needs: [Conditions-Filter] name: linux secrets: inherit - uses: .github/workflows/_linux_build.yml + uses: ./.github/workflows/_linux_build.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -122,7 +122,7 @@ jobs: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} name: linux needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: .github/workflows/_linux_ut.yml + uses: ./.github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -137,7 +137,7 @@ jobs: name: linux permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: .github/workflows/_linux_e2e.yml + uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -155,7 +155,7 @@ jobs: name: linux-nightly-ondemand-rolling / Op_microbench permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: .github/workflows/_linux_op_benchmark.yml + uses: ./.github/workflows/_linux_op_benchmark.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} @@ -167,7 +167,7 @@ jobs: Windows-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' }} name: Windows-nightly-ondemand - uses: .github/workflows/_windows_ut.yml + uses: ./.github/workflows/_windows_ut.yml with: ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index b17252b41a..9e804a27ee 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -97,7 +97,7 @@ jobs: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} needs: [conditions-filter] secrets: inherit - uses: .github/workflows/_linux_build.yml + uses: ./.github/workflows/_linux_build.yml with: runner: pvc_rolling test_type: build-cicd @@ -107,7 +107,7 @@ jobs: linux-ut: name: linux needs: [conditions-filter, linux-build] - uses: .github/workflows/_linux_ut.yml + uses: ./.github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: build-cicd @@ -120,7 +120,7 @@ jobs: name: linux permissions: write-all needs: [conditions-filter, linux-build] - uses: .github/workflows/_linux_e2e.yml + uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: build-cicd @@ -130,7 +130,7 @@ jobs: name: windows if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} needs: [conditions-filter] - uses: .github/workflows/_windows_ut.yml + uses: ./.github/workflows/_windows_ut.yml with: ut: op_extended,torch_xpu runner: Windows_CI From 7d025c0e6fcc850ae5ec0ac71338e7c83e2c30da Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 15:26:53 +0800 Subject: [PATCH 061/160] update --- .github/workflows/_linux_build.yml | 2 +- .github/workflows/_linux_e2e.yml | 4 ++-- .github/workflows/_linux_op_benchmark.yml | 4 ++-- .github/workflows/_linux_ut.yml | 6 +++--- .github/workflows/pull.yml | 8 ++++---- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index b226fb9371..e7ca29b90e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -46,7 +46,7 @@ jobs: steps: - name: Get runner id: runner-info - uses: .github/actions/get-runner + uses: ./.github/actions/get-runner build: name: ${{ contains(inputs.test_type, 'wheel') && inputs.pytorch || 'build' }} diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 48e0dd915e..cba621d0e0 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -58,7 +58,7 @@ jobs: steps: - name: Get runner id: runner-info - uses: .github/actions/get-runner + uses: ./.github/actions/get-runner e2e_test: runs-on: ${{ needs.get_runner.outputs.test_host }} @@ -80,7 +80,7 @@ jobs: shell: bash -xe {0} steps: - name: Setup Test Env - uses: .github/actions/setup-testenv + uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 582251fb80..2e23483ad6 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -37,7 +37,7 @@ jobs: steps: - name: Get runner id: runner-info - uses: .github/actions/get-runner + uses: ./.github/actions/get-runner op_benchmark: needs: get_runner @@ -61,7 +61,7 @@ jobs: shell: bash -xe {0} steps: - name: Setup Test Env - uses: .github/actions/setup-testenv + uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 57371e75ce..0824d0f0e0 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -48,7 +48,7 @@ jobs: steps: - name: Get runner id: runner-info - uses: .github/actions/get-runner + uses: ./.github/actions/get-runner ut_test: needs: get_runner @@ -189,7 +189,7 @@ jobs: ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }} steps: - name: Setup Test Env - uses: .github/actions/setup-testenv + uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} @@ -330,7 +330,7 @@ jobs: sudo rm -rf ~/.triton ~/.torch xpu-smi topology -m - name: Setup Test Env - uses: .github/actions/setup-testenv + uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9e804a27ee..b17252b41a 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -97,7 +97,7 @@ jobs: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} needs: [conditions-filter] secrets: inherit - uses: ./.github/workflows/_linux_build.yml + uses: .github/workflows/_linux_build.yml with: runner: pvc_rolling test_type: build-cicd @@ -107,7 +107,7 @@ jobs: linux-ut: name: linux needs: [conditions-filter, linux-build] - uses: ./.github/workflows/_linux_ut.yml + uses: .github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: build-cicd @@ -120,7 +120,7 @@ jobs: name: linux permissions: write-all needs: [conditions-filter, linux-build] - uses: ./.github/workflows/_linux_e2e.yml + uses: .github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: build-cicd @@ -130,7 +130,7 @@ jobs: name: windows if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} needs: [conditions-filter] - uses: ./.github/workflows/_windows_ut.yml + uses: .github/workflows/_windows_ut.yml with: ut: op_extended,torch_xpu runner: Windows_CI From dd23ceb6ef422419073fd7b266d0e75ca0b5d547 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 15:32:54 +0800 Subject: [PATCH 062/160] update --- .github/workflows/pull.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index b17252b41a..9e804a27ee 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -97,7 +97,7 @@ jobs: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} needs: [conditions-filter] secrets: inherit - uses: .github/workflows/_linux_build.yml + uses: ./.github/workflows/_linux_build.yml with: runner: pvc_rolling test_type: build-cicd @@ -107,7 +107,7 @@ jobs: linux-ut: name: linux needs: [conditions-filter, linux-build] - uses: .github/workflows/_linux_ut.yml + uses: ./.github/workflows/_linux_ut.yml with: runner: linux.idc.xpu test_type: build-cicd @@ -120,7 +120,7 @@ jobs: name: linux permissions: write-all needs: [conditions-filter, linux-build] - uses: .github/workflows/_linux_e2e.yml + uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: build-cicd @@ -130,7 +130,7 @@ jobs: name: windows if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} needs: [conditions-filter] - uses: .github/workflows/_windows_ut.yml + uses: ./.github/workflows/_windows_ut.yml with: ut: op_extended,torch_xpu runner: Windows_CI From de4a432f0979ac44813054e5d2da880516fb34ba Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 17:01:53 +0800 Subject: [PATCH 063/160] update --- .github/actions/setup-testenv/action.yml | 8 +++----- .github/workflows/_linux_build.yml | 13 ++++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index a51b8beb6f..a26f543f45 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -30,11 +30,9 @@ runs: using: composite steps: - name: Setup gh - uses: actions4gh/setup-gh@v1 - - name: Setup gcc - uses: Dup4/actions-setup-gcc@v1 - with: - version: 11 + run: | + curl -sS https://webi.sh/gh | sh + echo "PATH=${HOME}/.local/bin:$PATH" >> ${GITHUB_ENV} - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index e7ca29b90e..f83a6cc623 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -67,11 +67,9 @@ jobs: shell: bash -xe {0} steps: - name: Setup gh - uses: actions4gh/setup-gh@v1 - - name: Setup gcc - uses: Dup4/actions-setup-gcc@v1 - with: - version: 11 + run: | + curl -sS https://webi.sh/gh | sh + echo "PATH=${HOME}/.local/bin:$PATH" >> ${GITHUB_ENV} - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: @@ -110,6 +108,8 @@ jobs: export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" fi source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh + # gcc 11 + source /opt/rh/gcc-toolset-11/enable ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ @@ -137,6 +137,9 @@ jobs: cp pytorch_triton_xpu-*.whl ${{ github.workspace }} - name: Build Torchvision and Torchaudio run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable cd ./pytorch TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" From 65cc01a1ccd212dd673756b2445a0073f8d1ef89 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 17:16:50 +0800 Subject: [PATCH 064/160] remove useless inputs for op benchmark --- .github/workflows/nightly_ondemand.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 226828471e..3d6e8b9fe3 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -160,7 +160,6 @@ jobs: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} - torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} From f727ef88174a1fe699e69492d051fed8439b0a45 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 17:23:50 +0800 Subject: [PATCH 065/160] checkout torch-xpu-ops --- .github/workflows/_linux_build.yml | 2 ++ .github/workflows/_linux_e2e.yml | 4 ++++ .github/workflows/_linux_op_benchmark.yml | 4 ++++ .github/workflows/_linux_ut.yml | 6 ++++++ 4 files changed, 16 insertions(+) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index f83a6cc623..ad45c3bf04 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -44,6 +44,8 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Get runner id: runner-info uses: ./.github/actions/get-runner diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index cba621d0e0..0118ad47d3 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -56,6 +56,8 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Get runner id: runner-info uses: ./.github/actions/get-runner @@ -79,6 +81,8 @@ jobs: run: shell: bash -xe {0} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Setup Test Env uses: ./.github/actions/setup-testenv with: diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 2e23483ad6..d19e53de40 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -35,6 +35,8 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Get runner id: runner-info uses: ./.github/actions/get-runner @@ -60,6 +62,8 @@ jobs: run: shell: bash -xe {0} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Setup Test Env uses: ./.github/actions/setup-testenv with: diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 0824d0f0e0..7ca9ccdec5 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -46,6 +46,8 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Get runner id: runner-info uses: ./.github/actions/get-runner @@ -188,6 +190,8 @@ jobs: outputs: ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }} steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Setup Test Env uses: ./.github/actions/setup-testenv with: @@ -329,6 +333,8 @@ jobs: sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf sudo rm -rf ~/.triton ~/.torch xpu-smi topology -m + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 - name: Setup Test Env uses: ./.github/actions/setup-testenv with: From 18ada97394af7461cac3c2ab54089534b91d5750 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 17:58:31 +0800 Subject: [PATCH 066/160] modify get runner --- .github/actions/get-runner/action.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index 89dc1764d2..74544db7e4 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -3,6 +3,14 @@ name: Get Runner Infos on: workflow_call: +outputs: + test_host: + value: ${{ steps.runner.outputs.test_host }} + test_user: + value: ${{ steps.runner.outputs.test_user }} + test_group: + value: ${{ steps.runner.outputs.test_group }} + permissions: read-all runs: @@ -10,6 +18,7 @@ runs: steps: - name: Get runner shell: bash -xe {0} + id: runner run: | # get test runner echo "test_host=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT} @@ -21,6 +30,7 @@ runs: free -h df -h cat /etc/os-release + uname -a - name: Cleanup host if: ${{ always() }} shell: bash -xe {0} From 018f96804e839cf7310f5ce0ca54c7a616ba07cc Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 22 Jul 2025 21:12:07 +0800 Subject: [PATCH 067/160] modify build --- .github/workflows/_linux_build.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index ad45c3bf04..0925e2a73c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -119,6 +119,10 @@ jobs: --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log + if [ ! -f "${{ github.workspace }}/torch-*.whl" ];then + echo "Build pytorch got failed" + exit 1 + fi - name: Build Triton run: | cd ./pytorch @@ -135,6 +139,10 @@ jobs: )" python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log + if [ ! -f "${{ github.workspace }}/pytorch_triton_xpu-*.whl" ];then + echo "Build triton got failed" + exit 1 + fi pip install pytorch_triton_xpu-*.whl cp pytorch_triton_xpu-*.whl ${{ github.workspace }} - name: Build Torchvision and Torchaudio @@ -148,11 +156,19 @@ jobs: git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID} python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log + if [ ! -f "dist/*.whl" ];then + echo "Build torchvision got failed" + exit 1 + fi pip install dist/*.whl cp dist/*.whl ${{ github.workspace }} git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID} python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log + if [ ! -f "dist/*.whl" ];then + echo "Build torchaudio got failed" + exit 1 + fi pip install dist/*.whl cp dist/*.whl ${{ github.workspace }} - name: Torch Config From 93fa1123ac255fa984ae7e81cbdf2d31a8108752 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 11:12:36 +0800 Subject: [PATCH 068/160] modify build --- .github/workflows/_linux_build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 0925e2a73c..baba85b624 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -76,6 +76,7 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} + check-latest: true - name: Clean workspace run: | # Cleanup workspace @@ -184,7 +185,7 @@ jobs: pip list |grep -E 'torch|intel' - name: Upload Torch XPU Wheel - if: ${{ ! cancelled() }} + if: ${{ success() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} From ad8cc67a9300cf2c076dd56d967ce92077ff313c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 13:20:20 +0800 Subject: [PATCH 069/160] update --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index baba85b624..3294eb3c80 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -75,7 +75,7 @@ jobs: - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: - python-version: ${{ inputs.python }} + python-version: 'pypy${{ inputs.python }}' check-latest: true - name: Clean workspace run: | From 7c9d3a346169d3f9282c4cc0e1fdab358fd0e4d8 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 13:25:12 +0800 Subject: [PATCH 070/160] update --- .github/workflows/_linux_e2e.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 0118ad47d3..8320cba71c 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -286,7 +286,7 @@ jobs: pt2e_summary_csv="$(find ./target/ -name "summary.csv")" if [ -f "${pt2e_summary_csv}" ];then cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) + failed_num=$(grep -c ',failed' ${pt2e_summary_csv}) if [ ${failed_num} -ne 0 ];then echo "[Warning] PT2E has failures!" fi @@ -295,7 +295,7 @@ jobs: if: ${{ ! (contains(inputs.test_type, 'ondemand') && contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} run: | gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 - has_or_not="$(grep 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt |wc -l)" + has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" if [ ${has_or_not} -ne 0 ];then sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt else From 2fc3b8eb9439e781aa8ef6785aad85d0a60e1a44 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 15:56:52 +0800 Subject: [PATCH 071/160] update --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 3294eb3c80..03694ceb65 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -71,7 +71,6 @@ jobs: - name: Setup gh run: | curl -sS https://webi.sh/gh | sh - echo "PATH=${HOME}/.local/bin:$PATH" >> ${GITHUB_ENV} - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: @@ -89,6 +88,7 @@ jobs: path: torch-xpu-ops - name: Build Pytorch XPU run: | + export PATH=${HOME}/.local/bin:$PATH if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" From 78cedbfa05e3f955f5744ebe8208167bba52111e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 16:44:35 +0800 Subject: [PATCH 072/160] update --- .github/workflows/_linux_build.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 03694ceb65..42f659de62 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -70,7 +70,11 @@ jobs: steps: - name: Setup gh run: | + hostname && id curl -sS https://webi.sh/gh | sh + echo "PATH=${HOME}/.local/bin:$PATH" |tee -a ${GITHUB_ENV} + source ~/.config/envman/PATH.env + gh --version - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: @@ -80,7 +84,8 @@ jobs: run: | # Cleanup workspace find ./ |grep -v "^\./$" |xargs rm -rf - which python && python -V && pip list + which python && python -V + which pip && pip list pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -88,7 +93,6 @@ jobs: path: torch-xpu-ops - name: Build Pytorch XPU run: | - export PATH=${HOME}/.local/bin:$PATH if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" @@ -103,6 +107,8 @@ jobs: TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi + # gcc 11 + source /opt/rh/gcc-toolset-11/enable # oneAPI DLE if [ "${{ inputs.oneapi }}" != "installed" ];then rm -rf ${HOME}/intel ${HOME}/.intel @@ -111,8 +117,6 @@ jobs: export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" fi source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh - # gcc 11 - source /opt/rh/gcc-toolset-11/enable ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ From c6bc928ed934e88ce763828f956fdc86614cc447 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 16:50:22 +0800 Subject: [PATCH 073/160] update --- .github/workflows/_linux_build.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 42f659de62..5b9bf994c3 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -59,7 +59,6 @@ jobs: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} env: GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/_tools From 9765fac30e301772bb1be36e144f19f484e228a1 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 16:50:57 +0800 Subject: [PATCH 074/160] update --- .github/workflows/_linux_build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 5b9bf994c3..378cfc58c9 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -186,6 +186,7 @@ jobs: python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' + chmod 777 . -R - name: Upload Torch XPU Wheel if: ${{ success() }} From eda9634e3928daa89c35bdc3736f54bf4beee372 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 17:29:19 +0800 Subject: [PATCH 075/160] modify ut --- .github/workflows/_linux_build.yml | 6 +- .github/workflows/_linux_e2e.yml | 12 +- .github/workflows/_linux_ut.yml | 203 ++++++++++++++--------------- .github/workflows/pull.yml | 9 +- 4 files changed, 111 insertions(+), 119 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 378cfc58c9..d24628b6d5 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -37,7 +37,7 @@ on: permissions: read-all jobs: - get_runner: + runner: runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -52,9 +52,9 @@ jobs: build: name: ${{ contains(inputs.test_type, 'wheel') && inputs.pytorch || 'build' }} - needs: get_runner + needs: runner if: ${{ ! contains(inputs.test_type, 'wheel') }} - runs-on: ${{ needs.get_runner.outputs.test_host }} + runs-on: ${{ needs.runner.outputs.test_host }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 8320cba71c..2c55ce60a7 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -49,7 +49,7 @@ on: permissions: read-all jobs: - get_runner: + runner: runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -62,16 +62,16 @@ jobs: id: runner-info uses: ./.github/actions/get-runner - e2e_test: - runs-on: ${{ needs.get_runner.outputs.test_host }} - needs: get_runner + test: + runs-on: ${{ needs.runner.outputs.test_host }} + needs: runner timeout-minutes: 3600 container: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} + -u ${{ needs.runner.outputs.test_user }}:${{ needs.runner.outputs.test_group }} env: AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} @@ -226,7 +226,7 @@ jobs: summary: runs-on: [self-hosted, Linux, X64] if: ${{ always() }} - needs: e2e_test + needs: test permissions: issues: write container: diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 7ca9ccdec5..31391ff634 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -38,8 +38,12 @@ on: permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: - get_runner: + runner: runs-on: ${{ inputs.runner }} outputs: test_host: ${{ steps.runner-info.outputs.test_host }} @@ -52,9 +56,9 @@ jobs: id: runner-info uses: ./.github/actions/get-runner - ut_test: - needs: get_runner - runs-on: ${{ needs.get_runner.outputs.test_host }} + normal: + needs: runner + runs-on: ${{ needs.runner.outputs.test_host }} if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 container: @@ -62,17 +66,12 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} + -u ${{ needs.runner.outputs.test_user }}:${{ needs.runner.outputs.test_group }} -e ZE_AFFINITY_MASK env: AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - env: - UT_NAME: ${{ matrix.test.name }} - defaults: - run: - shell: bash -xe {0} strategy: fail-fast: false matrix: @@ -187,8 +186,8 @@ jobs: tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log additional_steps: | mkdir -p ut_log/profile_test/issue_reproduce - outputs: - ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }} + env: + UT_NAME: ${{ matrix.test.name }} steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -200,11 +199,9 @@ jobs: torch_xpu_ops: ${{ inputs.torch_xpu_ops }} oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - - name: Run XPU UT Test if: ${{ matrix.test.condition }} run: | - set -e mkdir -p ${{ github.workspace }}/ut_log mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }} echo "Running ${{ matrix.test.name }}" @@ -213,7 +210,6 @@ jobs: cd ${{ matrix.test.directory }} if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then bash << "SCRIPT" - set -e ${{ matrix.test.command_script }} SCRIPT else @@ -242,81 +238,74 @@ jobs: with: name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} path: ${{ github.workspace }}/ut_log/ut_failure_list.csv - - name: Set UT outputs - id: set-output - if: ${{ matrix.test.condition }} - run: | - echo "UT_NAME=${{ matrix.test.name }}" >> $GITHUB_OUTPUT - - ut_summary: - needs: ut_test - runs-on: ubuntu-22.04 - timeout-minutes: 30 - env: - GH_TOKEN: ${{ github.token }} - UT_SKIP_ISSUE: 1624 + + devices: + runs-on: pvc_rolling + if: ${{ contains(inputs.ut, 'op_regression_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} + timeout-minutes: 5 strategy: - fail-fast: false matrix: - test: - - name: 'op_regression' - condition: ${{ contains(inputs.ut, 'op_regression') }} - - name: 'op_regression_dev1' - condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - - name: 'op_transformers' - condition: ${{ contains(inputs.ut, 'op_transformers') }} - - name: 'op_extended' - condition: ${{ contains(inputs.ut, 'op_extended') }} - - name: 'op_ut' - condition: ${{ contains(inputs.ut, 'op_ut') }} - - name: 'torch_xpu' - condition: ${{ contains(inputs.ut, 'torch_xpu') }} - - name: 'xpu_profiling' - condition: ${{ contains(inputs.ut, 'xpu_profiling') }} + test: + - name: 'op_regression_dev1' + condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} + directory: 'pytorch/third_party/torch-xpu-ops/test/regressions' + command: 'pytest --timeout 600 -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml' + log_prefix: 'op_regression_dev1' + timeout: 3 + env: + AGENT_TOOLSDIRECTORY: /tmp/_tools + GH_TOKEN: ${{ github.token }} + UT_NAME: ${{ matrix.test.name }} steps: - - name: Get matrix UT value - run: | - echo "UT_NAME=${{ needs.ut_test.outputs.ut_name }}" >> "${GITHUB_ENV}" - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Download XPU UT Logs - if: ${{ matrix.test.condition }} - uses: actions/download-artifact@v4 + - name: Setup Test Env + uses: ./.github/actions/setup-testenv with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }} - path: ${{ github.workspace }}/ut_log - - name: Check UT Results + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: ${{ inputs.torch_xpu_ops }} + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} + - name: Run XPU UT Test if: ${{ matrix.test.condition }} - shell: bash run: | - repo="${{ github.repository }}" - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - cd ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log - gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ - > issues.log - awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log - awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log - cat issues_temp.log | awk '{print $1}' >> Known_issue.log - awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh ${{ matrix.test.name }} + mkdir -p ${{ github.workspace }}/ut_log + mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }} + echo "Running ${{ matrix.test.name }}" + echo "Directory: ${{ matrix.test.directory }}" + ${{ matrix.test.additional_steps }} + cd ${{ matrix.test.directory }} + if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then + bash << "SCRIPT" + ${{ matrix.test.command_script }} + SCRIPT + else + timeout ${{ matrix.test.timeout }} ${{ matrix.test.command }} \ + 2>${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test_error.log | \ + tee ${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test.log + ${{ matrix.test.xml_post_processing || '' }} + fi + - name: UT Test Results Summary + if: ${{ matrix.test.condition }} + run: | + pip install junitparser + python torch-xpu-ops/.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + if [ -e "ut_failure_list.csv" ];then + cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv + fi - name: Upload Inductor XPU UT Log if: ${{ matrix.test.condition }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} path: ${{ github.workspace }}/ut_log + - name: Upload XPU UT Failure list + if: ${{ matrix.test.condition }} + uses: actions/upload-artifact@v4 + with: + name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} + path: ${{ github.workspace }}/ut_log/ut_failure_list.csv distributed: runs-on: pytorch-06 @@ -343,7 +332,6 @@ jobs: torch_xpu_ops: ${{ inputs.torch_xpu_ops }} oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - - name: Set Ptrace_scope if: ${{ always() }} run: | @@ -378,53 +366,62 @@ jobs: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log - distributed_summary: - needs: distributed - runs-on: ubuntu-22.04 + summary: + needs: [normal, devices, distributed] + runs-on: ubuntu-latest timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + test: + - name: 'op_regression' + condition: ${{ contains(inputs.ut, 'op_regression') }} + - name: 'op_regression_dev1' + condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} + - name: 'op_transformers' + condition: ${{ contains(inputs.ut, 'op_transformers') }} + - name: 'op_extended' + condition: ${{ contains(inputs.ut, 'op_extended') }} + - name: 'op_ut' + condition: ${{ contains(inputs.ut, 'op_ut') }} + - name: 'torch_xpu' + condition: ${{ contains(inputs.ut, 'torch_xpu') }} + - name: 'xpu_profiling' + condition: ${{ contains(inputs.ut, 'xpu_profiling') }} + - name: 'xpu_distributed' + condition: ${{ contains(inputs.ut, 'xpu_distributed') }} env: GH_TOKEN: ${{ github.token }} UT_SKIP_ISSUE: 1624 + UT_NAME: ${{ matrix.test.name }} steps: - - name: Set the UT name - run: | - echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download XPU UT Logs + if: ${{ matrix.test.condition }} uses: actions/download-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }} path: ${{ github.workspace }}/ut_log - name: Check UT Results + if: ${{ matrix.test.condition }} shell: bash run: | repo="${{ github.repository }}" - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - cd ${{ github.workspace }}/ut_log/xpu_distributed + cd ${{ github.workspace }}/ut_log/${{ matrix.test.name }} gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ - > issues.log - awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log + --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log + awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | \ + grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log cat issues_temp.log | awk '{print $1}' >> Known_issue.log awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh 'xpu_distributed' + bash ut_result_check.sh ${{ matrix.test.name }} - name: Upload Inductor XPU UT Log - if: always() + if: ${{ matrix.test.condition }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed-checked + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9e804a27ee..3a2c819e32 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -21,9 +21,8 @@ concurrency: jobs: preci-lint-check: - name: preci-lint-check if: ${{ github.repository_owner == 'intel' }} - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 30 steps: - name: Checkout torch-xpu-ops @@ -52,10 +51,9 @@ jobs: bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh conditions-filter: - name: conditions-filter if: ${{ github.event.pull_request.draft == false }} needs: [preci-lint-check] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 10 env: GH_TOKEN: ${{ github.token }} @@ -93,7 +91,6 @@ jobs: echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" linux-build: - name: linux if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} needs: [conditions-filter] secrets: inherit @@ -105,7 +102,6 @@ jobs: torch_xpu_ops: cicd linux-ut: - name: linux needs: [conditions-filter, linux-build] uses: ./.github/workflows/_linux_ut.yml with: @@ -117,7 +113,6 @@ jobs: linux-e2e: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} - name: linux permissions: write-all needs: [conditions-filter, linux-build] uses: ./.github/workflows/_linux_e2e.yml From ec697f5398a5bbeb1609a52001f58bc2e5b4c07f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 17:33:10 +0800 Subject: [PATCH 076/160] modify build --- .github/actions/setup-testenv/action.yml | 4 +++- .github/workflows/_linux_build.yml | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index a26f543f45..1e4c100402 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -32,7 +32,9 @@ runs: - name: Setup gh run: | curl -sS https://webi.sh/gh | sh - echo "PATH=${HOME}/.local/bin:$PATH" >> ${GITHUB_ENV} + echo "PATH=${HOME}/.local/bin:${PATH}" |tee -a ${GITHUB_ENV} + source ~/.config/envman/PATH.env + gh --version - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d24628b6d5..786e227e87 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -71,7 +71,7 @@ jobs: run: | hostname && id curl -sS https://webi.sh/gh | sh - echo "PATH=${HOME}/.local/bin:$PATH" |tee -a ${GITHUB_ENV} + echo "PATH=${HOME}/.local/bin:${PATH}" |tee -a ${GITHUB_ENV} source ~/.config/envman/PATH.env gh --version - name: Setup python-${{ inputs.python }} @@ -92,6 +92,7 @@ jobs: path: torch-xpu-ops - name: Build Pytorch XPU run: | + source ~/.config/envman/PATH.env if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" @@ -187,7 +188,6 @@ jobs: python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' chmod 777 . -R - - name: Upload Torch XPU Wheel if: ${{ success() }} uses: actions/upload-artifact@v4 From c1e4ca740ed2ab461d01f0de851b1cc182967193 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 23 Jul 2025 17:56:54 +0800 Subject: [PATCH 077/160] modify build --- .github/workflows/_linux_build.yml | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 786e227e87..6572447f6f 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -36,6 +36,10 @@ on: permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: runner: runs-on: ${{ inputs.runner }} @@ -60,29 +64,22 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} env: + PATH: /tmp/xpu-build/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/github/home/.local/bin GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/_tools timeout-minutes: 300 - defaults: - run: - shell: bash -xe {0} steps: - name: Setup gh run: | + # Cleanup workspace + find ./ |grep -v "^\./$" |xargs rm -rf hostname && id curl -sS https://webi.sh/gh | sh - echo "PATH=${HOME}/.local/bin:${PATH}" |tee -a ${GITHUB_ENV} - source ~/.config/envman/PATH.env gh --version - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: 'pypy${{ inputs.python }}' - check-latest: true - - name: Clean workspace run: | - # Cleanup workspace - find ./ |grep -v "^\./$" |xargs rm -rf + local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-build which python && python -V which pip && pip list pip install -U pip wheel setuptools @@ -92,7 +89,6 @@ jobs: path: torch-xpu-ops - name: Build Pytorch XPU run: | - source ~/.config/envman/PATH.env if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" From 50e40fe290c7244e468f3e27bda5a2fd3699f748 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 09:31:10 +0800 Subject: [PATCH 078/160] modify build --- .github/actions/setup-testenv/action.yml | 11 ++--- .github/workflows/_linux_build.yml | 8 +-- .github/workflows/_linux_e2e.yml | 12 ++--- .github/workflows/_linux_ut.yml | 62 ++++-------------------- 4 files changed, 24 insertions(+), 69 deletions(-) diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index 1e4c100402..a387a6021c 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -31,6 +31,9 @@ runs: steps: - name: Setup gh run: | + hostname && id + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf curl -sS https://webi.sh/gh | sh echo "PATH=${HOME}/.local/bin:${PATH}" |tee -a ${GITHUB_ENV} source ~/.config/envman/PATH.env @@ -41,15 +44,11 @@ runs: python-version: ${{ inputs.python }} - name: Check runner run: | - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf - hostname && whoami && id cat /etc/os-release gcc -v && g++ -v - which python && which pip - python -V + which python && python -V + which pip && pip list pip install -U pip wheel setuptools - pip list uname -a dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' clinfo --list diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 6572447f6f..503e132f4e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -120,7 +120,7 @@ jobs: --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log - if [ ! -f "${{ github.workspace }}/torch-*.whl" ];then + if [ $(ls ${{ github.workspace }} |grep -c "torch-.*.whl") -eq 0 ];then echo "Build pytorch got failed" exit 1 fi @@ -140,7 +140,7 @@ jobs: )" python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log - if [ ! -f "${{ github.workspace }}/pytorch_triton_xpu-*.whl" ];then + if [ $(ls |grep -c "pytorch_triton_xpu-.*.whl") -eq 0 ];then echo "Build triton got failed" exit 1 fi @@ -157,7 +157,7 @@ jobs: git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID} python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log - if [ ! -f "dist/*.whl" ];then + if [ $(ls dist/ |grep -c "torchvision-.*.whl") -eq 0 ];then echo "Build torchvision got failed" exit 1 fi @@ -166,7 +166,7 @@ jobs: git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID} python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log - if [ ! -f "dist/*.whl" ];then + if [ $(ls dist/ |grep -c "torchaudio-.*.whl") -eq 0 ];then echo "Build torchaudio got failed" exit 1 fi diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 2c55ce60a7..2560a26a89 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -48,6 +48,10 @@ on: permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: runner: runs-on: ${{ inputs.runner }} @@ -77,9 +81,6 @@ jobs: GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} MODEL_ONLY_NAME: ${{ inputs.model }} - defaults: - run: - shell: bash -xe {0} steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -224,7 +225,7 @@ jobs: path: ${{ github.workspace }}/upload_files summary: - runs-on: [self-hosted, Linux, X64] + runs-on: [self-hosted, Linux] if: ${{ always() }} needs: test permissions: @@ -235,9 +236,6 @@ jobs: AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} REFERENCE_ISSUE_ID: 1645 - defaults: - run: - shell: bash -xe {0} steps: - name: Install gh run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 31391ff634..699baaf234 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -82,14 +82,6 @@ jobs: command: 'pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml' log_prefix: 'op_regression' timeout: 3600 - - name: 'op_regression_dev1' - condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - directory: 'pytorch/third_party/torch-xpu-ops/test/regressions' - command: 'pytest --timeout 600 -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml' - log_prefix: 'op_regression_dev1' - timeout: 300 - additional_steps: | - unset ZE_AFFINITY_MASK - name: 'op_transformers' condition: ${{ contains(inputs.ut, 'op_transformers') }} directory: 'pytorch' @@ -243,19 +235,9 @@ jobs: runs-on: pvc_rolling if: ${{ contains(inputs.ut, 'op_regression_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 5 - strategy: - matrix: - test: - - name: 'op_regression_dev1' - condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - directory: 'pytorch/third_party/torch-xpu-ops/test/regressions' - command: 'pytest --timeout 600 -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml' - log_prefix: 'op_regression_dev1' - timeout: 3 env: - AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} - UT_NAME: ${{ matrix.test.name }} + AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/_tools steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -268,44 +250,20 @@ jobs: oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - name: Run XPU UT Test - if: ${{ matrix.test.condition }} - run: | - mkdir -p ${{ github.workspace }}/ut_log - mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - echo "Running ${{ matrix.test.name }}" - echo "Directory: ${{ matrix.test.directory }}" - ${{ matrix.test.additional_steps }} - cd ${{ matrix.test.directory }} - if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then - bash << "SCRIPT" - ${{ matrix.test.command_script }} - SCRIPT - else - timeout ${{ matrix.test.timeout }} ${{ matrix.test.command }} \ - 2>${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test_error.log | \ - tee ${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test.log - ${{ matrix.test.xml_post_processing || '' }} - fi - - name: UT Test Results Summary - if: ${{ matrix.test.condition }} run: | - pip install junitparser - python torch-xpu-ops/.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true - if [ -e "ut_failure_list.csv" ];then - cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv - fi + mkdir -p ${{ github.workspace }}/ut_log/op_regression_dev1 + echo "Running op_regression_dev1" + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest --timeout 200 -v test_operation_on_device_1.py \ + --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml \ + 2>${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test_error.log | \ + tee ${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test.log - name: Upload Inductor XPU UT Log - if: ${{ matrix.test.condition }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-op_regression_dev1 path: ${{ github.workspace }}/ut_log - - name: Upload XPU UT Failure list - if: ${{ matrix.test.condition }} - uses: actions/upload-artifact@v4 - with: - name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} - path: ${{ github.workspace }}/ut_log/ut_failure_list.csv distributed: runs-on: pytorch-06 From b3f6f0ec3179087944e92be74438ccc291dc7d45 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 09:47:06 +0800 Subject: [PATCH 079/160] update --- .github/workflows/_linux_build.yml | 3 +++ .github/workflows/_linux_e2e.yml | 3 +++ .github/workflows/_linux_op_benchmark.yml | 10 +++++++--- .github/workflows/_linux_ut.yml | 3 +++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 503e132f4e..395addbb16 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -48,6 +48,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Get runner diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 2560a26a89..e773bce49f 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -60,6 +60,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Get runner diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index d19e53de40..cf05b7928a 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -27,6 +27,10 @@ on: permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: get_runner: runs-on: ${{ inputs.runner }} @@ -35,6 +39,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Get runner @@ -58,9 +65,6 @@ jobs: GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} REFERENCE_ISSUE: 1689 - defaults: - run: - shell: bash -xe {0} steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 699baaf234..7db00ef27a 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -50,6 +50,9 @@ jobs: test_user: ${{ steps.runner-info.outputs.test_user }} test_group: ${{ steps.runner-info.outputs.test_group }} steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Get runner From 42da693c771ad2da06319599ba256c2d718bdf09 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 14:09:55 +0800 Subject: [PATCH 080/160] modify build --- .github/scripts/build.sh | 2 +- .github/workflows/_linux_build.yml | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 4545fc3bfa..b419883740 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -48,7 +48,7 @@ python -m pip install requests python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static mkl-include +python -m pip install mkl-static==2025.1.0 mkl-include==2025.1.0 export USE_STATIC_MKL=1 export USE_XCCL=1 if [ "${XPU_ONEAPI_PATH}" == "" ];then diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 395addbb16..da4981fd0a 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -66,12 +66,20 @@ jobs: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} + options: -e HOME=/tmp/xpu-build env: - PATH: /tmp/xpu-build/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/github/home/.local/bin + PATH: /tmp/xpu-build/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin GH_TOKEN: ${{ github.token }} - AGENT_TOOLSDIRECTORY: /tmp/_tools + AGENT_TOOLSDIRECTORY: /tmp/xpu-build/_tools timeout-minutes: 300 steps: + - name: Setup python-${{ inputs.python }} + run: | + local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-build/.local + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools - name: Setup gh run: | # Cleanup workspace @@ -79,19 +87,16 @@ jobs: hostname && id curl -sS https://webi.sh/gh | sh gh --version - - name: Setup python-${{ inputs.python }} - run: | - local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-build - which python && python -V - which pip && pip list - pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: path: torch-xpu-ops - name: Build Pytorch XPU run: | + # only build pvc for CI + if [ "${{ inputs.test_type }}" == "build-cicd" ];then + export TORCH_XPU_ARCH_LIST='pvc' + fi if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" @@ -129,6 +134,9 @@ jobs: fi - name: Build Triton run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable cd ./pytorch pip install cmake ninja pybind11 rm -rf pytorch_triton_xpu-*.whl From 26b56dbbbb7bcd7e42bda0bb3044ea1b01bc13ab Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 14:32:49 +0800 Subject: [PATCH 081/160] modify build --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index da4981fd0a..dfc97b4a07 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -66,9 +66,9 @@ jobs: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: -e HOME=/tmp/xpu-build env: PATH: /tmp/xpu-build/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + HOME: /tmp/xpu-build GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/xpu-build/_tools timeout-minutes: 300 From 77d817270cf37dec1eb9b10548cefe142e5388f4 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 15:22:51 +0800 Subject: [PATCH 082/160] modify build --- .github/actions/get-runner/action.yml | 9 +- .github/actions/setup-testenv/action.yml | 203 +++++++++++------------ .github/workflows/_linux_build.yml | 8 +- 3 files changed, 113 insertions(+), 107 deletions(-) diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index 74544db7e4..6525957998 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -37,7 +37,12 @@ runs: run: | # clean docker cache docker system prune -af || true - # clean files + # clean workspace ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf ${RUNNER_TEMP} ${RUNNER_TOOL_CACHE} + cd ${RUNNER_WORKSPACE}/.. + if [ "${PWD}" != "/" ];then + ls -al + sudo chmod 777 -R torch-xpu-ops _temp _actions _tool || true + sudo rm -rf _temp + fi diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index a387a6021c..403105b0e6 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -29,107 +29,106 @@ permissions: read-all runs: using: composite steps: - - name: Setup gh - run: | - hostname && id - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf - curl -sS https://webi.sh/gh | sh - echo "PATH=${HOME}/.local/bin:${PATH}" |tee -a ${GITHUB_ENV} - source ~/.config/envman/PATH.env - gh --version - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} - - name: Check runner - run: | - cat /etc/os-release - gcc -v && g++ -v - which python && python -V - which pip && pip list - pip install -U pip wheel setuptools - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - clinfo --list - cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - with: - path: torch-xpu-ops - - name: Install oneAPI DLE - if: ${{ inputs.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - - name: Download Pytorch wheel - if: ${{ ! contains(inputs.test_type, 'wheel') }} - uses: actions/download-artifact@v4 - with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - run: | - # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + - name: Cleanup workspace + run: | + hostname && id + ls -al + find ./ |grep -v "^\./$" |xargs rm -rf + rm -rf ~/.triton /tmp/*inductor* + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Check runner + run: | + cat /etc/os-release + gcc -v && g++ -v + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + clinfo --list + cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Install oneAPI DLE + if: ${{ inputs.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + source ${HOME}/intel/oneapi/setvars.sh + sycl-ls && icpx -v + - name: Download Pytorch wheel + if: ${{ ! contains(inputs.test_type, 'wheel') }} + uses: actions/download-artifact@v4 + with: + pattern: Torch-XPU-Wheel-* + - name: Prepare Stock Pytorch + run: | + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install --force-reinstall ${{ github.workspace }}/*.whl + fi + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} + pip install -r .ci/docker/requirements-ci.txt + # apply extra PRs for stock pytorch + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + fi + git status && git diff && git show -s + - name: Prepare Torch-xpu-ops + if: ${{ inputs.torch_xpu_ops != 'skipped' }} + run: | + cd pytorch + rm -rf third_party/torch-xpu-ops + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" + else + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then + TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" else - pip install --force-reinstall ${{ github.workspace }}/*.whl + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - fi - git clone ${PYTORCH_REPO} pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - pip install -r .ci/docker/requirements-ci.txt - # apply extra PRs for stock pytorch - if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - else - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - fi - git status && git diff && git show -s - - name: Prepare Torch-xpu-ops - if: ${{ inputs.torch_xpu_ops != 'skipped' }} - run: | - cd pytorch - rm -rf third_party/torch-xpu-ops - if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" - else - TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then - TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" - else - TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" - fi - fi - if [ "${{ inputs.test_type }}" == "cicd" ];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - fi - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - git status && git diff && git show -s - - name: Torch Config - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' + fi + if [ "${{ inputs.test_type }}" == "cicd" ];then + cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops + else + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + fi + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} + git status && git diff && git show -s + - name: Torch Config + run: | + printenv + python -c "import torch; print(torch.__config__.show())" + python -c "import torch; print(torch.__config__.parallel_info())" + python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index dfc97b4a07..035b4f136e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -68,15 +68,15 @@ jobs: - ${{ github.workspace }}:${{ github.workspace }} env: PATH: /tmp/xpu-build/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - HOME: /tmp/xpu-build GH_TOKEN: ${{ github.token }} - AGENT_TOOLSDIRECTORY: /tmp/xpu-build/_tools + AGENT_TOOLSDIRECTORY: /tmp/xpu-build timeout-minutes: 300 steps: - name: Setup python-${{ inputs.python }} run: | + rm -rf ${AGENT_TOOLSDIRECTORY} local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-build/.local + /opt/python/${local_python}/bin/python -m venv ${AGENT_TOOLSDIRECTORY}/.local which python && python -V which pip && pip list pip install -U pip wheel setuptools @@ -85,6 +85,8 @@ jobs: # Cleanup workspace find ./ |grep -v "^\./$" |xargs rm -rf hostname && id + # install gh + export HOME=${AGENT_TOOLSDIRECTORY} curl -sS https://webi.sh/gh | sh gh --version - name: Checkout torch-xpu-ops From e9d551a7fd60624c87556dcf9d13ebcbe582b05c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 15:29:36 +0800 Subject: [PATCH 083/160] update --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 035b4f136e..63c92205cf 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -196,7 +196,7 @@ jobs: python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' - chmod 777 . -R + chmod 777 /__w -R - name: Upload Torch XPU Wheel if: ${{ success() }} uses: actions/upload-artifact@v4 From 9649dfd94733d1f387d95cb287ffe643c1e665cd Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 17:40:45 +0800 Subject: [PATCH 084/160] update --- .github/actions/get-runner/action.yml | 21 ++++++------ .github/actions/setup-testenv/action.yml | 2 +- .github/workflows/_linux_build.yml | 39 ++++++++++++----------- .github/workflows/_linux_e2e.yml | 13 ++++---- .github/workflows/_linux_op_benchmark.yml | 17 +++++----- .github/workflows/_linux_ut.yml | 32 +++++++++++++------ 6 files changed, 73 insertions(+), 51 deletions(-) diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index 6525957998..513c185bd3 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -4,12 +4,14 @@ on: workflow_call: outputs: - test_host: - value: ${{ steps.runner.outputs.test_host }} - test_user: - value: ${{ steps.runner.outputs.test_user }} - test_group: - value: ${{ steps.runner.outputs.test_group }} + runner_id: + value: ${{ steps.runner.outputs.runner_id }} + user_id: + value: ${{ steps.runner.outputs.user_id }} + render_id: + value: ${{ steps.runner.outputs.render_id }} + hostname: + value: ${{ steps.runner.outputs.hostname }} permissions: read-all @@ -21,9 +23,10 @@ runs: id: runner run: | # get test runner - echo "test_host=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + echo "runner_id=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT} + echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} # show host info lscpu lshw -C display diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index 403105b0e6..2df6313461 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -31,7 +31,6 @@ runs: steps: - name: Cleanup workspace run: | - hostname && id ls -al find ./ |grep -v "^\./$" |xargs rm -rf rm -rf ~/.triton /tmp/*inductor* @@ -41,6 +40,7 @@ runs: python-version: ${{ inputs.python }} - name: Check runner run: | + hostname && id cat /etc/os-release gcc -v && g++ -v which python && python -V diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 63c92205cf..53fdc621a0 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -44,9 +44,10 @@ jobs: runner: runs-on: ${{ inputs.runner }} outputs: - test_host: ${{ steps.runner-info.outputs.test_host }} - test_user: ${{ steps.runner-info.outputs.test_user }} - test_group: ${{ steps.runner-info.outputs.test_group }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} steps: - name: Cleanup workspace run: | @@ -61,39 +62,41 @@ jobs: name: ${{ contains(inputs.test_type, 'wheel') && inputs.pytorch || 'build' }} needs: runner if: ${{ ! contains(inputs.test_type, 'wheel') }} - runs-on: ${{ needs.runner.outputs.test_host }} + runs-on: ${{ needs.runner.outputs.runner_id }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: - PATH: /tmp/xpu-build/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + PATH: /tmp/xpu-venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin GH_TOKEN: ${{ github.token }} - AGENT_TOOLSDIRECTORY: /tmp/xpu-build + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool timeout-minutes: 300 steps: + - name: Install gh-cli + run: | + cat /etc/os-release + hostname && id + # Cleanup workspace + find ./ |grep -v "^\./$" |xargs rm -rf + # install gh + dnf install -y 'dnf-command(config-manager)' + dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo + dnf install -y gh --repo gh-cli + gh --version - name: Setup python-${{ inputs.python }} run: | - rm -rf ${AGENT_TOOLSDIRECTORY} + rm -rf /tmp/xpu-venv local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv ${AGENT_TOOLSDIRECTORY}/.local + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-venv which python && python -V which pip && pip list pip install -U pip wheel setuptools - - name: Setup gh - run: | - # Cleanup workspace - find ./ |grep -v "^\./$" |xargs rm -rf - hostname && id - # install gh - export HOME=${AGENT_TOOLSDIRECTORY} - curl -sS https://webi.sh/gh | sh - gh --version - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Build Pytorch XPU + - name: Build Pytorch on ${{ needs.runner.outputs.hostname }} run: | # only build pvc for CI if [ "${{ inputs.test_type }}" == "build-cicd" ];then diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index e773bce49f..26e87b8031 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -56,9 +56,10 @@ jobs: runner: runs-on: ${{ inputs.runner }} outputs: - test_host: ${{ steps.runner-info.outputs.test_host }} - test_user: ${{ steps.runner-info.outputs.test_user }} - test_group: ${{ steps.runner-info.outputs.test_group }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} steps: - name: Cleanup workspace run: | @@ -70,7 +71,7 @@ jobs: uses: ./.github/actions/get-runner test: - runs-on: ${{ needs.runner.outputs.test_host }} + runs-on: ${{ needs.runner.outputs.runner_id }} needs: runner timeout-minutes: 3600 container: @@ -78,7 +79,7 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.runner.outputs.test_user }}:${{ needs.runner.outputs.test_group }} + -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /tmp/_tools GH_TOKEN: ${{ github.token }} @@ -87,7 +88,7 @@ jobs: steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Setup Test Env + - name: Launch Test on ${{ needs.runner.outputs.hostname }} uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index cf05b7928a..352b3a81e1 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -32,12 +32,13 @@ defaults: shell: bash -xe {0} jobs: - get_runner: + runner: runs-on: ${{ inputs.runner }} outputs: - test_host: ${{ steps.runner-info.outputs.test_host }} - test_user: ${{ steps.runner-info.outputs.test_user }} - test_group: ${{ steps.runner-info.outputs.test_group }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} steps: - name: Cleanup workspace run: | @@ -49,8 +50,8 @@ jobs: uses: ./.github/actions/get-runner op_benchmark: - needs: get_runner - runs-on: ${{ needs.get_runner.outputs.test_host }} + needs: runner + runs-on: ${{ needs.runner.outputs.runner_id }} permissions: issues: write timeout-minutes: 900 @@ -59,7 +60,7 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} + -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /opt/_tools GH_TOKEN: ${{ github.token }} @@ -68,7 +69,7 @@ jobs: steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Setup Test Env + - name: Launch Test on ${{ needs.runner.outputs.hostname }} uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 7db00ef27a..3530d765ea 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -46,9 +46,10 @@ jobs: runner: runs-on: ${{ inputs.runner }} outputs: - test_host: ${{ steps.runner-info.outputs.test_host }} - test_user: ${{ steps.runner-info.outputs.test_user }} - test_group: ${{ steps.runner-info.outputs.test_group }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} steps: - name: Cleanup workspace run: | @@ -61,7 +62,7 @@ jobs: normal: needs: runner - runs-on: ${{ needs.runner.outputs.test_host }} + runs-on: ${{ needs.runner.outputs.runner_id }} if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 container: @@ -69,7 +70,7 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.runner.outputs.test_user }}:${{ needs.runner.outputs.test_group }} + -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} -e ZE_AFFINITY_MASK env: AGENT_TOOLSDIRECTORY: /tmp/_tools @@ -186,7 +187,7 @@ jobs: steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Setup Test Env + - name: Launch Test on ${{ needs.runner.outputs.hostname }} uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} @@ -242,9 +243,19 @@ jobs: GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/_tools steps: + - name: Cleanup workspace + id: cleanup + run: | + cat /etc/os-release + echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf + sudo rm -rf ~/.triton ~/.torch + xpu-smi discovery - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Setup Test Env + - name: Launch Test on ${{ steps.cleanup.outputs.hostname }} uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} @@ -276,8 +287,11 @@ jobs: GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/_tools steps: - - name: Check runner + - name: Cleanup workspace + id: cleanup run: | + cat /etc/os-release + echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} ls -al sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf @@ -285,7 +299,7 @@ jobs: xpu-smi topology -m - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Setup Test Env + - name: Launch Test on ${{ steps.cleanup.outputs.hostname }} uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} From 84a513213bb146723781da6377dad8c51c16be3d Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 17:43:09 +0800 Subject: [PATCH 085/160] update --- .github/workflows/_linux_e2e.yml | 2 +- .github/workflows/_linux_op_benchmark.yml | 2 +- .github/workflows/_linux_ut.yml | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 26e87b8031..af909adee4 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -81,7 +81,7 @@ jobs: options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: - AGENT_TOOLSDIRECTORY: /tmp/_tools + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} MODEL_ONLY_NAME: ${{ inputs.model }} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 352b3a81e1..a3fa6e32d3 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -62,7 +62,7 @@ jobs: options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: - AGENT_TOOLSDIRECTORY: /opt/_tools + AGENT_TOOLSDIRECTORY: /opt/xpu-tool GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} REFERENCE_ISSUE: 1689 diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 3530d765ea..f1cf802fbd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -73,7 +73,7 @@ jobs: -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} -e ZE_AFFINITY_MASK env: - AGENT_TOOLSDIRECTORY: /tmp/_tools + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} strategy: @@ -241,7 +241,7 @@ jobs: timeout-minutes: 5 env: GH_TOKEN: ${{ github.token }} - AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/_tools + AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool steps: - name: Cleanup workspace id: cleanup @@ -285,7 +285,7 @@ jobs: timeout-minutes: 60 env: GH_TOKEN: ${{ github.token }} - AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/_tools + AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool steps: - name: Cleanup workspace id: cleanup From ee18a1c4f7515a969d34f894a956c317ede96a43 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 17:49:57 +0800 Subject: [PATCH 086/160] update --- .github/actions/setup-testenv/action.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index 2df6313461..ba429457b2 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -30,6 +30,7 @@ runs: using: composite steps: - name: Cleanup workspace + shell: bash -xe {0} run: | ls -al find ./ |grep -v "^\./$" |xargs rm -rf @@ -39,6 +40,7 @@ runs: with: python-version: ${{ inputs.python }} - name: Check runner + shell: bash -xe {0} run: | hostname && id cat /etc/os-release @@ -55,6 +57,7 @@ runs: with: path: torch-xpu-ops - name: Install oneAPI DLE + shell: bash -xe {0} if: ${{ inputs.oneapi != 'installed' }} run: | rm -rf ~/intel ~/.intel @@ -69,6 +72,7 @@ runs: with: pattern: Torch-XPU-Wheel-* - name: Prepare Stock Pytorch + shell: bash -xe {0} run: | # install pytorch if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then @@ -98,6 +102,7 @@ runs: fi git status && git diff && git show -s - name: Prepare Torch-xpu-ops + shell: bash -xe {0} if: ${{ inputs.torch_xpu_ops != 'skipped' }} run: | cd pytorch @@ -122,6 +127,7 @@ runs: git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Torch Config + shell: bash -xe {0} run: | printenv python -c "import torch; print(torch.__config__.show())" From 43fee42afdac60c3bcd03a85a7c33df060a5e086 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 17:53:05 +0800 Subject: [PATCH 087/160] update --- .github/workflows/_linux_e2e.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index af909adee4..79d07bf925 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -237,7 +237,7 @@ jobs: container: image: ubuntu:latest env: - AGENT_TOOLSDIRECTORY: /tmp/_tools + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool GH_TOKEN: ${{ github.token }} REFERENCE_ISSUE_ID: 1645 steps: From e8f1c0dbbe74ca6a83864eb789b62ec91742744c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 24 Jul 2025 17:57:50 +0800 Subject: [PATCH 088/160] update --- .github/workflows/nightly_ondemand.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 3d6e8b9fe3..d4c12de348 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -106,7 +106,7 @@ jobs: Linux-Nightly-Ondemand-Build: needs: [Conditions-Filter] - name: linux + name: linux-build secrets: inherit uses: ./.github/workflows/_linux_build.yml with: @@ -120,7 +120,7 @@ jobs: Linux-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} - name: linux + name: linux-ut needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: @@ -134,7 +134,7 @@ jobs: Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} - name: linux + name: linux-e2e permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml @@ -152,7 +152,7 @@ jobs: Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: if: ${{ github.event_name == 'schedule' }} - name: linux-nightly-ondemand-rolling / Op_microbench + name: linux-microbench permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_op_benchmark.yml @@ -165,7 +165,7 @@ jobs: Windows-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' }} - name: Windows-nightly-ondemand + name: windows uses: ./.github/workflows/_windows_ut.yml with: ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} From 517b08121eec2efefc5420a77a7cc4c1cd9019d6 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 09:23:59 +0800 Subject: [PATCH 089/160] update --- .github/actions/setup-testenv/action.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index ba429457b2..a056168ab3 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -75,15 +75,16 @@ runs: shell: bash -xe {0} run: | # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" |wc -l) -ne 0 ];then + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" |wc -l) -ne 0 ];then + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" |wc -l) -ne 0 ];then + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu else - pip install --force-reinstall ${{ github.workspace }}/*.whl + pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl") fi + pip list |grep torch TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" From ddecdf96ae1d32ccbbf5c146a587d596bb3e2c7b Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 09:58:07 +0800 Subject: [PATCH 090/160] update --- .github/actions/setup-testenv/action.yml | 7 +------ .github/workflows/_linux_e2e.yml | 4 ++++ .github/workflows/_linux_op_benchmark.yml | 3 +++ .github/workflows/_linux_ut.yml | 6 ++++++ 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index a056168ab3..ae5068924d 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -29,12 +29,6 @@ permissions: read-all runs: using: composite steps: - - name: Cleanup workspace - shell: bash -xe {0} - run: | - ls -al - find ./ |grep -v "^\./$" |xargs rm -rf - rm -rf ~/.triton /tmp/*inductor* - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: @@ -52,6 +46,7 @@ runs: dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' clinfo --list cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c + rm -rf ~/.triton /tmp/*inductor* - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 79d07bf925..4088b30f9e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -86,6 +86,9 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} MODEL_ONLY_NAME: ${{ inputs.model }} steps: + - name: Cleanup workspace + run: | + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} @@ -245,6 +248,7 @@ jobs: run: | apt-get update apt-get install gh rsync ca-certificates -y + find ./ |grep -v "^\./$" |xargs rm -rf - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index a3fa6e32d3..bd6c1adc70 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -67,6 +67,9 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} REFERENCE_ISSUE: 1689 steps: + - name: Cleanup workspace + run: | + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index f1cf802fbd..172d0e793b 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -185,6 +185,9 @@ jobs: env: UT_NAME: ${{ matrix.test.name }} steps: + - name: Cleanup workspace + run: | + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} @@ -370,6 +373,9 @@ jobs: UT_SKIP_ISSUE: 1624 UT_NAME: ${{ matrix.test.name }} steps: + - name: Cleanup workspace + run: | + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download XPU UT Logs From d1bf4cf827c28c4fc5a57f086e46837aca1f765b Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 10:37:33 +0800 Subject: [PATCH 091/160] modify ut --- .github/actions/linux-ut/action.yml | 138 ++++++++++++++++++ .github/scripts/check-ut.py | 4 +- .github/scripts/ut_result_check.sh | 2 +- .github/workflows/_linux_ut.yml | 193 +++---------------------- .github/workflows/_windows_ut.yml | 2 +- .github/workflows/nightly_ondemand.yml | 4 +- .github/workflows/pull.yml | 2 +- 7 files changed, 167 insertions(+), 178 deletions(-) create mode 100644 .github/actions/linux-ut/action.yml diff --git a/.github/actions/linux-ut/action.yml b/.github/actions/linux-ut/action.yml new file mode 100644 index 0000000000..4ef45eb2fe --- /dev/null +++ b/.github/actions/linux-ut/action.yml @@ -0,0 +1,138 @@ +name: Linux Unit Test + +on: + workflow_call: + inputs: + test_type: + required: true + type: string + description: Test scope + +permissions: read-all + +runs: + using: composite + steps: + - name: op_regression + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_regression' }} + run: | + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml + - name: op_transformers + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_transformers' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + cd pytorch + pytest --timeout 600 -v test/test_transformers.py -k xpu \ + --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml + - name: op_extended + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_extended' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + cd pytorch/third_party/torch-xpu-ops/test/xpu/extended + timeout 3600 python run_test_with_skip.py + cp op_extended.xml $GITHUB_WORKSPACE/ut_log + - name: op_ut + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_ut' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + export PYTORCH_ENABLE_XPU_FALLBACK=1 + cd pytorch/third_party/torch-xpu-ops/test/xpu + timeout 10000 python run_test_with_skip.py \ + 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log + cp *.xml $GITHUB_WORKSPACE/ut_log + find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"op_ut_with_skip_quantization/core"*) + dir_name="op_ut_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log + cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log + # Cases run with a on-demand white list, since some suites are too + # slow to go through all operators on CPU. So add cases on-demand + # when XPU implementatoin is done. + # test_foreach, test_decomp + # Run with only + timeout 10000 python run_test_with_only.py \ + 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log + cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log + - name: torch_xpu + shell: bash -xe {0} + if: ${{ inputs.test_type == 'torch_xpu' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" + cd pytorch + test_cmd="python test/run_test.py --include " + for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done + for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done + if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi + eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log + - name: xpu_profiling + shell: bash -xe {0} + if: ${{ inputs.test_type == 'xpu_profiling' }} + run: | + mkdir -p ut_log/profile_test/issue_reproduce + cd pytorch/third_party/torch-xpu-ops + # RN50 Test + PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 + cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test + # All Issue Reproduce UT + python -u test/profiling/correlation_id_mixed.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log + python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log + python -u test/profiling/time_precision_in_profile.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log + python -u test/profiling/profile_partial_runtime_ops.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log + python -u test/profiling/triton_xpu_ops_time.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log + # All xpu ut under test/profiler + cd ../pytorch/test/profiler + python -m pytest --timeout 600 -vs test_cpp_thread.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log + python -m pytest --timeout 600 -vs test_execution_trace.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log + python -m pytest --timeout 600 -vs test_memory_profiler.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log + python -m pytest --timeout 600 -vs test_profiler_tree.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log + + - name: op_dev1 + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_dev1' }} + run: | + mkdir -p ut_log/op_dev1 + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest --timeout 200 -v test_operation_on_device_1.py \ + --junit-xml=$GITHUB_WORKSPACE/ut_log/op_dev1.xml \ + 2>${{ github.workspace }}/ut_log/op_dev1/op_dev1_test_error.log | \ + tee ${{ github.workspace }}/ut_log/op_dev1/op_dev1_test.log + + - name: xpu_distributed + shell: bash -x -e -o pipefail {0} + if: ${{ inputs.test_type == 'xpu_distributed' }} + run: | + mkdir -p ut_log/xpu_distributed + cd pytorch/third_party/torch-xpu-ops/test/xpu + XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") + if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then + echo -e "[ERROR] XCCL is not enabled" + exit 1 + fi + timeout 1800 python run_distributed.py \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 5d4b189e88..be84a64793 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -188,8 +188,8 @@ def parse_log_file(log_file): def determine_category(ut): if ut == 'op_regression': return 'op_regression' - elif ut == 'op_regression_dev1': - return 'op_regression_dev1' + elif ut == 'op_dev1': + return 'op_dev1' elif ut == 'op_extended': return 'op_extended' elif 'op_ut' in ut: diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index a6d94ce41a..0ad52580f7 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -44,7 +44,7 @@ compare_and_filter_logs() { fi } -if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then +if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then grep -E "FAILED" "${ut_suite}"_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_failed.log grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_failed.log grep "PASSED" "${ut_suite}"_test.log | awk '{print $1}' > ./"${ut_suite}"_passed.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 172d0e793b..6f04ee221f 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -30,7 +30,7 @@ on: ut: required: true type: string - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu` Delimiter is comma + description: UT scope. `op_regression,op_dev1,op_transformers,op_extended,op_ut,torch_xpu` Delimiter is comma disabled_tests: type: string default: '' @@ -79,111 +79,7 @@ jobs: strategy: fail-fast: false matrix: - test: - - name: 'op_regression' - condition: ${{ contains(inputs.ut, 'op_regression') }} - directory: 'pytorch/third_party/torch-xpu-ops/test/regressions' - command: 'pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml' - log_prefix: 'op_regression' - timeout: 3600 - - name: 'op_transformers' - condition: ${{ contains(inputs.ut, 'op_transformers') }} - directory: 'pytorch' - command: 'pytest --timeout 600 -v test/test_transformers.py -k xpu --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml' - log_prefix: 'op_transformers' - timeout: 3600 - additional_steps: | - export PYTORCH_TEST_WITH_SLOW=1 - - name: 'op_extended' - condition: ${{ contains(inputs.ut, 'op_extended') }} - directory: 'pytorch/third_party/torch-xpu-ops/test/xpu/extended/' - command: 'python run_test_with_skip.py' - log_prefix: 'op_extended' - timeout: 3600 - additional_steps: | - export PYTORCH_TEST_WITH_SLOW=1 - xml_post_processing: | - cp op_extended.xml $GITHUB_WORKSPACE/ut_log - - name: 'op_ut' - condition: ${{ contains(inputs.ut, 'op_ut') }} - directory: 'pytorch/third_party/torch-xpu-ops/test/xpu' - log_prefix: 'op_ut' - command_script: | - export PYTORCH_ENABLE_XPU_FALLBACK=1 - export PYTORCH_TEST_WITH_SLOW=1 - timeout 10000 python run_test_with_skip.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log - cp *.xml $GITHUB_WORKSPACE/ut_log - find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' - dir_path=$(dirname "$1"); - case "$dir_path" in - *"op_ut_with_skip_quantization/core"*) - dir_name="op_ut_with_skip_quantization_core";; - *) - dir_name=$(basename "$dir_path");; - esac; - mv "$1" "$dir_path/${dir_name}_$(basename "$1")" - ' _ {} \; - cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log - cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log - # Cases run with a on-demand white list, since some suites are too - # slow to go through all operators on CPU. So add cases on-demand - # when XPU implementatoin is done. - # test_foreach, test_decomp - # Run with only - timeout 10000 python run_test_with_only.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log - cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log - - name: 'torch_xpu' - condition: ${{ contains(inputs.ut, 'torch_xpu') }} - directory: 'pytorch' - command_script: | - export PYTORCH_TEST_WITH_SLOW=1 - export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" - test_cmd="python test/run_test.py --include " - for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done - for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done - if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi - eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log - log_prefix: 'torch_xpu' - timeout: 10000 - - name: 'xpu_profiling' - condition: ${{ contains(inputs.ut, 'xpu_profiling') }} - command_script: | - cd torch-xpu-ops - # RN50 Test - PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 - cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test - - # All Issue Reproduce UT - python -u test/profiling/correlation_id_mixed.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log - python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log - python -u test/profiling/time_precision_in_profile.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log - python -u test/profiling/profile_partial_runtime_ops.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log - python -u test/profiling/triton_xpu_ops_time.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log - - # All xpu ut under test/profiler - cd ../pytorch/test/profiler - python -m pytest --timeout 600 -vs test_cpp_thread.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log - python -m pytest --timeout 600 -vs test_execution_trace.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log - python -m pytest --timeout 600 -vs test_memory_profiler.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log - python -m pytest --timeout 600 -vs test_profiler_tree.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log - additional_steps: | - mkdir -p ut_log/profile_test/issue_reproduce - env: - UT_NAME: ${{ matrix.test.name }} + test: [op_regression, op_transformers, op_extended, op_ut, torch_xpu, xpu_profiling] steps: - name: Cleanup workspace run: | @@ -199,29 +95,13 @@ jobs: oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - name: Run XPU UT Test - if: ${{ matrix.test.condition }} - run: | - mkdir -p ${{ github.workspace }}/ut_log - mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - echo "Running ${{ matrix.test.name }}" - echo "Directory: ${{ matrix.test.directory }}" - ${{ matrix.test.additional_steps }} - cd ${{ matrix.test.directory }} - if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then - bash << "SCRIPT" - ${{ matrix.test.command_script }} - SCRIPT - else - timeout ${{ matrix.test.timeout }} ${{ matrix.test.command }} \ - 2>${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test_error.log | \ - tee ${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test.log - ${{ matrix.test.xml_post_processing || '' }} - fi + uses: ./.github/actions/linux-ut + with: + test_type: ${{ matrix.test }} - name: UT Test Results Summary - if: ${{ matrix.test.condition }} run: | pip install junitparser - python torch-xpu-ops/.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true if [ -e "ut_failure_list.csv" ];then cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv fi @@ -229,18 +109,18 @@ jobs: if: ${{ matrix.test.condition }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log - name: Upload XPU UT Failure list if: ${{ matrix.test.condition }} uses: actions/upload-artifact@v4 with: - name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} + name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log/ut_failure_list.csv devices: runs-on: pvc_rolling - if: ${{ contains(inputs.ut, 'op_regression_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} + if: ${{ contains(inputs.ut, 'op_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 5 env: GH_TOKEN: ${{ github.token }} @@ -267,19 +147,14 @@ jobs: oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - name: Run XPU UT Test - run: | - mkdir -p ${{ github.workspace }}/ut_log/op_regression_dev1 - echo "Running op_regression_dev1" - cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 200 -v test_operation_on_device_1.py \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml \ - 2>${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test_error.log | \ - tee ${{ github.workspace }}/ut_log/op_regression_dev1/op_regression_dev1_test.log + uses: ./.github/actions/linux-ut + with: + test_type: op_dev1 - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-op_regression_dev1 + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-op_dev1 path: ${{ github.workspace }}/ut_log distributed: @@ -319,18 +194,9 @@ jobs: cat ptrace_scope.bk echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope - name: Run Torch XPU Distributed UT - run: | - set -x -e -o pipefail - mkdir -p ut_log/xpu_distributed - cd ../pytorch/third_party/torch-xpu-ops/test/xpu - XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") - if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then - echo -e "[ERROR] XCCL is not enabled" - exit 1 - fi - timeout 1800 python run_distributed.py \ - 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ - tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + uses: ./.github/actions/linux-ut + with: + test_type: xpu_distributed - name: Reset Ptrace_scope if: ${{ always() }} run: | @@ -351,41 +217,26 @@ jobs: strategy: fail-fast: false matrix: - test: - - name: 'op_regression' - condition: ${{ contains(inputs.ut, 'op_regression') }} - - name: 'op_regression_dev1' - condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - - name: 'op_transformers' - condition: ${{ contains(inputs.ut, 'op_transformers') }} - - name: 'op_extended' - condition: ${{ contains(inputs.ut, 'op_extended') }} - - name: 'op_ut' - condition: ${{ contains(inputs.ut, 'op_ut') }} - - name: 'torch_xpu' - condition: ${{ contains(inputs.ut, 'torch_xpu') }} - - name: 'xpu_profiling' - condition: ${{ contains(inputs.ut, 'xpu_profiling') }} - - name: 'xpu_distributed' - condition: ${{ contains(inputs.ut, 'xpu_distributed') }} + test: [op_regression, op_transformers, op_extended, op_ut, torch_xpu, xpu_profiling, op_dev1, xpu_distributed] env: GH_TOKEN: ${{ github.token }} UT_SKIP_ISSUE: 1624 - UT_NAME: ${{ matrix.test.name }} steps: - name: Cleanup workspace + if: ${{ contains(inputs.ut, matrix.test) }} run: | find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/checkout@v4 - name: Download XPU UT Logs - if: ${{ matrix.test.condition }} + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/download-artifact@v4 with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }} path: ${{ github.workspace }}/ut_log - name: Check UT Results - if: ${{ matrix.test.condition }} + if: ${{ contains(inputs.ut, matrix.test) }} shell: bash run: | repo="${{ github.repository }}" @@ -401,7 +252,7 @@ jobs: cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ bash ut_result_check.sh ${{ matrix.test.name }} - name: Upload Inductor XPU UT Log - if: ${{ matrix.test.condition }} + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml index 9ca7f7eb8d..9cb27a30f1 100644 --- a/.github/workflows/_windows_ut.yml +++ b/.github/workflows/_windows_ut.yml @@ -17,7 +17,7 @@ on: required: true type: string default: '' - description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma + description: UT scope. `op_regression,op_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma python: required: false type: string diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index d4c12de348..c0f6553d89 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -29,7 +29,7 @@ on: ut: type: string default: '' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed`. Delimiter is comma + description: UT scope. `op_regression,op_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed`. Delimiter is comma suite: type: string default: '' @@ -130,7 +130,7 @@ jobs: torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} + ut: ${{ github.event_name == 'schedule' && 'op_regression,op_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3a2c819e32..b2f098efaf 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -108,7 +108,7 @@ jobs: runner: linux.idc.xpu test_type: build-cicd pytorch: main - ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed + ut: op_regression,op_dev1,op_transformers,op_extended,op_ut,xpu_distributed disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} linux-e2e: From d99668c06ca52149b2cb27b83727e305b5a35b9a Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 10:42:15 +0800 Subject: [PATCH 092/160] update --- .github/workflows/_linux_ut.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 6f04ee221f..e51886fa48 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -82,11 +82,14 @@ jobs: test: [op_regression, op_transformers, op_extended, op_ut, torch_xpu, xpu_profiling] steps: - name: Cleanup workspace + if: ${{ contains(inputs.ut, matrix.test) }} run: | find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} + if: ${{ contains(inputs.ut, matrix.test) }} uses: ./.github/actions/setup-testenv with: test_type: ${{ inputs.test_type }} @@ -95,10 +98,12 @@ jobs: oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - name: Run XPU UT Test + if: ${{ contains(inputs.ut, matrix.test) }} uses: ./.github/actions/linux-ut with: test_type: ${{ matrix.test }} - name: UT Test Results Summary + if: ${{ contains(inputs.ut, matrix.test) }} run: | pip install junitparser python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true @@ -106,13 +111,13 @@ jobs: cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv fi - name: Upload Inductor XPU UT Log - if: ${{ matrix.test.condition }} + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log - name: Upload XPU UT Failure list - if: ${{ matrix.test.condition }} + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 with: name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} From d0d1ceb0731bed8cc0c171d9ad3299cf07b8e79b Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 10:52:01 +0800 Subject: [PATCH 093/160] update --- .github/actions/get-runner/action.yml | 1 - .github/workflows/_linux_e2e.yml | 2 +- .github/workflows/nightly_ondemand.yml | 6 +++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index 513c185bd3..b772aadfd7 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -35,7 +35,6 @@ runs: cat /etc/os-release uname -a - name: Cleanup host - if: ${{ always() }} shell: bash -xe {0} run: | # clean docker cache diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 4088b30f9e..f7d58b5c17 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -233,7 +233,7 @@ jobs: summary: runs-on: [self-hosted, Linux] - if: ${{ always() }} + if: ${{ ! cancelled() }} needs: test permissions: issues: write diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index c0f6553d89..1f00b63263 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -29,7 +29,7 @@ on: ut: type: string default: '' - description: UT scope. `op_regression,op_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed`. Delimiter is comma + description: UT scope. `op_regression,op_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed,microbench,windows`. Delimiter is comma suite: type: string default: '' @@ -151,7 +151,7 @@ jobs: model: ${{ github.event_name == 'schedule' && '' || inputs.model }} Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: - if: ${{ github.event_name == 'schedule' }} + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'microbench') }} name: linux-microbench permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] @@ -164,7 +164,7 @@ jobs: python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} Windows-Nightly-Ondemand-UT-Tests: - if: ${{ github.event_name == 'schedule' }} + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'windows') }} name: windows uses: ./.github/workflows/_windows_ut.yml with: From 1f265381d1f1f89e355c8db8c73eebfc55c6ea0d Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 10:58:32 +0800 Subject: [PATCH 094/160] update --- .github/actions/get-runner/action.yml | 3 -- .../action.yml | 2 +- .github/actions/linux-ut/action.yml | 12 ++--- .github/actions/setup-testenv/action.yml | 46 +++++++++---------- .github/workflows/_linux_e2e.yml | 20 ++++---- 5 files changed, 38 insertions(+), 45 deletions(-) rename .github/actions/{inductor-xpu-e2e-test => linux-e2e}/action.yml (99%) diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index b772aadfd7..c55ca37cc6 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -1,8 +1,5 @@ name: Get Runner Infos -on: - workflow_call: - outputs: runner_id: value: ${{ steps.runner.outputs.runner_id }} diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/linux-e2e/action.yml similarity index 99% rename from .github/actions/inductor-xpu-e2e-test/action.yml rename to .github/actions/linux-e2e/action.yml index d269ce6d12..559b3b307b 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/linux-e2e/action.yml @@ -1,4 +1,4 @@ -name: inductor-xpu-e2e-test +name: Linux E2E Test inputs: env_prepare: diff --git a/.github/actions/linux-ut/action.yml b/.github/actions/linux-ut/action.yml index 4ef45eb2fe..af01261071 100644 --- a/.github/actions/linux-ut/action.yml +++ b/.github/actions/linux-ut/action.yml @@ -1,12 +1,10 @@ name: Linux Unit Test -on: - workflow_call: - inputs: - test_type: - required: true - type: string - description: Test scope +inputs: + test_type: + required: true + type: string + description: Test scope permissions: read-all diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml index ae5068924d..de5bdf5753 100644 --- a/.github/actions/setup-testenv/action.yml +++ b/.github/actions/setup-testenv/action.yml @@ -1,28 +1,26 @@ -name: Get Runner Infos +name: Setup Test Environment -on: - workflow_call: - inputs: - test_type: - required: true - type: string - description: Test scope - pytorch: - type: string - default: 'main' - description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' - torch_xpu_ops: - type: string - default: 'main' - description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin - oneapi: - type: string - default: 'installed' - description: Installed oneAPI DLE on host by default, fill offline.sh url if needed - python: - type: string - default: '3.10' - description: Python version +inputs: + test_type: + required: true + type: string + description: Test scope + pytorch: + type: string + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: + type: string + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version permissions: read-all diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index f7d58b5c17..b8733741e5 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -103,7 +103,7 @@ jobs: # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: huggingface @@ -112,7 +112,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: torchbench @@ -121,7 +121,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: timm_models @@ -132,7 +132,7 @@ jobs: # Nihglty launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: huggingface @@ -141,7 +141,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: torchbench @@ -150,7 +150,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: timm_models @@ -168,7 +168,7 @@ jobs: # Weekly launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: huggingface @@ -177,7 +177,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: torchbench @@ -186,7 +186,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: timm_models @@ -204,7 +204,7 @@ jobs: # On-demand launch - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ contains(inputs.test_type, 'ondemand') && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test + uses: ./.github/actions/linux-e2e with: env_prepare: true suite: ${{ inputs.suite }} From d06b8db4a4d88d0a25e70ef26f6fb3355b1a1736 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 11:09:36 +0800 Subject: [PATCH 095/160] update --- .github/actions/linux-e2e/action.yml | 143 ---------------------- .github/actions/linux-ut/action.yml | 136 -------------------- .github/actions/setup-testenv/action.yml | 134 -------------------- .github/workflows/_linux_e2e.yml | 22 ++-- .github/workflows/_linux_op_benchmark.yml | 2 +- .github/workflows/_linux_ut.yml | 12 +- 6 files changed, 18 insertions(+), 431 deletions(-) delete mode 100644 .github/actions/linux-e2e/action.yml delete mode 100644 .github/actions/linux-ut/action.yml delete mode 100644 .github/actions/setup-testenv/action.yml diff --git a/.github/actions/linux-e2e/action.yml b/.github/actions/linux-e2e/action.yml deleted file mode 100644 index 559b3b307b..0000000000 --- a/.github/actions/linux-e2e/action.yml +++ /dev/null @@ -1,143 +0,0 @@ -name: Linux E2E Test - -inputs: - env_prepare: - required: false - description: If set to any value, will prepare suite test env - suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma - dt: - required: true - type: string - default: 'float32' - description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma - mode: - required: true - type: string - default: 'inference' - description: inference,training. Delimiter is comma - scenario: - required: true - type: string - default: 'accuracy' - description: accuracy,performance. Delimiter is comma - -runs: - using: composite - steps: - - name: Prepare ENV - if: ${{ inputs.env_prepare }} - shell: bash -xe {0} - run: | - if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then - python -c "import torch, torchvision, torchaudio" - cd ./pytorch - TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt) - git clone https://github.com/pytorch/benchmark.git xpu-benchmark - cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID - # remove deps which will reinstall torch - pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - pip install -U transformers==4.44.2 - sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt - git status && git diff - pip install -r requirements.txt - python install.py --continue_on_fail - # deps for torchrec_dlrm - pip install pyre_extensions - pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu - pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec - fi - if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then - pip install -U transformers==4.44.2 - fi - if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then - # install timm without dependencies - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - # install timm dependencies without torch and torchvision - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - fi - pip list |grep -E 'intel|torch' - - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - shell: bash -xe {0} - run: | - cp ./.github/scripts/inductor_xpu_test.sh ./pytorch - cd ./pytorch - # check param - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) - cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')" - export OMP_NUM_THREADS=${cores_per_instance} - for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g') - do - if [ "${suite}" == "pt2e" ];then - continue - fi - contains "huggingface,timm_models,torchbench" $suite - $contains_status - for dt in $(echo ${{ inputs.dt }} |sed 's/,/ /g') - do - contains "float32,bfloat16,float16,amp_bf16,amp_fp16" $dt - $contains_status - for mode in $(echo ${{ inputs.mode }} |sed 's/,/ /g') - do - contains "inference,training" $mode - $contains_status - for scenario in $(echo ${{ inputs.scenario }} |sed 's/,/ /g') - do - contains "accuracy,performance" $scenario - $contains_status - if [ "${MODEL_ONLY_NAME}" == "" ];then - for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ]) - do - cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')" - numactl --localalloc --physcpubind=${cpu_list} bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} & - done - else - for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g') - do - numactl --localalloc bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model} - done - fi - wait - # summarize pass rate - LOG_DIR="inductor_log/${suite}/${dt}" - LOG_NAME=inductor_${suite}_${dt}_${mode}_xpu_${scenario}_all.log - rm -f ${LOG_DIR}/${LOG_NAME} - find ${LOG_DIR}/ -name "inductor_${suite}_${dt}_${mode}_xpu_${scenario}_card*.log" |xargs cat >> ${LOG_DIR}/${LOG_NAME} 2>&1 - done - done - done - done - - - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - shell: bash -xe {0} - run: | - cd ./pytorch - rm -f inductor_log/summary_accuracy.csv - for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv") - do - sed -i "s/$/,$(basename $var)/" $var - cat $var >> inductor_log/summary_accuracy.csv - done - cd ${{ github.workspace }} - cp ./.github/scripts/inductor_summary.py ./pytorch - cd ./pytorch - pip install styleFrame scipy pandas - dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') - mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') - suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') - scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g') - python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario} diff --git a/.github/actions/linux-ut/action.yml b/.github/actions/linux-ut/action.yml deleted file mode 100644 index af01261071..0000000000 --- a/.github/actions/linux-ut/action.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Linux Unit Test - -inputs: - test_type: - required: true - type: string - description: Test scope - -permissions: read-all - -runs: - using: composite - steps: - - name: op_regression - shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_regression' }} - run: | - cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml - - name: op_transformers - shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_transformers' }} - run: | - export PYTORCH_TEST_WITH_SLOW=1 - cd pytorch - pytest --timeout 600 -v test/test_transformers.py -k xpu \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml - - name: op_extended - shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_extended' }} - run: | - export PYTORCH_TEST_WITH_SLOW=1 - cd pytorch/third_party/torch-xpu-ops/test/xpu/extended - timeout 3600 python run_test_with_skip.py - cp op_extended.xml $GITHUB_WORKSPACE/ut_log - - name: op_ut - shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_ut' }} - run: | - export PYTORCH_TEST_WITH_SLOW=1 - export PYTORCH_ENABLE_XPU_FALLBACK=1 - cd pytorch/third_party/torch-xpu-ops/test/xpu - timeout 10000 python run_test_with_skip.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log - cp *.xml $GITHUB_WORKSPACE/ut_log - find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' - dir_path=$(dirname "$1"); - case "$dir_path" in - *"op_ut_with_skip_quantization/core"*) - dir_name="op_ut_with_skip_quantization_core";; - *) - dir_name=$(basename "$dir_path");; - esac; - mv "$1" "$dir_path/${dir_name}_$(basename "$1")" - ' _ {} \; - cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log - cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log - # Cases run with a on-demand white list, since some suites are too - # slow to go through all operators on CPU. So add cases on-demand - # when XPU implementatoin is done. - # test_foreach, test_decomp - # Run with only - timeout 10000 python run_test_with_only.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log - cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log - - name: torch_xpu - shell: bash -xe {0} - if: ${{ inputs.test_type == 'torch_xpu' }} - run: | - export PYTORCH_TEST_WITH_SLOW=1 - export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" - cd pytorch - test_cmd="python test/run_test.py --include " - for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done - for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done - if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi - eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log - - name: xpu_profiling - shell: bash -xe {0} - if: ${{ inputs.test_type == 'xpu_profiling' }} - run: | - mkdir -p ut_log/profile_test/issue_reproduce - cd pytorch/third_party/torch-xpu-ops - # RN50 Test - PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 - cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test - # All Issue Reproduce UT - python -u test/profiling/correlation_id_mixed.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log - python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log - python -u test/profiling/time_precision_in_profile.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log - python -u test/profiling/profile_partial_runtime_ops.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log - python -u test/profiling/triton_xpu_ops_time.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log - # All xpu ut under test/profiler - cd ../pytorch/test/profiler - python -m pytest --timeout 600 -vs test_cpp_thread.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log - python -m pytest --timeout 600 -vs test_execution_trace.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log - python -m pytest --timeout 600 -vs test_memory_profiler.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log - python -m pytest --timeout 600 -vs test_profiler_tree.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log - - - name: op_dev1 - shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_dev1' }} - run: | - mkdir -p ut_log/op_dev1 - cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 200 -v test_operation_on_device_1.py \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/op_dev1.xml \ - 2>${{ github.workspace }}/ut_log/op_dev1/op_dev1_test_error.log | \ - tee ${{ github.workspace }}/ut_log/op_dev1/op_dev1_test.log - - - name: xpu_distributed - shell: bash -x -e -o pipefail {0} - if: ${{ inputs.test_type == 'xpu_distributed' }} - run: | - mkdir -p ut_log/xpu_distributed - cd pytorch/third_party/torch-xpu-ops/test/xpu - XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") - if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then - echo -e "[ERROR] XCCL is not enabled" - exit 1 - fi - timeout 1800 python run_distributed.py \ - 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ - tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log diff --git a/.github/actions/setup-testenv/action.yml b/.github/actions/setup-testenv/action.yml deleted file mode 100644 index de5bdf5753..0000000000 --- a/.github/actions/setup-testenv/action.yml +++ /dev/null @@ -1,134 +0,0 @@ -name: Setup Test Environment - -inputs: - test_type: - required: true - type: string - description: Test scope - pytorch: - type: string - default: 'main' - description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' - torch_xpu_ops: - type: string - default: 'main' - description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin - oneapi: - type: string - default: 'installed' - description: Installed oneAPI DLE on host by default, fill offline.sh url if needed - python: - type: string - default: '3.10' - description: Python version - -permissions: read-all - -runs: - using: composite - steps: - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} - - name: Check runner - shell: bash -xe {0} - run: | - hostname && id - cat /etc/os-release - gcc -v && g++ -v - which python && python -V - which pip && pip list - pip install -U pip wheel setuptools - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - clinfo --list - cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c - rm -rf ~/.triton /tmp/*inductor* - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - with: - path: torch-xpu-ops - - name: Install oneAPI DLE - shell: bash -xe {0} - if: ${{ inputs.oneapi != 'installed' }} - run: | - rm -rf ~/intel ~/.intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} - source ${HOME}/intel/oneapi/setvars.sh - sycl-ls && icpx -v - - name: Download Pytorch wheel - if: ${{ ! contains(inputs.test_type, 'wheel') }} - uses: actions/download-artifact@v4 - with: - pattern: Torch-XPU-Wheel-* - - name: Prepare Stock Pytorch - shell: bash -xe {0} - run: | - # install pytorch - if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu - elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - else - pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl") - fi - pip list |grep torch - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - fi - git clone ${PYTORCH_REPO} pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - pip install -r .ci/docker/requirements-ci.txt - # apply extra PRs for stock pytorch - if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - else - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - fi - git status && git diff && git show -s - - name: Prepare Torch-xpu-ops - shell: bash -xe {0} - if: ${{ inputs.torch_xpu_ops != 'skipped' }} - run: | - cd pytorch - rm -rf third_party/torch-xpu-ops - if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" - else - TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then - TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" - else - TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" - fi - fi - if [ "${{ inputs.test_type }}" == "cicd" ];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - fi - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - git status && git diff && git show -s - - name: Torch Config - shell: bash -xe {0} - run: | - printenv - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index b8733741e5..763dca0ca1 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -92,7 +92,7 @@ jobs: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} - uses: ./.github/actions/setup-testenv + uses: ./.github/actions/linux-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} @@ -103,7 +103,7 @@ jobs: # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: huggingface @@ -112,7 +112,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: torchbench @@ -121,7 +121,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: timm_models @@ -132,7 +132,7 @@ jobs: # Nihglty launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: huggingface @@ -141,7 +141,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: torchbench @@ -150,7 +150,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'nightly') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: timm_models @@ -168,7 +168,7 @@ jobs: # Weekly launch - name: Nightly Huggingface Full Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: huggingface @@ -177,7 +177,7 @@ jobs: scenario: accuracy,performance - name: Nightly Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: torchbench @@ -186,7 +186,7 @@ jobs: scenario: accuracy,performance - name: Nightly Timm_models FP16 Training Test if: ${{ contains(inputs.test_type, 'weekly') }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: timm_models @@ -204,7 +204,7 @@ jobs: # On-demand launch - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: ${{ contains(inputs.test_type, 'ondemand') && inputs.suite != 'pt2e' }} - uses: ./.github/actions/linux-e2e + uses: ./.github/actions/linux-e2etest with: env_prepare: true suite: ${{ inputs.suite }} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index bd6c1adc70..6524c5418f 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -73,7 +73,7 @@ jobs: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} - uses: ./.github/actions/setup-testenv + uses: ./.github/actions/linux-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e51886fa48..029a029498 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -90,7 +90,7 @@ jobs: uses: actions/checkout@v4 - name: Launch Test on ${{ needs.runner.outputs.hostname }} if: ${{ contains(inputs.ut, matrix.test) }} - uses: ./.github/actions/setup-testenv + uses: ./.github/actions/linux-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} @@ -99,7 +99,7 @@ jobs: python: ${{ inputs.python }} - name: Run XPU UT Test if: ${{ contains(inputs.ut, matrix.test) }} - uses: ./.github/actions/linux-ut + uses: ./.github/actions/linux-uttest with: test_type: ${{ matrix.test }} - name: UT Test Results Summary @@ -144,7 +144,7 @@ jobs: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ steps.cleanup.outputs.hostname }} - uses: ./.github/actions/setup-testenv + uses: ./.github/actions/linux-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} @@ -152,7 +152,7 @@ jobs: oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} - name: Run XPU UT Test - uses: ./.github/actions/linux-ut + uses: ./.github/actions/linux-uttest with: test_type: op_dev1 - name: Upload Inductor XPU UT Log @@ -183,7 +183,7 @@ jobs: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Launch Test on ${{ steps.cleanup.outputs.hostname }} - uses: ./.github/actions/setup-testenv + uses: ./.github/actions/linux-testenv with: test_type: ${{ inputs.test_type }} pytorch: ${{ inputs.pytorch }} @@ -199,7 +199,7 @@ jobs: cat ptrace_scope.bk echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope - name: Run Torch XPU Distributed UT - uses: ./.github/actions/linux-ut + uses: ./.github/actions/linux-uttest with: test_type: xpu_distributed - name: Reset Ptrace_scope From 70577e1927a707d9551eaaecc19e307068b29bb5 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 11:10:23 +0800 Subject: [PATCH 096/160] update --- .github/actions/linux-e2etest/action.yml | 143 +++++++++++++++++++++++ .github/actions/linux-testenv/action.yml | 135 +++++++++++++++++++++ .github/actions/linux-uttest/action.yml | 136 +++++++++++++++++++++ 3 files changed, 414 insertions(+) create mode 100644 .github/actions/linux-e2etest/action.yml create mode 100644 .github/actions/linux-testenv/action.yml create mode 100644 .github/actions/linux-uttest/action.yml diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml new file mode 100644 index 0000000000..559b3b307b --- /dev/null +++ b/.github/actions/linux-e2etest/action.yml @@ -0,0 +1,143 @@ +name: Linux E2E Test + +inputs: + env_prepare: + required: false + description: If set to any value, will prepare suite test env + suite: + required: true + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma + dt: + required: true + type: string + default: 'float32' + description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma + mode: + required: true + type: string + default: 'inference' + description: inference,training. Delimiter is comma + scenario: + required: true + type: string + default: 'accuracy' + description: accuracy,performance. Delimiter is comma + +runs: + using: composite + steps: + - name: Prepare ENV + if: ${{ inputs.env_prepare }} + shell: bash -xe {0} + run: | + if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then + python -c "import torch, torchvision, torchaudio" + cd ./pytorch + TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt) + git clone https://github.com/pytorch/benchmark.git xpu-benchmark + cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID + # remove deps which will reinstall torch + pip install --no-deps accelerate + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) + pip install -U transformers==4.44.2 + sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt + git status && git diff + pip install -r requirements.txt + python install.py --continue_on_fail + # deps for torchrec_dlrm + pip install pyre_extensions + pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu + pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec + fi + if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then + pip install -U transformers==4.44.2 + fi + if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then + # install timm without dependencies + pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 + # install timm dependencies without torch and torchvision + pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) + fi + pip list |grep -E 'intel|torch' + - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash -xe {0} + run: | + cp ./.github/scripts/inductor_xpu_test.sh ./pytorch + cd ./pytorch + # check param + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) + cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')" + export OMP_NUM_THREADS=${cores_per_instance} + for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g') + do + if [ "${suite}" == "pt2e" ];then + continue + fi + contains "huggingface,timm_models,torchbench" $suite + $contains_status + for dt in $(echo ${{ inputs.dt }} |sed 's/,/ /g') + do + contains "float32,bfloat16,float16,amp_bf16,amp_fp16" $dt + $contains_status + for mode in $(echo ${{ inputs.mode }} |sed 's/,/ /g') + do + contains "inference,training" $mode + $contains_status + for scenario in $(echo ${{ inputs.scenario }} |sed 's/,/ /g') + do + contains "accuracy,performance" $scenario + $contains_status + if [ "${MODEL_ONLY_NAME}" == "" ];then + for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ]) + do + cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')" + numactl --localalloc --physcpubind=${cpu_list} bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} & + done + else + for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g') + do + numactl --localalloc bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model} + done + fi + wait + # summarize pass rate + LOG_DIR="inductor_log/${suite}/${dt}" + LOG_NAME=inductor_${suite}_${dt}_${mode}_xpu_${scenario}_all.log + rm -f ${LOG_DIR}/${LOG_NAME} + find ${LOG_DIR}/ -name "inductor_${suite}_${dt}_${mode}_xpu_${scenario}_card*.log" |xargs cat >> ${LOG_DIR}/${LOG_NAME} 2>&1 + done + done + done + done + + - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash -xe {0} + run: | + cd ./pytorch + rm -f inductor_log/summary_accuracy.csv + for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv") + do + sed -i "s/$/,$(basename $var)/" $var + cat $var >> inductor_log/summary_accuracy.csv + done + cd ${{ github.workspace }} + cp ./.github/scripts/inductor_summary.py ./pytorch + cd ./pytorch + pip install styleFrame scipy pandas + dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') + mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') + suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') + scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g') + python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario} diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml new file mode 100644 index 0000000000..20e2bf15bc --- /dev/null +++ b/.github/actions/linux-testenv/action.yml @@ -0,0 +1,135 @@ +name: Setup Test Environment + +inputs: + test_type: + required: true + type: string + description: Test scope + pytorch: + type: string + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: + type: string + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version + +permissions: read-all + +runs: + using: composite + steps: + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Check runner + shell: bash -xe {0} + run: | + hostname && id + cat /etc/os-release + gcc -v && g++ -v + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + clinfo --list + cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c + rm -rf ~/.triton /tmp/*inductor* + pip install pandas psutil scipy requests pytest-timeout + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Install oneAPI DLE + shell: bash -xe {0} + if: ${{ inputs.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + source ${HOME}/intel/oneapi/setvars.sh + sycl-ls && icpx -v + - name: Download Pytorch wheel + if: ${{ ! contains(inputs.test_type, 'wheel') }} + uses: actions/download-artifact@v4 + with: + pattern: Torch-XPU-Wheel-* + - name: Prepare Stock Pytorch + shell: bash -xe {0} + run: | + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl") + fi + pip list |grep torch + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} + pip install -r .ci/docker/requirements-ci.txt + # apply extra PRs for stock pytorch + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + fi + git status && git diff && git show -s + - name: Prepare Torch-xpu-ops + shell: bash -xe {0} + if: ${{ inputs.torch_xpu_ops != 'skipped' }} + run: | + cd pytorch + rm -rf third_party/torch-xpu-ops + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" + else + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then + TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" + else + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" + fi + fi + if [ "${{ inputs.test_type }}" == "cicd" ];then + cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops + else + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + fi + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} + git status && git diff && git show -s + - name: Torch Config + shell: bash -xe {0} + run: | + printenv + python -c "import torch; print(torch.__config__.show())" + python -c "import torch; print(torch.__config__.parallel_info())" + python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml new file mode 100644 index 0000000000..af01261071 --- /dev/null +++ b/.github/actions/linux-uttest/action.yml @@ -0,0 +1,136 @@ +name: Linux Unit Test + +inputs: + test_type: + required: true + type: string + description: Test scope + +permissions: read-all + +runs: + using: composite + steps: + - name: op_regression + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_regression' }} + run: | + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml + - name: op_transformers + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_transformers' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + cd pytorch + pytest --timeout 600 -v test/test_transformers.py -k xpu \ + --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml + - name: op_extended + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_extended' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + cd pytorch/third_party/torch-xpu-ops/test/xpu/extended + timeout 3600 python run_test_with_skip.py + cp op_extended.xml $GITHUB_WORKSPACE/ut_log + - name: op_ut + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_ut' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + export PYTORCH_ENABLE_XPU_FALLBACK=1 + cd pytorch/third_party/torch-xpu-ops/test/xpu + timeout 10000 python run_test_with_skip.py \ + 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log + cp *.xml $GITHUB_WORKSPACE/ut_log + find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"op_ut_with_skip_quantization/core"*) + dir_name="op_ut_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log + cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log + # Cases run with a on-demand white list, since some suites are too + # slow to go through all operators on CPU. So add cases on-demand + # when XPU implementatoin is done. + # test_foreach, test_decomp + # Run with only + timeout 10000 python run_test_with_only.py \ + 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log + cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log + - name: torch_xpu + shell: bash -xe {0} + if: ${{ inputs.test_type == 'torch_xpu' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" + cd pytorch + test_cmd="python test/run_test.py --include " + for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done + for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done + if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi + eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log + - name: xpu_profiling + shell: bash -xe {0} + if: ${{ inputs.test_type == 'xpu_profiling' }} + run: | + mkdir -p ut_log/profile_test/issue_reproduce + cd pytorch/third_party/torch-xpu-ops + # RN50 Test + PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 + cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test + # All Issue Reproduce UT + python -u test/profiling/correlation_id_mixed.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log + python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log + python -u test/profiling/time_precision_in_profile.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log + python -u test/profiling/profile_partial_runtime_ops.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log + python -u test/profiling/triton_xpu_ops_time.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log + # All xpu ut under test/profiler + cd ../pytorch/test/profiler + python -m pytest --timeout 600 -vs test_cpp_thread.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log + python -m pytest --timeout 600 -vs test_execution_trace.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log + python -m pytest --timeout 600 -vs test_memory_profiler.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log + python -m pytest --timeout 600 -vs test_profiler_tree.py | \ + tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log + + - name: op_dev1 + shell: bash -xe {0} + if: ${{ inputs.test_type == 'op_dev1' }} + run: | + mkdir -p ut_log/op_dev1 + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest --timeout 200 -v test_operation_on_device_1.py \ + --junit-xml=$GITHUB_WORKSPACE/ut_log/op_dev1.xml \ + 2>${{ github.workspace }}/ut_log/op_dev1/op_dev1_test_error.log | \ + tee ${{ github.workspace }}/ut_log/op_dev1/op_dev1_test.log + + - name: xpu_distributed + shell: bash -x -e -o pipefail {0} + if: ${{ inputs.test_type == 'xpu_distributed' }} + run: | + mkdir -p ut_log/xpu_distributed + cd pytorch/third_party/torch-xpu-ops/test/xpu + XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") + if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then + echo -e "[ERROR] XCCL is not enabled" + exit 1 + fi + timeout 1800 python run_distributed.py \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log From 2467e9e03a939d7c8d2305f683dad5458598afc3 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 11:12:19 +0800 Subject: [PATCH 097/160] update --- .github/workflows/_linux_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 53fdc621a0..afbae93cd8 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -59,7 +59,7 @@ jobs: uses: ./.github/actions/get-runner build: - name: ${{ contains(inputs.test_type, 'wheel') && inputs.pytorch || 'build' }} + name: ${{ inputs.pytorch }} needs: runner if: ${{ ! contains(inputs.test_type, 'wheel') }} runs-on: ${{ needs.runner.outputs.runner_id }} From 96ff039d997483bc0e2d93161843dedc1b024af0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 11:44:30 +0800 Subject: [PATCH 098/160] update --- .github/workflows/_linux_e2e.yml | 13 ++++++++----- .github/workflows/_linux_op_benchmark.yml | 2 +- .github/workflows/_linux_ut.yml | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 763dca0ca1..984dec240b 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -78,7 +78,7 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g + options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool @@ -244,15 +244,17 @@ jobs: GH_TOKEN: ${{ github.token }} REFERENCE_ISSUE_ID: 1645 steps: + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} - name: Install gh run: | apt-get update apt-get install gh rsync ca-certificates -y find ./ |grep -v "^\./$" |xargs rm -rf - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} + python -m venv /tmp/myvenv + echo "PATH=/tmp/myvenv/bin:$PATH" >> ${GITHUB_ENV} - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact @@ -279,6 +281,7 @@ jobs: - name: Get summary if: ${{ ! cancelled() }} run: | + export HOME=/tmp/ pip install pandas requests if [ "${{ inputs.suite }}" != 'pt2e' ];then bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 6524c5418f..1dbcfed652 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -59,7 +59,7 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g + options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /opt/xpu-tool diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 029a029498..39f219df6c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -69,7 +69,7 @@ jobs: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} - options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g + options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} -e ZE_AFFINITY_MASK env: From da12ea01e975e518741e02fd321ca20efc65c0e3 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 13:35:29 +0800 Subject: [PATCH 099/160] update --- .github/workflows/_linux_ut.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 39f219df6c..dce5d44753 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -238,14 +238,14 @@ jobs: if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/download-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }} + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log - name: Check UT Results if: ${{ contains(inputs.ut, matrix.test) }} shell: bash run: | repo="${{ github.repository }}" - cd ${{ github.workspace }}/ut_log/${{ matrix.test.name }} + cd ${{ github.workspace }}/ut_log/${{ matrix.test }} gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log gh api "repos/${{ github.repository }}/issues?labels=skipped" \ --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log @@ -255,10 +255,10 @@ jobs: cat issues_temp.log | awk '{print $1}' >> Known_issue.log awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh ${{ matrix.test.name }} + bash ut_result_check.sh ${{ matrix.test }} - name: Upload Inductor XPU UT Log if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }}-checked path: ${{ github.workspace }}/ut_log From 4f6ecfdb01b015c16d59e6ebe79079ca25bb60f1 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 13:45:17 +0800 Subject: [PATCH 100/160] modify ut --- .github/actions/linux-uttest/action.yml | 58 ++++++++++++------------- .github/scripts/check-ut.py | 16 +++---- .github/scripts/ut_result_check.sh | 18 ++++---- .github/workflows/_linux_ut.yml | 18 ++++---- .github/workflows/_windows_ut.yml | 4 +- .github/workflows/nightly_ondemand.yml | 6 +-- .github/workflows/pull.yml | 4 +- 7 files changed, 62 insertions(+), 62 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index af01261071..8693ab96a0 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -11,60 +11,60 @@ permissions: read-all runs: using: composite steps: - - name: op_regression + - name: ut_regression shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_regression' }} + if: ${{ inputs.test_type == 'ut_regression' }} run: | cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 -v --junit-xml=../../ut_log/op_regression.xml - - name: op_transformers + pytest --timeout 600 -v --junit-xml=../../ut_log/ut_regression.xml + - name: ut_transformers shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_transformers' }} + if: ${{ inputs.test_type == 'ut_transformers' }} run: | export PYTORCH_TEST_WITH_SLOW=1 cd pytorch pytest --timeout 600 -v test/test_transformers.py -k xpu \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml - - name: op_extended + --junit-xml=$GITHUB_WORKSPACE/ut_log/ut_transformers.xml + - name: ut_extended shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_extended' }} + if: ${{ inputs.test_type == 'ut_extended' }} run: | export PYTORCH_TEST_WITH_SLOW=1 cd pytorch/third_party/torch-xpu-ops/test/xpu/extended timeout 3600 python run_test_with_skip.py - cp op_extended.xml $GITHUB_WORKSPACE/ut_log - - name: op_ut + cp ut_extended.xml $GITHUB_WORKSPACE/ut_log + - name: ut_op shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_ut' }} + if: ${{ inputs.test_type == 'ut_op' }} run: | export PYTORCH_TEST_WITH_SLOW=1 export PYTORCH_ENABLE_XPU_FALLBACK=1 cd pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log + 2>$GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_skip_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_skip_test.log cp *.xml $GITHUB_WORKSPACE/ut_log - find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + find ut_op_with_skip_nn ut_op_with_skip_quantization/core -type f -exec sh -c ' dir_path=$(dirname "$1"); case "$dir_path" in - *"op_ut_with_skip_quantization/core"*) - dir_name="op_ut_with_skip_quantization_core";; + *"ut_op_with_skip_quantization/core"*) + dir_name="ut_op_with_skip_quantization_core";; *) dir_name=$(basename "$dir_path");; esac; mv "$1" "$dir_path/${dir_name}_$(basename "$1")" ' _ {} \; - cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log - cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log + cp ut_op_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log + cp ut_op_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. # test_foreach, test_decomp # Run with only timeout 10000 python run_test_with_only.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log - cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log + 2>$GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_only_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_only_test.log + cp ut_op_with_only.xml $GITHUB_WORKSPACE/ut_log - name: torch_xpu shell: bash -xe {0} if: ${{ inputs.test_type == 'torch_xpu' }} @@ -78,9 +78,9 @@ runs: if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log - - name: xpu_profiling + - name: ut_profiling shell: bash -xe {0} - if: ${{ inputs.test_type == 'xpu_profiling' }} + if: ${{ inputs.test_type == 'ut_profiling' }} run: | mkdir -p ut_log/profile_test/issue_reproduce cd pytorch/third_party/torch-xpu-ops @@ -109,16 +109,16 @@ runs: python -m pytest --timeout 600 -vs test_profiler_tree.py | \ tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log - - name: op_dev1 + - name: xpu_dev1 shell: bash -xe {0} - if: ${{ inputs.test_type == 'op_dev1' }} + if: ${{ inputs.test_type == 'xpu_dev1' }} run: | - mkdir -p ut_log/op_dev1 + mkdir -p ut_log/xpu_dev1 cd pytorch/third_party/torch-xpu-ops/test/regressions pytest --timeout 200 -v test_operation_on_device_1.py \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/op_dev1.xml \ - 2>${{ github.workspace }}/ut_log/op_dev1/op_dev1_test_error.log | \ - tee ${{ github.workspace }}/ut_log/op_dev1/op_dev1_test.log + --junit-xml=$GITHUB_WORKSPACE/ut_log/xpu_dev1.xml \ + 2>${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test.log - name: xpu_distributed shell: bash -x -e -o pipefail {0} diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index be84a64793..3364efa61c 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -186,14 +186,14 @@ def parse_log_file(log_file): return summary def determine_category(ut): - if ut == 'op_regression': - return 'op_regression' - elif ut == 'op_dev1': - return 'op_dev1' - elif ut == 'op_extended': - return 'op_extended' - elif 'op_ut' in ut: - return 'op_ut' + if ut == 'ut_regression': + return 'ut_regression' + elif ut == 'xpu_dev1': + return 'xpu_dev1' + elif ut == 'ut_extended': + return 'ut_extended' + elif 'ut_op' in ut: + return 'ut_op' else: return 'unknown' diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 0ad52580f7..bd7ccd490a 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -1,5 +1,5 @@ #!/bin/bash -ut_suite="${1:-op_regression}" # op_regression / op_extended / op_ut / torch_xpu +ut_suite="${1:-ut_regression}" # ut_regression / ut_extended / ut_op / torch_xpu # usage # compare_and_filter_logs [output.log] @@ -44,7 +44,7 @@ compare_and_filter_logs() { fi } -if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then +if [[ "${ut_suite}" == 'ut_regression' || "${ut_suite}" == 'xpu_dev1' || "${ut_suite}" == 'ut_extended' || "${ut_suite}" == 'ut_transformers' ]]; then grep -E "FAILED" "${ut_suite}"_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_failed.log grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_failed.log grep "PASSED" "${ut_suite}"_test.log | awk '{print $1}' > ./"${ut_suite}"_passed.log @@ -66,11 +66,11 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_dev1' || "${ut_su echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'op_ut' ]]; then - grep -E "FAILED" op_ut_with_skip_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_with_skip_test_failed.log - grep -E "have failures" op_ut_with_skip_test.log | awk '{print $1}' >> ./"${ut_suite}"_with_skip_test_failed.log - grep -E "FAILED" op_ut_with_only_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_with_only_test_failed.log - grep -E "have failures" op_ut_with_only_test.log | awk '{print $1}' >> ./"${ut_suite}"_with_only_test_failed.log +if [[ "${ut_suite}" == 'ut_op' ]]; then + grep -E "FAILED" ut_op_with_skip_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_with_skip_test_failed.log + grep -E "have failures" ut_op_with_skip_test.log | awk '{print $1}' >> ./"${ut_suite}"_with_skip_test_failed.log + grep -E "FAILED" ut_op_with_only_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_with_only_test_failed.log + grep -E "have failures" ut_op_with_only_test.log | awk '{print $1}' >> ./"${ut_suite}"_with_only_test_failed.log compare_and_filter_logs "${ut_suite}"_with_skip_test_failed.log Known_issue.log if [[ -f "${ut_suite}_with_skip_test_failed_filtered.log" ]]; then num_failed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_failed_filtered.log") @@ -92,8 +92,8 @@ if [[ "${ut_suite}" == 'op_ut' ]]; then echo -e "=========================================================================" cat "./${ut_suite}_with_only_test_failed.log" ((num_failed=num_failed_with_skip+num_failed_with_only)) - grep "PASSED" op_ut_with_skip_test.log | awk '{print $1}' > ./"${ut_suite}"_with_skip_test_passed.log - grep "PASSED" op_ut_with_only_test.log | awk '{print $1}' > ./"${ut_suite}"_with_only_test_passed.log + grep "PASSED" ut_op_with_skip_test.log | awk '{print $1}' > ./"${ut_suite}"_with_skip_test_passed.log + grep "PASSED" ut_op_with_only_test.log | awk '{print $1}' > ./"${ut_suite}"_with_only_test_passed.log num_passed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_passed.log") num_passed_with_only=$(wc -l < "./${ut_suite}_with_only_test_passed.log") ((num_passed=num_passed_with_skip+num_passed_with_only)) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index dce5d44753..f1940bd088 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -30,7 +30,7 @@ on: ut: required: true type: string - description: UT scope. `op_regression,op_dev1,op_transformers,op_extended,op_ut,torch_xpu` Delimiter is comma + description: UT scope. `ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op,torch_xpu` Delimiter is comma disabled_tests: type: string default: '' @@ -63,7 +63,7 @@ jobs: normal: needs: runner runs-on: ${{ needs.runner.outputs.runner_id }} - if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} + if: ${{ contains(inputs.ut, 'p') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 container: image: mengfeili/intel-pvc-driver:1146-1136 @@ -79,7 +79,7 @@ jobs: strategy: fail-fast: false matrix: - test: [op_regression, op_transformers, op_extended, op_ut, torch_xpu, xpu_profiling] + test: [ut_regression, ut_transformers, ut_extended, ut_op, torch_xpu, ut_profiling] steps: - name: Cleanup workspace if: ${{ contains(inputs.ut, matrix.test) }} @@ -125,7 +125,7 @@ jobs: devices: runs-on: pvc_rolling - if: ${{ contains(inputs.ut, 'op_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} + if: ${{ contains(inputs.ut, 'xpu_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 5 env: GH_TOKEN: ${{ github.token }} @@ -154,12 +154,12 @@ jobs: - name: Run XPU UT Test uses: ./.github/actions/linux-uttest with: - test_type: op_dev1 + test_type: xpu_dev1 - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-op_dev1 + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_dev1 path: ${{ github.workspace }}/ut_log distributed: @@ -222,7 +222,7 @@ jobs: strategy: fail-fast: false matrix: - test: [op_regression, op_transformers, op_extended, op_ut, torch_xpu, xpu_profiling, op_dev1, xpu_distributed] + test: [ut_regression, ut_transformers, ut_extended, ut_op, torch_xpu, ut_profiling, xpu_dev1, xpu_distributed] env: GH_TOKEN: ${{ github.token }} UT_SKIP_ISSUE: 1624 @@ -251,9 +251,9 @@ jobs: --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | \ grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log - awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log + awk '$2 == "ut_op" {print $1}' issues_temp.log > issues_ut_op.log cat issues_temp.log | awk '{print $1}' >> Known_issue.log - awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log + awk -F'::' '{print $1}' issues_ut_op.log | sort -u | paste -sd ',' >> Known_issue.log cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ bash ut_result_check.sh ${{ matrix.test }} - name: Upload Inductor XPU UT Log diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml index 9cb27a30f1..f30e469749 100644 --- a/.github/workflows/_windows_ut.yml +++ b/.github/workflows/_windows_ut.yml @@ -17,7 +17,7 @@ on: required: true type: string default: '' - description: UT scope. `op_regression,op_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma + description: UT scope. `ut_regression,xpu_dev1,ut_extended,ut_op,torch_xpu` Delimiter is comma python: required: false type: string @@ -157,7 +157,7 @@ jobs: path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\dist' - name: Run XPU OP Extended UT - if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule' + if: contains(inputs.ut, 'ut_extended') || github.event_name == 'schedule' shell: cmd run: | call "C:\ProgramData\miniforge3\Scripts\activate.bat" diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 1f00b63263..f0a452ae33 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -29,7 +29,7 @@ on: ut: type: string default: '' - description: UT scope. `op_regression,op_dev1,op_transformers,op_extended,op_ut,xpu_profiling,xpu_distributed,microbench,windows`. Delimiter is comma + description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma suite: type: string default: '' @@ -130,7 +130,7 @@ jobs: torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} + ut: ${{ github.event_name == 'schedule' && 'ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op' || inputs.ut }} Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} @@ -168,7 +168,7 @@ jobs: name: windows uses: ./.github/workflows/_windows_ut.yml with: - ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} + ut: ${{ github.event_name == 'schedule' && 'ut_extended,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} src_changed: false has_label: true diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index b2f098efaf..0a7def8e9e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -108,7 +108,7 @@ jobs: runner: linux.idc.xpu test_type: build-cicd pytorch: main - ut: op_regression,op_dev1,op_transformers,op_extended,op_ut,xpu_distributed + ut: ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op,xpu_distributed disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} linux-e2e: @@ -127,7 +127,7 @@ jobs: needs: [conditions-filter] uses: ./.github/workflows/_windows_ut.yml with: - ut: op_extended,torch_xpu + ut: ut_extended,torch_xpu runner: Windows_CI src_changed: ${{ needs.conditions-filter.outputs.src_changed }} has_label: ${{ needs.conditions-filter.outputs.has_label }} From ba97507afa546c37d033fa4f670687c394d1d886 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 13:54:26 +0800 Subject: [PATCH 101/160] modify ut --- .github/actions/linux-uttest/action.yml | 8 ++++---- .github/scripts/ut_result_check.sh | 8 ++++---- .github/workflows/_linux_ut.yml | 8 ++++---- .github/workflows/_windows_ut.yml | 4 ++-- .github/workflows/nightly_ondemand.yml | 6 +++--- .github/workflows/pull.yml | 4 ++-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 8693ab96a0..43dcc5a3fa 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -65,9 +65,9 @@ runs: 2>$GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_only_test_error.log | \ tee $GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_only_test.log cp ut_op_with_only.xml $GITHUB_WORKSPACE/ut_log - - name: torch_xpu + - name: ut_torch shell: bash -xe {0} - if: ${{ inputs.test_type == 'torch_xpu' }} + if: ${{ inputs.test_type == 'ut_torch' }} run: | export PYTORCH_TEST_WITH_SLOW=1 export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" @@ -76,8 +76,8 @@ runs: for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi - eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log + eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/ut_torch/torch_xpu_test_error.log | \ + tee $GITHUB_WORKSPACE/ut_log/ut_torch/torch_xpu_test.log - name: ut_profiling shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_profiling' }} diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index bd7ccd490a..7e370f813d 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -1,5 +1,5 @@ #!/bin/bash -ut_suite="${1:-ut_regression}" # ut_regression / ut_extended / ut_op / torch_xpu +ut_suite="${1:-ut_regression}" # ut_regression / ut_extended / ut_op / ut_torch # usage # compare_and_filter_logs [output.log] @@ -104,13 +104,13 @@ if [[ "${ut_suite}" == 'ut_op' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'torch_xpu' ]]; then +if [[ "${ut_suite}" == 'ut_torch' ]]; then echo "Pytorch XPU binary UT checking" cd ../../pytorch || exit for xpu_case in build/bin/*{xpu,sycl}*; do if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then case_name=$(basename "$xpu_case") - cd ../ut_log/torch_xpu || exit + cd ../ut_log/ut_torch || exit grep -E "FAILED|have failures" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_failed.log wc -l < "./binary_ut_${ut_suite}_${case_name}_failed.log" | tee -a ./binary_ut_"${ut_suite}"_failed_summary.log grep -E "PASSED|Pass" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_passed.log @@ -121,7 +121,7 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" - cd ../ut_log/torch_xpu || exit + cd ../ut_log/ut_torch || exit cat "./binary_ut_${ut_suite}_${case_name}_failed.log" num_failed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_failed_summary.log) num_passed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_passed_summary.log) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index f1940bd088..0427b0a47b 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -30,7 +30,7 @@ on: ut: required: true type: string - description: UT scope. `ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op,torch_xpu` Delimiter is comma + description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_torch,xpu_dev1` Delimiter is comma disabled_tests: type: string default: '' @@ -63,7 +63,7 @@ jobs: normal: needs: runner runs-on: ${{ needs.runner.outputs.runner_id }} - if: ${{ contains(inputs.ut, 'p') && !contains(inputs.disabled_tests, 'disable_ut') }} + if: ${{ contains(inputs.ut, 'ut_') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 container: image: mengfeili/intel-pvc-driver:1146-1136 @@ -79,7 +79,7 @@ jobs: strategy: fail-fast: false matrix: - test: [ut_regression, ut_transformers, ut_extended, ut_op, torch_xpu, ut_profiling] + test: [ut_regression, ut_transformers, ut_extended, ut_op, ut_torch, ut_profiling] steps: - name: Cleanup workspace if: ${{ contains(inputs.ut, matrix.test) }} @@ -222,7 +222,7 @@ jobs: strategy: fail-fast: false matrix: - test: [ut_regression, ut_transformers, ut_extended, ut_op, torch_xpu, ut_profiling, xpu_dev1, xpu_distributed] + test: [ut_regression, ut_transformers, ut_extended, ut_op, ut_torch, ut_profiling, xpu_dev1, xpu_distributed] env: GH_TOKEN: ${{ github.token }} UT_SKIP_ISSUE: 1624 diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml index f30e469749..3c211ccfc2 100644 --- a/.github/workflows/_windows_ut.yml +++ b/.github/workflows/_windows_ut.yml @@ -17,7 +17,7 @@ on: required: true type: string default: '' - description: UT scope. `ut_regression,xpu_dev1,ut_extended,ut_op,torch_xpu` Delimiter is comma + description: UT scope. `ut_regression,xpu_dev1,ut_extended,ut_op,ut_torch` Delimiter is comma python: required: false type: string @@ -169,7 +169,7 @@ jobs: python run_test_with_skip_mtl.py - name: Run Test XPU UT - if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule' + if: contains(inputs.ut, 'ut_torch') || github.event_name == 'schedule' shell: cmd run: | call "C:\ProgramData\miniforge3\Scripts\activate.bat" diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index f0a452ae33..43f74faa50 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -29,7 +29,7 @@ on: ut: type: string default: '' - description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma + description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,ut_torch,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma suite: type: string default: '' @@ -119,7 +119,7 @@ jobs: python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} Linux-Nightly-Ondemand-UT-Tests: - if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'ut_') || contains(inputs.ut, 'xpu_') }} name: linux-ut needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml @@ -168,7 +168,7 @@ jobs: name: windows uses: ./.github/workflows/_windows_ut.yml with: - ut: ${{ github.event_name == 'schedule' && 'ut_extended,torch_xpu' || inputs.ut }} + ut: ${{ github.event_name == 'schedule' && 'ut_extended,ut_torch' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} src_changed: false has_label: true diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 0a7def8e9e..f009d10cb9 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -108,7 +108,7 @@ jobs: runner: linux.idc.xpu test_type: build-cicd pytorch: main - ut: ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op,xpu_distributed + ut: ut_regression,ut_transformers,ut_extended,ut_op,xpu_dev1,xpu_distributed disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} linux-e2e: @@ -127,7 +127,7 @@ jobs: needs: [conditions-filter] uses: ./.github/workflows/_windows_ut.yml with: - ut: ut_extended,torch_xpu + ut: ut_extended,ut_torch runner: Windows_CI src_changed: ${{ needs.conditions-filter.outputs.src_changed }} has_label: ${{ needs.conditions-filter.outputs.has_label }} From 98964410b82a1f432ad4d20f4e83f01c62816130 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 14:30:11 +0800 Subject: [PATCH 102/160] fix pip warnings --- .github/workflows/_linux_build.yml | 8 +++++--- .github/workflows/_linux_e2e.yml | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index afbae93cd8..e9115348bb 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -71,6 +71,8 @@ jobs: PATH: /tmp/xpu-venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache + PIP_ROOT_USER_ACTION: ignore timeout-minutes: 300 steps: - name: Install gh-cli @@ -120,10 +122,10 @@ jobs: source /opt/rh/gcc-toolset-11/enable # oneAPI DLE if [ "${{ inputs.oneapi }}" != "installed" ];then - rm -rf ${HOME}/intel ${HOME}/.intel + rm -rf ${HOME}/intel ${HOME}/.intel /opt/intel wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi - export XPU_ONEAPI_PATH="${HOME}/intel/oneapi" + bash oneapi.sh -a -s --eula accept --action install --install-dir /opt/intel/oneapi + export XPU_ONEAPI_PATH="/opt/intel/oneapi" fi source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 984dec240b..6d202ade4b 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -240,9 +240,11 @@ jobs: container: image: ubuntu:latest env: - AGENT_TOOLSDIRECTORY: /tmp/xpu-tool GH_TOKEN: ${{ github.token }} REFERENCE_ISSUE_ID: 1645 + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache + PIP_ROOT_USER_ACTION: ignore steps: - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 @@ -281,7 +283,6 @@ jobs: - name: Get summary if: ${{ ! cancelled() }} run: | - export HOME=/tmp/ pip install pandas requests if [ "${{ inputs.suite }}" != 'pt2e' ];then bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} From 50467eea4e1a1dd0993df29e67d89fa2e77af5a9 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 14:33:20 +0800 Subject: [PATCH 103/160] modify ut logs path --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 0427b0a47b..039f5547f3 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -115,7 +115,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} - path: ${{ github.workspace }}/ut_log + path: ${{ github.workspace }}/ut_log/${{ matrix.test }} - name: Upload XPU UT Failure list if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 From 5c62bc925fb2e937631095e3db71b3cd40de3ad5 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 14:34:39 +0800 Subject: [PATCH 104/160] modify ut logs path --- .github/workflows/_linux_ut.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 039f5547f3..bf5f1e877e 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -245,6 +245,7 @@ jobs: shell: bash run: | repo="${{ github.repository }}" + ls -al ${{ github.workspace }}/ut_log cd ${{ github.workspace }}/ut_log/${{ matrix.test }} gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log gh api "repos/${{ github.repository }}/issues?labels=skipped" \ From 8b33c216239ba1d7b5edec1eaccbf3f91812b07d Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 15:20:42 +0800 Subject: [PATCH 105/160] set run name for nightly and on-demand tests --- .github/workflows/nightly_ondemand.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 43f74faa50..e68bade8b7 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -53,6 +53,8 @@ on: permissions: read-all +run-name: ${{ (contains(github.event.schedule, '13') && 'Nightly') || (contains(github.event.schedule, '16') && 'Weekly') || 'On-demand' }} / ${{ (contains(github.event.schedule, '10') && 'Source Code') || (contains(github.event.schedule, '30') && 'CD Wheel') || inputs.pytorch }} + jobs: Conditions-Filter: name: conditions-filter From f08c528527be4855bcb29745381759a9f6a6fa6c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 15:25:21 +0800 Subject: [PATCH 106/160] modify ut logs path --- .github/workflows/_linux_ut.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index bf5f1e877e..a86c514c88 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -107,8 +107,8 @@ jobs: run: | pip install junitparser python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true - if [ -e "ut_failure_list.csv" ];then - cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv + if [ -e ut_failure_list.csv ];then + cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true fi - name: Upload Inductor XPU UT Log if: ${{ contains(inputs.ut, matrix.test) }} From 55bd5dc4e3885ceb5aea8a89c27de120e1396095 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 15:36:36 +0800 Subject: [PATCH 107/160] ut summray always --- .github/workflows/_linux_build.yml | 7 +++---- .github/workflows/_linux_e2e.yml | 6 +++--- .github/workflows/_linux_ut.yml | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index e9115348bb..ebb6b6fb46 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -68,11 +68,10 @@ jobs: volumes: - ${{ github.workspace }}:${{ github.workspace }} env: - PATH: /tmp/xpu-venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/xpu-tool PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache - PIP_ROOT_USER_ACTION: ignore timeout-minutes: 300 steps: - name: Install gh-cli @@ -88,9 +87,9 @@ jobs: gh --version - name: Setup python-${{ inputs.python }} run: | - rm -rf /tmp/xpu-venv + rm -rf /tmp/xpu-tool/myvenv local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-venv + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv which python && python -V which pip && pip list pip install -U pip wheel setuptools diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 6d202ade4b..3f932b60e4 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -244,7 +244,7 @@ jobs: REFERENCE_ISSUE_ID: 1645 AGENT_TOOLSDIRECTORY: /tmp/xpu-tool PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache - PIP_ROOT_USER_ACTION: ignore + PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin steps: - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 @@ -255,8 +255,8 @@ jobs: apt-get update apt-get install gh rsync ca-certificates -y find ./ |grep -v "^\./$" |xargs rm -rf - python -m venv /tmp/myvenv - echo "PATH=/tmp/myvenv/bin:$PATH" >> ${GITHUB_ENV} + rm -rf /tmp/xpu-tool/myvenv + python -m venv /tmp/xpu-tool/myvenv - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index a86c514c88..6d6527ea17 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -217,6 +217,7 @@ jobs: summary: needs: [normal, devices, distributed] + if: ${{ ! cancelled() }} runs-on: ubuntu-latest timeout-minutes: 30 strategy: From dbd3a27c52989b530470c811ad177beca1178da8 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 16:11:18 +0800 Subject: [PATCH 108/160] fix ut logs path --- .github/actions/linux-uttest/action.yml | 63 ++++++++++++++----------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 43dcc5a3fa..48afe04125 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -15,24 +15,33 @@ runs: shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_regression' }} run: | + mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 -v --junit-xml=../../ut_log/ut_regression.xml + pytest --timeout 600 -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ + 2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log - name: ut_transformers shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_transformers' }} run: | export PYTORCH_TEST_WITH_SLOW=1 + mkdir -p ut_log/ut_transformers cd pytorch pytest --timeout 600 -v test/test_transformers.py -k xpu \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/ut_transformers.xml + --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \ + 2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log - name: ut_extended shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_extended' }} run: | export PYTORCH_TEST_WITH_SLOW=1 + mkdir -p ut_log/ut_extended cd pytorch/third_party/torch-xpu-ops/test/xpu/extended - timeout 3600 python run_test_with_skip.py - cp ut_extended.xml $GITHUB_WORKSPACE/ut_log + timeout 3600 python run_test_with_skip.py \ + 2> ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test.log + cp ut_extended.xml ${{ github.workspace }}/ut_log - name: ut_op shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_op' }} @@ -41,9 +50,9 @@ runs: export PYTORCH_ENABLE_XPU_FALLBACK=1 cd pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py \ - 2>$GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_skip_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_skip_test.log - cp *.xml $GITHUB_WORKSPACE/ut_log + 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test.log + cp *.xml ${{ github.workspace }}/ut_log find ut_op_with_skip_nn ut_op_with_skip_quantization/core -type f -exec sh -c ' dir_path=$(dirname "$1"); case "$dir_path" in @@ -54,17 +63,17 @@ runs: esac; mv "$1" "$dir_path/${dir_name}_$(basename "$1")" ' _ {} \; - cp ut_op_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log - cp ut_op_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log + cp ut_op_with_skip_nn/*.xml ${{ github.workspace }}/ut_log + cp ut_op_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. # test_foreach, test_decomp # Run with only timeout 10000 python run_test_with_only.py \ - 2>$GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_only_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/ut_op/ut_op_with_only_test.log - cp ut_op_with_only.xml $GITHUB_WORKSPACE/ut_log + 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test.log + cp ut_op_with_only.xml ${{ github.workspace }}/ut_log - name: ut_torch shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_torch' }} @@ -76,8 +85,8 @@ runs: for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi - eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/ut_torch/torch_xpu_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/ut_torch/torch_xpu_test.log + eval $test_cmd 2> ${{ github.workspace }}/ut_log/ut_torch/torch_xpu_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_torch/torch_xpu_test.log - name: ut_profiling shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_profiling' }} @@ -86,28 +95,28 @@ runs: cd pytorch/third_party/torch-xpu-ops # RN50 Test PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 - cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test + cp profiling.fp32.train.pt ${{ github.workspace }}/ut_log/profile_test # All Issue Reproduce UT python -u test/profiling/correlation_id_mixed.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log + tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log + tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log python -u test/profiling/time_precision_in_profile.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log + tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log python -u test/profiling/profile_partial_runtime_ops.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log + tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log python -u test/profiling/triton_xpu_ops_time.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log + tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log # All xpu ut under test/profiler cd ../pytorch/test/profiler python -m pytest --timeout 600 -vs test_cpp_thread.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log + tee ${{ github.workspace }}/ut_log/profile_test/test_cpp_thread.log python -m pytest --timeout 600 -vs test_execution_trace.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log + tee ${{ github.workspace }}/ut_log/profile_test/test_execution_trace.log python -m pytest --timeout 600 -vs test_memory_profiler.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log + tee ${{ github.workspace }}/ut_log/profile_test/test_memory_profiler.log python -m pytest --timeout 600 -vs test_profiler_tree.py | \ - tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log + tee ${{ github.workspace }}/ut_log/profile_test/test_profiler_tree.log - name: xpu_dev1 shell: bash -xe {0} @@ -116,8 +125,8 @@ runs: mkdir -p ut_log/xpu_dev1 cd pytorch/third_party/torch-xpu-ops/test/regressions pytest --timeout 200 -v test_operation_on_device_1.py \ - --junit-xml=$GITHUB_WORKSPACE/ut_log/xpu_dev1.xml \ - 2>${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test_error.log | \ + --junit-xml=${{ github.workspace }}/ut_log/xpu_dev1.xml \ + 2> ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test.log - name: xpu_distributed @@ -132,5 +141,5 @@ runs: exit 1 fi timeout 1800 python run_distributed.py \ - 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ + 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log From 2e7680d6d12cb3bd9854dc82d57f6784d88f159e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 16:35:51 +0800 Subject: [PATCH 109/160] fix e2e summary permission --- .github/workflows/_linux_e2e.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 3f932b60e4..ccc71b4bb6 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -312,3 +312,7 @@ jobs: echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt fi gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt + - name: Set permissions + if: ${{ always() }} + run: | + chmod 777 /__w -R From 0a78df1851ef5fce946cb448fe6eeafe68161e49 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 17:00:13 +0800 Subject: [PATCH 110/160] fix ut log path --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 6d6527ea17..359ea9199f 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -115,7 +115,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} - path: ${{ github.workspace }}/ut_log/${{ matrix.test }} + path: ${{ github.workspace }}/ut_log - name: Upload XPU UT Failure list if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 From 074992f241e031942b11ace8640011c97cf649c8 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 17:03:46 +0800 Subject: [PATCH 111/160] update --- .github/workflows/_linux_e2e.yml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index ccc71b4bb6..ab9a6d2b69 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -232,19 +232,16 @@ jobs: path: ${{ github.workspace }}/upload_files summary: - runs-on: [self-hosted, Linux] + runs-on: ubuntu-latest if: ${{ ! cancelled() }} needs: test permissions: issues: write - container: - image: ubuntu:latest - env: - GH_TOKEN: ${{ github.token }} - REFERENCE_ISSUE_ID: 1645 - AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache - PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + env: + GH_TOKEN: ${{ github.token }} + REFERENCE_ISSUE_ID: 1645 + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin steps: - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 From b11510f06bdcef67506610a59225436e93065b76 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 17:04:40 +0800 Subject: [PATCH 112/160] update --- .github/workflows/_linux_e2e.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index ab9a6d2b69..6f91780e3d 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -312,4 +312,4 @@ jobs: - name: Set permissions if: ${{ always() }} run: | - chmod 777 /__w -R + find ./ |grep -v "^\./$" |xargs rm -rf From a18995b6618d868056c551ed4f71b0dd594ce4a9 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 18:28:43 +0800 Subject: [PATCH 113/160] modify e2e summary --- .github/workflows/_linux_e2e.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 6f91780e3d..b9c813c067 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -241,19 +241,16 @@ jobs: GH_TOKEN: ${{ github.token }} REFERENCE_ISSUE_ID: 1645 AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin steps: - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} - - name: Install gh + - name: Install gh-cli run: | - apt-get update - apt-get install gh rsync ca-certificates -y + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y find ./ |grep -v "^\./$" |xargs rm -rf - rm -rf /tmp/xpu-tool/myvenv - python -m venv /tmp/xpu-tool/myvenv - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact From 754202d4838ec597d8ebd7b4659e3de06f453ccf Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 18:32:18 +0800 Subject: [PATCH 114/160] modify e2e summary --- .github/actions/linux-uttest/action.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 48afe04125..8a741bf1d1 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -48,6 +48,7 @@ runs: run: | export PYTORCH_TEST_WITH_SLOW=1 export PYTORCH_ENABLE_XPU_FALLBACK=1 + mkdir -p ut_log/ut_op cd pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py \ 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test_error.log | \ @@ -80,6 +81,7 @@ runs: run: | export PYTORCH_TEST_WITH_SLOW=1 export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" + mkdir -p ut_log/ut_torch cd pytorch test_cmd="python test/run_test.py --include " for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done From 27c5cff7996a5cccccf09e3c841786800d998560 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 20:52:16 +0800 Subject: [PATCH 115/160] modify e2e summary --- .github/actions/linux-e2etest/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 559b3b307b..1404414b3e 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -61,6 +61,7 @@ runs: # install timm dependencies without torch and torchvision pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) fi + pip install -U numpy==1.26.4 pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) shell: bash -xe {0} From 92d7ff1237486d0410762a44c10ac8149627a89a Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 25 Jul 2025 20:55:49 +0800 Subject: [PATCH 116/160] update --- .github/workflows/_linux_e2e.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index b9c813c067..bb256881d3 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -296,7 +296,7 @@ jobs: fi fi - name: Upload Reference Run ID - if: ${{ ! (contains(inputs.test_type, 'ondemand') && contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} + if: ${{ ! (contains(inputs.test_type, 'ondemand') || contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} run: | gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" From 56520ca1ae843fe819b912394118ebe1206c81d9 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 4 Aug 2025 09:40:38 +0800 Subject: [PATCH 117/160] update --- .github/actions/linux-uttest/action.yml | 10 +++++----- .github/workflows/_linux_op_benchmark.yml | 4 ++-- test/xpu/extended/run_test_with_skip.py | 2 +- test/xpu/run_test_with_only.py | 6 +++--- test/xpu/xpu_test_utils.py | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 8a741bf1d1..7b6ad49cfe 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -17,7 +17,7 @@ runs: run: | mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ + pytest --timeout 600 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ 2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log - name: ut_transformers @@ -27,7 +27,7 @@ runs: export PYTORCH_TEST_WITH_SLOW=1 mkdir -p ut_log/ut_transformers cd pytorch - pytest --timeout 600 -v test/test_transformers.py -k xpu \ + pytest --timeout 600 --timeout_method=thread -v test/test_transformers.py -k xpu \ --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \ 2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log @@ -38,7 +38,7 @@ runs: export PYTORCH_TEST_WITH_SLOW=1 mkdir -p ut_log/ut_extended cd pytorch/third_party/torch-xpu-ops/test/xpu/extended - timeout 3600 python run_test_with_skip.py \ + python run_test_with_skip.py \ 2> ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test.log cp ut_extended.xml ${{ github.workspace }}/ut_log @@ -50,7 +50,7 @@ runs: export PYTORCH_ENABLE_XPU_FALLBACK=1 mkdir -p ut_log/ut_op cd pytorch/third_party/torch-xpu-ops/test/xpu - timeout 10000 python run_test_with_skip.py \ + python run_test_with_skip.py \ 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test.log cp *.xml ${{ github.workspace }}/ut_log @@ -71,7 +71,7 @@ runs: # when XPU implementatoin is done. # test_foreach, test_decomp # Run with only - timeout 10000 python run_test_with_only.py \ + python run_test_with_only.py \ 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test.log cp ut_op_with_only.xml ${{ github.workspace }}/ut_log diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index f0166cb6f3..c23d0e0278 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -103,8 +103,8 @@ jobs: path: ${{ github.workspace }}/op_benchmark op_benchmark_test_results_check: - needs: op_benchmark_test - runs-on: ubuntu-22.04 + needs: op_benchmark + runs-on: ubuntu-latest env: GH_TOKEN: ${{ github.token }} reference_issue: 1689 diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 01a608ae6d..01fc294823 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -17,7 +17,7 @@ skip_options += '"' os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" -test_command = "pytest --timeout 600 -v --timeout_method=thread --junit-xml=./op_extended.xml test_ops_xpu.py" +test_command = "pytest --timeout 600 -v --timeout_method=thread --junit-xml=./ut_extended.xml test_ops_xpu.py" test_command += skip_options res = os.system(test_command) sys.exit(res) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index 9d70896b11..642cb699eb 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -15,7 +15,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += '"' test_command = ( "pytest --timeout 600 -v " - + "--junit-xml=./op_ut_with_only.xml " + + "--junit-xml=./ut_op_with_only.xml " + test_case + skip_options ) @@ -28,14 +28,14 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += '"' test_command = ( "pytest --timeout 600 -v " - + "--junit-xml=./op_ut_with_only.xml " + + "--junit-xml=./ut_op_with_only.xml " + test_case + exe_options ) return os.system(test_command) else: test_command = ( - "pytest --timeout 600 -v --junit-xml=./op_ut_with_only.xml " + test_case + "pytest --timeout 600 -v --junit-xml=./ut_op_with_only.xml " + test_case ) return os.system(test_command) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index df524100b3..25e239f750 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1170,7 +1170,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + f"pytest --timeout 600 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += skip_options @@ -1181,13 +1181,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + f"pytest --timeout 600 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + f"pytest --timeout 600 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) return os.system(test_command) From 9117a0c7be1ee40a5ce1bb4ce764ec629d0b2480 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 4 Aug 2025 10:22:47 +0800 Subject: [PATCH 118/160] update --- .github/workflows/_linux_e2e.yml | 8 ++++---- .github/workflows/_linux_op_benchmark.yml | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index bb256881d3..f089020f23 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -242,15 +242,15 @@ jobs: REFERENCE_ISSUE_ID: 1645 AGENT_TOOLSDIRECTORY: /tmp/xpu-tool steps: - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} - name: Install gh-cli run: | sudo apt-get update sudo apt-get install gh rsync ca-certificates -y find ./ |grep -v "^\./$" |xargs rm -rf + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Download Target Artifact diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index c23d0e0278..4034c5f385 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -109,6 +109,11 @@ jobs: GH_TOKEN: ${{ github.token }} reference_issue: 1689 steps: + - name: Install gh-cli + run: | + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Setup python-${{ inputs.python }} From 587aa953614f20186fd71004887e4152920ae919 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 4 Aug 2025 15:50:10 +0800 Subject: [PATCH 119/160] update --- .github/actions/linux-uttest/action.yml | 8 ++++++-- .github/workflows/_linux_ut.yml | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 7b6ad49cfe..4d4dfd61c2 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -41,7 +41,8 @@ runs: python run_test_with_skip.py \ 2> ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test.log - cp ut_extended.xml ${{ github.workspace }}/ut_log + ls -al + cp *.xml ${{ github.workspace }}/ut_log - name: ut_op shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_op' }} @@ -53,6 +54,7 @@ runs: python run_test_with_skip.py \ 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test.log + ls -al cp *.xml ${{ github.workspace }}/ut_log find ut_op_with_skip_nn ut_op_with_skip_quantization/core -type f -exec sh -c ' dir_path=$(dirname "$1"); @@ -64,6 +66,7 @@ runs: esac; mv "$1" "$dir_path/${dir_name}_$(basename "$1")" ' _ {} \; + ls -al ut_op_with_skip_nn ut_op_with_skip_quantization/core cp ut_op_with_skip_nn/*.xml ${{ github.workspace }}/ut_log cp ut_op_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too @@ -74,7 +77,8 @@ runs: python run_test_with_only.py \ 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test.log - cp ut_op_with_only.xml ${{ github.workspace }}/ut_log + ls -al + cp *.xml ${{ github.workspace }}/ut_log - name: ut_torch shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_torch' }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index af9126825b..03d690de11 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -127,7 +127,7 @@ jobs: devices: runs-on: pvc_rolling if: ${{ contains(inputs.ut, 'xpu_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} - timeout-minutes: 5 + timeout-minutes: 30 env: GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool From 3b0b94d13fd1fc96983df33231c7bc7a84d3ece7 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 4 Aug 2025 15:52:24 +0800 Subject: [PATCH 120/160] update --- .github/actions/linux-uttest/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 4d4dfd61c2..17512a722a 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -114,7 +114,7 @@ runs: python -u test/profiling/triton_xpu_ops_time.py | \ tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log # All xpu ut under test/profiler - cd ../pytorch/test/profiler + cd ../../test/profiler python -m pytest --timeout 600 -vs test_cpp_thread.py | \ tee ${{ github.workspace }}/ut_log/profile_test/test_cpp_thread.log python -m pytest --timeout 600 -vs test_execution_trace.py | \ From e47b3e45b2927fbf48299a223c9766fbf0cd8aaa Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 5 Aug 2025 13:34:41 +0800 Subject: [PATCH 121/160] update --- .github/workflows/_linux_ut.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 03d690de11..29cd114ef6 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -104,7 +104,6 @@ jobs: test_type: ${{ matrix.test }} - name: UT Test Results Summary if: ${{ contains(inputs.ut, matrix.test) }} - run: | pip install junitparser python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true @@ -112,17 +111,19 @@ jobs: cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true fi - name: Upload Inductor XPU UT Log - if: ${{ contains(inputs.ut, matrix.test) }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log + if-no-files-found: ignore - name: Upload XPU UT Failure list - if: ${{ contains(inputs.ut, matrix.test) }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log/ut_failure_list.csv + if-no-files-found: ignore devices: runs-on: pvc_rolling From 51578bd58ff9c03285604dbb7d76d9960ebad7a0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 6 Aug 2025 10:04:33 +0800 Subject: [PATCH 122/160] enable pytest to survive crashing tests and potentially complete the remaining tests --- .github/actions/linux-testenv/action.yml | 2 +- .github/actions/linux-uttest/action.yml | 12 ++++++------ test/xpu/extended/run_test_with_skip.py | 2 +- test/xpu/run_test_with_only.py | 6 +++--- test/xpu/xpu_test_utils.py | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 20e2bf15bc..d9907ed651 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -45,7 +45,7 @@ runs: clinfo --list cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c rm -rf ~/.triton /tmp/*inductor* - pip install pandas psutil scipy requests pytest-timeout + pip install pandas psutil scipy requests pytest-timeout pytest-xdist - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 17512a722a..f903c29fa7 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -17,7 +17,7 @@ runs: run: | mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ + pytest --timeout 600 -n 4 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ 2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log - name: ut_transformers @@ -27,7 +27,7 @@ runs: export PYTORCH_TEST_WITH_SLOW=1 mkdir -p ut_log/ut_transformers cd pytorch - pytest --timeout 600 --timeout_method=thread -v test/test_transformers.py -k xpu \ + pytest --timeout 600 -n 4 --timeout_method=thread -v test/test_transformers.py -k xpu \ --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \ 2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log @@ -115,13 +115,13 @@ runs: tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log # All xpu ut under test/profiler cd ../../test/profiler - python -m pytest --timeout 600 -vs test_cpp_thread.py | \ + python -m pytest --timeout 600 -n 4 -vs test_cpp_thread.py | \ tee ${{ github.workspace }}/ut_log/profile_test/test_cpp_thread.log - python -m pytest --timeout 600 -vs test_execution_trace.py | \ + python -m pytest --timeout 600 -n 4 -vs test_execution_trace.py | \ tee ${{ github.workspace }}/ut_log/profile_test/test_execution_trace.log - python -m pytest --timeout 600 -vs test_memory_profiler.py | \ + python -m pytest --timeout 600 -n 4 -vs test_memory_profiler.py | \ tee ${{ github.workspace }}/ut_log/profile_test/test_memory_profiler.log - python -m pytest --timeout 600 -vs test_profiler_tree.py | \ + python -m pytest --timeout 600 -n 4 -vs test_profiler_tree.py | \ tee ${{ github.workspace }}/ut_log/profile_test/test_profiler_tree.log - name: xpu_dev1 diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 01fc294823..17a8bbeb7a 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -17,7 +17,7 @@ skip_options += '"' os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" -test_command = "pytest --timeout 600 -v --timeout_method=thread --junit-xml=./ut_extended.xml test_ops_xpu.py" +test_command = "pytest --timeout 600 -n 4 -v --timeout_method=thread --junit-xml=./ut_extended.xml test_ops_xpu.py" test_command += skip_options res = os.system(test_command) sys.exit(res) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index 642cb699eb..f7b8f097fd 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -14,7 +14,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - "pytest --timeout 600 -v " + "pytest --timeout 600 -n 4 -v " + "--junit-xml=./ut_op_with_only.xml " + test_case + skip_options @@ -27,7 +27,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - "pytest --timeout 600 -v " + "pytest --timeout 600 -n 4 -v " + "--junit-xml=./ut_op_with_only.xml " + test_case + exe_options @@ -35,7 +35,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): return os.system(test_command) else: test_command = ( - "pytest --timeout 600 -v --junit-xml=./ut_op_with_only.xml " + test_case + "pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_only.xml " + test_case ) return os.system(test_command) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 25e239f750..967bc192a6 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1170,7 +1170,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f"pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += skip_options @@ -1181,13 +1181,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f"pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f"pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) return os.system(test_command) From 75c99ffbb2fa190c21a8522f71acf5a0e078f914 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 6 Aug 2025 10:19:46 +0800 Subject: [PATCH 123/160] update --- .github/actions/linux-testenv/action.yml | 7 ++++--- .github/workflows/pull.yml | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 20e2bf15bc..2199e0aa64 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -113,13 +113,14 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ inputs.test_type }}" == "cicd" ];then + if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ];then cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops + cd third_party/torch-xpu-ops else git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} fi - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Torch Config shell: bash -xe {0} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index f009d10cb9..d8533f4091 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -108,6 +108,7 @@ jobs: runner: linux.idc.xpu test_type: build-cicd pytorch: main + torch_xpu_ops: cicd ut: ut_regression,ut_transformers,ut_extended,ut_op,xpu_dev1,xpu_distributed disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} @@ -120,6 +121,7 @@ jobs: runner: pvc_rolling test_type: build-cicd pytorch: main + torch_xpu_ops: cicd windows: name: windows From dcc44334f66d25d2b122c564796f6f0bc1338d24 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 6 Aug 2025 10:22:49 +0800 Subject: [PATCH 124/160] fix lint issue --- test/xpu/run_test_with_only.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index f7b8f097fd..52bbcc1ced 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -35,7 +35,8 @@ def launch_test(test_case, skip_list=None, exe_list=None): return os.system(test_command) else: test_command = ( - "pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_only.xml " + test_case + "pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_only.xml " + + test_case ) return os.system(test_command) From e244cb17878def59180ec81277811c4dcff7f05d Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 6 Aug 2025 14:31:14 +0800 Subject: [PATCH 125/160] Update pull.yml --- .github/workflows/pull.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index d8533f4091..7e385d8c98 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -121,7 +121,6 @@ jobs: runner: pvc_rolling test_type: build-cicd pytorch: main - torch_xpu_ops: cicd windows: name: windows From 47cbdf5bba6fbc99383e5f4fa1f38b4b55b6f8f0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 7 Aug 2025 10:20:04 +0800 Subject: [PATCH 126/160] modify pt2e --- .github/actions/linux-e2etest/action.yml | 1 - .github/actions/pt2e/action.yml | 13 ------------- 2 files changed, 14 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 1404414b3e..559b3b307b 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -61,7 +61,6 @@ runs: # install timm dependencies without torch and torchvision pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) fi - pip install -U numpy==1.26.4 pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) shell: bash -xe {0} diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 65fde6a03b..6fdf926a2a 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -39,18 +39,6 @@ runs: fi # deps if [[ ${{ inputs.scenario }} == *"performance"* ]]; then - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - rm -rf pt2e-audio - git clone --single-branch -b main https://github.com/pytorch/audio pt2e-audio - cd pt2e-audio && git checkout $TORCHAUDIO_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchaudio -y && pip install dist/*.whl - cd ../ - rm -rf pt2e-vision - git clone --single-branch -b main https://github.com/pytorch/vision pt2e-vision - cd pt2e-vision && git checkout $TORCHVISION_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl - cd ../ - fi # torchbench python -c "import torch, torchvision, torchaudio" cd pt2e-performance @@ -68,7 +56,6 @@ runs: pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec fi - pip install numpy==1.26.4 # dataset if [ ! -d ${HOME}/datasets/imagenet ];then rm -rf ${HOME}/datasets/imagenet From de15a3fe580fa034546e97305e26cddc3a4680ca Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 7 Aug 2025 10:30:45 +0800 Subject: [PATCH 127/160] update --- .github/scripts/build.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b419883740..41c46c99b1 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -57,17 +57,17 @@ if [ "${XPU_ONEAPI_PATH}" == "" ];then intel-cmplr-lib-ur==2025.1.1 | \ intel-cmplr-lic-rt==2025.1.1 | \ intel-sycl-rt==2025.1.1 | \ - oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + impi-rt==2021.15.0 | \ + dpcpp-cpp-rt==2025.1.1 | \ + oneccl-devel==2021.15.2 | \ + oneccl==2021.15.2 | \ + mkl==2025.1.0 | \ onemkl-sycl-blas==2025.1.0 | \ onemkl-sycl-dft==2025.1.0 | \ onemkl-sycl-lapack==2025.1.0 | \ onemkl-sycl-rng==2025.1.0 | \ onemkl-sycl-sparse==2025.1.0 | \ - dpcpp-cpp-rt==2025.1.1 | \ intel-opencl-rt==2025.1.1 | \ - mkl==2025.1.0 | \ intel-openmp==2025.1.1 | \ tbb==2022.1.0 | \ tcmlib==1.3.0 | \ From 8445b8bc65b2e16e3e2349b20d344aa2bb04a542 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 10:28:39 +0800 Subject: [PATCH 128/160] e2e test matrix tests --- .github/actions/linux-e2etest/action.yml | 2 +- .github/workflows/_linux_e2e.yml | 14 +++++++------- .github/workflows/pull.yml | 6 ++++++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 559b3b307b..7fc921330e 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -35,7 +35,7 @@ runs: if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then python -c "import torch, torchvision, torchaudio" cd ./pytorch - TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt) + TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt 2> /dev/null || cat .ci/docker/ci_commit_pins/torchbench.txt) git clone https://github.com/pytorch/benchmark.git xpu-benchmark cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID # remove deps which will reinstall torch diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index f089020f23..4456a2a075 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -102,7 +102,7 @@ jobs: # CICD launch - name: Nightly Huggingface BF16 & FP16 Training Test - if: ${{ contains(inputs.test_type, 'cicd') }} + if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'huggingface') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true @@ -110,21 +110,21 @@ jobs: dt: bfloat16,float16 mode: training scenario: accuracy,performance - - name: Nightly Torchbench BF16 Training Test - if: ${{ contains(inputs.test_type, 'cicd') }} + - name: Nightly Timm_models BF16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'timm_models') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: torchbench + suite: timm_models dt: bfloat16 mode: training scenario: accuracy,performance - - name: Nightly Timm_models BF16 Training Test - if: ${{ contains(inputs.test_type, 'cicd') }} + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'torchbench') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: timm_models + suite: torchbench dt: bfloat16 mode: training scenario: accuracy,performance diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 7e385d8c98..15923ebc6d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -116,11 +116,17 @@ jobs: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} permissions: write-all needs: [conditions-filter, linux-build] + name: ${{ matrix.suite }} + strategy: + fail-fast: false + matrix: + suite: [huggingface, timm_models, torchbench] uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: build-cicd pytorch: main + suite: ${{ matrix.suite }} windows: name: windows From 8fe34c522c0cf0107adef1724bd7f81636f63ef7 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 15:01:33 +0800 Subject: [PATCH 129/160] modify e2e summary --- .github/workflows/_linux_e2e.yml | 82 +------------------ .github/workflows/_linux_e2e_summary.yml | 100 +++++++++++++++++++++++ .github/workflows/nightly_ondemand.yml | 12 ++- .github/workflows/pull.yml | 7 +- 4 files changed, 118 insertions(+), 83 deletions(-) create mode 100644 .github/workflows/_linux_e2e_summary.yml diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 4456a2a075..cdbc7cbd0e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -228,85 +228,5 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} + name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.suite }} path: ${{ github.workspace }}/upload_files - - summary: - runs-on: ubuntu-latest - if: ${{ ! cancelled() }} - needs: test - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - REFERENCE_ISSUE_ID: 1645 - AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - steps: - - name: Install gh-cli - run: | - sudo apt-get update - sudo apt-get install gh rsync ca-certificates -y - find ./ |grep -v "^\./$" |xargs rm -rf - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 - with: - python-version: ${{ inputs.python }} - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Download Target Artifact - run: | - mkdir target/ - cd target/ - target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}" - gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -n "${target_dir}" - - name: Download Baseline Artifact - run: | - mkdir baseline/ - artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/;s/cicd/weekly/')" - gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt - REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" - if [ "${REFERENCE_RUN_ID}" != "" ];then - gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - baseline_dir="$(find . -name 'Inductor-*-XPU-E2E-*' -type d)" - if [ -d "${baseline_dir}" ];then - rsync -avzq --delete ${baseline_dir}/ baseline/ - ls -al baseline/ - rm -rf ${baseline_dir}/ - fi - fi - - name: Get summary - if: ${{ ! cancelled() }} - run: | - pip install pandas requests - if [ "${{ inputs.suite }}" != 'pt2e' ];then - bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ./target/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep -c ',failed' ${pt2e_summary_csv}) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Reference Run ID - if: ${{ ! (contains(inputs.test_type, 'ondemand') || contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 - has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" - if [ ${has_or_not} -ne 0 ];then - sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt - else - echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt - fi - gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt - - name: Set permissions - if: ${{ always() }} - run: | - find ./ |grep -v "^\./$" |xargs rm -rf diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml new file mode 100644 index 0000000000..98438c9c78 --- /dev/null +++ b/.github/workflows/_linux_e2e_summary.yml @@ -0,0 +1,100 @@ +name: Linux E2E Test + +on: + workflow_call: + inputs: + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel + python: + type: string + default: '3.10' + description: Python version + suite: + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma + +permissions: read-all + +defaults: + run: + shell: bash -xe {0} + +jobs: + summary: + runs-on: ubuntu-latest + if: ${{ ! cancelled() }} + permissions: + issues: write + env: + GH_TOKEN: ${{ github.token }} + REFERENCE_ISSUE_ID: 1645 + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + steps: + - name: Install gh-cli + run: | + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y + find ./ |grep -v "^\./$" |xargs rm -rf + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Download Target Artifact + run: | + mkdir target/ + cd target/ + target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*" + gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -p "${target_dir}" + mv Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*/* . + - name: Download Baseline Artifact + run: | + mkdir baseline/ + cd baseline/ + artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/;s/cicd/weekly/')" + gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt + REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" + if [ "${REFERENCE_RUN_ID}" != "" ];then + gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" + mv Inductor-*-XPU-E2E-*/* . + fi + - name: Get summary + if: ${{ ! cancelled() }} + run: | + pip install pandas requests + if [ "${{ inputs.suite }}" != 'pt2e' ];then + bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} + exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) + if [ ${exit_label} -ne 0 ];then + grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 + echo "There are ${exit_label} cases that need look into!!! Please check them" + exit ${exit_label} + fi + fi + pt2e_summary_csv="$(find ./target/ -name "summary.csv")" + if [ -f "${pt2e_summary_csv}" ];then + cat ${pt2e_summary_csv} + failed_num=$(grep -c ',failed' ${pt2e_summary_csv}) + if [ ${failed_num} -ne 0 ];then + echo "[Warning] PT2E has failures!" + fi + fi + - name: Upload Reference Run ID + if: ${{ ! (contains(inputs.test_type, 'ondemand') || contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} + run: | + gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 + has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" + if [ ${has_or_not} -ne 0 ];then + sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt + else + echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt + fi + gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt + - name: Set permissions + if: ${{ always() }} + run: | + find ./ |grep -v "^\./$" |xargs rm -rf diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index e68bade8b7..9158fe1361 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -137,7 +137,6 @@ jobs: Linux-Nightly-Ondemand-E2E-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} name: linux-e2e - permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml with: @@ -151,6 +150,17 @@ jobs: mode: ${{ github.event_name == 'schedule' && 'inference' || inputs.mode }} scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} model: ${{ github.event_name == 'schedule' && '' || inputs.model }} + Linux-Nightly-Ondemand-E2E-Tests-Summary: + if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} + name: linux-e2e-summary + permissions: write-all + needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] + uses: ./.github/workflows/_linux_e2e_summary.yml + with: + runner: pvc_rolling + test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} + suite: ${{ github.event_name == 'schedule' && 'huggingface' || inputs.suite }} Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'microbench') }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 15923ebc6d..1720132a53 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -114,7 +114,6 @@ jobs: linux-e2e: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} - permissions: write-all needs: [conditions-filter, linux-build] name: ${{ matrix.suite }} strategy: @@ -127,6 +126,12 @@ jobs: test_type: build-cicd pytorch: main suite: ${{ matrix.suite }} + linux-e2e-summary: + permissions: write-all + needs: [linux-e2e] + uses: ./.github/workflows/_linux_e2e_summary.yml + with: + test_type: build-cicd windows: name: windows From bfc98daa851712678fc0928332a3fcb0915f70eb Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 15:12:06 +0800 Subject: [PATCH 130/160] update --- .github/workflows/nightly_ondemand.yml | 2 +- .github/workflows/pull.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 9158fe1361..7cab7199c7 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -152,7 +152,7 @@ jobs: model: ${{ github.event_name == 'schedule' && '' || inputs.model }} Linux-Nightly-Ondemand-E2E-Tests-Summary: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} - name: linux-e2e-summary + name: linux-e2e permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] uses: ./.github/workflows/_linux_e2e_summary.yml diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 1720132a53..a142f5b800 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -127,6 +127,7 @@ jobs: pytorch: main suite: ${{ matrix.suite }} linux-e2e-summary: + name: linux-e2e permissions: write-all needs: [linux-e2e] uses: ./.github/workflows/_linux_e2e_summary.yml From 80641265967681d42b99e4754cf1029cd49e8f73 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 15:25:11 +0800 Subject: [PATCH 131/160] update --- .github/workflows/nightly_ondemand.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 7cab7199c7..cbd9f65c45 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -157,7 +157,6 @@ jobs: needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] uses: ./.github/workflows/_linux_e2e_summary.yml with: - runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} suite: ${{ github.event_name == 'schedule' && 'huggingface' || inputs.suite }} From 1ea6a628828355ec55bd3afb87e10f054f786b49 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 15:46:41 +0800 Subject: [PATCH 132/160] update --- .github/workflows/_linux_e2e.yml | 66 ++++++++++++++++---------- .github/workflows/nightly_ondemand.yml | 9 ++-- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index cdbc7cbd0e..d585b9f375 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -131,7 +131,7 @@ jobs: # Nihglty launch - name: Nightly Huggingface Full Test - if: ${{ contains(inputs.test_type, 'nightly') }} + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'huggingface') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true @@ -139,26 +139,26 @@ jobs: dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - - name: Nightly Torchbench BF16 Training Test - if: ${{ contains(inputs.test_type, 'nightly') }} + - name: Nightly Timm_models FP16 Training Test + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'timm_models') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: torchbench - dt: bfloat16 + suite: timm_models + dt: float16 mode: training scenario: accuracy,performance - - name: Nightly Timm_models FP16 Training Test - if: ${{ contains(inputs.test_type, 'nightly') }} + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'torchbench') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: timm_models - dt: float16 + suite: torchbench + dt: bfloat16 mode: training scenario: accuracy,performance - name: Nightly PT2E Full Test - if: ${{ contains(inputs.test_type, 'nightly') }} + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'pt2e') }} uses: ./.github/actions/pt2e with: env_prepare: true @@ -166,8 +166,8 @@ jobs: scenario: accuracy,performance # Weekly launch - - name: Nightly Huggingface Full Test - if: ${{ contains(inputs.test_type, 'weekly') }} + - name: Weekly Huggingface Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'huggingface') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true @@ -175,26 +175,26 @@ jobs: dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - - name: Nightly Torchbench BF16 Training Test - if: ${{ contains(inputs.test_type, 'weekly') }} + - name: Weekly Timm_models Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'timm_models') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: torchbench + suite: timm_models dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - - name: Nightly Timm_models FP16 Training Test - if: ${{ contains(inputs.test_type, 'weekly') }} + - name: Weekly Torchbench Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'torchbench') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: timm_models + suite: torchbench dt: float32,bfloat16,float16,amp_bf16,amp_fp16 mode: inference,training scenario: accuracy,performance - - name: Nightly PT2E Full Test - if: ${{ contains(inputs.test_type, 'weekly') }} + - name: Weekly PT2E Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'pt2e') }} uses: ./.github/actions/pt2e with: env_prepare: true @@ -202,16 +202,34 @@ jobs: scenario: accuracy,performance # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ contains(inputs.test_type, 'ondemand') && inputs.suite != 'pt2e' }} + - name: OnDemand Test (huggingface) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'huggingface') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: huggingface + dt: ${{ inputs.dt }} + mode: ${{ inputs.mode }} + scenario: ${{ inputs.scenario }} + - name: OnDemand Test (timm_models) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'timm_models') }} uses: ./.github/actions/linux-e2etest with: env_prepare: true - suite: ${{ inputs.suite }} + suite: timm_models + dt: ${{ inputs.dt }} + mode: ${{ inputs.mode }} + scenario: ${{ inputs.scenario }} + - name: OnDemand Test (torchbench) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'torchbench') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: torchbench dt: ${{ inputs.dt }} mode: ${{ inputs.mode }} scenario: ${{ inputs.scenario }} - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + - name: OnDemand PT2E Test (pt2e) if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'pt2e') }} uses: ./.github/actions/pt2e with: diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index cbd9f65c45..3cc5514a5e 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -31,9 +31,8 @@ on: default: '' description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,ut_torch,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma suite: - type: string default: '' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma + description: Dynamo benchmarks test suite. `[huggingface,timm_models,torchbench,pt2e]`. Delimiter is comma dt: type: string default: '' @@ -139,13 +138,17 @@ jobs: name: linux-e2e needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml + strategy: + fail-fast: false + matrix: + suite: ${{ fromJSON(inputs.suite) }} with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} - suite: ${{ github.event_name == 'schedule' && 'huggingface' || inputs.suite }} + suite: ${{ matrix.suite }} dt: ${{ github.event_name == 'schedule' && 'float32' || inputs.dt }} mode: ${{ github.event_name == 'schedule' && 'inference' || inputs.mode }} scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} From 530af25780f4172b2bca0c8b3f455f7648d69b96 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 16:07:59 +0800 Subject: [PATCH 133/160] update --- .github/workflows/_linux_e2e_summary.yml | 7 ++----- .github/workflows/nightly_ondemand.yml | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml index 98438c9c78..d65e47a3ae 100644 --- a/.github/workflows/_linux_e2e_summary.yml +++ b/.github/workflows/_linux_e2e_summary.yml @@ -11,10 +11,6 @@ on: type: string default: '3.10' description: Python version - suite: - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma permissions: read-all @@ -66,7 +62,8 @@ jobs: if: ${{ ! cancelled() }} run: | pip install pandas requests - if [ "${{ inputs.suite }}" != 'pt2e' ];then + e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)" + if [ -f "${e2e_summary_csv}" ];then bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) if [ ${exit_label} -ne 0 ];then diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 3cc5514a5e..efc60f4600 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -31,7 +31,7 @@ on: default: '' description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,ut_torch,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma suite: - default: '' + default: '[]' description: Dynamo benchmarks test suite. `[huggingface,timm_models,torchbench,pt2e]`. Delimiter is comma dt: type: string @@ -134,7 +134,7 @@ jobs: ut: ${{ github.event_name == 'schedule' && 'ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op' || inputs.ut }} Linux-Nightly-Ondemand-E2E-Tests: - if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} + if: ${{ github.event_name == 'schedule' || inputs.suite != '[]' }} name: linux-e2e needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_e2e.yml @@ -154,15 +154,12 @@ jobs: scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} model: ${{ github.event_name == 'schedule' && '' || inputs.model }} Linux-Nightly-Ondemand-E2E-Tests-Summary: - if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} name: linux-e2e permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] uses: ./.github/workflows/_linux_e2e_summary.yml with: test_type: ${{ needs.Conditions-Filter.outputs.test_type }} - python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} - suite: ${{ github.event_name == 'schedule' && 'huggingface' || inputs.suite }} Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'microbench') }} From 091678f8604ad4eb59eff54a2bc3d12c414c95f6 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 16:22:23 +0800 Subject: [PATCH 134/160] update --- .github/workflows/nightly_ondemand.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index efc60f4600..d5adf534a6 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -137,11 +137,11 @@ jobs: if: ${{ github.event_name == 'schedule' || inputs.suite != '[]' }} name: linux-e2e needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] - uses: ./.github/workflows/_linux_e2e.yml strategy: fail-fast: false matrix: suite: ${{ fromJSON(inputs.suite) }} + uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling test_type: ${{ needs.Conditions-Filter.outputs.test_type }} From eaa4bc4c5d1d27b020f33fcf17a8582246b865ce Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 16:56:39 +0800 Subject: [PATCH 135/160] update --- .github/workflows/nightly_ondemand.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index d5adf534a6..8fb9ffe14d 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -32,7 +32,7 @@ on: description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,ut_torch,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma suite: default: '[]' - description: Dynamo benchmarks test suite. `[huggingface,timm_models,torchbench,pt2e]`. Delimiter is comma + description: Dynamo benchmarks test suite. `["huggingface","timm_models","torchbench","pt2e"]`. Delimiter is comma dt: type: string default: '' From f70ef8abc90c509e1a9e6a6696802b4a8c0633cb Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 22:12:38 +0800 Subject: [PATCH 136/160] update deps --- .github/actions/linux-testenv/action.yml | 9 +++++++-- .github/workflows/nightly_ondemand.yml | 8 ++++---- .github/workflows/pull.yml | 6 +++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 2199e0aa64..85a38de9f1 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -45,7 +45,6 @@ runs: clinfo --list cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c rm -rf ~/.triton /tmp/*inductor* - pip install pandas psutil scipy requests pytest-timeout - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: @@ -88,8 +87,14 @@ runs: git clone ${PYTORCH_REPO} pytorch cd pytorch git checkout ${TORCH_COMMIT_ID} - pip install -r .ci/docker/requirements-ci.txt + if [[ "${{ inputs.test_type }}" == *"-e2e" ]];then + pip install pandas psutil scipy + else + pip install pytest-timeout + pip install -r .ci/docker/requirements-ci.txt + fi # apply extra PRs for stock pytorch + pip install requests if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 else diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 8fb9ffe14d..a58710682a 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -126,7 +126,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: linux.idc.xpu - test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-ut pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} @@ -144,7 +144,7 @@ jobs: uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling - test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-e2e pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} @@ -159,7 +159,7 @@ jobs: needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] uses: ./.github/workflows/_linux_e2e_summary.yml with: - test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-e2e Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'microbench') }} @@ -169,7 +169,7 @@ jobs: uses: ./.github/workflows/_linux_op_benchmark.yml with: runner: pvc_rolling - test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-mb pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index a142f5b800..6aece67fff 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -106,7 +106,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: linux.idc.xpu - test_type: build-cicd + test_type: build-cicd-ut pytorch: main torch_xpu_ops: cicd ut: ut_regression,ut_transformers,ut_extended,ut_op,xpu_dev1,xpu_distributed @@ -123,7 +123,7 @@ jobs: uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling - test_type: build-cicd + test_type: build-cicd-e2e pytorch: main suite: ${{ matrix.suite }} linux-e2e-summary: @@ -132,7 +132,7 @@ jobs: needs: [linux-e2e] uses: ./.github/workflows/_linux_e2e_summary.yml with: - test_type: build-cicd + test_type: build-cicd-e2e windows: name: windows From a12045a5cd15a8e12febe0f367246b05cea747c3 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 22:13:40 +0800 Subject: [PATCH 137/160] update --- .github/workflows/_linux_e2e.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index d585b9f375..a5e4d87ee1 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -101,7 +101,7 @@ jobs: python: ${{ inputs.python }} # CICD launch - - name: Nightly Huggingface BF16 & FP16 Training Test + - name: CICD Huggingface BF16 & FP16 Training Test if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'huggingface') }} uses: ./.github/actions/linux-e2etest with: @@ -110,7 +110,7 @@ jobs: dt: bfloat16,float16 mode: training scenario: accuracy,performance - - name: Nightly Timm_models BF16 Training Test + - name: CICD Timm_models BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'timm_models') }} uses: ./.github/actions/linux-e2etest with: @@ -119,7 +119,7 @@ jobs: dt: bfloat16 mode: training scenario: accuracy,performance - - name: Nightly Torchbench BF16 Training Test + - name: CICD Torchbench BF16 Training Test if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'torchbench') }} uses: ./.github/actions/linux-e2etest with: From 70415c292c96551669f5d0e920aafb8b7a7baba0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 11 Aug 2025 22:32:47 +0800 Subject: [PATCH 138/160] modify cache dir --- .github/workflows/_linux_e2e.yml | 2 ++ .github/workflows/_linux_op_benchmark.yml | 2 ++ .github/workflows/_linux_ut.yml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index a5e4d87ee1..d784f2e49f 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -84,6 +84,8 @@ jobs: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + TORCH_HOME: /tmp/.cache/_torch + HF_HOME: /tmp/.cache/_huggingface MODEL_ONLY_NAME: ${{ inputs.model }} steps: - name: Cleanup workspace diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 4034c5f385..a2e3757042 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -65,6 +65,8 @@ jobs: AGENT_TOOLSDIRECTORY: /opt/xpu-tool GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + TORCH_HOME: /tmp/.cache/_torch + HF_HOME: /tmp/.cache/_huggingface REFERENCE_ISSUE: 1689 steps: - name: Cleanup workspace diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 29cd114ef6..508d144c9c 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -76,6 +76,8 @@ jobs: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool GH_TOKEN: ${{ github.token }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + TORCH_HOME: /tmp/.cache/_torch + HF_HOME: /tmp/.cache/_huggingface strategy: fail-fast: false matrix: From 5fcc6c696b2d86208f6a19c0c7a14453005a6f08 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 12 Aug 2025 11:01:26 +0800 Subject: [PATCH 139/160] update --- .github/actions/linux-e2etest/action.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 7fc921330e..01a7216bab 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -63,7 +63,7 @@ runs: fi pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - shell: bash -xe {0} + shell: bash -x {0} run: | cp ./.github/scripts/inductor_xpu_test.sh ./pytorch cd ./pytorch @@ -103,12 +103,12 @@ runs: for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ]) do cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')" - numactl --localalloc --physcpubind=${cpu_list} bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} & + numactl --localalloc --physcpubind=${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} & done else for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g') do - numactl --localalloc bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model} + numactl --localalloc bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model} done fi wait From 18f22e007f2fbf0cfef1cca84757b40f085f06cc Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 12 Aug 2025 22:40:13 +0800 Subject: [PATCH 140/160] update --- .github/actions/linux-e2etest/action.yml | 35 +------------------- .github/actions/linux-testenv/action.yml | 41 ++++++++++++++++++++---- .github/actions/linux-uttest/action.yml | 6 ++++ .github/actions/pt2e/action.yml | 19 ----------- 4 files changed, 41 insertions(+), 60 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 01a7216bab..874b250638 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -28,43 +28,10 @@ inputs: runs: using: composite steps: - - name: Prepare ENV - if: ${{ inputs.env_prepare }} - shell: bash -xe {0} - run: | - if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then - python -c "import torch, torchvision, torchaudio" - cd ./pytorch - TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt 2> /dev/null || cat .ci/docker/ci_commit_pins/torchbench.txt) - git clone https://github.com/pytorch/benchmark.git xpu-benchmark - cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID - # remove deps which will reinstall torch - pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - pip install -U transformers==4.44.2 - sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt - git status && git diff - pip install -r requirements.txt - python install.py --continue_on_fail - # deps for torchrec_dlrm - pip install pyre_extensions - pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu - pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec - fi - if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then - pip install -U transformers==4.44.2 - fi - if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then - # install timm without dependencies - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - # install timm dependencies without torch and torchvision - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - fi - pip list |grep -E 'intel|torch' - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) shell: bash -x {0} run: | + pip list |grep -E 'intel|torch' cp ./.github/scripts/inductor_xpu_test.sh ./pytorch cd ./pytorch # check param diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 85a38de9f1..12868db55f 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -21,6 +21,10 @@ inputs: type: string default: '3.10' description: Python version + suite: + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma permissions: read-all @@ -64,6 +68,36 @@ runs: uses: actions/download-artifact@v4 with: pattern: Torch-XPU-Wheel-* + - name: Install E2E Requirements + if: ${{ ! contains(inputs.test_type, 'e2e') }} + shell: bash -xe {0} + run: | + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu + pip install pandas psutil scipy + if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then + pip install transformers==4.44.2 + elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then + pip install timm==1.0.14 + elif [[ "${{ inputs.suite }}" == *"torchbench"* ]];then + rm -rf ./benchmark + git clone https://github.com/pytorch/benchmark + cd benchmark + git checkout e03a63be43e33596f7f0a43b0f530353785e4a59 + pip install -r requirements.txt + pip install -U transformers==4.44.2 timm==1.0.14 + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install + python install.py --continue_on_fail + elif [[ "${{ inputs.suite }}" == *"pt2e"* ]];then + rm -rf ./benchmark + git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark + cd benchmark + pip install -r requirements.txt + pip install -U transformers==4.44.2 timm==1.0.14 + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install + python install.py --continue_on_fail + fi + pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton + pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton - name: Prepare Stock Pytorch shell: bash -xe {0} run: | @@ -77,7 +111,6 @@ runs: else pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl") fi - pip list |grep torch TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" @@ -87,12 +120,6 @@ runs: git clone ${PYTORCH_REPO} pytorch cd pytorch git checkout ${TORCH_COMMIT_ID} - if [[ "${{ inputs.test_type }}" == *"-e2e" ]];then - pip install pandas psutil scipy - else - pip install pytest-timeout - pip install -r .ci/docker/requirements-ci.txt - fi # apply extra PRs for stock pytorch pip install requests if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 17512a722a..ded9d5f737 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -11,10 +11,16 @@ permissions: read-all runs: using: composite steps: + - name: requirements + shell: bash -xe {0} + run: | + pip install -r pytorch/.ci/docker/requirements-ci.txt + pip install -U pytest-timeout - name: ut_regression shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_regression' }} run: | + pip install -r .ci/docker/requirements-ci.txt mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions pytest --timeout 600 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 6fdf926a2a..5fc3a9993c 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -37,25 +37,6 @@ runs: rm -rf pt2e-performance git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark pt2e-performance fi - # deps - if [[ ${{ inputs.scenario }} == *"performance"* ]]; then - # torchbench - python -c "import torch, torchvision, torchaudio" - cd pt2e-performance - # remove deps which will reinstall torch - pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) - pip install -U transformers==4.44.2 - sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt - git status && git diff - pip install -r requirements.txt - python install.py --continue_on_fail - # deps for torchrec_dlrm - pip install pyre_extensions - pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu - pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec - fi # dataset if [ ! -d ${HOME}/datasets/imagenet ];then rm -rf ${HOME}/datasets/imagenet From 0add64edd402d98885f4a573a609795677cabf4a Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 12 Aug 2025 22:44:37 +0800 Subject: [PATCH 141/160] update --- .github/actions/linux-testenv/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 12868db55f..7b7a91af06 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -84,7 +84,7 @@ runs: cd benchmark git checkout e03a63be43e33596f7f0a43b0f530353785e4a59 pip install -r requirements.txt - pip install -U transformers==4.44.2 timm==1.0.14 + pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install python install.py --continue_on_fail elif [[ "${{ inputs.suite }}" == *"pt2e"* ]];then @@ -92,7 +92,7 @@ runs: git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark cd benchmark pip install -r requirements.txt - pip install -U transformers==4.44.2 timm==1.0.14 + pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install python install.py --continue_on_fail fi From 8902540637e9a9580fac5c9dec46a0396273254e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 09:10:48 +0800 Subject: [PATCH 142/160] update --- .github/actions/linux-e2etest/action.yml | 19 ++++++++++--------- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/_linux_e2e.yml | 1 + .github/workflows/nightly_ondemand.yml | 1 + .github/workflows/pull.yml | 1 + 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 874b250638..52ef8a4cc9 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -99,12 +99,13 @@ runs: sed -i "s/$/,$(basename $var)/" $var cat $var >> inductor_log/summary_accuracy.csv done - cd ${{ github.workspace }} - cp ./.github/scripts/inductor_summary.py ./pytorch - cd ./pytorch - pip install styleFrame scipy pandas - dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') - mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') - suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') - scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g') - python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario} + cp ${{ github.workspace }}/.github/scripts/inductor_summary.py ./ + csv_file="$(find inductor_log/ -name "inductor_*_xpu_*.csv" |tail -n 1)" + if [ -f "${csv_file}" ];then + pip install styleFrame scipy pandas + dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') + mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') + suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') + scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g') + python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario} + fi diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 7b7a91af06..4250ceed9e 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -69,7 +69,7 @@ runs: with: pattern: Torch-XPU-Wheel-* - name: Install E2E Requirements - if: ${{ ! contains(inputs.test_type, 'e2e') }} + if: ${{ contains(inputs.test_type, 'e2e') }} shell: bash -xe {0} run: | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index d784f2e49f..61647e05e8 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -101,6 +101,7 @@ jobs: torch_xpu_ops: skipped oneapi: ${{ inputs.oneapi }} python: ${{ inputs.python }} + suite: ${{ inputs.suite }} # CICD launch - name: CICD Huggingface BF16 & FP16 Training Test diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index a58710682a..99731123e5 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -154,6 +154,7 @@ jobs: scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} model: ${{ github.event_name == 'schedule' && '' || inputs.model }} Linux-Nightly-Ondemand-E2E-Tests-Summary: + if: ${{ ! cancelled() }} name: linux-e2e permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 6aece67fff..2c6999a520 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -127,6 +127,7 @@ jobs: pytorch: main suite: ${{ matrix.suite }} linux-e2e-summary: + if: ${{ ! cancelled() }} name: linux-e2e permissions: write-all needs: [linux-e2e] From 0eda9f76b501f6972b26687c4ff648699bbdc0b9 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 09:11:54 +0800 Subject: [PATCH 143/160] update --- .github/actions/linux-uttest/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index ded9d5f737..04b0fbcd86 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -20,7 +20,6 @@ runs: shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_regression' }} run: | - pip install -r .ci/docker/requirements-ci.txt mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions pytest --timeout 600 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ From 363145454b3e7250317797fa1d8f79295827a88b Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 11:24:16 +0800 Subject: [PATCH 144/160] update --- .github/ci_expected_accuracy/check_expected.py | 2 +- .../rolling/inductor_timm_models_training.csv | 4 ++-- .../rolling/inductor_torchbench_inference.csv | 6 +++--- .github/scripts/e2e_summary.sh | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 48c09606de..3c82666af0 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -6,7 +6,7 @@ # Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223 parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--driver", type=str, default="lts", help="rolling or lts") +parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts") parser.add_argument("--category", type=str, default="inductor", help="inductor") parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench") parser.add_argument("--mode", type=str, required=True, help="inference or training") diff --git a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv index 58dd7064d6..4a60aecac6 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv @@ -2,10 +2,10 @@ name,float32,bfloat16,float16,amp_bf16,amp_fp16 adv_inception_v3,pass,pass,pass,pass,pass beit_base_patch16_224,pass,pass,pass,pass,pass botnet26t_256,pass,pass,pass,pass,pass -cait_m36_384,pass,pass,pass,pass,pass +cait_m36_384,pass,pass,fail_accuracy,pass,pass coat_lite_mini,pass,pass,pass,pass,pass convit_base,pass,pass,pass,pass,pass -convmixer_768_32,pass,pass,pass,pass,pass +convmixer_768_32,pass,fail_accuracy,pass,fail_accuracy,pass # https://github.com/intel/torch-xpu-ops/issues/1274 convnext_base,pass,fail_accuracy,fail_accuracy,pass,pass crossvit_9_240,pass,pass,pass,pass,pass diff --git a/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv index 78a4677a90..29989ad6ff 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv @@ -19,11 +19,11 @@ densenet121,pass,pass,pass,pass,pass # https://github.com/intel/torch-xpu-ops/issues/1278 detectron2_fasterrcnn_r_101_c4,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_fasterrcnn_r_101_dc5,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy -detectron2_fasterrcnn_r_101_fpn,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy +detectron2_fasterrcnn_r_101_fpn,eager_1st_run_OOM,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_fasterrcnn_r_50_c4,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_fasterrcnn_r_50_dc5,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy -detectron2_fasterrcnn_r_50_fpn,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,pass -detectron2_fcos_r_50_fpn,pass,pass,pass,pass,pass +detectron2_fasterrcnn_r_50_fpn,eager_1st_run_OOM,eager_fail_to_run,eager_1st_run_OOM,fail_accuracy,pass +detectron2_fcos_r_50_fpn,pass,pass,pass,fail_accuracy,pass detectron2_maskrcnn,fail_to_run,eager_fail_to_run,fail_to_run,eager_fail_to_run,fail_to_run detectron2_maskrcnn_r_101_c4,fail_accuracy,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_maskrcnn_r_101_fpn,fail_accuracy,eager_fail_to_run,eager_1st_run_OOM,eager_1st_run_OOM,fail_accuracy diff --git a/.github/scripts/e2e_summary.sh b/.github/scripts/e2e_summary.sh index c858f6f3f5..d4ad299b59 100644 --- a/.github/scripts/e2e_summary.sh +++ b/.github/scripts/e2e_summary.sh @@ -98,7 +98,7 @@ Empty means the cases NOT run\n\n" suite="$(echo "${csv}" |sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/')" mode="$(echo "${csv}" |sed 's/_xpu_accuracy.*//;s/.*_//')" dtype="$(echo "${csv}" |sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//')" - python "${check_file}" --driver "${LTS_OR_ROLLING:-"lts"}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" + python "${check_file}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" test_result="$(sed 's/, /,/g' "/tmp/tmp-${suite}-${mode}-${dtype}.txt" |awk '{ if($0 ~/Total/){ total = $3; From 00a3720c3f52e8ec4784b640e43ee69d1313192f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 11:26:30 +0800 Subject: [PATCH 145/160] Revert "update" This reverts commit 363145454b3e7250317797fa1d8f79295827a88b. --- .github/ci_expected_accuracy/check_expected.py | 2 +- .../rolling/inductor_timm_models_training.csv | 4 ++-- .../rolling/inductor_torchbench_inference.csv | 6 +++--- .github/scripts/e2e_summary.sh | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 3c82666af0..48c09606de 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -6,7 +6,7 @@ # Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223 parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts") +parser.add_argument("--driver", type=str, default="lts", help="rolling or lts") parser.add_argument("--category", type=str, default="inductor", help="inductor") parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench") parser.add_argument("--mode", type=str, required=True, help="inference or training") diff --git a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv index 4a60aecac6..58dd7064d6 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv @@ -2,10 +2,10 @@ name,float32,bfloat16,float16,amp_bf16,amp_fp16 adv_inception_v3,pass,pass,pass,pass,pass beit_base_patch16_224,pass,pass,pass,pass,pass botnet26t_256,pass,pass,pass,pass,pass -cait_m36_384,pass,pass,fail_accuracy,pass,pass +cait_m36_384,pass,pass,pass,pass,pass coat_lite_mini,pass,pass,pass,pass,pass convit_base,pass,pass,pass,pass,pass -convmixer_768_32,pass,fail_accuracy,pass,fail_accuracy,pass +convmixer_768_32,pass,pass,pass,pass,pass # https://github.com/intel/torch-xpu-ops/issues/1274 convnext_base,pass,fail_accuracy,fail_accuracy,pass,pass crossvit_9_240,pass,pass,pass,pass,pass diff --git a/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv index 29989ad6ff..78a4677a90 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_torchbench_inference.csv @@ -19,11 +19,11 @@ densenet121,pass,pass,pass,pass,pass # https://github.com/intel/torch-xpu-ops/issues/1278 detectron2_fasterrcnn_r_101_c4,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_fasterrcnn_r_101_dc5,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy -detectron2_fasterrcnn_r_101_fpn,eager_1st_run_OOM,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy +detectron2_fasterrcnn_r_101_fpn,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_fasterrcnn_r_50_c4,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_fasterrcnn_r_50_dc5,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy -detectron2_fasterrcnn_r_50_fpn,eager_1st_run_OOM,eager_fail_to_run,eager_1st_run_OOM,fail_accuracy,pass -detectron2_fcos_r_50_fpn,pass,pass,pass,fail_accuracy,pass +detectron2_fasterrcnn_r_50_fpn,pass,eager_fail_to_run,fail_accuracy,fail_accuracy,pass +detectron2_fcos_r_50_fpn,pass,pass,pass,pass,pass detectron2_maskrcnn,fail_to_run,eager_fail_to_run,fail_to_run,eager_fail_to_run,fail_to_run detectron2_maskrcnn_r_101_c4,fail_accuracy,eager_fail_to_run,fail_accuracy,fail_accuracy,fail_accuracy detectron2_maskrcnn_r_101_fpn,fail_accuracy,eager_fail_to_run,eager_1st_run_OOM,eager_1st_run_OOM,fail_accuracy diff --git a/.github/scripts/e2e_summary.sh b/.github/scripts/e2e_summary.sh index d4ad299b59..c858f6f3f5 100644 --- a/.github/scripts/e2e_summary.sh +++ b/.github/scripts/e2e_summary.sh @@ -98,7 +98,7 @@ Empty means the cases NOT run\n\n" suite="$(echo "${csv}" |sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/')" mode="$(echo "${csv}" |sed 's/_xpu_accuracy.*//;s/.*_//')" dtype="$(echo "${csv}" |sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//')" - python "${check_file}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" + python "${check_file}" --driver "${LTS_OR_ROLLING:-"lts"}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" test_result="$(sed 's/, /,/g' "/tmp/tmp-${suite}-${mode}-${dtype}.txt" |awk '{ if($0 ~/Total/){ total = $3; From 0aab07a859ba5f607ad8e4ca079db158470949f7 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 15:04:19 +0800 Subject: [PATCH 146/160] update --- .github/actions/linux-e2etest/action.yml | 2 ++ .github/actions/linux-uttest/action.yml | 2 ++ .github/actions/pt2e/action.yml | 2 ++ .github/ci_expected_accuracy/check_expected.py | 2 +- .github/scripts/e2e_summary.sh | 2 +- .github/workflows/_linux_build.yml | 3 ++- .github/workflows/_linux_e2e.yml | 4 ++-- .github/workflows/_linux_op_benchmark.yml | 4 ++-- .github/workflows/_linux_ut.yml | 4 ++-- 9 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index 52ef8a4cc9..a8c1865aac 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -27,6 +27,8 @@ inputs: runs: using: composite + env: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) shell: bash -x {0} diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 04b0fbcd86..e94525082e 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -10,6 +10,8 @@ permissions: read-all runs: using: composite + env: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: requirements shell: bash -xe {0} diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 5fc3a9993c..10425ffa8c 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -22,6 +22,8 @@ inputs: runs: using: composite + env: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 48c09606de..3c82666af0 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -6,7 +6,7 @@ # Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223 parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--driver", type=str, default="lts", help="rolling or lts") +parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts") parser.add_argument("--category", type=str, default="inductor", help="inductor") parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench") parser.add_argument("--mode", type=str, required=True, help="inference or training") diff --git a/.github/scripts/e2e_summary.sh b/.github/scripts/e2e_summary.sh index c858f6f3f5..d4ad299b59 100644 --- a/.github/scripts/e2e_summary.sh +++ b/.github/scripts/e2e_summary.sh @@ -98,7 +98,7 @@ Empty means the cases NOT run\n\n" suite="$(echo "${csv}" |sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/')" mode="$(echo "${csv}" |sed 's/_xpu_accuracy.*//;s/.*_//')" dtype="$(echo "${csv}" |sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//')" - python "${check_file}" --driver "${LTS_OR_ROLLING:-"lts"}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" + python "${check_file}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" test_result="$(sed 's/, /,/g' "/tmp/tmp-${suite}-${mode}-${dtype}.txt" |awk '{ if($0 ~/Total/){ total = $3; diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index ebb6b6fb46..1b9b1be3aa 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -69,9 +69,10 @@ jobs: - ${{ github.workspace }}:${{ github.workspace }} env: PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - GH_TOKEN: ${{ github.token }} AGENT_TOOLSDIRECTORY: /tmp/xpu-tool PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache + env: + GH_TOKEN: ${{ github.token }} timeout-minutes: 300 steps: - name: Install gh-cli diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 61647e05e8..47b9014c7e 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -82,11 +82,11 @@ jobs: -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} TORCH_HOME: /tmp/.cache/_torch HF_HOME: /tmp/.cache/_huggingface MODEL_ONLY_NAME: ${{ inputs.model }} + env: + GH_TOKEN: ${{ github.token }} steps: - name: Cleanup workspace run: | diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index a2e3757042..2ab84d571a 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -63,11 +63,11 @@ jobs: -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /opt/xpu-tool - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} TORCH_HOME: /tmp/.cache/_torch HF_HOME: /tmp/.cache/_huggingface REFERENCE_ISSUE: 1689 + env: + GH_TOKEN: ${{ github.token }} steps: - name: Cleanup workspace run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 508d144c9c..8ad953e862 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -74,10 +74,10 @@ jobs: -e ZE_AFFINITY_MASK env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} TORCH_HOME: /tmp/.cache/_torch HF_HOME: /tmp/.cache/_huggingface + env: + GH_TOKEN: ${{ github.token }} strategy: fail-fast: false matrix: From 29a9fd8fb4301d8e0854967ee12f73fa082e6b09 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 15:06:58 +0800 Subject: [PATCH 147/160] update --- .github/workflows/pull.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 2c6999a520..740fb6bfee 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -113,9 +113,9 @@ jobs: disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} linux-e2e: + name: linux-e2e if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} needs: [conditions-filter, linux-build] - name: ${{ matrix.suite }} strategy: fail-fast: false matrix: From c69854ff2dac3478b57f4bc2fbf133ee0b232a87 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 16:50:28 +0800 Subject: [PATCH 148/160] update --- .github/actions/linux-e2etest/action.yml | 2 -- .github/actions/linux-uttest/action.yml | 2 -- .github/actions/pt2e/action.yml | 2 -- .github/workflows/_linux_e2e.yml | 1 + .github/workflows/_linux_ut.yml | 1 + 5 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml index a8c1865aac..52ef8a4cc9 100644 --- a/.github/actions/linux-e2etest/action.yml +++ b/.github/actions/linux-e2etest/action.yml @@ -27,8 +27,6 @@ inputs: runs: using: composite - env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) shell: bash -x {0} diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index e94525082e..04b0fbcd86 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -10,8 +10,6 @@ permissions: read-all runs: using: composite - env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: requirements shell: bash -xe {0} diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 10425ffa8c..5fc3a9993c 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -22,8 +22,6 @@ inputs: runs: using: composite - env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index 47b9014c7e..9abe81cacb 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -87,6 +87,7 @@ jobs: MODEL_ONLY_NAME: ${{ inputs.model }} env: GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Cleanup workspace run: | diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8ad953e862..33d0b54d8d 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -78,6 +78,7 @@ jobs: HF_HOME: /tmp/.cache/_huggingface env: GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} strategy: fail-fast: false matrix: From a6b2302547ac1222878bc2ea3e968d507e3d6649 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 17:00:55 +0800 Subject: [PATCH 149/160] merge main --- .github/actions/linux-uttest/action.yml | 30 ++++++++++++++++--------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 04b0fbcd86..e189754464 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -102,32 +102,40 @@ runs: shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_profiling' }} run: | - mkdir -p ut_log/profile_test/issue_reproduce + mkdir -p ut_log/xpu_profiling/issue_reproduce cd pytorch/third_party/torch-xpu-ops # RN50 Test PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 - cp profiling.fp32.train.pt ${{ github.workspace }}/ut_log/profile_test + cp profiling.fp32.train.pt ${{ github.workspace }}/ut_log/xpu_profiling # All Issue Reproduce UT python -u test/profiling/correlation_id_mixed.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log python -u test/profiling/time_precision_in_profile.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log python -u test/profiling/profile_partial_runtime_ops.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log python -u test/profiling/triton_xpu_ops_time.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log + + # llama case for calls number test + pip install transformers + python test/profiling/llama.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log + python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv + bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv + # All xpu ut under test/profiler cd ../../test/profiler python -m pytest --timeout 600 -vs test_cpp_thread.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/test_cpp_thread.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_cpp_thread.log python -m pytest --timeout 600 -vs test_execution_trace.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/test_execution_trace.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_execution_trace.log python -m pytest --timeout 600 -vs test_memory_profiler.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/test_memory_profiler.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_memory_profiler.log python -m pytest --timeout 600 -vs test_profiler_tree.py | \ - tee ${{ github.workspace }}/ut_log/profile_test/test_profiler_tree.log + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_dev1 shell: bash -xe {0} From 0a17050fc0928a6fa6d76aa1a79b84006fba75af Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Wed, 13 Aug 2025 17:15:31 +0800 Subject: [PATCH 150/160] update --- .github/workflows/nightly_ondemand.yml | 1 + .github/workflows/pull.yml | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 99731123e5..f3d967f1f5 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -178,6 +178,7 @@ jobs: Windows-Nightly-Ondemand-UT-Tests: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'windows') }} name: windows + needs: [Conditions-Filter] uses: ./.github/workflows/_windows_ut.yml with: ut: ${{ github.event_name == 'schedule' && 'ut_extended,ut_torch' || inputs.ut }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 740fb6bfee..23683fa701 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -51,8 +51,7 @@ jobs: bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh conditions-filter: - if: ${{ github.event.pull_request.draft == false }} - needs: [preci-lint-check] + if: ${{ github.repository_owner == 'intel' && github.event.pull_request.draft == false }} runs-on: ubuntu-latest timeout-minutes: 10 env: @@ -92,7 +91,7 @@ jobs: linux-build: if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} - needs: [conditions-filter] + needs: [conditions-filter, preci-lint-check] secrets: inherit uses: ./.github/workflows/_linux_build.yml with: @@ -138,7 +137,7 @@ jobs: windows: name: windows if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} - needs: [conditions-filter] + needs: [conditions-filter, preci-lint-check] uses: ./.github/workflows/_windows_ut.yml with: ut: ut_extended,ut_torch From a7257b0f5a2b63973adda86d809745580e19b434 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 09:50:29 +0800 Subject: [PATCH 151/160] modify e2e summary --- .github/workflows/_linux_e2e_summary.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml index d65e47a3ae..746bc1d565 100644 --- a/.github/workflows/_linux_e2e_summary.yml +++ b/.github/workflows/_linux_e2e_summary.yml @@ -46,7 +46,8 @@ jobs: cd target/ target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*" gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -p "${target_dir}" - mv Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*/* . + find Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*/ -maxdepth 1 -mindepth 1 -type d |\ + while read line; do mv $line .; done - name: Download Baseline Artifact run: | mkdir baseline/ @@ -56,7 +57,7 @@ jobs: REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" if [ "${REFERENCE_RUN_ID}" != "" ];then gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - mv Inductor-*-XPU-E2E-*/* . + find Inductor-*-XPU-E2E-*/ -maxdepth 1 -mindepth 1 -type d |while read line; do mv $line .; done fi - name: Get summary if: ${{ ! cancelled() }} From 8a54cfaa4788147c20056353970f1729f3379dbc Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 10:47:08 +0800 Subject: [PATCH 152/160] modify on-demand test --- .github/workflows/nightly_ondemand.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index f3d967f1f5..cc7b1ac428 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -91,14 +91,12 @@ jobs: torch_xpu_ops="main" fi else + pytorch="${{ inputs.pytorch }}" + torch_xpu_ops="${{ inputs.torch_xpu_ops }}" if [[ "${{ inputs.pytorch }}" == *"_wheel" ]];then test_type="wheel-ondemand" - pytorch="${{ inputs.pytorch }}" - torch_xpu_ops="pinned" else test_type="build-ondemand" - pytorch="${{ inputs.pytorch }}" - torch_xpu_ops="${{ inputs.torch_xpu_ops }}" fi fi echo "test_type=${test_type}" >> ${GITHUB_OUTPUT} From 23f097f0dc60c1100791c9d576d6900c154e817e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 10:54:30 +0800 Subject: [PATCH 153/160] modify on-demand test --- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/nightly_ondemand.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 4250ceed9e..188dacc29b 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -145,7 +145,7 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ];then + if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ] || [ "${{ inputs.torch_xpu_ops }}" == "triggered" ];then cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops cd third_party/torch-xpu-ops else diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index cc7b1ac428..93826b34a8 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -16,8 +16,8 @@ on: description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' - description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + default: 'triggered' + description: Torch-xpu-ops workflow triggered branch by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: type: string default: 'pinned' From a047accd26af551195937116ebe8296391cdea73 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 16:57:44 +0800 Subject: [PATCH 154/160] rebase --- .github/scripts/build.sh | 67 ++++---- .github/scripts/env.sh | 2 +- .github/workflows/_linux_build.yml | 251 ++++++++++++----------------- 3 files changed, 134 insertions(+), 186 deletions(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 41c46c99b1..5bce6eacdf 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -1,18 +1,18 @@ #!/bin/bash # Usage: # ./build.sh --WORKSPACE= \ -# --PYTORCH_REPO= --PYTORCH_COMMIT= \ +# --PYTORCH_REPO= --PYTORCH_VERSION= \ # --TORCH_XPU_OPS_REPO= \ -# --TORCH_XPU_OPS_COMMIT= +# --TORCH_XPU_OPS_VERSION= set -xe export GIT_PAGER=cat # Init params WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"}) PYTORCH_REPO=${PYTORCH_REPO:-"https://github.com/pytorch/pytorch.git"} -PYTORCH_COMMIT=${PYTORCH_COMMIT:-"main"} +PYTORCH_VERSION=${PYTORCH_VERSION:-"main"} TORCH_XPU_OPS_REPO=${TORCH_XPU_OPS_REPO:-"https://github.com/intel/torch-xpu-ops.git"} -TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT:-"main"} +TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"} for var; do eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")" done @@ -21,20 +21,20 @@ done rm -rf ${WORKSPACE}/pytorch git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch -git checkout ${PYTORCH_COMMIT} +git checkout ${PYTORCH_VERSION} git remote -v && git branch && git show -s git rev-parse HEAD > ${WORKSPACE}/pytorch.commit # Set torch-xpu-ops -if [ "${TORCH_XPU_OPS_COMMIT,,}" == "pinned" ];then +if [ "${TORCH_XPU_OPS_VERSION,,}" == "pinned" ];then TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - TORCH_XPU_OPS_COMMIT="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" + TORCH_XPU_OPS_VERSION="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" fi -if [ "${TORCH_XPU_OPS_COMMIT,,}" != "cicd" ];then +if [ "${TORCH_XPU_OPS_VERSION,,}" != "cicd" ];then rm -rf ${WORKSPACE}/torch-xpu-ops git clone ${TORCH_XPU_OPS_REPO} ${WORKSPACE}/torch-xpu-ops cd ${WORKSPACE}/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} + git checkout ${TORCH_XPU_OPS_VERSION} fi cd ${WORKSPACE}/torch-xpu-ops git remote -v && git branch && git show -s @@ -48,33 +48,30 @@ python -m pip install requests python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static==2025.1.0 mkl-include==2025.1.0 +python -m pip install mkl-static mkl-include export USE_STATIC_MKL=1 -export USE_XCCL=1 -if [ "${XPU_ONEAPI_PATH}" == "" ];then - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.1.1 | \ - intel-cmplr-lib-ur==2025.1.1 | \ - intel-cmplr-lic-rt==2025.1.1 | \ - intel-sycl-rt==2025.1.1 | \ - impi-rt==2021.15.0 | \ - dpcpp-cpp-rt==2025.1.1 | \ - oneccl-devel==2021.15.2 | \ - oneccl==2021.15.2 | \ - mkl==2025.1.0 | \ - onemkl-sycl-blas==2025.1.0 | \ - onemkl-sycl-dft==2025.1.0 | \ - onemkl-sycl-lapack==2025.1.0 | \ - onemkl-sycl-rng==2025.1.0 | \ - onemkl-sycl-sparse==2025.1.0 | \ - intel-opencl-rt==2025.1.1 | \ - intel-openmp==2025.1.1 | \ - tbb==2022.1.0 | \ - tcmlib==1.3.0 | \ - umf==0.10.0 | \ - intel-pti==0.12.3 - " -fi +export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ + intel-cmplr-lib-rt==2025.1.1 | \ + intel-cmplr-lib-ur==2025.1.1 | \ + intel-cmplr-lic-rt==2025.1.1 | \ + intel-sycl-rt==2025.1.1 | \ + oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \ + onemkl-sycl-blas==2025.1.0 | \ + onemkl-sycl-dft==2025.1.0 | \ + onemkl-sycl-lapack==2025.1.0 | \ + onemkl-sycl-rng==2025.1.0 | \ + onemkl-sycl-sparse==2025.1.0 | \ + dpcpp-cpp-rt==2025.1.1 | \ + intel-opencl-rt==2025.1.1 | \ + mkl==2025.1.0 | \ + intel-openmp==2025.1.1 | \ + tbb==2022.1.0 | \ + tcmlib==1.3.0 | \ + umf==0.10.0 | \ + intel-pti==0.12.3 +" # Build sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index d0f7cfd338..3b17170385 100755 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,6 +1,6 @@ #!/bin/bash -XPU_ONEAPI_PATH="${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}" +XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"} source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 1b9b1be3aa..8fbed99275 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -3,214 +3,165 @@ name: Linux PyTorch XPU Build on: workflow_call: inputs: - runner: - required: true - type: string - default: 'pvc_rolling' - description: Runner label - test_type: - type: string - default: 'build-from-source' - description: Build from source or install nightly wheel pytorch: + required: true type: string default: 'main' - description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' - torch_xpu_ops: - type: string - default: 'main' - description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin - triton: + description: Pytorch branch/commit + keep_torch_xpu_ops: required: false type: string - default: 'pinned' - description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' - oneapi: + default: 'false' + description: Keep torch-xpu-ops pin. `true` means use pined commit + driver: + required: false type: string - default: 'installed' - description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + default: 'lts' + description: Driver lts/rolling python: + required: false type: string default: '3.10' description: Python version + runner: + required: true + type: string + default: 'linux.idc.xpu' + description: Runner label + triton: + required: false + type: string + default: '' + description: Triton commit. Use pytorch pined commit by default + outputs: + torch_commit_id: + description: The commit id of the torch build + value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }} permissions: read-all -defaults: - run: - shell: bash -xe {0} - jobs: - runner: - runs-on: ${{ inputs.runner }} - outputs: - runner_id: ${{ steps.runner-info.outputs.runner_id }} - user_id: ${{ steps.runner-info.outputs.user_id }} - render_id: ${{ steps.runner-info.outputs.render_id }} - hostname: ${{ steps.runner-info.outputs.hostname }} - steps: - - name: Cleanup workspace - run: | - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Get runner - id: runner-info - uses: ./.github/actions/get-runner - build: - name: ${{ inputs.pytorch }} - needs: runner - if: ${{ ! contains(inputs.test_type, 'wheel') }} - runs-on: ${{ needs.runner.outputs.runner_id }} + runs-on: ${{ inputs.runner }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: - PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache - env: - GH_TOKEN: ${{ github.token }} + PATH: /opt/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + commit_issue: 1280 + GH_TOKEN: ${{ github.token }} + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + outputs: + TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }} timeout-minutes: 300 steps: - - name: Install gh-cli + - name: Setup based env run: | - cat /etc/os-release - hostname && id # Cleanup workspace - find ./ |grep -v "^\./$" |xargs rm -rf - # install gh - dnf install -y 'dnf-command(config-manager)' + rm -rf ${{ github.workspace }}/* + # Install gh + dnf install 'dnf-command(config-manager)' dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf install -y gh --repo gh-cli - gh --version - - name: Setup python-${{ inputs.python }} - run: | - rm -rf /tmp/xpu-tool/myvenv + dnf autoremove -y git236* && dnf install -y git + dnf install gh --repo gh-cli -y + # Setup python local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv - which python && python -V - which pip && pip list + /opt/python/${local_python}/bin/python -m venv /opt/xpu-build + which python && python -V && pip list pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Build Pytorch on ${{ needs.runner.outputs.hostname }} + - name: Build Triton XPU run: | - # only build pvc for CI - if [ "${{ inputs.test_type }}" == "build-cicd" ];then - export TORCH_XPU_ARCH_LIST='pvc' + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ + source /opt/rh/gcc-toolset-13/enable + dnf install -y zlib-devel + cd ../ && rm -rf pytorch + git clone https://github.com/pytorch/pytorch pytorch + cd pytorch + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install cmake ninja pybind11 + rm -rf pytorch_triton_xpu-*.whl + TRITON_VERSION_NAME="$( + curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ + grep '__version__' |head -n 1 |awk -F "'" '{print $2}' + )" + python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} + cp pytorch_triton_xpu-*.whl ${{ github.workspace }} + fi + - name: Build Pytorch XPU + run: | + set -xe -o pipefail + if [ "${{ inputs.driver }}" == "lts" ]; then + export TORCH_XPU_ARCH_LIST='pvc' fi if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" + PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" else PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - PYTORCH_COMMIT="${{ inputs.pytorch }}" + PYTORCH_VERSION="${{ inputs.pytorch }}" fi - if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" + if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')" + elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then + TORCH_XPU_OPS_VERSION="pinned" else - TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" + TORCH_XPU_OPS_VERSION="cicd" fi - # gcc 11 - source /opt/rh/gcc-toolset-11/enable # oneAPI DLE - if [ "${{ inputs.oneapi }}" != "installed" ];then - rm -rf ${HOME}/intel ${HOME}/.intel /opt/intel - wget -q -O oneapi.sh "${{ inputs.oneapi }}" - bash oneapi.sh -a -s --eula accept --action install --install-dir /opt/intel/oneapi - export XPU_ONEAPI_PATH="/opt/intel/oneapi" - fi source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh + # gcc 11 + source /opt/rh/gcc-toolset-11/enable + export USE_XCCL=1 ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ - --PYTORCH_COMMIT="${PYTORCH_COMMIT}" \ + --PYTORCH_VERSION="${PYTORCH_VERSION}" \ --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ - --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ - 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log - if [ $(ls ${{ github.workspace }} |grep -c "torch-.*.whl") -eq 0 ];then - echo "Build pytorch got failed" - exit 1 - fi - - name: Build Triton - run: | - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ zlib-devel - source /opt/rh/gcc-toolset-13/enable - cd ./pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - if [ "${{ inputs.triton }}" != "pinned" ];then - TRITON_COMMIT_ID="${{ inputs.triton }}" - else - TRITON_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)" - fi - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ - 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log - if [ $(ls |grep -c "pytorch_triton_xpu-.*.whl") -eq 0 ];then - echo "Build triton got failed" - exit 1 - fi - pip install pytorch_triton_xpu-*.whl - cp pytorch_triton_xpu-*.whl ${{ github.workspace }} - - name: Build Torchvision and Torchaudio - run: | - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ zlib-devel - source /opt/rh/gcc-toolset-13/enable - cd ./pytorch - TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" - TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" - git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision - cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID} - python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log - if [ $(ls dist/ |grep -c "torchvision-.*.whl") -eq 0 ];then - echo "Build torchvision got failed" - exit 1 - fi - pip install dist/*.whl - cp dist/*.whl ${{ github.workspace }} - git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio - cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID} - python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log - if [ $(ls dist/ |grep -c "torchaudio-.*.whl") -eq 0 ];then - echo "Build torchaudio got failed" - exit 1 - fi - pip install dist/*.whl - cp dist/*.whl ${{ github.workspace }} + --TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \ + 2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_VERSION//\//-}.log - name: Torch Config run: | - printenv python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - python -c "import triton; print(triton.__version__)" - python -c "import torchvision; print(torchvision.__version__)" - python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py - pip list |grep -E 'torch|intel' - chmod 777 /__w -R + - name: Identify Build version + id: build_version + run: | + echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" - name: Upload Torch XPU Wheel - if: ${{ success() }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/*.whl + path: ${{ github.workspace }}/torch*.whl + - name: Upload Triton Wheel + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Triton-Wheel-${{ github.event.pull_request.number || github.sha }} + path: ${{ github.workspace }}/pytorch_triton_xpu-*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/build_*.log + path: ${{ github.workspace }}/pytorch_*.log + - name: Cleanup + if: always() + run: | + chmod 777 . -R + rm -rf pytorch torch-xpu-ops pytorch_*.log torch*.whl pytorch_triton_xpu-*.whl From 7df6ea3b32dd4a2feb51450c47fffa46b9cc2f7a Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 17:00:49 +0800 Subject: [PATCH 155/160] rebase --- .github/scripts/build.sh | 66 ++++---- .github/scripts/env.sh | 2 +- .github/workflows/_linux_build.yml | 252 +++++++++++++++++------------ 3 files changed, 186 insertions(+), 134 deletions(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 5bce6eacdf..001e5c9b44 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -1,18 +1,18 @@ #!/bin/bash # Usage: # ./build.sh --WORKSPACE= \ -# --PYTORCH_REPO= --PYTORCH_VERSION= \ +# --PYTORCH_REPO= --PYTORCH_COMMIT= \ # --TORCH_XPU_OPS_REPO= \ -# --TORCH_XPU_OPS_VERSION= +# --TORCH_XPU_OPS_COMMIT= set -xe export GIT_PAGER=cat # Init params WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"}) PYTORCH_REPO=${PYTORCH_REPO:-"https://github.com/pytorch/pytorch.git"} -PYTORCH_VERSION=${PYTORCH_VERSION:-"main"} +PYTORCH_COMMIT=${PYTORCH_COMMIT:-"main"} TORCH_XPU_OPS_REPO=${TORCH_XPU_OPS_REPO:-"https://github.com/intel/torch-xpu-ops.git"} -TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"} +TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT:-"main"} for var; do eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")" done @@ -21,20 +21,20 @@ done rm -rf ${WORKSPACE}/pytorch git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch -git checkout ${PYTORCH_VERSION} +git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s git rev-parse HEAD > ${WORKSPACE}/pytorch.commit # Set torch-xpu-ops -if [ "${TORCH_XPU_OPS_VERSION,,}" == "pinned" ];then +if [ "${TORCH_XPU_OPS_COMMIT,,}" == "pinned" ];then TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - TORCH_XPU_OPS_VERSION="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" + TORCH_XPU_OPS_COMMIT="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" fi -if [ "${TORCH_XPU_OPS_VERSION,,}" != "cicd" ];then +if [ "${TORCH_XPU_OPS_COMMIT,,}" != "cicd" ];then rm -rf ${WORKSPACE}/torch-xpu-ops git clone ${TORCH_XPU_OPS_REPO} ${WORKSPACE}/torch-xpu-ops cd ${WORKSPACE}/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_VERSION} + git checkout ${TORCH_XPU_OPS_COMMIT} fi cd ${WORKSPACE}/torch-xpu-ops git remote -v && git branch && git show -s @@ -48,30 +48,32 @@ python -m pip install requests python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static mkl-include +python -m pip install mkl-static==2025.1.0 mkl-include==2025.1.0 export USE_STATIC_MKL=1 -export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.1.1 | \ - intel-cmplr-lib-ur==2025.1.1 | \ - intel-cmplr-lic-rt==2025.1.1 | \ - intel-sycl-rt==2025.1.1 | \ - oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - onemkl-sycl-blas==2025.1.0 | \ - onemkl-sycl-dft==2025.1.0 | \ - onemkl-sycl-lapack==2025.1.0 | \ - onemkl-sycl-rng==2025.1.0 | \ - onemkl-sycl-sparse==2025.1.0 | \ - dpcpp-cpp-rt==2025.1.1 | \ - intel-opencl-rt==2025.1.1 | \ - mkl==2025.1.0 | \ - intel-openmp==2025.1.1 | \ - tbb==2022.1.0 | \ - tcmlib==1.3.0 | \ - umf==0.10.0 | \ - intel-pti==0.12.3 -" +if [ "${XPU_ONEAPI_PATH}" == "" ];then + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ + intel-cmplr-lib-rt==2025.1.1 | \ + intel-cmplr-lib-ur==2025.1.1 | \ + intel-cmplr-lic-rt==2025.1.1 | \ + intel-sycl-rt==2025.1.1 | \ + impi-rt==2021.15.0 | \ + dpcpp-cpp-rt==2025.1.1 | \ + oneccl-devel==2021.15.2 | \ + oneccl==2021.15.2 | \ + mkl==2025.1.0 | \ + onemkl-sycl-blas==2025.1.0 | \ + onemkl-sycl-dft==2025.1.0 | \ + onemkl-sycl-lapack==2025.1.0 | \ + onemkl-sycl-rng==2025.1.0 | \ + onemkl-sycl-sparse==2025.1.0 | \ + intel-opencl-rt==2025.1.1 | \ + intel-openmp==2025.1.1 | \ + tbb==2022.1.0 | \ + tcmlib==1.3.0 | \ + umf==0.10.0 | \ + intel-pti==0.12.3 + " +fi # Build sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 3b17170385..d0f7cfd338 100755 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,6 +1,6 @@ #!/bin/bash -XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"} +XPU_ONEAPI_PATH="${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}" source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 8fbed99275..ccbac87b3d 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -3,165 +3,215 @@ name: Linux PyTorch XPU Build on: workflow_call: inputs: - pytorch: + runner: required: true + type: string + default: 'pvc_rolling' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel + pytorch: type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - driver: + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + triton: required: false type: string - default: 'lts' - description: Driver lts/rolling + default: 'pinned' + description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed python: - required: false type: string default: '3.10' description: Python version - runner: - required: true - type: string - default: 'linux.idc.xpu' - description: Runner label - triton: - required: false - type: string - default: '' - description: Triton commit. Use pytorch pined commit by default - outputs: - torch_commit_id: - description: The commit id of the torch build - value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }} permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: - build: + runner: runs-on: ${{ inputs.runner }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + build: + name: ${{ inputs.pytorch }} + needs: runner + if: ${{ ! contains(inputs.test_type, 'wheel') }} + runs-on: ${{ needs.runner.outputs.runner_id }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: - PATH: /opt/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - commit_issue: 1280 - GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - outputs: - TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }} + PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache + env: + GH_TOKEN: ${{ github.token }} timeout-minutes: 300 steps: - - name: Setup based env + - name: Install gh-cli run: | + cat /etc/os-release + hostname && id # Cleanup workspace - rm -rf ${{ github.workspace }}/* - # Install gh - dnf install 'dnf-command(config-manager)' + find ./ |grep -v "^\./$" |xargs rm -rf + # install gh + dnf install -y 'dnf-command(config-manager)' dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf autoremove -y git236* && dnf install -y git - dnf install gh --repo gh-cli -y - # Setup python + dnf install -y gh --repo gh-cli + gh --version + - name: Setup python-${{ inputs.python }} + run: | + rm -rf /tmp/xpu-tool/myvenv local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /opt/xpu-build - which python && python -V && pip list + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv + which python && python -V + which pip && pip list pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Build Triton XPU + - name: Build Pytorch on ${{ needs.runner.outputs.hostname }} run: | - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ - source /opt/rh/gcc-toolset-13/enable - dnf install -y zlib-devel - cd ../ && rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - cp pytorch_triton_xpu-*.whl ${{ github.workspace }} - fi - - name: Build Pytorch XPU - run: | - set -xe -o pipefail - if [ "${{ inputs.driver }}" == "lts" ]; then - export TORCH_XPU_ARCH_LIST='pvc' + export USE_XCCL=1 + # only build pvc for CI + if [ "${{ inputs.test_type }}" == "build-cicd" ];then + export TORCH_XPU_ARCH_LIST='pvc' fi if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" + PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" else PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - PYTORCH_VERSION="${{ inputs.pytorch }}" + PYTORCH_COMMIT="${{ inputs.pytorch }}" fi - if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')" - elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then - TORCH_XPU_OPS_VERSION="pinned" + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" else - TORCH_XPU_OPS_VERSION="cicd" + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi - # oneAPI DLE - source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh # gcc 11 source /opt/rh/gcc-toolset-11/enable - export USE_XCCL=1 + # oneAPI DLE + if [ "${{ inputs.oneapi }}" != "installed" ];then + rm -rf ${HOME}/intel ${HOME}/.intel /opt/intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir /opt/intel/oneapi + export XPU_ONEAPI_PATH="/opt/intel/oneapi" + fi + source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ - --PYTORCH_VERSION="${PYTORCH_VERSION}" \ + --PYTORCH_COMMIT="${PYTORCH_COMMIT}" \ --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ - --TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \ - 2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_VERSION//\//-}.log + --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ + 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log + if [ $(ls ${{ github.workspace }} |grep -c "torch-.*.whl") -eq 0 ];then + echo "Build pytorch got failed" + exit 1 + fi + - name: Build Triton + run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable + cd ./pytorch + pip install cmake ninja pybind11 + rm -rf pytorch_triton_xpu-*.whl + if [ "${{ inputs.triton }}" != "pinned" ];then + TRITON_COMMIT_ID="${{ inputs.triton }}" + else + TRITON_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)" + fi + TRITON_VERSION_NAME="$( + curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ + grep '__version__' |head -n 1 |awk -F "'" '{print $2}' + )" + python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ + 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log + if [ $(ls |grep -c "pytorch_triton_xpu-.*.whl") -eq 0 ];then + echo "Build triton got failed" + exit 1 + fi + pip install pytorch_triton_xpu-*.whl + cp pytorch_triton_xpu-*.whl ${{ github.workspace }} + - name: Build Torchvision and Torchaudio + run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable + cd ./pytorch + TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" + TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" + git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision + cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID} + python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log + if [ $(ls dist/ |grep -c "torchvision-.*.whl") -eq 0 ];then + echo "Build torchvision got failed" + exit 1 + fi + pip install dist/*.whl + cp dist/*.whl ${{ github.workspace }} + git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio + cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID} + python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log + if [ $(ls dist/ |grep -c "torchaudio-.*.whl") -eq 0 ];then + echo "Build torchaudio got failed" + exit 1 + fi + pip install dist/*.whl + cp dist/*.whl ${{ github.workspace }} - name: Torch Config run: | + printenv python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import triton; print(triton.__version__)" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py - - name: Identify Build version - id: build_version - run: | - echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" + pip list |grep -E 'torch|intel' + chmod 777 /__w -R - name: Upload Torch XPU Wheel - if: ${{ ! cancelled() }} + if: ${{ success() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/torch*.whl - - name: Upload Triton Wheel - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Triton-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/pytorch_triton_xpu-*.whl + path: ${{ github.workspace }}/*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/pytorch_*.log - - name: Cleanup - if: always() - run: | - chmod 777 . -R - rm -rf pytorch torch-xpu-ops pytorch_*.log torch*.whl pytorch_triton_xpu-*.whl + path: ${{ github.workspace }}/build_*.log From 72c4bb5101564f18b71f9ccaabb6aa4d3e91d99c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 17:08:02 +0800 Subject: [PATCH 156/160] parallel 1 to skip crash only --- .github/actions/linux-uttest/action.yml | 4 ++-- test/xpu/extended/run_test_with_skip.py | 2 +- test/xpu/run_test_with_only.py | 6 +++--- test/xpu/xpu_test_utils.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 6d6b8ef606..90b7ac8523 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -22,7 +22,7 @@ runs: run: | mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 -n 4 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ + pytest --timeout 600 -n 1 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ 2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log - name: ut_transformers @@ -32,7 +32,7 @@ runs: export PYTORCH_TEST_WITH_SLOW=1 mkdir -p ut_log/ut_transformers cd pytorch - pytest --timeout 600 -n 4 --timeout_method=thread -v test/test_transformers.py -k xpu \ + pytest --timeout 600 -n 1 --timeout_method=thread -v test/test_transformers.py -k xpu \ --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \ 2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 17a8bbeb7a..e062885a6f 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -17,7 +17,7 @@ skip_options += '"' os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" -test_command = "pytest --timeout 600 -n 4 -v --timeout_method=thread --junit-xml=./ut_extended.xml test_ops_xpu.py" +test_command = "pytest --timeout 600 -n 1 -v --timeout_method=thread --junit-xml=./ut_extended.xml test_ops_xpu.py" test_command += skip_options res = os.system(test_command) sys.exit(res) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index 52bbcc1ced..ca24d3925a 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -14,7 +14,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - "pytest --timeout 600 -n 4 -v " + "pytest --timeout 600 -n 1 -v " + "--junit-xml=./ut_op_with_only.xml " + test_case + skip_options @@ -27,7 +27,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - "pytest --timeout 600 -n 4 -v " + "pytest --timeout 600 -n 1 -v " + "--junit-xml=./ut_op_with_only.xml " + test_case + exe_options @@ -35,7 +35,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): return os.system(test_command) else: test_command = ( - "pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_only.xml " + "pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_only.xml " + test_case ) return os.system(test_command) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 967bc192a6..22b84a1683 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1170,7 +1170,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f"pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += skip_options @@ -1181,13 +1181,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f"pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -n 4 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f"pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) return os.system(test_command) From d951abfa73b35c63df7af041de1de2587ef1e54a Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 17:09:02 +0800 Subject: [PATCH 157/160] install pytest-xdist --- .github/actions/linux-uttest/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 90b7ac8523..7d56412050 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -15,7 +15,7 @@ runs: shell: bash -xe {0} run: | pip install -r pytorch/.ci/docker/requirements-ci.txt - pip install -U pytest-timeout + pip install -U pytest-timeout pytest-xdist - name: ut_regression shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_regression' }} From d459b6e97297fca7cc26e0de872027b25954bc2c Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 19:27:11 +0800 Subject: [PATCH 158/160] modify --- .github/actions/linux-uttest/action.yml | 24 ++++++++++++++++++------ test/xpu/extended/run_test_with_skip.py | 10 +++++++++- test/xpu/run_test_with_only.py | 17 ++++++++++++----- test/xpu/xpu_test_utils.py | 14 +++++++++++--- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 7d56412050..1f0c87f2a9 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -16,13 +16,25 @@ runs: run: | pip install -r pytorch/.ci/docker/requirements-ci.txt pip install -U pytest-timeout pytest-xdist + xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) + parallel_options="" + if [ ${xpu_num} -gt 1 ];then + parallel_options+=" --dist worksteal " + for x in $(seq 0 $[ ${xpu_num} - 1 ]) + do + parallel_options+=" --tx popen//env:ZE_AFFINITY_MASK=${x} " + done + else + parallel_options+=" -n 1 " + fi + echo " --timeout 600 --timeout_method=thread ${parallel_options} " > ${{ github.workspace }}/test-options.txt - name: ut_regression shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_regression' }} run: | mkdir -p ut_log/ut_regression cd pytorch/third_party/torch-xpu-ops/test/regressions - pytest --timeout 600 -n 1 --timeout_method=thread -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ + pytest $(cat ${{ github.workspace }}/test-options.txt) -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ 2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log - name: ut_transformers @@ -32,7 +44,7 @@ runs: export PYTORCH_TEST_WITH_SLOW=1 mkdir -p ut_log/ut_transformers cd pytorch - pytest --timeout 600 -n 1 --timeout_method=thread -v test/test_transformers.py -k xpu \ + pytest $(cat ${{ github.workspace }}/test-options.txt) -v test/test_transformers.py -k xpu \ --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \ 2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \ tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log @@ -128,13 +140,13 @@ runs: # All xpu ut under test/profiler cd ../../test/profiler - python -m pytest --timeout 600 -n 1 -vs test_cpp_thread.py | \ + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_cpp_thread.py | \ tee ${{ github.workspace }}/ut_log/xpu_profiling/test_cpp_thread.log - python -m pytest --timeout 600 -n 1 -vs test_execution_trace.py | \ + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_execution_trace.py | \ tee ${{ github.workspace }}/ut_log/xpu_profiling/test_execution_trace.log - python -m pytest --timeout 600 -n 1 -vs test_memory_profiler.py | \ + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_memory_profiler.py | \ tee ${{ github.workspace }}/ut_log/xpu_profiling/test_memory_profiler.log - python -m pytest --timeout 600 -n 1 -vs test_profiler_tree.py | \ + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_profiler_tree.py | \ tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_dev1 diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index e062885a6f..eac4046155 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -1,5 +1,6 @@ import os import sys +import torch from skip_list_common import skip_dict from skip_list_win import skip_dict as skip_dict_win @@ -16,8 +17,15 @@ skip_options += skip_option skip_options += '"' +# pytest options +xpu_num = torch.xpu.device_count() +parallel_options = ' --dist worksteal ' + \ + ' '.join([f'--tx popen//env:ZE_AFFINITY_MASK={x}' for x in range(xpu_num)]) \ + if xpu_num > 1 else ' -n 1 ' +test_options = f' --timeout 600 --timeout_method=thread {parallel_options} ' + os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" -test_command = "pytest --timeout 600 -n 1 -v --timeout_method=thread --junit-xml=./ut_extended.xml test_ops_xpu.py" +test_command = f" pytest {test_options} -v --junit-xml=./ut_extended.xml test_ops_xpu.py " test_command += skip_options res = os.system(test_command) sys.exit(res) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index ca24d3925a..e854b12447 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -1,5 +1,6 @@ import os import sys +import torch # Cases in the file is too slow to run all suites on CPU. So add white list. @@ -7,6 +8,14 @@ def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + + # pytest options + xpu_num = torch.xpu.device_count() + parallel_options = ' --dist worksteal ' + \ + ' '.join([f'--tx popen//env:ZE_AFFINITY_MASK={x}' for x in range(xpu_num)]) \ + if xpu_num > 1 else ' -n 1 ' + test_options = f' --timeout 600 --timeout_method=thread {parallel_options} ' + if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -14,8 +23,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - "pytest --timeout 600 -n 1 -v " - + "--junit-xml=./ut_op_with_only.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case + skip_options ) @@ -27,15 +35,14 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - "pytest --timeout 600 -n 1 -v " - + "--junit-xml=./ut_op_with_only.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case + exe_options ) return os.system(test_command) else: test_command = ( - "pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_only.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case ) return os.system(test_command) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 22b84a1683..54a94a3fb4 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1163,6 +1163,14 @@ def copy_tests( def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + + # pytest options + xpu_num = torch.xpu.device_count() + parallel_options = ' --dist worksteal ' + \ + ' '.join([f'--tx popen//env:ZE_AFFINITY_MASK={x}' for x in range(xpu_num)]) \ + if xpu_num > 1 else ' -n 1 ' + test_options = f' --timeout 600 --timeout_method=thread {parallel_options} ' + if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -1170,7 +1178,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += skip_options @@ -1181,13 +1189,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -n 1 -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) return os.system(test_command) From 171cc610ba86130b1f459e62ebb6d0bf4333ec1f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 19:28:06 +0800 Subject: [PATCH 159/160] modify --- .github/actions/linux-uttest/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 1f0c87f2a9..d522dc4691 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -27,7 +27,7 @@ runs: else parallel_options+=" -n 1 " fi - echo " --timeout 600 --timeout_method=thread ${parallel_options} " > ${{ github.workspace }}/test-options.txt + printf " --timeout 600 --timeout_method=thread ${parallel_options} " > ${{ github.workspace }}/test-options.txt - name: ut_regression shell: bash -xe {0} if: ${{ inputs.test_type == 'ut_regression' }} From d50cf682c29e6b4357e0d34f49b807e574feebb6 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 14 Aug 2025 21:17:20 +0800 Subject: [PATCH 160/160] lint python --- test/xpu/extended/run_test_with_skip.py | 17 +++++++++++------ test/xpu/run_test_with_only.py | 15 +++++++++------ test/xpu/xpu_test_utils.py | 11 +++++++---- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index eac4046155..49f3be5876 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -1,7 +1,7 @@ import os import sys -import torch +import torch from skip_list_common import skip_dict from skip_list_win import skip_dict as skip_dict_win @@ -19,13 +19,18 @@ # pytest options xpu_num = torch.xpu.device_count() -parallel_options = ' --dist worksteal ' + \ - ' '.join([f'--tx popen//env:ZE_AFFINITY_MASK={x}' for x in range(xpu_num)]) \ - if xpu_num > 1 else ' -n 1 ' -test_options = f' --timeout 600 --timeout_method=thread {parallel_options} ' +parallel_options = ( + " --dist worksteal " + + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)]) + if xpu_num > 1 + else " -n 1 " +) +test_options = f" --timeout 600 --timeout_method=thread {parallel_options} " os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" -test_command = f" pytest {test_options} -v --junit-xml=./ut_extended.xml test_ops_xpu.py " +test_command = ( + f" pytest {test_options} -v --junit-xml=./ut_extended.xml test_ops_xpu.py " +) test_command += skip_options res = os.system(test_command) sys.exit(res) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index e854b12447..06ebc87e8d 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -1,5 +1,6 @@ import os import sys + import torch # Cases in the file is too slow to run all suites on CPU. So add white list. @@ -11,10 +12,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): # pytest options xpu_num = torch.xpu.device_count() - parallel_options = ' --dist worksteal ' + \ - ' '.join([f'--tx popen//env:ZE_AFFINITY_MASK={x}' for x in range(xpu_num)]) \ - if xpu_num > 1 else ' -n 1 ' - test_options = f' --timeout 600 --timeout_method=thread {parallel_options} ' + parallel_options = ( + " --dist worksteal " + + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)]) + if xpu_num > 1 + else " -n 1 " + ) + test_options = f" --timeout 600 --timeout_method=thread {parallel_options} " if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] @@ -42,8 +46,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): return os.system(test_command) else: test_command = ( - f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " - + test_case + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case ) return os.system(test_command) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 54a94a3fb4..26c0152f71 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1166,10 +1166,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): # pytest options xpu_num = torch.xpu.device_count() - parallel_options = ' --dist worksteal ' + \ - ' '.join([f'--tx popen//env:ZE_AFFINITY_MASK={x}' for x in range(xpu_num)]) \ - if xpu_num > 1 else ' -n 1 ' - test_options = f' --timeout 600 --timeout_method=thread {parallel_options} ' + parallel_options = ( + " --dist worksteal " + + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)]) + if xpu_num > 1 + else " -n 1 " + ) + test_options = f" --timeout 600 --timeout_method=thread {parallel_options} " if skip_list is not None: skip_options = ' -k "not ' + skip_list[0]