diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 9dcd170aa..a9c43eef5 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -45,7 +45,6 @@ cp -r ${WORKSPACE}/torch-xpu-ops third_party/torch-xpu-ops # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt python -m pip install mkl-static mkl-include diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 831864d6d..db95edb94 100644 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,9 +1,9 @@ #!/bin/bash -source /opt/intel/oneapi/compiler/latest/env/vars.sh +source /opt/intel/oneapi/compiler/2025.1/env/vars.sh source /opt/intel/oneapi/pti/latest/env/vars.sh -source /opt/intel/oneapi/umf/latest/env/vars.sh -source /opt/intel/oneapi/ccl/latest/env/vars.sh -source /opt/intel/oneapi/mpi/latest/env/vars.sh +source /opt/intel/oneapi/umf/0.10/env/vars.sh +source /opt/intel/oneapi/ccl/2021.15/env/vars.sh +source /opt/intel/oneapi/mpi/2021.15/env/vars.sh icpx --version sycl-ls diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index ce309138b..87e1ce25e 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -11,7 +11,7 @@ on: keep_torch_xpu_ops: required: false type: string - default: 'false' + default: 'https://github.com/intel/torch-xpu-ops/tree/daisyden/distributed_2.9' description: Keep torch-xpu-ops pin. `true` means use pined commit driver: required: false @@ -109,12 +109,12 @@ jobs: PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" PYTORCH_VERSION="${{ inputs.pytorch }}" fi if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')" + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + TORCH_XPU_OPS_VERSION="daisyden/distributed_2.9" elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then TORCH_XPU_OPS_VERSION="pinned" else diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index c9c55208d..df1d4fbae 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -16,7 +16,7 @@ on: triton: required: false type: string - default: '' + default: 'bdd0656b' description: Triton commit. Use pytorch pined commit by default ut: required: true @@ -394,9 +394,9 @@ jobs: path: ${{ github.workspace }}/ut_log distributed_ut_test: - runs-on: pytorch-06 + runs-on: PVC-7358 if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} - timeout-minutes: 60 + timeout-minutes: 600 env: GH_TOKEN: ${{ github.token }} NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} @@ -422,7 +422,7 @@ jobs: run: | cd ../ rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone https://github.com/pytorch/pytorch pytorch + git clone -b distributed_2.9 https://github.com/daisyden/pytorch.git pytorch source activate xpu_op_${ZE_AFFINITY_MASK} if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then pip install --force-reinstall ${{ github.workspace }}/torch*.whl @@ -446,7 +446,10 @@ jobs: cd ../pytorch rm -rf third_party/torch-xpu-ops if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cp -r ${{ github.workspace }} third_party + cd third_party + git clone https://github.com/intel/torch-xpu-ops.git + cd torch-xpu-ops + git checkout daisyden/distributed_2.9 else TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ - tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + python run_distributed_local.py \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log || true + cp *.xml ${{ github.workspace }}/ut_log - name: Reset Ptrace_scope if: ${{ always() }} run: | if [ -f ptrace_scope.bk ]; then sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope fi + - name: UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + cd ${{ github.workspace }}/ut_log/ + python check-ut.py ${{ github.workspace }}/ut_log/*.xml \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \ + >> $GITHUB_STEP_SUMMARY || true - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 @@ -554,15 +565,7 @@ jobs: echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" cd ${{ github.workspace }}/ut_log/xpu_distributed gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log - gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ - > issues.log - awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log - awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log - cat issues_temp.log | awk '{print $1}' >> Known_issue.log - awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh 'xpu_distributed' + bash ut_result_check.sh 'pytorch_distributed' - name: Upload Inductor XPU UT Log if: always() uses: actions/upload-artifact@v4 diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3f3b1c1b5..103f7c342 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -95,172 +95,17 @@ jobs: preci-linux-build: name: preci-linux if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}} - needs: [preci-conditions-filter] secrets: inherit uses: ./.github/workflows/_linux_build.yml with: - pytorch: main - runner: pvc_e2e + pytorch: distributed_2.9 + runner: PVC-7358 preci-linux-ut: name: preci-linux - needs: [preci-conditions-filter, preci-linux-build] + needs: [preci-linux-build] uses: ./.github/workflows/_linux_ut.yml with: disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} - ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed - runner: linux.idc.xpu - - preci-linux-e2e: - if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} - name: preci-linux / e2e_test - needs: [preci-conditions-filter, preci-linux-build] - runs-on: pvc_e2e - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - timeout-minutes: 300 - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=3.10 cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number }} - - name: Install Pytorch XPU - run: | - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout ${TORCH_COMMIT_ID} - # apply PRs for stock pytorch - # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - git show -s && git status && git diff - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - python .github/scripts/build_triton_wheel.py --device xpu - pip install pytorch_triton_xpu-*.whl - - name: Identify pinned versions - run: | - cd ../pytorch - echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - source ../torch-xpu-ops/.github/scripts/env.sh - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - - name: Torch Config - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - cd .. - source activate e2e_ci - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - - name: Huggingface BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Huggingface FP16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: float16 - mode: training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Timm_models BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Torchbench BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files || sudo rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - # Print summary - source activate e2e_ci - export IS_PR=1 - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - preci-windows: - name: preci-windows - if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }} - needs: [preci-conditions-filter] - uses: ./.github/workflows/_windows_ut.yml - with: - ut: op_extended,torch_xpu - runner: Windows_CI - src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }} - has_label: ${{ needs.preci-conditions-filter.outputs.has_label }} + ut: xpu_distributed + runner: PVC-7358