Skip to content

[TEST ONLY] distributed UT weekly test #1826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ cp -r ${WORKSPACE}/torch-xpu-ops third_party/torch-xpu-ops
# Pre Build
cd ${WORKSPACE}/pytorch
python -m pip install requests
python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
git submodule sync && git submodule update --init --recursive
python -m pip install -r requirements.txt
python -m pip install mkl-static mkl-include
Expand Down
8 changes: 4 additions & 4 deletions .github/scripts/env.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash

source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/compiler/2025.1/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh
source /opt/intel/oneapi/umf/latest/env/vars.sh
source /opt/intel/oneapi/ccl/latest/env/vars.sh
source /opt/intel/oneapi/mpi/latest/env/vars.sh
source /opt/intel/oneapi/umf/0.10/env/vars.sh
source /opt/intel/oneapi/ccl/2021.15/env/vars.sh
source /opt/intel/oneapi/mpi/2021.15/env/vars.sh
icpx --version
sycl-ls
8 changes: 4 additions & 4 deletions .github/workflows/_linux_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ on:
keep_torch_xpu_ops:
required: false
type: string
default: 'false'
default: 'https://github.com/intel/torch-xpu-ops/tree/daisyden/distributed_2.9'
description: Keep torch-xpu-ops pin. `true` means use pined commit
driver:
required: false
Expand Down Expand Up @@ -109,12 +109,12 @@ jobs:
PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')"
PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')"
else
PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
PYTORCH_REPO="https://github.com/daisyden/pytorch.git"
PYTORCH_VERSION="${{ inputs.pytorch }}"
fi
if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then
TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')"
TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')"
TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git"
TORCH_XPU_OPS_VERSION="daisyden/distributed_2.9"
elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then
TORCH_XPU_OPS_VERSION="pinned"
else
Expand Down
53 changes: 28 additions & 25 deletions .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ on:
triton:
required: false
type: string
default: ''
default: 'bdd0656b'
description: Triton commit. Use pytorch pined commit by default
ut:
required: true
Expand Down Expand Up @@ -394,9 +394,9 @@ jobs:
path: ${{ github.workspace }}/ut_log

distributed_ut_test:
runs-on: pytorch-06
runs-on: PVC-7358
if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }}
timeout-minutes: 60
timeout-minutes: 600
env:
GH_TOKEN: ${{ github.token }}
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
Expand All @@ -422,7 +422,7 @@ jobs:
run: |
cd ../
rm -rf ./pytorch || sudo rm -rf ./pytorch
git clone https://github.com/pytorch/pytorch pytorch
git clone -b distributed_2.9 https://github.com/daisyden/pytorch.git pytorch
source activate xpu_op_${ZE_AFFINITY_MASK}
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
Expand All @@ -446,7 +446,10 @@ jobs:
cd ../pytorch
rm -rf third_party/torch-xpu-ops
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
cp -r ${{ github.workspace }} third_party
cd third_party
git clone https://github.com/intel/torch-xpu-ops.git
cd torch-xpu-ops
git checkout daisyden/distributed_2.9
else
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
Expand All @@ -464,13 +467,9 @@ jobs:
fi
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
pip install cmake ninja pybind11
rm -rf pytorch_triton_xpu-*.whl
TRITON_VERSION_NAME="$(
curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
)"
python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME}
pip install pytorch_triton_xpu-*.whl
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
TRITON_COMMIT_ID="bdd0656b"
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
fi
- name: Torch Config
run: |
Expand All @@ -495,23 +494,35 @@ jobs:
run: |
set -x -e -o pipefail
source activate xpu_op_${ZE_AFFINITY_MASK}
pip install pytest pytest-timeout
pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers
pip install hypothesis==6.131.27
mkdir -p ut_log/xpu_distributed
cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/
cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/check-ut.py ut_log/
cd ../pytorch/third_party/torch-xpu-ops/test/xpu
XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then
echo -e "[ERROR] XCCL is not enabled"
exit 1
fi
timeout 1800 python run_distributed.py \
2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
python run_distributed_local.py \
2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \
tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log || true
cp *.xml ${{ github.workspace }}/ut_log
- name: Reset Ptrace_scope
if: ${{ always() }}
run: |
if [ -f ptrace_scope.bk ]; then
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
fi
- name: UT Test Results Summary
run: |
source activate xpu_op_${ZE_AFFINITY_MASK}
pip install junitparser
cd ${{ github.workspace }}/ut_log/
python check-ut.py ${{ github.workspace }}/ut_log/*.xml \
2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \
>> $GITHUB_STEP_SUMMARY || true
- name: Upload Inductor XPU UT Log
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
Expand Down Expand Up @@ -554,15 +565,7 @@ jobs:
echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
cd ${{ github.workspace }}/ut_log/xpu_distributed
gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log
gh api "repos/${{ github.repository }}/issues?labels=skipped" \
--jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \
> issues.log
awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log
awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log
cat issues_temp.log | awk '{print $1}' >> Known_issue.log
awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log
cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
bash ut_result_check.sh 'xpu_distributed'
bash ut_result_check.sh 'pytorch_distributed'
- name: Upload Inductor XPU UT Log
if: always()
uses: actions/upload-artifact@v4
Expand Down
165 changes: 5 additions & 160 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,172 +95,17 @@ jobs:
preci-linux-build:
name: preci-linux
if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}}
needs: [preci-conditions-filter]
secrets: inherit
uses: ./.github/workflows/_linux_build.yml
with:
pytorch: main
runner: pvc_e2e
pytorch: distributed_2.9
runner: PVC-7358

preci-linux-ut:
name: preci-linux
needs: [preci-conditions-filter, preci-linux-build]
needs: [preci-linux-build]
uses: ./.github/workflows/_linux_ut.yml
with:
disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }}
ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
runner: linux.idc.xpu

preci-linux-e2e:
if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
name: preci-linux / e2e_test
needs: [preci-conditions-filter, preci-linux-build]
runs-on: pvc_e2e
env:
GH_TOKEN: ${{ github.token }}
reference_issue: 1645
timeout-minutes: 300
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Prepare Conda ENV
run: |
which conda && conda clean -ay
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=3.10 cmake ninja -y
source activate e2e_ci
pip install pandas scipy psutil requests
- name: Download Pytorch wheel
uses: actions/download-artifact@v4
with:
name: Torch-XPU-Wheel-${{ github.event.pull_request.number }}
- name: Install Pytorch XPU
run: |
source activate e2e_ci
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
cd ../
rm -rf pytorch || sudo rm -rf pytorch
git clone https://github.com/pytorch/pytorch pytorch
cd pytorch && git checkout ${TORCH_COMMIT_ID}
# apply PRs for stock pytorch
# https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
git show -s && git status && git diff
- name: Triton Installation
run: |
source activate e2e_ci
cd ../pytorch
pip install cmake ninja pybind11
rm -rf pytorch_triton_xpu-*.whl
python .github/scripts/build_triton_wheel.py --device xpu
pip install pytorch_triton_xpu-*.whl
- name: Identify pinned versions
run: |
cd ../pytorch
echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
. /etc/os-release
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
source ../torch-xpu-ops/.github/scripts/env.sh
echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- name: Torch Config
run: |
echo "$GITHUB_ENV"
rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log
rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_*
rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache
cd ..
source activate e2e_ci
python -c "import triton; print(triton.__version__)"
python pytorch/torch/utils/collect_env.py
- name: Huggingface BF16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: bfloat16
mode: training
scenario: accuracy,performance
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Huggingface FP16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: float16
mode: training
scenario: accuracy,performance
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Timm_models BF16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
dt: bfloat16
mode: training
scenario: accuracy,performance
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Torchbench BF16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
dt: bfloat16
mode: training
scenario: accuracy,performance
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Download Reference Artifact
id: reference_id
run: |
set -xe
source activate e2e_ci
conda install gh --channel conda-forge -y
REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
--json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')"
gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
rm -rf reference && mv Inductor-*-XPU-E2E-* reference
- name: Summarize archieve files
if: ${{ ! cancelled() }}
run: |
set -x -e -o pipefail
rm -rf ${{ github.workspace }}/upload_files || sudo rm -rf ${{ github.workspace }}/upload_files
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
# Print summary
source activate e2e_ci
export IS_PR=1
bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
${{ github.workspace }}/upload_files \
${{ github.workspace }}/reference \
>> ${GITHUB_STEP_SUMMARY}
exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
if [ ${exit_label} -ne 0 ];then
grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
echo "There are ${exit_label} cases that need look into!!! Please check them"
exit ${exit_label}
fi
- name: Upload Inductor XPU E2E Data
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/upload_files

preci-windows:
name: preci-windows
if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }}
needs: [preci-conditions-filter]
uses: ./.github/workflows/_windows_ut.yml
with:
ut: op_extended,torch_xpu
runner: Windows_CI
src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }}
has_label: ${{ needs.preci-conditions-filter.outputs.has_label }}
ut: xpu_distributed
runner: PVC-7358
Loading