Skip to content

Commit 3d2e28e

Browse files
committed
add distributed weekly workflow
1 parent f8b1ee9 commit 3d2e28e

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
name: Weekly-Distributed-OnDemand Tests
2+
3+
on:
4+
schedule:
5+
# GMT+8 0:00 Saturday
6+
- cron: '0 16 * * 5'
7+
workflow_dispatch:
8+
inputs:
9+
pytorch:
10+
required: false
11+
type: string
12+
default: 'https://github.com/daisyden/pytorch.git@distributed_2.9'
13+
description: Pytorch branch/commit
14+
keep_torch_xpu_ops:
15+
required: false
16+
type: string
17+
default: 'https://github.com/intel/torch-xpu-ops.git@daisyden/distributed_2.9'
18+
description: Keep torch-xpu-ops pin. `true` means use pined commit
19+
triton:
20+
required: false
21+
type: string
22+
default: 'bdd0656b'
23+
description: Triton commit. Use pytorch pined commit by default
24+
python:
25+
required: false
26+
type: string
27+
default: '3.10'
28+
description: Python version
29+
30+
permissions: read-all
31+
32+
concurrency:
33+
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.triton }}-${{ inputs.python }}
34+
cancel-in-progress: ${{ github.event_name != 'schedule' }}
35+
36+
jobs:
37+
Linux-Weekly-Ondemand-Build:
38+
if: ${{ github.repository_owner == 'intel' }}
39+
name: linux-weekly-ondemand
40+
secrets: inherit
41+
uses: ./.github/workflows/_linux_build.yml
42+
with:
43+
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
44+
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
45+
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
46+
runner: PVC-7358
47+
48+
Linux-Weekly-Ondemand-Distributed-Tests:
49+
runs-on: PVC-7358
50+
name: linux-weekly-ondemand / distributed_weekly
51+
needs: Linux-Weekly-Ondemand-Build
52+
timeout-minutes: 3600
53+
permissions:
54+
issues: write
55+
env:
56+
GH_TOKEN: ${{ github.token }}
57+
pytorch: ${{ needs.Linux-Weekly-Ondemand-Build.outputs.torch_commit_id }}
58+
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
59+
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
60+
run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }}
61+
outputs:
62+
TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
63+
TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
64+
DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
65+
KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
66+
BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
67+
OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
68+
GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
69+
TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
70+
TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
71+
TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
72+
TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
73+
TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
74+
TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
75+
TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
76+
steps:
77+
- name: Checkout torch-xpu-ops
78+
uses: actions/checkout@v4
79+
- name: Prepare Conda ENV
80+
run: |
81+
which conda && conda clean -ay
82+
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
83+
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
84+
source activate e2e_ci
85+
pip install pandas scipy psutil requests
86+
- name: Download Pytorch wheel
87+
if: ${{ inputs.pytorch != 'nightly_wheel' }}
88+
uses: actions/download-artifact@v4
89+
with:
90+
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}
91+
- name: Prepare Stock Pytorch
92+
run: |
93+
pwd
94+
cd ../
95+
rm -rf pytorch || sudo rm -rf pytorch
96+
source activate e2e_ci
97+
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
98+
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
99+
git clone https://github.com/daisyden/pytorch.git pytorch
100+
cd pytorch
101+
git checkout ${TORCH_COMMIT_ID}
102+
git status && git diff && git show -s
103+
rm -rf vision || sudo rm -rf vision
104+
git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
105+
pip install -r .ci/docker/requirements-ci.txt
106+
- name: Prepare Torch-xpu-ops
107+
run: |
108+
cd ../pytorch
109+
rm -rf third_party/torch-xpu-ops
110+
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
111+
cd third_party
112+
git clone https://github.com/intel/torch-xpu-ops.git
113+
cd torch-xpu-ops
114+
git checkout daisyden/distributed_2.9
115+
else
116+
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
117+
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
118+
cd third_party/torch-xpu-ops
119+
git checkout ${TORCH_XPU_OPS_COMMIT}
120+
fi
121+
- name: Triton Installation
122+
run: |
123+
source activate e2e_ci
124+
cd ../pytorch
125+
if [ -z ${{ inputs.triton }} ]; then
126+
TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)"
127+
else
128+
TRITON_COMMIT_ID="${{ inputs.triton }}"
129+
fi
130+
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
131+
pip install cmake ninja pybind11
132+
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
133+
fi
134+
- name: Identify pinned versions
135+
id: pinned
136+
run: |
137+
source .github/scripts/env.sh
138+
cd ../pytorch
139+
if [ -z ${{ inputs.triton }} ]; then
140+
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
141+
else
142+
echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
143+
fi
144+
echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
145+
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
146+
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
147+
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
148+
echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
149+
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
150+
echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
151+
. /etc/os-release
152+
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
153+
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
154+
echo ${GITHUB_ENV}
155+
- name: Show GITHUB_ENV
156+
run: |
157+
echo "$GITHUB_ENV"
158+
- name: Set Ptrace_scope
159+
if: ${{ always() }}
160+
run: |
161+
set -x -e -u -o pipefail
162+
sudo rm -rf ptrace_scope.bk
163+
sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
164+
cat ptrace_scope.bk
165+
echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope
166+
- name: Run Torch XPU Distributed UT
167+
run: |
168+
set -x -e -o pipefail
169+
source activate e2e_ci
170+
pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers
171+
pip install hypothesis==6.131.27
172+
mkdir -p ut_log/xpu_distributed
173+
cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/
174+
cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/check-ut.py ut_log/
175+
cd ../pytorch/third_party/torch-xpu-ops/test/xpu
176+
XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
177+
if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then
178+
echo -e "[ERROR] XCCL is not enabled"
179+
exit 1
180+
fi
181+
python run_distributed_local.py \
182+
2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \
183+
tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log || true
184+
cp *.xml ${{ github.workspace }}/ut_log
185+
- name: Reset Ptrace_scope
186+
if: ${{ always() }}
187+
run: |
188+
if [ -f ptrace_scope.bk ]; then
189+
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
190+
fi
191+
- name: UT Test Results Summary
192+
run: |
193+
source activate xpu_op_${ZE_AFFINITY_MASK}
194+
pip install junitparser
195+
cd ${{ github.workspace }}/ut_log/
196+
python check-ut.py ${{ github.workspace }}/ut_log/*.xml \
197+
2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \
198+
>> $GITHUB_STEP_SUMMARY || true
199+
cd ${{ github.workspace }}/ut_log/xpu_distributed
200+
gh --repo https://github.com/intel/torch-xpu-ops.git issue view 1624 --json body -q .body | sed '/^$/d' > Known_issue.log
201+
bash ut_result_check.sh 'pytorch_distributed'
202+
- name: Upload Inductor XPU UT Log
203+
if: ${{ ! cancelled() }}
204+
uses: actions/upload-artifact@v4
205+
with:
206+
name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed
207+
path: ${{ github.workspace }}/ut_log

0 commit comments

Comments
 (0)