Skip to content

Commit 5ac5b25

Browse files
committed
distributed_weekly test
1 parent 798a079 commit 5ac5b25

File tree

4 files changed

+29
-179
lines changed

4 files changed

+29
-179
lines changed

.github/scripts/build.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ cp -r ${WORKSPACE}/torch-xpu-ops third_party/torch-xpu-ops
4545
# Pre Build
4646
cd ${WORKSPACE}/pytorch
4747
python -m pip install requests
48-
python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
4948
git submodule sync && git submodule update --init --recursive
5049
python -m pip install -r requirements.txt
5150
python -m pip install mkl-static mkl-include

.github/workflows/_linux_build.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
keep_torch_xpu_ops:
1212
required: false
1313
type: string
14-
default: 'false'
14+
default: 'https://github.com/intel/torch-xpu-ops/tree/daisyden/distributed_2.8'
1515
description: Keep torch-xpu-ops pin. `true` means use pined commit
1616
driver:
1717
required: false
@@ -80,12 +80,12 @@ jobs:
8080
PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')"
8181
PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')"
8282
else
83-
PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
83+
PYTORCH_REPO="https://github.com/daisyden/pytorch.git"
8484
PYTORCH_VERSION="${{ inputs.pytorch }}"
8585
fi
8686
if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then
87-
TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')"
88-
TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')"
87+
TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git"
88+
TORCH_XPU_OPS_VERSION="daisyden/distributed_2.8"
8989
elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then
9090
TORCH_XPU_OPS_VERSION="pinned"
9191
else

.github/workflows/_linux_ut.yml

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ on:
1616
triton:
1717
required: false
1818
type: string
19-
default: ''
19+
default: 'bdd0656b'
2020
description: Triton commit. Use pytorch pined commit by default
2121
ut:
2222
required: true
@@ -402,7 +402,7 @@ jobs:
402402
run: |
403403
cd ../
404404
rm -rf ./pytorch || sudo rm -rf ./pytorch
405-
git clone https://github.com/pytorch/pytorch pytorch
405+
git clone -b distributed_2.8 https://github.com/daisyden/pytorch.git pytorch
406406
source activate xpu_op_${ZE_AFFINITY_MASK}
407407
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
408408
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
@@ -426,7 +426,10 @@ jobs:
426426
cd ../pytorch
427427
rm -rf third_party/torch-xpu-ops
428428
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
429-
cp -r ${{ github.workspace }} third_party
429+
cd third_party
430+
git clone https://github.com/intel/torch-xpu-ops.git
431+
cd torch-xpu-ops
432+
git checkout daisyden/distributed_2.8
430433
else
431434
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
432435
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
@@ -475,23 +478,33 @@ jobs:
475478
run: |
476479
set -x -e -o pipefail
477480
source activate xpu_op_${ZE_AFFINITY_MASK}
478-
pip install pytest pytest-timeout
481+
pip install pytest pytest-timeout xmlrunner unittest-xml-reporting
479482
mkdir -p ut_log/xpu_distributed
483+
cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/
480484
cd ../pytorch/third_party/torch-xpu-ops/test/xpu
481485
XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())")
482486
if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then
483487
echo -e "[ERROR] XCCL is not enabled"
484488
exit 1
485489
fi
486-
timeout 1800 python run_distributed.py \
487-
2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
488-
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
490+
python run_distributed_local.py \
491+
2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \
492+
tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log
493+
cp *.xml ${{ github.workspace }}/ut_log
494+
cp
489495
- name: Reset Ptrace_scope
490496
if: ${{ always() }}
491497
run: |
492498
if [ -f ptrace_scope.bk ]; then
493499
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
494500
fi
501+
- name: UT Test Results Summary
502+
run: |
503+
source activate xpu_op_${ZE_AFFINITY_MASK}
504+
pip install junitparser
505+
python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml \
506+
2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \
507+
>> $GITHUB_STEP_SUMMARY || true
495508
- name: Upload Inductor XPU UT Log
496509
if: ${{ ! cancelled() }}
497510
uses: actions/upload-artifact@v4
@@ -534,15 +547,7 @@ jobs:
534547
echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
535548
cd ${{ github.workspace }}/ut_log/xpu_distributed
536549
gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log
537-
gh api "repos/${{ github.repository }}/issues?labels=skipped" \
538-
--jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \
539-
> issues.log
540-
awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log
541-
awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log
542-
cat issues_temp.log | awk '{print $1}' >> Known_issue.log
543-
awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log
544-
cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
545-
bash ut_result_check.sh 'xpu_distributed'
550+
bash ut_result_check.sh 'pytorch_distributed'
546551
- name: Upload Inductor XPU UT Log
547552
if: always()
548553
uses: actions/upload-artifact@v4

.github/workflows/pull.yml

Lines changed: 4 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -99,168 +99,14 @@ jobs:
9999
secrets: inherit
100100
uses: ./.github/workflows/_linux_build.yml
101101
with:
102-
pytorch: main
103-
runner: pvc_e2e
102+
pytorch: distributed_2.8
103+
runner: PVC-7358
104104

105105
preci-linux-ut:
106106
name: preci-linux
107107
needs: [preci-conditions-filter, preci-linux-build]
108108
uses: ./.github/workflows/_linux_ut.yml
109109
with:
110110
disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }}
111-
ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
112-
runner: linux.idc.xpu
113-
114-
preci-linux-e2e:
115-
if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
116-
name: preci-linux / e2e_test
117-
needs: [preci-conditions-filter, preci-linux-build]
118-
runs-on: pvc_e2e
119-
env:
120-
GH_TOKEN: ${{ github.token }}
121-
reference_issue: 1645
122-
timeout-minutes: 300
123-
steps:
124-
- name: Checkout torch-xpu-ops
125-
uses: actions/checkout@v4
126-
- name: Prepare Conda ENV
127-
run: |
128-
which conda && conda clean -ay
129-
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
130-
conda create -n e2e_ci python=3.10 cmake ninja -y
131-
source activate e2e_ci
132-
pip install pandas scipy psutil requests
133-
- name: Download Pytorch wheel
134-
uses: actions/download-artifact@v4
135-
with:
136-
name: Torch-XPU-Wheel-${{ github.event.pull_request.number }}
137-
- name: Install Pytorch XPU
138-
run: |
139-
source activate e2e_ci
140-
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
141-
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
142-
cd ../
143-
rm -rf pytorch || sudo rm -rf pytorch
144-
git clone https://github.com/pytorch/pytorch pytorch
145-
cd pytorch && git checkout ${TORCH_COMMIT_ID}
146-
# apply PRs for stock pytorch
147-
# https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list
148-
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
149-
git show -s && git status && git diff
150-
- name: Triton Installation
151-
run: |
152-
source activate e2e_ci
153-
cd ../pytorch
154-
pip install cmake ninja pybind11
155-
rm -rf pytorch_triton_xpu-*.whl
156-
python .github/scripts/build_triton_wheel.py --device xpu
157-
pip install pytorch_triton_xpu-*.whl
158-
- name: Identify pinned versions
159-
run: |
160-
cd ../pytorch
161-
echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
162-
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
163-
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
164-
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
165-
echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
166-
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
167-
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
168-
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
169-
. /etc/os-release
170-
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
171-
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
172-
source ../torch-xpu-ops/.github/scripts/env.sh
173-
echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
174-
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
175-
echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
176-
- name: Torch Config
177-
run: |
178-
echo "$GITHUB_ENV"
179-
rm -rf ../pytorch/inductor_log
180-
rm -rf /tmp/torchinductor_*
181-
rm -rf ~/.triton/cache
182-
cd ..
183-
source activate e2e_ci
184-
python -c "import triton; print(triton.__version__)"
185-
python pytorch/torch/utils/collect_env.py
186-
- name: Huggingface BF16 Training Accuracy Test
187-
uses: ./.github/actions/inductor-xpu-e2e-test
188-
with:
189-
suite: huggingface
190-
dt: bfloat16
191-
mode: training
192-
scenario: accuracy,performance
193-
env_prepare: true
194-
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
195-
- name: Huggingface FP16 Training Accuracy Test
196-
uses: ./.github/actions/inductor-xpu-e2e-test
197-
with:
198-
suite: huggingface
199-
dt: float16
200-
mode: training
201-
scenario: accuracy,performance
202-
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
203-
- name: Timm_models BF16 Training Accuracy Test
204-
uses: ./.github/actions/inductor-xpu-e2e-test
205-
with:
206-
suite: timm_models
207-
dt: bfloat16
208-
mode: training
209-
scenario: accuracy,performance
210-
env_prepare: true
211-
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
212-
- name: Torchbench BF16 Training Accuracy Test
213-
uses: ./.github/actions/inductor-xpu-e2e-test
214-
with:
215-
suite: torchbench
216-
dt: bfloat16
217-
mode: training
218-
scenario: accuracy,performance
219-
env_prepare: true
220-
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
221-
- name: Download Reference Artifact
222-
id: reference_id
223-
run: |
224-
set -xe
225-
source activate e2e_ci
226-
conda install gh --channel conda-forge -y
227-
REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
228-
--json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')"
229-
gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
230-
rm -rf reference && mv Inductor-*-XPU-E2E-* reference
231-
- name: Summarize archieve files
232-
if: ${{ ! cancelled() }}
233-
run: |
234-
set -x -e -o pipefail
235-
rm -rf ${{ github.workspace }}/upload_files
236-
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
237-
# Print summary
238-
source activate e2e_ci
239-
export IS_PR=1
240-
bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
241-
${{ github.workspace }}/upload_files \
242-
${{ github.workspace }}/reference \
243-
>> ${GITHUB_STEP_SUMMARY}
244-
exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
245-
if [ ${exit_label} -ne 0 ];then
246-
grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
247-
echo "There are ${exit_label} cases that need look into!!! Please check them"
248-
exit ${exit_label}
249-
fi
250-
- name: Upload Inductor XPU E2E Data
251-
if: ${{ ! cancelled() }}
252-
uses: actions/upload-artifact@v4
253-
with:
254-
name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
255-
path: ${{ github.workspace }}/upload_files
256-
257-
preci-windows:
258-
name: preci-windows
259-
if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }}
260-
needs: [preci-conditions-filter]
261-
uses: ./.github/workflows/_windows_ut.yml
262-
with:
263-
ut: op_extended,torch_xpu
264-
runner: Windows_CI
265-
src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }}
266-
has_label: ${{ needs.preci-conditions-filter.outputs.has_label }}
111+
ut: xpu_distributed
112+
runner: PVC-7358

0 commit comments

Comments
 (0)