@@ -99,168 +99,14 @@ jobs:
99
99
secrets : inherit
100
100
uses : ./.github/workflows/_linux_build.yml
101
101
with :
102
- pytorch : main
103
- runner : pvc_e2e
102
+ pytorch : distributed_2.8
103
+ runner : PVC-7358
104
104
105
105
preci-linux-ut :
106
106
name : preci-linux
107
107
needs : [preci-conditions-filter, preci-linux-build]
108
108
uses : ./.github/workflows/_linux_ut.yml
109
109
with :
110
110
disabled_tests : ${{ needs.preci-conditions-filter.outputs.disabled_tests }}
111
- ut : op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
112
- runner : linux.idc.xpu
113
-
114
- preci-linux-e2e :
115
- if : ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
116
- name : preci-linux / e2e_test
117
- needs : [preci-conditions-filter, preci-linux-build]
118
- runs-on : pvc_e2e
119
- env :
120
- GH_TOKEN : ${{ github.token }}
121
- reference_issue : 1645
122
- timeout-minutes : 300
123
- steps :
124
- - name : Checkout torch-xpu-ops
125
- uses : actions/checkout@v4
126
- - name : Prepare Conda ENV
127
- run : |
128
- which conda && conda clean -ay
129
- conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
130
- conda create -n e2e_ci python=3.10 cmake ninja -y
131
- source activate e2e_ci
132
- pip install pandas scipy psutil requests
133
- - name : Download Pytorch wheel
134
- uses : actions/download-artifact@v4
135
- with :
136
- name : Torch-XPU-Wheel-${{ github.event.pull_request.number }}
137
- - name : Install Pytorch XPU
138
- run : |
139
- source activate e2e_ci
140
- pip install --force-reinstall ${{ github.workspace }}/torch*.whl
141
- TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
142
- cd ../
143
- rm -rf pytorch || sudo rm -rf pytorch
144
- git clone https://github.com/pytorch/pytorch pytorch
145
- cd pytorch && git checkout ${TORCH_COMMIT_ID}
146
- # apply PRs for stock pytorch
147
- # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list
148
- python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
149
- git show -s && git status && git diff
150
- - name : Triton Installation
151
- run : |
152
- source activate e2e_ci
153
- cd ../pytorch
154
- pip install cmake ninja pybind11
155
- rm -rf pytorch_triton_xpu-*.whl
156
- python .github/scripts/build_triton_wheel.py --device xpu
157
- pip install pytorch_triton_xpu-*.whl
158
- - name : Identify pinned versions
159
- run : |
160
- cd ../pytorch
161
- echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
162
- echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
163
- echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
164
- echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
165
- echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
166
- echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
167
- echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
168
- echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
169
- . /etc/os-release
170
- echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
171
- echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
172
- source ../torch-xpu-ops/.github/scripts/env.sh
173
- echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
174
- echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
175
- echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
176
- - name : Torch Config
177
- run : |
178
- echo "$GITHUB_ENV"
179
- rm -rf ../pytorch/inductor_log
180
- rm -rf /tmp/torchinductor_*
181
- rm -rf ~/.triton/cache
182
- cd ..
183
- source activate e2e_ci
184
- python -c "import triton; print(triton.__version__)"
185
- python pytorch/torch/utils/collect_env.py
186
- - name : Huggingface BF16 Training Accuracy Test
187
- uses : ./.github/actions/inductor-xpu-e2e-test
188
- with :
189
- suite : huggingface
190
- dt : bfloat16
191
- mode : training
192
- scenario : accuracy,performance
193
- env_prepare : true
194
- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
195
- - name : Huggingface FP16 Training Accuracy Test
196
- uses : ./.github/actions/inductor-xpu-e2e-test
197
- with :
198
- suite : huggingface
199
- dt : float16
200
- mode : training
201
- scenario : accuracy,performance
202
- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
203
- - name : Timm_models BF16 Training Accuracy Test
204
- uses : ./.github/actions/inductor-xpu-e2e-test
205
- with :
206
- suite : timm_models
207
- dt : bfloat16
208
- mode : training
209
- scenario : accuracy,performance
210
- env_prepare : true
211
- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
212
- - name : Torchbench BF16 Training Accuracy Test
213
- uses : ./.github/actions/inductor-xpu-e2e-test
214
- with :
215
- suite : torchbench
216
- dt : bfloat16
217
- mode : training
218
- scenario : accuracy,performance
219
- env_prepare : true
220
- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
221
- - name : Download Reference Artifact
222
- id : reference_id
223
- run : |
224
- set -xe
225
- source activate e2e_ci
226
- conda install gh --channel conda-forge -y
227
- REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
228
- --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')"
229
- gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
230
- rm -rf reference && mv Inductor-*-XPU-E2E-* reference
231
- - name : Summarize archieve files
232
- if : ${{ ! cancelled() }}
233
- run : |
234
- set -x -e -o pipefail
235
- rm -rf ${{ github.workspace }}/upload_files
236
- cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
237
- # Print summary
238
- source activate e2e_ci
239
- export IS_PR=1
240
- bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
241
- ${{ github.workspace }}/upload_files \
242
- ${{ github.workspace }}/reference \
243
- >> ${GITHUB_STEP_SUMMARY}
244
- exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
245
- if [ ${exit_label} -ne 0 ];then
246
- grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
247
- echo "There are ${exit_label} cases that need look into!!! Please check them"
248
- exit ${exit_label}
249
- fi
250
- - name : Upload Inductor XPU E2E Data
251
- if : ${{ ! cancelled() }}
252
- uses : actions/upload-artifact@v4
253
- with :
254
- name : Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
255
- path : ${{ github.workspace }}/upload_files
256
-
257
- preci-windows :
258
- name : preci-windows
259
- if : ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }}
260
- needs : [preci-conditions-filter]
261
- uses : ./.github/workflows/_windows_ut.yml
262
- with :
263
- ut : op_extended,torch_xpu
264
- runner : Windows_CI
265
- src_changed : ${{ needs.preci-conditions-filter.outputs.src_changed }}
266
- has_label : ${{ needs.preci-conditions-filter.outputs.has_label }}
111
+ ut : xpu_distributed
112
+ runner : PVC-7358
0 commit comments