Skip to content

Commit 1b2c539

Browse files
authored
[Nightly] Enable bisect search (#1849)
disable_all
1 parent 30a820f commit 1b2c539

File tree

5 files changed

+355
-6
lines changed

5 files changed

+355
-6
lines changed

.github/scripts/bisect_search.sh

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/bin/bash
2+
set -xe
3+
export GIT_PAGER=cat
4+
5+
# Init params
6+
WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"})
7+
PYTORCH_VERSION=${PYTORCH_VERSION:-"main"}
8+
TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"}
9+
for var; do
10+
eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")"
11+
done
12+
13+
if [ "${PYTORCH_VERSION}" == "search" ];then
14+
PYTORCH_VERSION="$(git rev-parse HEAD)"
15+
fi
16+
if [ "${TORCH_XPU_OPS_VERSION}" == "search" ];then
17+
TORCH_XPU_OPS_VERSION="$(git rev-parse HEAD)"
18+
fi
19+
20+
# Clean WORKSPACE
21+
mkdir -p ${WORKSPACE}
22+
rm -rf "${WORKSPACE:?}/"* || sudo rm -rf "${WORKSPACE:?}/"*
23+
24+
# Build pytorch
25+
pip uninstall -y torch
26+
source $(dirname $(realpath $0))/env.sh 2> /dev/null
27+
build_status="$($(dirname $(realpath $0))/build.sh \
28+
--WORKSPACE="${WORKSPACE}" \
29+
--PYTORCH_VERSION="${PYTORCH_VERSION}" \
30+
--TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \
31+
> ${GITHUB_WORKSPACE}/gs-logs/build-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
32+
if [ ${build_status} -ne 0 ];then
33+
tail -n 100 ${GITHUB_WORKSPACE}/gs-logs/build-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log
34+
echo "Build got failed"
35+
exit 1
36+
fi
37+
pip list |grep torch
38+
39+
# Test
40+
test_result=1
41+
if [ "${SEARCH_CHECK}" == "accuracy" ];then
42+
cd ${WORKSPACE}/pytorch
43+
rm -rf torch
44+
test_status="$(eval "${SEARCH_CASE} --output=${WORKSPACE}/tmp.csv" \
45+
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
46+
if [ ${test_status} -eq 0 ];then
47+
acc_result=$(tail -n 1 ${WORKSPACE}/tmp.csv |awk -F, '{print $4}')
48+
if [[ "${acc_result}" == "pass"* ]];then
49+
test_result=0
50+
fi
51+
fi
52+
elif [ "${SEARCH_CHECK}" == "performance" ];then
53+
cd ${WORKSPACE}/pytorch
54+
rm -rf torch
55+
test_status="$(eval "${SEARCH_CASE} --output=${WORKSPACE}/tmp.csv" \
56+
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
57+
if [ ${test_status} -eq 0 ];then
58+
perf_result=$(tail -n 1 ${WORKSPACE}/tmp.csv |awk -F, '{print $5}')
59+
test_result=$(echo "${perf_result},${SEARCH_GOOD_VALUE:-"0.00001"},${SEARCH_CRITERIA}" |awk -F, '{
60+
if ($1/$2 > (1 - $3)){
61+
print "0";
62+
}else{
63+
print "1";
64+
}
65+
}')
66+
fi
67+
elif [ "${SEARCH_CHECK}" == "ut_regressions" ];then
68+
cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/regressions
69+
test_status="$(eval "${SEARCH_CASE}" \
70+
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
71+
if [ ${test_status} -eq 0 ];then
72+
test_result=0
73+
fi
74+
elif [ "${SEARCH_CHECK}" == "ut_extended" ];then
75+
cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/xpu/extended
76+
test_status="$(eval "${SEARCH_CASE}" \
77+
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
78+
if [ ${test_status} -eq 0 ];then
79+
test_result=0
80+
fi
81+
elif [ "${SEARCH_CHECK}" == "ut_xpu" ];then
82+
cd ${WORKSPACE}/pytorch/third_party/torch-xpu-ops/test/xpu
83+
test_status="$(eval "${SEARCH_CASE}" \
84+
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
85+
if [ ${test_status} -eq 0 ];then
86+
test_result=0
87+
fi
88+
else
89+
test_status="$(eval "${SEARCH_CASE}" \
90+
> ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log 2>&1 && echo $? || echo $?)"
91+
if [ ${test_status} -eq 0 ];then
92+
test_result=0
93+
fi
94+
fi
95+
96+
# Test result
97+
cat ${GITHUB_WORKSPACE}/gs-logs/test-${PYTORCH_VERSION}-${TORCH_XPU_OPS_VERSION}.log
98+
echo "${test_result},${acc_result},${perf_result},${PYTORCH_VERSION},${TORCH_XPU_OPS_VERSION}" |\
99+
tee -a ${GITHUB_WORKSPACE}/gs-logs/summary.csv |tee -a ${WORKSPACE}/result.csv
100+
exit ${test_result}

.github/scripts/build.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ git submodule sync && git submodule update --init --recursive
5050
python -m pip install -r requirements.txt
5151
python -m pip install mkl-static mkl-include
5252
export USE_STATIC_MKL=1
53-
export USE_XCCL=1
5453
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
5554
intel-cmplr-lib-rt==2025.1.1 | \
5655
intel-cmplr-lib-ur==2025.1.1 | \

.github/scripts/env.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
#!/bin/bash
22

3-
source /opt/intel/oneapi/compiler/latest/env/vars.sh
4-
source /opt/intel/oneapi/pti/latest/env/vars.sh
5-
source /opt/intel/oneapi/umf/latest/env/vars.sh
6-
source /opt/intel/oneapi/ccl/latest/env/vars.sh
7-
source /opt/intel/oneapi/mpi/latest/env/vars.sh
3+
XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}
4+
5+
source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh
6+
source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh
7+
source ${XPU_ONEAPI_PATH}/umf/latest/env/vars.sh
8+
source ${XPU_ONEAPI_PATH}/ccl/latest/env/vars.sh
9+
source ${XPU_ONEAPI_PATH}/mpi/latest/env/vars.sh
810
icpx --version
911
sycl-ls

.github/workflows/_linux_build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ jobs:
124124
source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
125125
# gcc 11
126126
source /opt/rh/gcc-toolset-11/enable
127+
export USE_XCCL=1
127128
${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \
128129
--WORKSPACE="${{ github.workspace }}" \
129130
--PYTORCH_REPO="${PYTORCH_REPO}" \

.github/workflows/bisect_search.yml

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
name: Bisect Search
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
runner:
7+
required: true
8+
type: string
9+
default: 'pvc_rolling'
10+
description: Test node
11+
search_commits:
12+
required: true
13+
type: string
14+
default: ''
15+
description: Target commits, such as 'pytorch=old/new,xpu-ops=old/new'
16+
search_check:
17+
type: string
18+
default: ''
19+
description: Test case type, 'performance, accuracy, <ut_regressions/ut_extended/ut_xpu> or others'
20+
search_case:
21+
required: true
22+
type: string
23+
default: ''
24+
description: Test case, such as 'python xxx.py or pytest -k xxx'
25+
search_criteria:
26+
type: string
27+
default: '0.1'
28+
description: Criteria for performance check, default is 10%
29+
oneapi:
30+
type: string
31+
default: '2025.1.3'
32+
description: Installed oneAPI DLE on host by default, fill offline.sh url if needed
33+
python:
34+
type: string
35+
default: '3.10'
36+
description: Python version
37+
38+
permissions: read-all
39+
40+
jobs:
41+
get_runner:
42+
runs-on: ${{ inputs.runner }}
43+
outputs:
44+
test_host: ${{ steps.runner-info.outputs.test_host }}
45+
test_user: ${{ steps.runner-info.outputs.test_user }}
46+
test_group: ${{ steps.runner-info.outputs.test_group }}
47+
steps:
48+
- name: Get runner info
49+
id: runner-info
50+
run: |
51+
# get test runner
52+
echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT}
53+
echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT}
54+
echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
55+
# show host info
56+
cat /etc/os-release
57+
uname -a
58+
source /opt/intel/oneapi/setvars.sh
59+
sycl-ls
60+
dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
61+
- name: Cleanup workspace
62+
if: ${{ always() }}
63+
run: |
64+
# clean docker cache
65+
docker stop $(docker ps -aq) || true
66+
docker system prune -af || true
67+
# clean files
68+
ls -al
69+
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
70+
71+
biisect-search:
72+
needs: get_runner
73+
runs-on: ${{ needs.get_runner.outputs.test_host }}
74+
container:
75+
image: mengfeili/intel-pvc-driver:1146-1136
76+
volumes:
77+
- ${{ github.workspace }}:${{ github.workspace }}
78+
options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g
79+
-u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }}
80+
env:
81+
AGENT_TOOLSDIRECTORY: /tmp/_tools
82+
HF_HOME: /tmp/.cache/huggingface
83+
TORCH_HOME: /tmp/.cache/torch
84+
GH_TOKEN: ${{ github.token }}
85+
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
86+
SEARCH_COMMITS: ${{ inputs.search_commits }}
87+
SEARCH_CHECK: ${{ inputs.search_check }}
88+
SEARCH_CASE: ${{ inputs.search_case }}
89+
SEARCH_CRITERIA: ${{ inputs.search_criteria }}
90+
TORCH_XPU_ARCH_LIST: pvc
91+
USE_XCCL: 0
92+
USE_KINETO: 0
93+
defaults:
94+
run:
95+
shell: bash -xe {0}
96+
steps:
97+
- name: Check runner
98+
run: |
99+
ls -al
100+
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
101+
sudo rm -rf /tmp/_tools
102+
- name: Setup python-${{ inputs.python }}
103+
uses: actions/setup-python@v5
104+
with:
105+
python-version: ${{ inputs.python }}
106+
- name: Check runner
107+
run: |
108+
hostname && whoami && id
109+
clinfo --list
110+
gcc -v && g++ -v
111+
which python && which pip
112+
python -V
113+
pip install -U pip wheel setuptools
114+
pip list
115+
uname -a
116+
dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
117+
pip install cmake ninja pandas psutil scipy requests pybind11
118+
mkdir gs-logs gs-search
119+
echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
120+
- name: Install oneAPI DLE
121+
if: ${{ inputs.oneapi != 'installed' }}
122+
run: |
123+
rm -rf ~/intel ~/.intel /tmp/intel
124+
if [ "${{ inputs.oneapi }}" == "2025.1.3" ];then
125+
ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh"
126+
elif [ "${{ inputs.oneapi }}" == "2025.2.0" ];then
127+
ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/49d38360-b403-4b06-9104-86fa8d886e6d/intel-deep-learning-essentials-2025.2.0.558_offline.sh"
128+
else
129+
ONEAPI_URL="${{ inputs.oneapi }}"
130+
fi
131+
wget -q -O oneapi.sh "${ONEAPI_URL}"
132+
bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi
133+
echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV}
134+
- name: Checkout torch-xpu-ops
135+
uses: actions/checkout@v4
136+
with:
137+
path: gs-scripts
138+
- name: Prepare source code
139+
run: |
140+
git clone https://github.com/pytorch/pytorch gs-pytorch
141+
cd gs-pytorch
142+
LATEST_PT_COMMIT="$(git rev-parse HEAD)"
143+
cd ..
144+
git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops
145+
cd gs-torch-xpu-ops
146+
LATEST_XPU_COMMIT="$(git rev-parse HEAD)"
147+
cd ..
148+
echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV}
149+
echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV}
150+
- name: Prepare test env
151+
run: |
152+
pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
153+
if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then
154+
pip install transformers==4.44.2
155+
elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then
156+
pip install --no-deps git+https://github.com/huggingface/[email protected]
157+
pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
158+
elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then
159+
model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')"
160+
git clone https://github.com/pytorch/benchmark gs-benchmark
161+
cd gs-benchmark
162+
echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
163+
python install.py ${model_name}
164+
else
165+
pip install -r gs-pytorch/.ci/docker/requirements-ci.txt
166+
fi
167+
pip uninstall -y torch && pip uninstall -y torch
168+
- name: Bisect search pytorch
169+
if: ${{ contains(inputs.search_commits, 'pytorch') }}
170+
run: |
171+
pytorch_commits="$(echo ${{ inputs.search_commits }} |sed 's+.*pytorch=++;s+,.*++')"
172+
old_commit="$(echo ${pytorch_commits} |awk -F '/' '{print $1}')"
173+
new_commit="$(echo ${pytorch_commits} |awk -F '/' '{print $2}')"
174+
old_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
175+
--WORKSPACE="${{ github.workspace }}/gs-search" \
176+
--PYTORCH_VERSION="${old_commit}" \
177+
--TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
178+
> ${{ github.workspace }}/gs-logs/search-${old_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
179+
old_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
180+
export SEARCH_GOOD_VALUE="$(echo ${old_result} |awk -F, '{print $3}')"
181+
new_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
182+
--WORKSPACE="${{ github.workspace }}/gs-search" \
183+
--PYTORCH_VERSION="${new_commit}" \
184+
--TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
185+
> ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
186+
new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
187+
if [ "${old_status}" != "${new_status}" ];then
188+
cd gs-pytorch
189+
git reset --hard
190+
rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
191+
git bisect start ${new_commit} ${old_commit}
192+
git bisect run ./gs-scripts/.github/scripts/bisect_search.sh \
193+
--WORKSPACE="${{ github.workspace }}/gs-search" \
194+
--PYTORCH_VERSION="search" \
195+
--TORCH_XPU_OPS_VERSION="${LATEST_XPU_COMMIT}" \
196+
2>&1 |tee ${{ github.workspace }}/gs-logs/bisect-pytorch.log
197+
git bisect log |tee ${{ github.workspace }}/gs-logs/result-pytorch.log
198+
else
199+
echo "Checked and no regression !"
200+
fi
201+
- name: Bisect search torch-xpu-ops
202+
if: ${{ contains(inputs.search_commits, 'xpu-ops') }}
203+
run: |
204+
xpu_ops_commits="$(echo ${{ inputs.search_commits }} |sed 's+.*xpu-ops=++;s+,.*++')"
205+
old_commit="$(echo ${xpu_ops_commits} |awk -F '/' '{print $1}')"
206+
new_commit="$(echo ${xpu_ops_commits} |awk -F '/' '{print $2}')"
207+
old_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
208+
--WORKSPACE="${{ github.workspace }}/gs-search" \
209+
--PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
210+
--TORCH_XPU_OPS_VERSION="${old_commit}" \
211+
> ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${old_commit}.log && echo $? || echo $?)"
212+
old_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
213+
export SEARCH_GOOD_VALUE="$(echo ${old_result} |awk -F, '{print $3}')"
214+
new_status="$(${{ github.workspace }}/gs-scripts/.github/scripts/bisect_search.sh \
215+
--WORKSPACE="${{ github.workspace }}/gs-search" \
216+
--PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
217+
--TORCH_XPU_OPS_VERSION="${new_commit}" \
218+
> ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)"
219+
new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
220+
if [ "${old_status}" != "${new_status}" ];then
221+
cd gs-pytorch
222+
git reset --hard
223+
rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
224+
git bisect start ${new_commit} ${old_commit}
225+
git bisect run ./gs-scripts/.github/scripts/bisect_search.sh \
226+
--WORKSPACE="${{ github.workspace }}/gs-search" \
227+
--PYTORCH_VERSION="${LATEST_PT_COMMIT}" \
228+
--TORCH_XPU_OPS_VERSION="search" \
229+
2>&1 |tee ${{ github.workspace }}/gs-logs/bisect-torch-xpu-ops.log
230+
git bisect log |tee ${{ github.workspace }}/gs-logs/result-torch-xpu-ops.log
231+
else
232+
echo "Checked and no regression !"
233+
fi
234+
- name: Summary
235+
run: |
236+
cat gs-logs/summary.csv |tee -a ${GITHUB_STEP_SUMMARY}
237+
for reulst_log in $(find gs-logs -name "result-*.log")
238+
do
239+
echo -e "\n\n\n${reulst_log}" |tee -a ${GITHUB_STEP_SUMMARY}
240+
cat ${reulst_log} |tee -a ${GITHUB_STEP_SUMMARY}
241+
done
242+
- name: Upload Logs
243+
if: ${{ ! cancelled() }}
244+
uses: actions/upload-artifact@v4
245+
with:
246+
name: bisect-search
247+
path: ${{ github.workspace }}/gs-logs

0 commit comments

Comments
 (0)