diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml new file mode 100644 index 0000000000..c55ca37cc6 --- /dev/null +++ b/.github/actions/get-runner/action.yml @@ -0,0 +1,47 @@ +name: Get Runner Infos + +outputs: + runner_id: + value: ${{ steps.runner.outputs.runner_id }} + user_id: + value: ${{ steps.runner.outputs.user_id }} + render_id: + value: ${{ steps.runner.outputs.render_id }} + hostname: + value: ${{ steps.runner.outputs.hostname }} + +permissions: read-all + +runs: + using: composite + steps: + - name: Get runner + shell: bash -xe {0} + id: runner + run: | + # get test runner + echo "runner_id=$(echo ${RUNNER_NAME} |sed 's/\-[0-9]$//')" |tee -a ${GITHUB_OUTPUT} + echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT} + echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} + echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} + # show host info + lscpu + lshw -C display + free -h + df -h + cat /etc/os-release + uname -a + - name: Cleanup host + shell: bash -xe {0} + run: | + # clean docker cache + docker system prune -af || true + # clean workspace + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + cd ${RUNNER_WORKSPACE}/.. + if [ "${PWD}" != "/" ];then + ls -al + sudo chmod 777 -R torch-xpu-ops _temp _actions _tool || true + sudo rm -rf _temp + fi diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml deleted file mode 100644 index 1631f399f2..0000000000 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ /dev/null @@ -1,185 +0,0 @@ -name: inductor-xpu-e2e-test - -inputs: - suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma - env_prepare: - required: false - description: If set to any value, will prepare suite test env - dt: - required: true - type: string - default: 'float32' - description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma - mode: - required: true - type: string - default: 'inference' - description: inference,training. Delimiter is comma - scenario: - required: true - type: string - default: 'accuracy' - description: accuracy,performance. Delimiter is comma - cards: - required: false - type: string - default: 'all' - description: which cards can be used in the test - hf_token: - required: false - description: HUGGING_FACE_HUB_TOKEN for torchbench test - pytorch: - required: false - type: string - default: 'main' - description: Pytorch branch/commit - driver: - required: false - type: string - default: 'lts' - description: Driver lts/rolling - -runs: - using: composite - steps: - - name: Prepare ENV - if: ${{ inputs.env_prepare }} - shell: bash - run: | - source activate e2e_ci - if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git - cd audio && git checkout $TORCHAUDIO_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchaudio -y && pip install dist/*.whl - cd ../ && rm -rf vision && git clone --single-branch -b main https://github.com/pytorch/vision.git - cd vision && git checkout $TORCHVISION_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl - fi - cd ../ && python -c "import torch, torchvision, torchaudio" - rm -rf benchmark && git clone https://github.com/pytorch/benchmark.git - cd benchmark && git checkout $TORCHBENCH_COMMIT_ID - # remove deps which will reinstall torch - pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch) - pip install -U transformers==4.44.2 - sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt - git status && git diff - pip install -r requirements.txt - python install.py --continue_on_fail - # deps for torchrec_dlrm - pip install pyre_extensions - pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu - pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec - fi - if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then - pip install -U transformers==4.44.2 - fi - if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../ && rm -rf vision && git clone --single-branch -b main https://github.com/pytorch/vision.git - cd vision && git checkout $TORCHVISION_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl - fi - # install timm without dependencies - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID - # install timm dependencies without torch and torchvision - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch) - fi - pip install numpy==1.26.4 - - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - env: - HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - shell: bash - run: | - source activate e2e_ci - cp .github/scripts/inductor_xpu_test.sh ../pytorch - cd ../pytorch - - # check param - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) - cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')" - export OMP_NUM_THREADS=${cores_per_instance} - for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g') - do - if [ "${suite}" == "pt2e" ];then - continue - fi - contains "huggingface,timm_models,torchbench" $suite - $contains_status - for dt in $(echo ${{ inputs.dt }} |sed 's/,/ /g') - do - contains "float32,bfloat16,float16,amp_bf16,amp_fp16" $dt - $contains_status - for mode in $(echo ${{ inputs.mode }} |sed 's/,/ /g') - do - contains "inference,training" $mode - $contains_status - for scenario in $(echo ${{ inputs.scenario }} |sed 's/,/ /g') - do - contains "accuracy,performance" $scenario - $contains_status - if [ "${MODEL_ONLY_NAME}" == "" ];then - for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ]) - do - cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')" - numactl --localalloc --physcpubind=${cpu_list} bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} & - done - else - for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g') - do - numactl --localalloc bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model} - done - fi - wait - # summarize pass rate - LOG_DIR="inductor_log/${suite}/${dt}" - LOG_NAME=inductor_${suite}_${dt}_${mode}_xpu_${scenario}_all.log - rm -f ${LOG_DIR}/${LOG_NAME} - find ${LOG_DIR}/ -name "inductor_${suite}_${dt}_${mode}_xpu_${scenario}_card*.log" |xargs cat >> ${LOG_DIR}/${LOG_NAME} 2>&1 - done - done - done - done - - - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - env: - HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} - shell: bash - run: | - cd ../pytorch - rm -f inductor_log/summary_accuracy.csv - for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv") - do - sed -i "s/$/,$(basename $var)/" $var - cat $var >> inductor_log/summary_accuracy.csv - done - - source activate e2e_ci - cd ${{ github.workspace }} - cp .github/scripts/inductor_summary.py ../pytorch - cd ../pytorch - pip install styleFrame scipy pandas - set -xe - dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') - mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') - suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') - scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g') - python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario} diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml new file mode 100644 index 0000000000..52ef8a4cc9 --- /dev/null +++ b/.github/actions/linux-e2etest/action.yml @@ -0,0 +1,111 @@ +name: Linux E2E Test + +inputs: + env_prepare: + required: false + description: If set to any value, will prepare suite test env + suite: + required: true + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma + dt: + required: true + type: string + default: 'float32' + description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma + mode: + required: true + type: string + default: 'inference' + description: inference,training. Delimiter is comma + scenario: + required: true + type: string + default: 'accuracy' + description: accuracy,performance. Delimiter is comma + +runs: + using: composite + steps: + - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash -x {0} + run: | + pip list |grep -E 'intel|torch' + cp ./.github/scripts/inductor_xpu_test.sh ./pytorch + cd ./pytorch + # check param + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) + cores_per_instance="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk -v i="${xpu_num}" 'BEGIN{sum=1}{sum*=$NF}END{print sum/i}')" + export OMP_NUM_THREADS=${cores_per_instance} + for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g') + do + if [ "${suite}" == "pt2e" ];then + continue + fi + contains "huggingface,timm_models,torchbench" $suite + $contains_status + for dt in $(echo ${{ inputs.dt }} |sed 's/,/ /g') + do + contains "float32,bfloat16,float16,amp_bf16,amp_fp16" $dt + $contains_status + for mode in $(echo ${{ inputs.mode }} |sed 's/,/ /g') + do + contains "inference,training" $mode + $contains_status + for scenario in $(echo ${{ inputs.scenario }} |sed 's/,/ /g') + do + contains "accuracy,performance" $scenario + $contains_status + if [ "${MODEL_ONLY_NAME}" == "" ];then + for xpu_id in $(seq 0 $[ ${xpu_num} - 1 ]) + do + cpu_list="$(echo "${cores_per_instance} ${xpu_id}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')" + numactl --localalloc --physcpubind=${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${xpu_id} & + done + else + for test_model in $(echo ${MODEL_ONLY_NAME} |sed 's/,/ /g') + do + numactl --localalloc bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${test_model} + done + fi + wait + # summarize pass rate + LOG_DIR="inductor_log/${suite}/${dt}" + LOG_NAME=inductor_${suite}_${dt}_${mode}_xpu_${scenario}_all.log + rm -f ${LOG_DIR}/${LOG_NAME} + find ${LOG_DIR}/ -name "inductor_${suite}_${dt}_${mode}_xpu_${scenario}_card*.log" |xargs cat >> ${LOG_DIR}/${LOG_NAME} 2>&1 + done + done + done + done + + - name: Summary E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash -xe {0} + run: | + cd ./pytorch + rm -f inductor_log/summary_accuracy.csv + for var in $(find inductor_log/ -name "inductor_*_xpu_accuracy.csv") + do + sed -i "s/$/,$(basename $var)/" $var + cat $var >> inductor_log/summary_accuracy.csv + done + cp ${{ github.workspace }}/.github/scripts/inductor_summary.py ./ + csv_file="$(find inductor_log/ -name "inductor_*_xpu_*.csv" |tail -n 1)" + if [ -f "${csv_file}" ];then + pip install styleFrame scipy pandas + dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g') + mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g') + suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g') + scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g') + python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario} + fi diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml new file mode 100644 index 0000000000..188dacc29b --- /dev/null +++ b/.github/actions/linux-testenv/action.yml @@ -0,0 +1,168 @@ +name: Setup Test Environment + +inputs: + test_type: + required: true + type: string + description: Test scope + pytorch: + type: string + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: + type: string + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version + suite: + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma + +permissions: read-all + +runs: + using: composite + steps: + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Check runner + shell: bash -xe {0} + run: | + hostname && id + cat /etc/os-release + gcc -v && g++ -v + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + uname -a + dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' + clinfo --list + cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq -c + rm -rf ~/.triton /tmp/*inductor* + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Install oneAPI DLE + shell: bash -xe {0} + if: ${{ inputs.oneapi != 'installed' }} + run: | + rm -rf ~/intel ~/.intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + source ${HOME}/intel/oneapi/setvars.sh + sycl-ls && icpx -v + - name: Download Pytorch wheel + if: ${{ ! contains(inputs.test_type, 'wheel') }} + uses: actions/download-artifact@v4 + with: + pattern: Torch-XPU-Wheel-* + - name: Install E2E Requirements + if: ${{ contains(inputs.test_type, 'e2e') }} + shell: bash -xe {0} + run: | + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu + pip install pandas psutil scipy + if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then + pip install transformers==4.44.2 + elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then + pip install timm==1.0.14 + elif [[ "${{ inputs.suite }}" == *"torchbench"* ]];then + rm -rf ./benchmark + git clone https://github.com/pytorch/benchmark + cd benchmark + git checkout e03a63be43e33596f7f0a43b0f530353785e4a59 + pip install -r requirements.txt + pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install + python install.py --continue_on_fail + elif [[ "${{ inputs.suite }}" == *"pt2e"* ]];then + rm -rf ./benchmark + git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark + cd benchmark + pip install -r requirements.txt + pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install + python install.py --continue_on_fail + fi + pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton + pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton + - name: Prepare Stock Pytorch + shell: bash -xe {0} + run: | + # install pytorch + if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu + elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl") + fi + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then + PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + else + PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + fi + git clone ${PYTORCH_REPO} pytorch + cd pytorch + git checkout ${TORCH_COMMIT_ID} + # apply extra PRs for stock pytorch + pip install requests + if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + else + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + fi + git status && git diff && git show -s + - name: Prepare Torch-xpu-ops + shell: bash -xe {0} + if: ${{ inputs.torch_xpu_ops != 'skipped' }} + run: | + cd pytorch + rm -rf third_party/torch-xpu-ops + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" + else + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + if [ "${{ inputs.torch_xpu_ops }}" == "pinned" ];then + TORCH_XPU_OPS_COMMIT="$(cat third_party/xpu.txt)" + else + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" + fi + fi + if [ "${{ inputs.torch_xpu_ops }}" == "cicd" ] || [ "${{ inputs.torch_xpu_ops }}" == "triggered" ];then + cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + else + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} + fi + git status && git diff && git show -s + - name: Torch Config + shell: bash -xe {0} + run: | + printenv + python -c "import torch; print(torch.__config__.show())" + python -c "import torch; print(torch.__config__.parallel_info())" + python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py + pip list |grep -E 'torch|intel' diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml new file mode 100644 index 0000000000..d522dc4691 --- /dev/null +++ b/.github/actions/linux-uttest/action.yml @@ -0,0 +1,176 @@ +name: Linux Unit Test + +inputs: + test_type: + required: true + type: string + description: Test scope + +permissions: read-all + +runs: + using: composite + steps: + - name: requirements + shell: bash -xe {0} + run: | + pip install -r pytorch/.ci/docker/requirements-ci.txt + pip install -U pytest-timeout pytest-xdist + xpu_num=$(clinfo --list |awk 'BEGIN{gpu=0;}{if(gpu==1 && $0~/Platform/){gpu=0;}; if(gpu==1){print $0;}; if($0~/Platform.*Graphics/){gpu=1;}}' |wc -l) + parallel_options="" + if [ ${xpu_num} -gt 1 ];then + parallel_options+=" --dist worksteal " + for x in $(seq 0 $[ ${xpu_num} - 1 ]) + do + parallel_options+=" --tx popen//env:ZE_AFFINITY_MASK=${x} " + done + else + parallel_options+=" -n 1 " + fi + printf " --timeout 600 --timeout_method=thread ${parallel_options} " > ${{ github.workspace }}/test-options.txt + - name: ut_regression + shell: bash -xe {0} + if: ${{ inputs.test_type == 'ut_regression' }} + run: | + mkdir -p ut_log/ut_regression + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest $(cat ${{ github.workspace }}/test-options.txt) -v --junit-xml=${{ github.workspace }}/ut_log/ut_regression.xml \ + 2> ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_regression/ut_regression_test.log + - name: ut_transformers + shell: bash -xe {0} + if: ${{ inputs.test_type == 'ut_transformers' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + mkdir -p ut_log/ut_transformers + cd pytorch + pytest $(cat ${{ github.workspace }}/test-options.txt) -v test/test_transformers.py -k xpu \ + --junit-xml=${{ github.workspace }}/ut_log/ut_transformers.xml \ + 2> ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_transformers/ut_transformers_test.log + - name: ut_extended + shell: bash -xe {0} + if: ${{ inputs.test_type == 'ut_extended' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + mkdir -p ut_log/ut_extended + cd pytorch/third_party/torch-xpu-ops/test/xpu/extended + python run_test_with_skip.py \ + 2> ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_extended/ut_extended_test.log + ls -al + cp *.xml ${{ github.workspace }}/ut_log + - name: ut_op + shell: bash -xe {0} + if: ${{ inputs.test_type == 'ut_op' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + export PYTORCH_ENABLE_XPU_FALLBACK=1 + mkdir -p ut_log/ut_op + cd pytorch/third_party/torch-xpu-ops/test/xpu + python run_test_with_skip.py \ + 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_skip_test.log + ls -al + cp *.xml ${{ github.workspace }}/ut_log + find ut_op_with_skip_nn ut_op_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"ut_op_with_skip_quantization/core"*) + dir_name="ut_op_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + ls -al ut_op_with_skip_nn ut_op_with_skip_quantization/core + cp ut_op_with_skip_nn/*.xml ${{ github.workspace }}/ut_log + cp ut_op_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log + # Cases run with a on-demand white list, since some suites are too + # slow to go through all operators on CPU. So add cases on-demand + # when XPU implementatoin is done. + # test_foreach, test_decomp + # Run with only + python run_test_with_only.py \ + 2> ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_op/ut_op_with_only_test.log + ls -al + cp *.xml ${{ github.workspace }}/ut_log + - name: ut_torch + shell: bash -xe {0} + if: ${{ inputs.test_type == 'ut_torch' }} + run: | + export PYTORCH_TEST_WITH_SLOW=1 + export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" + mkdir -p ut_log/ut_torch + cd pytorch + test_cmd="python test/run_test.py --include " + for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done + for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done + if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi + eval $test_cmd 2> ${{ github.workspace }}/ut_log/ut_torch/torch_xpu_test_error.log | \ + tee ${{ github.workspace }}/ut_log/ut_torch/torch_xpu_test.log + - name: ut_profiling + shell: bash -xe {0} + if: ${{ inputs.test_type == 'ut_profiling' }} + run: | + mkdir -p ut_log/xpu_profiling/issue_reproduce + cd pytorch/third_party/torch-xpu-ops + # RN50 Test + PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 + cp profiling.fp32.train.pt ${{ github.workspace }}/ut_log/xpu_profiling + # All Issue Reproduce UT + python -u test/profiling/correlation_id_mixed.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log + python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log + python -u test/profiling/time_precision_in_profile.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log + python -u test/profiling/profile_partial_runtime_ops.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log + python -u test/profiling/triton_xpu_ops_time.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log + + # llama case for calls number test + pip install transformers + python test/profiling/llama.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log + python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv + bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv + + # All xpu ut under test/profiler + cd ../../test/profiler + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_cpp_thread.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_cpp_thread.log + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_execution_trace.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_execution_trace.log + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_memory_profiler.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_memory_profiler.log + python -m pytest $(cat ${{ github.workspace }}/test-options.txt) -vs test_profiler_tree.py | \ + tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log + + - name: xpu_dev1 + shell: bash -xe {0} + if: ${{ inputs.test_type == 'xpu_dev1' }} + run: | + mkdir -p ut_log/xpu_dev1 + cd pytorch/third_party/torch-xpu-ops/test/regressions + pytest --timeout 200 -v test_operation_on_device_1.py \ + --junit-xml=${{ github.workspace }}/ut_log/xpu_dev1.xml \ + 2> ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_dev1/xpu_dev1_test.log + + - name: xpu_distributed + shell: bash -x -e -o pipefail {0} + if: ${{ inputs.test_type == 'xpu_distributed' }} + run: | + mkdir -p ut_log/xpu_distributed + cd pytorch/third_party/torch-xpu-ops/test/xpu + XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") + if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then + echo -e "[ERROR] XCCL is not enabled" + exit 1 + fi + timeout 1800 python run_distributed.py \ + 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index ac4067e7ce..5fc3a9993c 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -14,28 +14,19 @@ inputs: type: string default: 'accuracy' description: accuracy,performance. Delimiter is comma - hf_token: - required: false - description: HUGGING_FACE_HUB_TOKEN for torchbench test pytorch: required: false type: string default: 'main' description: Pytorch branch/commit - driver: - required: false - type: string - default: 'lts' - description: Driver lts/rolling runs: using: composite steps: - name: Prepare ENV if: ${{ inputs.env_prepare }} - shell: bash + shell: bash -xe {0} run: | - source activate e2e_ci # accuracy code if [[ "${{ inputs.scenario }}" == *"accuracy"* ]];then rm -rf pt2e-accuracy @@ -46,38 +37,6 @@ runs: rm -rf pt2e-performance git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark pt2e-performance fi - # deps - if [[ ${{ inputs.scenario }} == *"performance"* ]]; then - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - rm -rf pt2e-audio - git clone --single-branch -b main https://github.com/pytorch/audio pt2e-audio - cd pt2e-audio && git checkout $TORCHAUDIO_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchaudio -y && pip install dist/*.whl - cd ../ - rm -rf pt2e-vision - git clone --single-branch -b main https://github.com/pytorch/vision pt2e-vision - cd pt2e-vision && git checkout $TORCHVISION_COMMIT_ID - python setup.py bdist_wheel && pip uninstall torchvision -y && pip install dist/*.whl - cd ../ - fi - # torchbench - python -c "import torch, torchvision, torchaudio" - cd pt2e-performance - # remove deps which will reinstall torch - pip install --no-deps accelerate - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@$TIMM_COMMIT_ID - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/$TIMM_COMMIT_ID/requirements.txt | grep -vE torch) - pip install -U transformers==4.44.2 - sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g' requirements.txt - git status && git diff - pip install -r requirements.txt - python install.py --continue_on_fail - # deps for torchrec_dlrm - pip install pyre_extensions - pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu - pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec - fi - pip install numpy==1.26.4 # dataset if [ ! -d ${HOME}/datasets/imagenet ];then rm -rf ${HOME}/datasets/imagenet @@ -89,14 +48,8 @@ runs: bash valprep.sh fi - name: PT2E Test (${{ inputs.dt }} ${{ inputs.scenario }}) - env: - HUGGING_FACE_HUB_TOKEN: ${{ inputs.hf_token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - shell: bash + shell: bash -xe {0} run: | - source activate e2e_ci - set -xe pt2e_logs_dir="${{ github.workspace }}/../pytorch/inductor_log/pt2e" rm -rf "${pt2e_logs_dir}" && mkdir -p "${pt2e_logs_dir}" echo "Mode,Model,Dtype,Result" |tee ${pt2e_logs_dir}/summary.csv @@ -107,14 +60,14 @@ runs: do if [[ "${{ inputs.dt }}" == *"float32"* ]];then ${cmd_line} --model_list ${model_name} --is_fp32 2>&1 |tee "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" || true - grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" |tail -n 1 |awk -v m="${model_name}" ' + (grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-float32-${model_name}.log" || echo "failed a failed") 2>&1 |tail -n 1 |awk -v m="${model_name}" ' BEGIN{acc1 = "failed"; acc5 = "failed";} {acc1 = $(NF - 2); acc5 = $NF;} END{printf("Accuracy,%s,float32,%s,%s\n", m, acc1, acc5) }' |tee -a ${pt2e_logs_dir}/summary.csv fi if [[ "${{ inputs.dt }}" == *"int8"* ]];then ${cmd_line} --model_list ${model_name} 2>&1 |tee "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" || true - grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" |tail -n 1 |awk -v m="${model_name}" ' + (grep -i 'Acc.1.*Acc.5' "${pt2e_logs_dir}/accuracy-int8-${model_name}.log" || echo "failed a failed") 2>&1 |tail -n 1 |awk -v m="${model_name}" ' BEGIN{acc1 = "failed"; acc5 = "failed";} {acc1 = $(NF - 2); acc5 = $NF;} END{printf("Accuracy,%s,int8,%s,%s\n", m, acc1, acc5) }' |tee -a ${pt2e_logs_dir}/summary.csv diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 48c09606de..3c82666af0 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -6,7 +6,7 @@ # Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223 parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--driver", type=str, default="lts", help="rolling or lts") +parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts") parser.add_argument("--category", type=str, default="inductor", help="inductor") parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench") parser.add_argument("--mode", type=str, required=True, help="inference or training") diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 5bce6eacdf..001e5c9b44 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -1,18 +1,18 @@ #!/bin/bash # Usage: # ./build.sh --WORKSPACE= \ -# --PYTORCH_REPO= --PYTORCH_VERSION= \ +# --PYTORCH_REPO= --PYTORCH_COMMIT= \ # --TORCH_XPU_OPS_REPO= \ -# --TORCH_XPU_OPS_VERSION= +# --TORCH_XPU_OPS_COMMIT= set -xe export GIT_PAGER=cat # Init params WORKSPACE=$(realpath ${WORKSPACE:-"/tmp"}) PYTORCH_REPO=${PYTORCH_REPO:-"https://github.com/pytorch/pytorch.git"} -PYTORCH_VERSION=${PYTORCH_VERSION:-"main"} +PYTORCH_COMMIT=${PYTORCH_COMMIT:-"main"} TORCH_XPU_OPS_REPO=${TORCH_XPU_OPS_REPO:-"https://github.com/intel/torch-xpu-ops.git"} -TORCH_XPU_OPS_VERSION=${TORCH_XPU_OPS_VERSION:-"main"} +TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT:-"main"} for var; do eval "export $(echo ${var@Q} |sed "s/^'-*//g;s/=/='/")" done @@ -21,20 +21,20 @@ done rm -rf ${WORKSPACE}/pytorch git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch -git checkout ${PYTORCH_VERSION} +git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s git rev-parse HEAD > ${WORKSPACE}/pytorch.commit # Set torch-xpu-ops -if [ "${TORCH_XPU_OPS_VERSION,,}" == "pinned" ];then +if [ "${TORCH_XPU_OPS_COMMIT,,}" == "pinned" ];then TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - TORCH_XPU_OPS_VERSION="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" + TORCH_XPU_OPS_COMMIT="$(cat ${WORKSPACE}/pytorch/third_party/xpu.txt)" fi -if [ "${TORCH_XPU_OPS_VERSION,,}" != "cicd" ];then +if [ "${TORCH_XPU_OPS_COMMIT,,}" != "cicd" ];then rm -rf ${WORKSPACE}/torch-xpu-ops git clone ${TORCH_XPU_OPS_REPO} ${WORKSPACE}/torch-xpu-ops cd ${WORKSPACE}/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_VERSION} + git checkout ${TORCH_XPU_OPS_COMMIT} fi cd ${WORKSPACE}/torch-xpu-ops git remote -v && git branch && git show -s @@ -48,30 +48,32 @@ python -m pip install requests python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static mkl-include +python -m pip install mkl-static==2025.1.0 mkl-include==2025.1.0 export USE_STATIC_MKL=1 -export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.1.1 | \ - intel-cmplr-lib-ur==2025.1.1 | \ - intel-cmplr-lic-rt==2025.1.1 | \ - intel-sycl-rt==2025.1.1 | \ - oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | \ - onemkl-sycl-blas==2025.1.0 | \ - onemkl-sycl-dft==2025.1.0 | \ - onemkl-sycl-lapack==2025.1.0 | \ - onemkl-sycl-rng==2025.1.0 | \ - onemkl-sycl-sparse==2025.1.0 | \ - dpcpp-cpp-rt==2025.1.1 | \ - intel-opencl-rt==2025.1.1 | \ - mkl==2025.1.0 | \ - intel-openmp==2025.1.1 | \ - tbb==2022.1.0 | \ - tcmlib==1.3.0 | \ - umf==0.10.0 | \ - intel-pti==0.12.3 -" +if [ "${XPU_ONEAPI_PATH}" == "" ];then + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ + intel-cmplr-lib-rt==2025.1.1 | \ + intel-cmplr-lib-ur==2025.1.1 | \ + intel-cmplr-lic-rt==2025.1.1 | \ + intel-sycl-rt==2025.1.1 | \ + impi-rt==2021.15.0 | \ + dpcpp-cpp-rt==2025.1.1 | \ + oneccl-devel==2021.15.2 | \ + oneccl==2021.15.2 | \ + mkl==2025.1.0 | \ + onemkl-sycl-blas==2025.1.0 | \ + onemkl-sycl-dft==2025.1.0 | \ + onemkl-sycl-lapack==2025.1.0 | \ + onemkl-sycl-rng==2025.1.0 | \ + onemkl-sycl-sparse==2025.1.0 | \ + intel-opencl-rt==2025.1.1 | \ + intel-openmp==2025.1.1 | \ + tbb==2022.1.0 | \ + tcmlib==1.3.0 | \ + umf==0.10.0 | \ + intel-pti==0.12.3 + " +fi # Build sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index c9afb73eb8..ff78bffe12 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -186,14 +186,14 @@ def parse_log_file(log_file): return summary def determine_category(ut): - if ut == 'op_regression': - return 'op_regression' - elif ut == 'op_regression_dev1': - return 'op_regression_dev1' - elif ut == 'op_extended': - return 'op_extended' - elif 'op_ut' in ut: - return 'op_ut' + if ut == 'ut_regression': + return 'ut_regression' + elif ut == 'xpu_dev1': + return 'xpu_dev1' + elif ut == 'ut_extended': + return 'ut_extended' + elif 'ut_op' in ut: + return 'ut_op' else: return 'unknown' diff --git a/.github/scripts/e2e_summary.sh b/.github/scripts/e2e_summary.sh index c858f6f3f5..d4ad299b59 100644 --- a/.github/scripts/e2e_summary.sh +++ b/.github/scripts/e2e_summary.sh @@ -98,7 +98,7 @@ Empty means the cases NOT run\n\n" suite="$(echo "${csv}" |sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/')" mode="$(echo "${csv}" |sed 's/_xpu_accuracy.*//;s/.*_//')" dtype="$(echo "${csv}" |sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//')" - python "${check_file}" --driver "${LTS_OR_ROLLING:-"lts"}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" + python "${check_file}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" test_result="$(sed 's/, /,/g' "/tmp/tmp-${suite}-${mode}-${dtype}.txt" |awk '{ if($0 ~/Total/){ total = $3; diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh old mode 100644 new mode 100755 index 3b17170385..d0f7cfd338 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,6 +1,6 @@ #!/bin/bash -XPU_ONEAPI_PATH=${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"} +XPU_ONEAPI_PATH="${XPU_ONEAPI_PATH:-"/opt/intel/oneapi"}" source ${XPU_ONEAPI_PATH}/compiler/latest/env/vars.sh source ${XPU_ONEAPI_PATH}/pti/latest/env/vars.sh diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 52baa15dd0..dd399471d9 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -1,5 +1,5 @@ #!/bin/bash -ut_suite="${1:-op_regression}" # op_regression / op_extended / op_ut / torch_xpu +ut_suite="${1:-ut_regression}" # ut_regression / ut_extended / ut_op / ut_torch # usage # compare_and_filter_logs [output.log] @@ -86,15 +86,12 @@ check_passed_known_issues() { local file_passed_UT="$1" local file_known_issue="$2" local output_file="${3:-${file_passed_UT%.*}_passed_known_issues.log}" - if [[ $# -lt 2 ]]; then echo "[ERROR] Need 2 files to compare" return 1 fi - echo "Checking for known issues that are now passing in $file_passed_UT" grep -Fxf "$file_passed_UT" "$file_known_issue" > "$output_file" - echo -e "\n\033[1;32m[New passed cases Summary]\033[0m" if [[ -s "$output_file" ]]; then cat "$output_file" @@ -104,7 +101,7 @@ check_passed_known_issues() { fi } -if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then +if [[ "${ut_suite}" == 'ut_regression' || "${ut_suite}" == 'xpu_dev1' || "${ut_suite}" == 'ut_extended' || "${ut_suite}" == 'ut_transformers' ]]; then grep -E "FAILED" "${ut_suite}"_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_failed.log grep -E "have failures" "${ut_suite}"_test.log | awk '{print $1}' >> ./"${ut_suite}"_failed.log grep -E "Timeout" "${ut_suite}"_test.log | grep "test" >> ./"${ut_suite}"_failed.log @@ -134,7 +131,7 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'op_ut' ]]; then +if [[ "${ut_suite}" == 'ut_op' ]]; then grep -E "FAILED" op_ut_with_skip_test.log | awk '{print $1}' | grep -v "FAILED" > ./"${ut_suite}"_with_skip_test_failed.log grep -E "have failures" op_ut_with_skip_test.log | awk '{print $1}' >> ./"${ut_suite}"_with_skip_test_failed.log grep -E "Timeout" op_ut_with_skip_test.log | grep "test" >> ./"${ut_suite}"_with_skip_test_failed.log @@ -178,8 +175,8 @@ if [[ "${ut_suite}" == 'op_ut' ]]; then num_failed_with_only=$(wc -l < "./${ut_suite}_with_only_test_failed.log") fi ((num_failed=num_failed_with_skip+num_failed_with_only)) - grep "PASSED" op_ut_with_skip_test.log | awk '{print $1}' > ./"${ut_suite}"_with_skip_test_passed.log - grep "PASSED" op_ut_with_only_test.log | awk '{print $1}' > ./"${ut_suite}"_with_only_test_passed.log + grep "PASSED" ut_op_with_skip_test.log | awk '{print $1}' > ./"${ut_suite}"_with_skip_test_passed.log + grep "PASSED" ut_op_with_only_test.log | awk '{print $1}' > ./"${ut_suite}"_with_only_test_passed.log num_passed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_passed.log") num_passed_with_only=$(wc -l < "./${ut_suite}_with_only_test_passed.log") ((num_passed=num_passed_with_skip+num_passed_with_only)) @@ -190,13 +187,13 @@ if [[ "${ut_suite}" == 'op_ut' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'torch_xpu' ]]; then +if [[ "${ut_suite}" == 'ut_torch' ]]; then echo "Pytorch XPU binary UT checking" cd ../../pytorch || exit for xpu_case in build/bin/*{xpu,sycl}*; do if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then case_name=$(basename "$xpu_case") - cd ../ut_log/torch_xpu || exit + cd ../ut_log/ut_torch || exit grep -E "FAILED|have failures" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_failed.log wc -l < "./binary_ut_${ut_suite}_${case_name}_failed.log" | tee -a ./binary_ut_"${ut_suite}"_failed_summary.log grep -E "PASSED|Pass" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_passed.log @@ -207,7 +204,7 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" - cd ../ut_log/torch_xpu || exit + cd ../ut_log/ut_torch || exit cat "./binary_ut_${ut_suite}_${case_name}_failed.log" num_failed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_failed_summary.log) num_passed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_passed_summary.log) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 8fbed99275..ccbac87b3d 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -3,165 +3,215 @@ name: Linux PyTorch XPU Build on: workflow_call: inputs: - pytorch: + runner: required: true + type: string + default: 'pvc_rolling' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel + pytorch: type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - driver: + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + triton: required: false type: string - default: 'lts' - description: Driver lts/rolling + default: 'pinned' + description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed python: - required: false type: string default: '3.10' description: Python version - runner: - required: true - type: string - default: 'linux.idc.xpu' - description: Runner label - triton: - required: false - type: string - default: '' - description: Triton commit. Use pytorch pined commit by default - outputs: - torch_commit_id: - description: The commit id of the torch build - value: ${{ jobs.build.outputs.TORCH_COMMIT_ID }} permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: - build: + runner: runs-on: ${{ inputs.runner }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + build: + name: ${{ inputs.pytorch }} + needs: runner + if: ${{ ! contains(inputs.test_type, 'wheel') }} + runs-on: ${{ needs.runner.outputs.runner_id }} container: image: 'pytorch/manylinux2_28-builder:xpu-main' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: - PATH: /opt/xpu-build/bin:/usr/share/Modules/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - commit_issue: 1280 - GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - outputs: - TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }} + PATH: /tmp/xpu-tool/myvenv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PIP_CACHE_DIR: /tmp/xpu-tool/.pipcache + env: + GH_TOKEN: ${{ github.token }} timeout-minutes: 300 steps: - - name: Setup based env + - name: Install gh-cli run: | + cat /etc/os-release + hostname && id # Cleanup workspace - rm -rf ${{ github.workspace }}/* - # Install gh - dnf install 'dnf-command(config-manager)' + find ./ |grep -v "^\./$" |xargs rm -rf + # install gh + dnf install -y 'dnf-command(config-manager)' dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf autoremove -y git236* && dnf install -y git - dnf install gh --repo gh-cli -y - # Setup python + dnf install -y gh --repo gh-cli + gh --version + - name: Setup python-${{ inputs.python }} + run: | + rm -rf /tmp/xpu-tool/myvenv local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /opt/xpu-build - which python && python -V && pip list + /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv + which python && python -V + which pip && pip list pip install -U pip wheel setuptools - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Build Triton XPU + - name: Build Pytorch on ${{ needs.runner.outputs.hostname }} run: | - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ - source /opt/rh/gcc-toolset-13/enable - dnf install -y zlib-devel - cd ../ && rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - if [ -z ${{ inputs.triton }} ]; then - TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" - else - TRITON_COMMIT_ID="${{ inputs.triton }}" - fi - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - cp pytorch_triton_xpu-*.whl ${{ github.workspace }} - fi - - name: Build Pytorch XPU - run: | - set -xe -o pipefail - if [ "${{ inputs.driver }}" == "lts" ]; then - export TORCH_XPU_ARCH_LIST='pvc' + export USE_XCCL=1 + # only build pvc for CI + if [ "${{ inputs.test_type }}" == "build-cicd" ];then + export TORCH_XPU_ARCH_LIST='pvc' fi if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" - PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" + PYTORCH_COMMIT="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" else PYTORCH_REPO="https://github.com/pytorch/pytorch.git" - PYTORCH_VERSION="${{ inputs.pytorch }}" + PYTORCH_COMMIT="${{ inputs.pytorch }}" fi - if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')" - elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then - TORCH_XPU_OPS_VERSION="pinned" + if [[ "${{ inputs.torch_xpu_ops }}" == *"https://"* ]];then + TORCH_XPU_OPS_REPO="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/@.*//')" + TORCH_XPU_OPS_COMMIT="$(echo ${{ inputs.torch_xpu_ops }} |sed 's/.*@//')" else - TORCH_XPU_OPS_VERSION="cicd" + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi - # oneAPI DLE - source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh # gcc 11 source /opt/rh/gcc-toolset-11/enable - export USE_XCCL=1 + # oneAPI DLE + if [ "${{ inputs.oneapi }}" != "installed" ];then + rm -rf ${HOME}/intel ${HOME}/.intel /opt/intel + wget -q -O oneapi.sh "${{ inputs.oneapi }}" + bash oneapi.sh -a -s --eula accept --action install --install-dir /opt/intel/oneapi + export XPU_ONEAPI_PATH="/opt/intel/oneapi" + fi + source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ --PYTORCH_REPO="${PYTORCH_REPO}" \ - --PYTORCH_VERSION="${PYTORCH_VERSION}" \ + --PYTORCH_COMMIT="${PYTORCH_COMMIT}" \ --TORCH_XPU_OPS_REPO="${TORCH_XPU_OPS_REPO}" \ - --TORCH_XPU_OPS_VERSION="${TORCH_XPU_OPS_VERSION}" \ - 2>&1 |tee ${{ github.workspace }}/pytorch_build_${PYTORCH_VERSION//\//-}.log + --TORCH_XPU_OPS_COMMIT="${TORCH_XPU_OPS_COMMIT}" \ + 2>&1 |tee ${{ github.workspace }}/build_pytorch_${PYTORCH_COMMIT//\//-}.log + if [ $(ls ${{ github.workspace }} |grep -c "torch-.*.whl") -eq 0 ];then + echo "Build pytorch got failed" + exit 1 + fi + - name: Build Triton + run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable + cd ./pytorch + pip install cmake ninja pybind11 + rm -rf pytorch_triton_xpu-*.whl + if [ "${{ inputs.triton }}" != "pinned" ];then + TRITON_COMMIT_ID="${{ inputs.triton }}" + else + TRITON_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)" + fi + TRITON_VERSION_NAME="$( + curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ + grep '__version__' |head -n 1 |awk -F "'" '{print $2}' + )" + python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ + 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log + if [ $(ls |grep -c "pytorch_triton_xpu-.*.whl") -eq 0 ];then + echo "Build triton got failed" + exit 1 + fi + pip install pytorch_triton_xpu-*.whl + cp pytorch_triton_xpu-*.whl ${{ github.workspace }} + - name: Build Torchvision and Torchaudio + run: | + # gcc 13 + dnf install -y gcc-toolset-13-gcc-c++ zlib-devel + source /opt/rh/gcc-toolset-13/enable + cd ./pytorch + TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" + TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" + git clone --single-branch -b main https://github.com/pytorch/vision.git xpu-vision + cd xpu-vision && git checkout ${TORCHVISION_COMMIT_ID} + python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_vision_${TRITON_COMMIT_ID}.log + if [ $(ls dist/ |grep -c "torchvision-.*.whl") -eq 0 ];then + echo "Build torchvision got failed" + exit 1 + fi + pip install dist/*.whl + cp dist/*.whl ${{ github.workspace }} + git clone --single-branch -b main https://github.com/pytorch/audio.git xpu-audio + cd xpu-audio && git checkout ${TORCHAUDIO_COMMIT_ID} + python setup.py bdist_wheel 2>&1 |tee ${{ github.workspace }}/build_audio_${TRITON_COMMIT_ID}.log + if [ $(ls dist/ |grep -c "torchaudio-.*.whl") -eq 0 ];then + echo "Build torchaudio got failed" + exit 1 + fi + pip install dist/*.whl + cp dist/*.whl ${{ github.workspace }} - name: Torch Config run: | + printenv python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import triton; print(triton.__version__)" + python -c "import torchvision; print(torchvision.__version__)" + python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py - - name: Identify Build version - id: build_version - run: | - echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" + pip list |grep -E 'torch|intel' + chmod 777 /__w -R - name: Upload Torch XPU Wheel - if: ${{ ! cancelled() }} + if: ${{ success() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/torch*.whl - - name: Upload Triton Wheel - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Triton-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/pytorch_triton_xpu-*.whl + path: ${{ github.workspace }}/*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/pytorch_*.log - - name: Cleanup - if: always() - run: | - chmod 777 . -R - rm -rf pytorch torch-xpu-ops pytorch_*.log torch*.whl pytorch_triton_xpu-*.whl + path: ${{ github.workspace }}/build_*.log diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml new file mode 100644 index 0000000000..9abe81cacb --- /dev/null +++ b/.github/workflows/_linux_e2e.yml @@ -0,0 +1,254 @@ +name: Linux E2E Test + +on: + workflow_call: + inputs: + runner: + required: true + type: string + default: 'pvc_rolling' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel + pytorch: + type: string + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + python: + type: string + default: '3.10' + description: Python version + suite: + type: string + default: 'huggingface' + description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma + dt: + type: string + default: 'float32' + description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma + mode: + type: string + default: 'inference' + description: Test mode. `inference,training`. Delimiter is comma + scenario: + type: string + default: 'accuracy' + description: Test scenario. `accuracy,performance`. Delimiter is comma + model: + required: false + type: string + default: '' + description: Model. Will only run this one mode if set + +permissions: read-all + +defaults: + run: + shell: bash -xe {0} + +jobs: + runner: + runs-on: ${{ inputs.runner }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + test: + runs-on: ${{ needs.runner.outputs.runner_id }} + needs: runner + timeout-minutes: 3600 + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} + env: + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + TORCH_HOME: /tmp/.cache/_torch + HF_HOME: /tmp/.cache/_huggingface + MODEL_ONLY_NAME: ${{ inputs.model }} + env: + GH_TOKEN: ${{ github.token }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + steps: + - name: Cleanup workspace + run: | + find ./ |grep -v "^\./$" |xargs rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Launch Test on ${{ needs.runner.outputs.hostname }} + uses: ./.github/actions/linux-testenv + with: + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: skipped + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} + suite: ${{ inputs.suite }} + + # CICD launch + - name: CICD Huggingface BF16 & FP16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'huggingface') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: huggingface + dt: bfloat16,float16 + mode: training + scenario: accuracy,performance + - name: CICD Timm_models BF16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'timm_models') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: timm_models + dt: bfloat16 + mode: training + scenario: accuracy,performance + - name: CICD Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'cicd') && contains(inputs.suite, 'torchbench') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: torchbench + dt: bfloat16 + mode: training + scenario: accuracy,performance + + # Nihglty launch + - name: Nightly Huggingface Full Test + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'huggingface') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: huggingface + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Nightly Timm_models FP16 Training Test + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'timm_models') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: timm_models + dt: float16 + mode: training + scenario: accuracy,performance + - name: Nightly Torchbench BF16 Training Test + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'torchbench') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: torchbench + dt: bfloat16 + mode: training + scenario: accuracy,performance + - name: Nightly PT2E Full Test + if: ${{ contains(inputs.test_type, 'nightly') && contains(inputs.suite, 'pt2e') }} + uses: ./.github/actions/pt2e + with: + env_prepare: true + dt: float32,int8 + scenario: accuracy,performance + + # Weekly launch + - name: Weekly Huggingface Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'huggingface') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: huggingface + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Weekly Timm_models Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'timm_models') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: timm_models + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Weekly Torchbench Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'torchbench') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: torchbench + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + - name: Weekly PT2E Full Test + if: ${{ contains(inputs.test_type, 'weekly') && contains(inputs.suite, 'pt2e') }} + uses: ./.github/actions/pt2e + with: + env_prepare: true + dt: float32,int8 + scenario: accuracy,performance + + # On-demand launch + - name: OnDemand Test (huggingface) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'huggingface') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: huggingface + dt: ${{ inputs.dt }} + mode: ${{ inputs.mode }} + scenario: ${{ inputs.scenario }} + - name: OnDemand Test (timm_models) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'timm_models') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: timm_models + dt: ${{ inputs.dt }} + mode: ${{ inputs.mode }} + scenario: ${{ inputs.scenario }} + - name: OnDemand Test (torchbench) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'torchbench') }} + uses: ./.github/actions/linux-e2etest + with: + env_prepare: true + suite: torchbench + dt: ${{ inputs.dt }} + mode: ${{ inputs.mode }} + scenario: ${{ inputs.scenario }} + - name: OnDemand PT2E Test (pt2e) + if: ${{ contains(inputs.test_type, 'ondemand') && contains(inputs.suite, 'pt2e') }} + uses: ./.github/actions/pt2e + with: + env_prepare: true + dt: ${{ inputs.dt }} + scenario: ${{ inputs.scenario }} + + - name: Get archieve files + if: ${{ ! cancelled() }} + run: | + rm -rf ${{ github.workspace }}/upload_files + cp -r ${{ github.workspace }}/pytorch/inductor_log ${{ github.workspace }}/upload_files + - name: Upload Inductor XPU E2E Data + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.suite }} + path: ${{ github.workspace }}/upload_files diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml new file mode 100644 index 0000000000..746bc1d565 --- /dev/null +++ b/.github/workflows/_linux_e2e_summary.yml @@ -0,0 +1,98 @@ +name: Linux E2E Test + +on: + workflow_call: + inputs: + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel + python: + type: string + default: '3.10' + description: Python version + +permissions: read-all + +defaults: + run: + shell: bash -xe {0} + +jobs: + summary: + runs-on: ubuntu-latest + if: ${{ ! cancelled() }} + permissions: + issues: write + env: + GH_TOKEN: ${{ github.token }} + REFERENCE_ISSUE_ID: 1645 + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + steps: + - name: Install gh-cli + run: | + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y + find ./ |grep -v "^\./$" |xargs rm -rf + - name: Setup python-${{ inputs.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python }} + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Download Target Artifact + run: | + mkdir target/ + cd target/ + target_dir="Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*" + gh --repo ${GITHUB_REPOSITORY} run download ${GITHUB_RUN_ID} -p "${target_dir}" + find Inductor-${{ inputs.test_type }}-LTS2-XPU-E2E-Data-*/ -maxdepth 1 -mindepth 1 -type d |\ + while read line; do mv $line .; done + - name: Download Baseline Artifact + run: | + mkdir baseline/ + cd baseline/ + artifact_type="$(echo ${{ inputs.test_type }} |sed 's/ondemand/weekly/;s/cicd/weekly/')" + gh --repo intel/torch-xpu-ops issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee body.txt + REFERENCE_RUN_ID="$(cat body.txt |grep "Inductor-${artifact_type}-LTS2" |sed 's/.*: *//' || echo '')" + if [ "${REFERENCE_RUN_ID}" != "" ];then + gh --repo intel/torch-xpu-ops run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" + find Inductor-*-XPU-E2E-*/ -maxdepth 1 -mindepth 1 -type d |while read line; do mv $line .; done + fi + - name: Get summary + if: ${{ ! cancelled() }} + run: | + pip install pandas requests + e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)" + if [ -f "${e2e_summary_csv}" ];then + bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} + exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) + if [ ${exit_label} -ne 0 ];then + grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 + echo "There are ${exit_label} cases that need look into!!! Please check them" + exit ${exit_label} + fi + fi + pt2e_summary_csv="$(find ./target/ -name "summary.csv")" + if [ -f "${pt2e_summary_csv}" ];then + cat ${pt2e_summary_csv} + failed_num=$(grep -c ',failed' ${pt2e_summary_csv}) + if [ ${failed_num} -ne 0 ];then + echo "[Warning] PT2E has failures!" + fi + fi + - name: Upload Reference Run ID + if: ${{ ! (contains(inputs.test_type, 'ondemand') || contains(inputs.test_type, 'cicd')) && github.repository_owner == 'intel' }} + run: | + gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 + has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" + if [ ${has_or_not} -ne 0 ];then + sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt + else + echo "Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}" |tee -a new_body.txt + fi + gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE_ID} --body-file new_body.txt + - name: Set permissions + if: ${{ always() }} + run: | + find ./ |grep -v "^\./$" |xargs rm -rf diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 9760e6d960..2ab84d571a 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -3,112 +3,88 @@ name: Linux OP Benchmark Test on: workflow_call: inputs: + runner: + required: true + type: string + default: 'pvc_rolling' + description: Runner label + test_type: + type: string + default: 'build-from-source' + description: Build from source or install nightly wheel pytorch: - required: false type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false - type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - triton: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: type: string - default: '' - description: Triton commit. Use pytorch pined commit by default + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed python: - required: false type: string default: '3.10' description: Python version - runner: - required: true - type: string - default: 'linux.idc.xpu' - description: Runner label - driver: - required: false - type: string - default: 'rolling' - description: Driver lts/rolling -permissions: - issues: write +permissions: read-all + +defaults: + run: + shell: bash -xe {0} jobs: - op_benchmark_test: - runs-on: ${{ inputs.runner }} + runner: + runs-on: ${{ inputs.runner }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + op_benchmark: + needs: runner + runs-on: ${{ needs.runner.outputs.runner_id }} + permissions: + issues: write timeout-minutes: 900 + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} + env: + AGENT_TOOLSDIRECTORY: /opt/xpu-tool + TORCH_HOME: /tmp/.cache/_torch + HF_HOME: /tmp/.cache/_huggingface + REFERENCE_ISSUE: 1689 env: GH_TOKEN: ${{ github.token }} - reference_issue: 1689 - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} steps: + - name: Cleanup workspace + run: | + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Prepare Stock Pytorch - run: | - pwd - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../ && rm -rf pytorch - pip install requests - git clone https://github.com/pytorch/pytorch pytorch - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - fi - fi - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 + - name: Launch Test on ${{ needs.runner.outputs.hostname }} + uses: ./.github/actions/linux-testenv with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }} - - name: Install Pytorch XPU - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cd ../pytorch - export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - pip install -r requirements.txt - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - fi - pip install -r .ci/docker/requirements-ci.txt - - name: Torch Config - run: | - source activate xpu_op_${ZE_AFFINITY_MASK} - python -c "import torch; print(torch.__config__.show())" - python -c "import torch; print(torch.__config__.parallel_info())" - python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: skipped + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} - cd .. - python pytorch/torch/utils/collect_env.py - rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache - name: Run Torch XPU Op Benchmark - if: ${{ inputs.driver == 'rolling' }} run: | - source activate xpu_op_${ZE_AFFINITY_MASK} mkdir -p ${{ github.workspace }}/op_benchmark cd test/microbench filename=$(find -- *.py) @@ -129,12 +105,17 @@ jobs: path: ${{ github.workspace }}/op_benchmark op_benchmark_test_results_check: - needs: op_benchmark_test - runs-on: ubuntu-22.04 + needs: op_benchmark + runs-on: ubuntu-latest env: GH_TOKEN: ${{ github.token }} reference_issue: 1689 steps: + - name: Install gh-cli + run: | + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - name: Setup python-${{ inputs.python }} @@ -174,7 +155,6 @@ jobs: python ${{ github.workspace }}/.github/scripts/op_perf_comparison.py --xpu_file ${{ github.workspace }}/op_benchmark/backward_op_summary.csv --baseline_file ${{ github.workspace }}/baseline/baseline_backward_op_summary.csv - name: Update OP Baseline run: | - pip install tabulate pandas mkdir ${{ github.workspace }}/new_baseline cp ${{ github.workspace }}/baseline/baseline*.csv ${{ github.workspace }}/new_baseline # Update forward op @@ -190,6 +170,6 @@ jobs: path: ${{ github.workspace }}/op_benchmark - name: Upload Reference Run ID run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ + gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE} --json body -q .body | \ sed "s/Inductor-XPU-OP-Benchmark-Data:.*/Inductor-XPU-OP-Benchmark-Data: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt + gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE} --body-file new_body.txt diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 07c83ea143..33d0b54d8d 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -3,493 +3,198 @@ name: Linux UT Test on: workflow_call: inputs: - pytorch: - required: false + runner: + required: true type: string - default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false + description: Runner label + test_type: + required: true type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - triton: - required: false + description: Test scope + pytorch: type: string - default: '' - description: Triton commit. Use pytorch pined commit by default - ut: - required: true + default: 'main' + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: '' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu` Delimiter is comma - disabled_tests: - required: false + default: 'main' + description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin + oneapi: type: string - default: '' - description: List disabled tests, such as disable_ut or disable_distributed + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed python: - required: false type: string default: '3.10' description: Python version - runner: + ut: required: true type: string - default: 'linux.idc.xpu' - description: Runner label - driver: - required: false + description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_torch,xpu_dev1` Delimiter is comma + disabled_tests: type: string - default: 'lts' - description: Driver lts/rolling + default: '' + description: List disabled tests, such as disable_ut or disable_distributed permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: - ut_test: - runs-on: ${{ matrix.test.runner || inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' && !contains(inputs.disabled_tests, 'disable_ut') }} + runner: + runs-on: ${{ inputs.runner }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + steps: + - name: Cleanup workspace + run: | + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + normal: + needs: runner + runs-on: ${{ needs.runner.outputs.runner_id }} + if: ${{ contains(inputs.ut, 'ut_') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 300 + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} + -e ZE_AFFINITY_MASK + env: + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + TORCH_HOME: /tmp/.cache/_torch + HF_HOME: /tmp/.cache/_huggingface env: GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - ut_skip_issue: 1624 + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} strategy: fail-fast: false matrix: - test: - - name: 'op_regression' - condition: ${{ contains(inputs.ut, 'op_regression') }} - directory: 'test/regressions' - command: 'pytest --timeout 600 --timeout_method=thread -v --junit-xml=../../ut_log/op_regression.xml' - log_prefix: 'op_regression' - additional_steps: | - clinfo --list - pip install pytest pytest-timeout - - name: 'op_regression_dev1' - condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - directory: 'test/regressions' - command: 'pytest --timeout 600 --timeout_method=thread -v test_operation_on_device_1.py --junit-xml=$GITHUB_WORKSPACE/ut_log/op_regression_dev1.xml' - log_prefix: 'op_regression_dev1' - additional_steps: | - clinfo --list - unset ZE_AFFINITY_MASK - pip install pytest pytest-timeout - runner: 'pvc_e2e' - - name: 'op_transformers' - condition: ${{ contains(inputs.ut, 'op_transformers') }} - directory: '../pytorch' - command: 'pytest --timeout 600 --timeout_method=thread -v test/test_transformers.py -k xpu --junit-xml=$GITHUB_WORKSPACE/ut_log/op_transformers.xml' - log_prefix: 'op_transformers' - additional_steps: | - pip install pytest pytest-timeout - export PYTORCH_TEST_WITH_SLOW=1 - - name: 'op_extended' - condition: ${{ contains(inputs.ut, 'op_extended') }} - directory: '../pytorch/third_party/torch-xpu-ops/test/xpu/extended/' - command: 'python run_test_with_skip.py' - log_prefix: 'op_extended' - additional_steps: | - pip install pytest pytest-timeout - export PYTORCH_TEST_WITH_SLOW=1 - xml_post_processing: | - cp op_extended.xml $GITHUB_WORKSPACE/ut_log - - name: 'op_ut' - condition: ${{ contains(inputs.ut, 'op_ut') }} - directory: '../pytorch/third_party/torch-xpu-ops/test/xpu' - log_prefix: 'op_ut' - command_script: | - export PYTORCH_ENABLE_XPU_FALLBACK=1 - export PYTORCH_TEST_WITH_SLOW=1 - python run_test_with_skip.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_skip_test.log - cp *.xml $GITHUB_WORKSPACE/ut_log - find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' - dir_path=$(dirname "$1"); - case "$dir_path" in - *"op_ut_with_skip_quantization/core"*) - dir_name="op_ut_with_skip_quantization_core";; - *) - dir_name=$(basename "$dir_path");; - esac; - mv "$1" "$dir_path/${dir_name}_$(basename "$1")" - ' _ {} \; - cp op_ut_with_skip_nn/*.xml $GITHUB_WORKSPACE/ut_log - cp op_ut_with_skip_quantization/core/*.xml $GITHUB_WORKSPACE/ut_log - # Cases run with a on-demand white list, since some suites are too - # slow to go through all operators on CPU. So add cases on-demand - # when XPU implementatoin is done. - # test_foreach, test_decomp - # Run with only - timeout 10000 python run_test_with_only.py \ - 2>$GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/op_ut/op_ut_with_only_test.log - cp op_ut_with_only.xml $GITHUB_WORKSPACE/ut_log - additional_steps: | - pip install pytest pytest-timeout - - name: 'torch_xpu' - condition: ${{ contains(inputs.ut, 'torch_xpu') }} - directory: '../pytorch' - command_script: | - export PYTORCH_TEST_WITH_SLOW=1 - export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" - test_cmd="python test/run_test.py --include " - for test in $(ls test/inductor | grep test); do test_cmd="${test_cmd} inductor/$test"; done - for test in $(ls test/xpu | grep test); do test_cmd="${test_cmd} xpu/$test"; done - if [ -f "test/test_xpu.py" ]; then test_cmd="${test_cmd} test_xpu.py"; fi - eval $test_cmd 2>$GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test_error.log | \ - tee $GITHUB_WORKSPACE/ut_log/torch_xpu/torch_xpu_test.log - log_prefix: 'torch_xpu' - additional_steps: | - pip install pytest pytest-timeout - - name: 'xpu_profiling' - condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }} - directory: '$GITHUB_WORKSPACE' - command_script: | - mkdir -p $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce - # RN50 Test - PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0 - cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/xpu_profiling - - # All Issue Reproduce UT - python -u test/profiling/correlation_id_mixed.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log - python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log - python -u test/profiling/time_precision_in_profile.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log - python -u test/profiling/profile_partial_runtime_ops.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log - python -u test/profiling/triton_xpu_ops_time.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log - - # llama case for calls number test - python test/profiling/llama.py | \ - tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log - python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv - bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv - - # All xpu ut under test/profiler - cd ../pytorch/test/profiler - python -m pytest --timeout 600 -vs test_cpp_thread.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_cpp_thread.log - python -m pytest --timeout 600 -vs test_execution_trace.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_execution_trace.log - python -m pytest --timeout 600 -vs test_memory_profiler.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_memory_profiler.log - python -m pytest --timeout 600 -vs test_profiler_tree.py | \ - tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_profiler_tree.log - additional_steps: | - pip install pytest pytest-timeout transformers - outputs: - ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }} + test: [ut_regression, ut_transformers, ut_extended, ut_op, ut_torch, ut_profiling] steps: + - name: Cleanup workspace + if: ${{ contains(inputs.ut, matrix.test) }} + run: | + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/checkout@v4 - - name: Create unique workspace - shell: bash -xe {0} - run: | - # Create unique conda env for each UT test - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=xpu_op_${ZE_AFFINITY_MASK}_${{ matrix.test.name }}_${random}" >> $GITHUB_ENV - - name: Create Conda Env - shell: bash -xe {0} - run: | - pwd - which conda - conda remove --all -y -n $CONDA_ENV_NAME || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -n $CONDA_ENV_NAME python=${{ inputs.python }} cmake ninja -y - source activate $CONDA_ENV_NAME - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 + - name: Launch Test on ${{ needs.runner.outputs.hostname }} + if: ${{ contains(inputs.ut, matrix.test) }} + uses: ./.github/actions/linux-testenv with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - - name: Prepare Stock Pytorch - shell: bash -xe {0} - run: | - cd ../ - rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone https://github.com/pytorch/pytorch pytorch - source activate $CONDA_ENV_NAME - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} - rm -rf vision || sudo rm -rf vision - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} - fi - pip install requests - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git show -s && git status && git diff - pip install -r .ci/docker/requirements-ci.txt - - name: Prepare Torch-xpu-ops - shell: bash -xe {0} - run: | - cd ../pytorch - rm -rf third_party/torch-xpu-ops - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cp -r ${{ github.workspace }} third_party - else - TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}" + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: ${{ inputs.torch_xpu_ops }} + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} - name: Run XPU UT Test - shell: bash -xe {0} - if: ${{ matrix.test.condition }} - run: | - set -e - mkdir -p ${{ github.workspace }}/ut_log - mkdir -p ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - source activate $CONDA_ENV_NAME - echo "Running ${{ matrix.test.name }}" - echo "Directory: ${{ matrix.test.directory }}" - ${{ matrix.test.additional_steps }} - - cd ${{ matrix.test.directory }} - - if [[ "${{ matrix.test.name }}" == "op_ut" ]] || [[ "${{ matrix.test.name }}" == "xpu_profiling" ]] || [[ "${{ matrix.test.name }}" == "torch_xpu" ]]; then - bash << "SCRIPT" - set -e - ${{ matrix.test.command_script }} - SCRIPT - else - ${{ matrix.test.command }} \ - 2>${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test_error.log | \ - tee ${{ github.workspace }}/ut_log/${{ matrix.test.name }}/${{ matrix.test.log_prefix }}_test.log - ${{ matrix.test.xml_post_processing || '' }} - fi + if: ${{ contains(inputs.ut, matrix.test) }} + uses: ./.github/actions/linux-uttest + with: + test_type: ${{ matrix.test }} - name: UT Test Results Summary - shell: bash -xe {0} - if: ${{ matrix.test.condition }} + if: ${{ contains(inputs.ut, matrix.test) }} run: | - source activate $CONDA_ENV_NAME pip install junitparser - python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true - if [ -e "ut_failure_list.csv" ];then - cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv - fi - - name: Clean up - if: ${{ always() }} - run: | - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME + python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + if [ -e ut_failure_list.csv ];then + cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true fi - name: Upload Inductor XPU UT Log - if: ${{ matrix.test.condition }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log + if-no-files-found: ignore - name: Upload XPU UT Failure list - if: ${{ matrix.test.condition }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ env.UT_NAME }} + name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log/ut_failure_list.csv - - name: Set UT outputs - id: set-output - if: ${{ matrix.test.condition }} - run: | - echo "UT_NAME=${{ matrix.test.name }}" >> $GITHUB_OUTPUT - - ut_test_results_check: - needs: ut_test - runs-on: ubuntu-22.04 + if-no-files-found: ignore + + devices: + runs-on: pvc_rolling + if: ${{ contains(inputs.ut, 'xpu_dev1') && !contains(inputs.disabled_tests, 'disable_ut') }} timeout-minutes: 30 env: GH_TOKEN: ${{ github.token }} - ut_skip_issue: 1624 - strategy: - fail-fast: false - matrix: - test: - - name: 'op_regression' - condition: ${{ contains(inputs.ut, 'op_regression') }} - - name: 'op_regression_dev1' - condition: ${{ contains(inputs.ut, 'op_regression_dev1') }} - - name: 'op_transformers' - condition: ${{ contains(inputs.ut, 'op_transformers') }} - - name: 'op_extended' - condition: ${{ contains(inputs.ut, 'op_extended') }} - - name: 'op_ut' - condition: ${{ contains(inputs.ut, 'op_ut') }} - - name: 'torch_xpu' - condition: ${{ contains(inputs.ut, 'torch_xpu') }} - - name: 'xpu_profiling' - condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }} + AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool steps: - - name: Get matrix UT value + - name: Cleanup workspace + id: cleanup run: | - echo "UT_NAME=${{ needs.ut_test.outputs.ut_name }}" >> "${GITHUB_ENV}" + cat /etc/os-release + echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf + sudo rm -rf ~/.triton ~/.torch + xpu-smi discovery - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Download XPU UT Logs - if: ${{ matrix.test.condition }} - uses: actions/download-artifact@v4 + - name: Launch Test on ${{ steps.cleanup.outputs.hostname }} + uses: ./.github/actions/linux-testenv with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }} - path: ${{ github.workspace }}/ut_log - - name: Check UT Results - if: ${{ matrix.test.condition }} - shell: bash - run: | - repo="${{ github.repository }}" - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - cd ${{ github.workspace }}/ut_log/${{ matrix.test.name }} - gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log - gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ - > issues.log - awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log - awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log - cat issues_temp.log | awk '{print $1}' >> Known_issue.log - awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh ${{ matrix.test.name }} + test_type: ${{ inputs.test_type }} + pytorch: ${{ inputs.pytorch }} + torch_xpu_ops: ${{ inputs.torch_xpu_ops }} + oneapi: ${{ inputs.oneapi }} + python: ${{ inputs.python }} + - name: Run XPU UT Test + uses: ./.github/actions/linux-uttest + with: + test_type: xpu_dev1 - name: Upload Inductor XPU UT Log - if: ${{ matrix.test.condition }} + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test.name }}-checked + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_dev1 path: ${{ github.workspace }}/ut_log - distributed_ut_test: + distributed: runs-on: pytorch-06 if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} timeout-minutes: 60 env: GH_TOKEN: ${{ github.token }} - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} - ut_skip_issue: 1624 + AGENT_TOOLSDIRECTORY: ${{ github.workspace }}/xpu-tool steps: + - name: Cleanup workspace + id: cleanup + run: | + cat /etc/os-release + echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} + ls -al + sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + sudo find /tmp/ |grep -v "^/tmp/$" |xargs sudo rm -rf + sudo rm -rf ~/.triton ~/.torch + xpu-smi topology -m - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Create Conda Env - run: | - pwd - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 + - name: Launch Test on ${{ steps.cleanup.outputs.hostname }} + uses: ./.github/actions/linux-testenv with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - - name: Prepare Stock Pytorch - run: | - cd ../ - rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone https://github.com/pytorch/pytorch pytorch - source activate xpu_op_${ZE_AFFINITY_MASK} - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} - rm -rf vision || sudo rm -rf vision - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - else - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ./pytorch - git checkout ${TORCH_COMMIT_ID} - fi - pip install requests - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git show -s && git status && git diff - pip install -r .ci/docker/requirements-ci.txt - - name: Prepare Torch-xpu-ops - run: | - cd ../pytorch - rm -rf third_party/torch-xpu-ops - if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cp -r ${{ github.workspace }} third_party - else - TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ - tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + uses: ./.github/actions/linux-uttest + with: + test_type: xpu_distributed - name: Reset Ptrace_scope if: ${{ always() }} run: | @@ -526,53 +220,52 @@ jobs: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log - distributed_ut_test_results_check: - needs: distributed_ut_test - runs-on: ubuntu-22.04 + summary: + needs: [normal, devices, distributed] + if: ${{ ! cancelled() }} + runs-on: ubuntu-latest timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + test: [ut_regression, ut_transformers, ut_extended, ut_op, ut_torch, ut_profiling, xpu_dev1, xpu_distributed] env: GH_TOKEN: ${{ github.token }} - ut_skip_issue: 1624 + UT_SKIP_ISSUE: 1624 steps: - - name: Set the UT name + - name: Cleanup workspace + if: ${{ contains(inputs.ut, matrix.test) }} run: | - echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + find ./ |grep -v "^\./$" |xargs rm -rf - name: Checkout torch-xpu-ops + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/checkout@v4 - name: Download XPU UT Logs + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/download-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }} path: ${{ github.workspace }}/ut_log - name: Check UT Results + if: ${{ contains(inputs.ut, matrix.test) }} shell: bash run: | repo="${{ github.repository }}" - function contains() { - contains_status="echo 'Start $2 ...'" - { - [[ $1 =~ (^|,)$2($|,) ]] - } || { - echo "[Warning] $2 is not suppotted type! Skipped!" - contains_status="continue" - } - } - set -xe - echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - cd ${{ github.workspace }}/ut_log/xpu_distributed - gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log + ls -al ${{ github.workspace }}/ut_log + cd ${{ github.workspace }}/ut_log/${{ matrix.test }} + gh --repo $repo issue view $UT_SKIP_ISSUE --json body -q .body | sed '/^$/d' > Known_issue.log gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ - > issues.log - awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log - awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log + --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log + awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | \ + grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log + awk '$2 == "ut_op" {print $1}' issues_temp.log > issues_ut_op.log cat issues_temp.log | awk '{print $1}' >> Known_issue.log - awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log + awk -F'::' '{print $1}' issues_ut_op.log | sort -u | paste -sd ',' >> Known_issue.log cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh 'xpu_distributed' + bash ut_result_check.sh ${{ matrix.test }} - name: Upload Inductor XPU UT Log - if: always() + if: ${{ contains(inputs.ut, matrix.test) }} uses: actions/upload-artifact@v4 with: - name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed-checked + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ matrix.test }}-checked path: ${{ github.workspace }}/ut_log diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml index ee628792f0..3c211ccfc2 100644 --- a/.github/workflows/_windows_ut.yml +++ b/.github/workflows/_windows_ut.yml @@ -8,7 +8,7 @@ on: type: string default: 'main' description: Pytorch branch/commit - keep_torch_xpu_ops: + torch_xpu_ops: required: false type: string default: 'false' @@ -17,7 +17,7 @@ on: required: true type: string default: '' - description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu` Delimiter is comma + description: UT scope. `ut_regression,xpu_dev1,ut_extended,ut_op,ut_torch` Delimiter is comma python: required: false type: string @@ -89,7 +89,7 @@ jobs: git status git show -s git submodule sync && git submodule update --init --recursive - if ${{ inputs.keep_torch_xpu_ops }} == 'true' ( + if ${{ inputs.torch_xpu_ops }} == 'pinned' ( echo "Don't replace torch-xpu-ops!" ) else ( echo "Replace torch-xpu-ops!" @@ -157,7 +157,7 @@ jobs: path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\dist' - name: Run XPU OP Extended UT - if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule' + if: contains(inputs.ut, 'ut_extended') || github.event_name == 'schedule' shell: cmd run: | call "C:\ProgramData\miniforge3\Scripts\activate.bat" @@ -169,7 +169,7 @@ jobs: python run_test_with_skip_mtl.py - name: Run Test XPU UT - if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule' + if: contains(inputs.ut, 'ut_torch') || github.event_name == 'schedule' shell: cmd run: | call "C:\ProgramData\miniforge3\Scripts\activate.bat" diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 25c3af0245..93826b34a8 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -3,442 +3,184 @@ name: Nightly-OnDemand Tests on: schedule: # GMT+8 21:00 every workday - - cron: '0 13 * * 0-4' - # GMT+8 0:00 Saturday - - cron: '0 16 * * 5' + - cron: '10 13 * * 0-4' # build from source + - cron: '30 13 * * 0-4' # nightly wheel + # GMT+8 00:00 Saturday + - cron: '10 16 * * 5' # build from source + - cron: '30 16 * * 5' # nightly wheel workflow_dispatch: inputs: pytorch: - required: false type: string default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false + description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' + torch_xpu_ops: type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - ut: - required: false - type: string - default: 'torch_xpu' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma + default: 'triggered' + description: Torch-xpu-ops workflow triggered branch by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: - required: false + type: string + default: 'pinned' + description: Triton pinned by pytorch by default, or 'commit/branch', or 'repo@commit/repo@branch' + oneapi: + type: string + default: 'installed' + description: Installed oneAPI DLE on host by default, fill offline.sh url if needed + ut: type: string default: '' - description: Triton commit. Use pytorch pined commit by default + description: UT scope. `ut_regression,ut_transformers,ut_extended,ut_op,ut_profiling,ut_torch,xpu_dev1,xpu_distributed,microbench,windows`. Delimiter is comma suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma + default: '[]' + description: Dynamo benchmarks test suite. `["huggingface","timm_models","torchbench","pt2e"]`. Delimiter is comma dt: - required: true type: string - default: 'float32' + default: '' description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma mode: - required: true type: string - default: 'inference' + default: '' description: Test mode. `inference,training`. Delimiter is comma scenario: - required: true type: string - default: 'accuracy' + default: '' description: Test scenario. `accuracy,performance`. Delimiter is comma model: - required: false type: string default: '' description: Model. Will only run this one mode if set - python: - required: false - type: string - default: '3.10' - description: Python version permissions: read-all -concurrency: - group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: ${{ github.event_name != 'schedule' }} +run-name: ${{ (contains(github.event.schedule, '13') && 'Nightly') || (contains(github.event.schedule, '16') && 'Weekly') || 'On-demand' }} / ${{ (contains(github.event.schedule, '10') && 'Source Code') || (contains(github.event.schedule, '30') && 'CD Wheel') || inputs.pytorch }} jobs: - Linux-Nightly-Ondemand-Build: + Conditions-Filter: + name: conditions-filter if: ${{ github.repository_owner == 'intel' }} - name: linux-nightly-ondemand + runs-on: ubuntu-latest + timeout-minutes: 3 + outputs: + test_type: ${{ steps.inputs-check.outputs.test_type }} + pytorch: ${{ steps.inputs-check.outputs.pytorch }} + torch_xpu_ops: ${{ steps.inputs-check.outputs.torch_xpu_ops }} + steps: + - name: Inputs check + id: inputs-check + run: | + if [ "${{ github.event_name }}" == "schedule" ];then + if [ "${{ github.event.schedule }}" == "10 13 * * 0-4" ];then + test_type="build-nightly" + pytorch="main" + torch_xpu_ops="main" + elif [ "${{ github.event.schedule }}" == "30 13 * * 0-4" ];then + test_type="wheel-nightly" + pytorch="nightly_wheel" + torch_xpu_ops="pinned" + elif [ "${{ github.event.schedule }}" == "10 16 * * 5" ];then + test_type="build-weekly" + pytorch="main" + torch_xpu_ops="main" + elif [ "${{ github.event.schedule }}" == "30 16 * * 5" ];then + test_type="wheel-weekly" + pytorch="nightly_wheel" + torch_xpu_ops="pinned" + else + test_type="unknown" + pytorch="main" + torch_xpu_ops="main" + fi + else + pytorch="${{ inputs.pytorch }}" + torch_xpu_ops="${{ inputs.torch_xpu_ops }}" + if [[ "${{ inputs.pytorch }}" == *"_wheel" ]];then + test_type="wheel-ondemand" + else + test_type="build-ondemand" + fi + fi + echo "test_type=${test_type}" >> ${GITHUB_OUTPUT} + echo "pytorch=${pytorch}" >> ${GITHUB_OUTPUT} + echo "torch_xpu_ops=${torch_xpu_ops}" >> ${GITHUB_OUTPUT} + + Linux-Nightly-Ondemand-Build: + needs: [Conditions-Filter] + name: linux-build secrets: inherit uses: ./.github/workflows/_linux_build.yml with: - pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - runner: pvc_e2e + runner: pvc_rolling + test_type: ${{ needs.Conditions-Filter.outputs.test_type }} + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} + triton: ${{ github.event_name == 'schedule' && 'pinned' || inputs.triton }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} Linux-Nightly-Ondemand-UT-Tests: - if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} - name: linux-nightly-ondemand - needs: Linux-Nightly-Ondemand-Build + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'ut_') || contains(inputs.ut, 'xpu_') }} + name: linux-ut + needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] uses: ./.github/workflows/_linux_ut.yml with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }} runner: linux.idc.xpu + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-ut + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.Conditions-Filter.outputs.torch_xpu_ops }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} + ut: ${{ github.event_name == 'schedule' && 'ut_regression,xpu_dev1,ut_transformers,ut_extended,ut_op' || inputs.ut }} Linux-Nightly-Ondemand-E2E-Tests: - runs-on: pvc_e2e - name: linux-nightly-ondemand / e2e_test - needs: Linux-Nightly-Ondemand-Build - timeout-minutes: 3600 - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} - outputs: - TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} - TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} - TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} - TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} - TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} - TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} - TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} - TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=${{ env.python }} cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - - name: Prepare Stock Pytorch - run: | - pwd - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - # apply extra PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Identify pinned versions - id: pinned - run: | - source .github/scripts/env.sh - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - else - echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - fi - echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo ${GITHUB_ENV} - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - - name: Show GITHUB_ENV - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - - # Nihglty launch - - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Torchbench BF16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Timm_models FP16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: float16 - mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly PT2E Full Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - # Weekly launch - - name: Weekly Huggingface Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Torchbench Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Timm_models Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly PT2E Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: float32,int8 - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: ${{ inputs.suite }} - env_prepare: true - dt: ${{ inputs.dt }} - mode: ${{ inputs.mode }} - scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: ${{ inputs.dt }} - scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + if: ${{ github.event_name == 'schedule' || inputs.suite != '[]' }} + name: linux-e2e + needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] + strategy: + fail-fast: false + matrix: + suite: ${{ fromJSON(inputs.suite) }} + uses: ./.github/workflows/_linux_e2e.yml + with: + runner: pvc_rolling + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-e2e + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} + suite: ${{ matrix.suite }} + dt: ${{ github.event_name == 'schedule' && 'float32' || inputs.dt }} + mode: ${{ github.event_name == 'schedule' && 'inference' || inputs.mode }} + scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }} + model: ${{ github.event_name == 'schedule' && '' || inputs.model }} + Linux-Nightly-Ondemand-E2E-Tests-Summary: + if: ${{ ! cancelled() }} + name: linux-e2e + permissions: write-all + needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests] + uses: ./.github/workflows/_linux_e2e_summary.yml + with: + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-e2e - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - if [ "${{ env.run_type }}" == "on-demand" ];then - artifact_type="weekly" - else - artifact_type="${{ env.run_type }}" - fi - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-${artifact_type}-LTS-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - id: summary - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary - if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ env.run_type }}-LTS-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - name: Upload Reference Run ID - if: ${{ env.run_type != 'on-demand' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.run_type }}-LTS-XPU-E2E:.*/Inductor-${{ env.run_type }}-LTS-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt + Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'microbench') }} + name: linux-microbench + permissions: write-all + needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] + uses: ./.github/workflows/_linux_op_benchmark.yml + with: + runner: pvc_rolling + test_type: ${{ needs.Conditions-Filter.outputs.test_type }}-mb + pytorch: ${{ needs.Conditions-Filter.outputs.pytorch }} + oneapi: ${{ github.event_name == 'schedule' && 'installed' || inputs.oneapi }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} Windows-Nightly-Ondemand-UT-Tests: - if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} - name: Windows-nightly-ondemand + if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'windows') }} + name: windows + needs: [Conditions-Filter] uses: ./.github/workflows/_windows_ut.yml with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - ut: ${{ github.event_name == 'schedule' && 'op_extended,torch_xpu' || inputs.ut }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} + ut: ${{ github.event_name == 'schedule' && 'ut_extended,ut_torch' || inputs.ut }} + python: ${{ github.event_name == 'schedule' && '3.10' || '3.10' }} src_changed: false has_label: true runner: Windows_CI - - Tests-Failure-And-Report: - if: ${{ ! cancelled() }} - runs-on: [ self-hosted, Linux ] - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - needs: Linux-Nightly-Ondemand-E2E-Tests - steps: - - name: Report github issue for XPU OPS nightly - if: github.repository_owner == 'intel' - run: | - set -xe - # Test env - build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - repo="${{ github.repository }}" - TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_BRANCH_ID }}" - TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_COMMIT_ID }}" - DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.DRIVER_VERSION }}" - KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.KERNEL_VERSION }}" - BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.BUNDLE_VERSION }}" - OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.OS_PRETTY_NAME }}" - GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.GCC_VERSION }}" - TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}" - TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}" - TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}" - TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}" - TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}" - TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}" - TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMEOUT_MODELS }}" - # Test status - if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "success" ];then - test_status=Success - elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "failure" ];then - test_status=Failure - cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" - else - test_status=None - exit 0 - fi - # Test Type - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_type="On-demand" - test_issue_id=426 - cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" - elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then - test_type="Weekly" - test_issue_id=432 - else - test_type="Nightly" - test_issue_id=432 - fi - # Test report - echo -e "**${test_status}** $test_type Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt - printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt - echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION | $KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" - if [ "${{ inputs.model }}" != "" ];then - test_scope+="; model=${{ inputs.model }}" - fi - echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt - fi - echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt - echo "$cc_comment" >> ${{ github.workspace }}/report.txt - # Report - report_txt=$(cat ${{ github.workspace }}/report.txt) - gh --repo $repo issue comment $test_issue_id --body "$report_txt" diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml deleted file mode 100644 index 03101ebf3a..0000000000 --- a/.github/workflows/nightly_ondemand_rolling.yml +++ /dev/null @@ -1,460 +0,0 @@ -name: Nightly-OnDemand Tests Rolling - -on: - schedule: - # GMT+8 21:30 every workday - - cron: '30 13 * * 0-4' - # GMT+8 0:30 Saturday - - cron: '30 16 * * 5' - workflow_dispatch: - inputs: - pytorch: - required: false - type: string - default: 'main' - description: Pytorch branch/commit - keep_torch_xpu_ops: - required: false - type: string - default: 'false' - description: Keep torch-xpu-ops pin. `true` means use pined commit - ut: - required: false - type: string - default: 'torch_xpu' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma - triton: - required: false - type: string - default: '' - description: Triton commit. Use pytorch pined commit by default - suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma - dt: - required: true - type: string - default: 'float32' - description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma - mode: - required: true - type: string - default: 'inference' - description: Test mode. `inference,training`. Delimiter is comma - scenario: - required: true - type: string - default: 'accuracy' - description: Test scenario. `accuracy,performance`. Delimiter is comma - model: - required: false - type: string - default: '' - description: Model. Will only run this one mode if set - python: - required: false - type: string - default: '3.10' - description: Python version - -permissions: read-all - -concurrency: - group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: ${{ github.event_name != 'schedule' }} - -jobs: - Linux-Nightly-Ondemand-Build-Rolling: - if: ${{ github.repository_owner == 'intel' }} - name: linux-nightly-ondemand-rolling - secrets: inherit - uses: ./.github/workflows/_linux_build.yml - with: - pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - driver: rolling - runner: pvc_rolling - - Linux-Nightly-Ondemand-UT-Tests-Rolling: - if: ${{ github.event_name == 'schedule' || inputs.ut != '' }} - name: linux-nightly-ondemand-rolling - needs: Linux-Nightly-Ondemand-Build-Rolling - uses: ./.github/workflows/_linux_ut.yml - with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }} - driver: rolling - runner: pvc_rolling - - Linux-Nightly-Ondemand-OP-Microbench-Tests-Rolling: - name: linux-nightly-ondemand-rolling / Op_microbench - permissions: - issues: write - needs: Linux-Nightly-Ondemand-Build-Rolling - uses: ./.github/workflows/_linux_op_benchmark.yml - with: - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }} - driver: rolling - runner: pvc_rolling - - Linux-Nightly-Ondemand-E2E-Tests-Rolling: - runs-on: pvc_rolling - name: linux-nightly-ondemand-rolling / e2e_test - needs: Linux-Nightly-Ondemand-Build-Rolling - timeout-minutes: 3600 - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-Rolling.outputs.torch_commit_id }} - keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - NEOReadDebugKeys: 1 - DisableScratchPages: 1 - run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '30 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} - outputs: - TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} - TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} - TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} - TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} - TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} - TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} - TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} - TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=${{ env.python }} cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - if: ${{ inputs.pytorch != 'nightly_wheel' }} - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} - - name: Prepare Stock Pytorch - run: | - pwd - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch - git checkout ${TORCH_COMMIT_ID} - # apply extra PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Identify pinned versions - id: pinned - run: | - source .github/scripts/env.sh - cd ../pytorch - if [ -z ${{ inputs.triton }} ]; then - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - else - echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - fi - echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo ${GITHUB_ENV} - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl - - name: Show GITHUB_ENV - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - - # Nihglty launch - - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16 - mode: inference,training - scenario: accuracy - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Nightly Torchbench BF16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Nightly Timm_models FP16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: float16 - mode: training - scenario: accuracy - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Nightly PT2E Full Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - # Weekly launch - - name: Weekly Huggingface Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Weekly Torchbench Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Weekly Timm_models Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Weekly PT2E Accuracy Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: float32,int8 - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: ${{ inputs.suite }} - env_prepare: true - dt: ${{ inputs.dt }} - mode: ${{ inputs.mode }} - scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: ${{ inputs.dt }} - scenario: ${{ inputs.scenario }} - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - driver: rolling - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - if [ "${{ env.run_type }}" == "on-demand" ];then - artifact_type="weekly" - else - artifact_type="${{ env.run_type }}" - fi - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-${artifact_type}-Rolling-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - id: summary - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary - if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci - export LTS_OR_ROLLING='rolling' - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ env.run_type }}-Rolling-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - name: Upload Reference Run ID - if: ${{ env.run_type != 'on-demand' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.run_type }}-Rolling-XPU-E2E:.*/Inductor-${{ env.run_type }}-Rolling-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt - - Tests-Failure-And-Report: - if: ${{ ! cancelled() }} - runs-on: [ self-hosted, Linux ] - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - needs: Linux-Nightly-Ondemand-E2E-Tests-Rolling - steps: - - name: Report github issue for XPU OPS nightly - if: github.repository_owner == 'intel' - run: | - set -xe - # Test env - build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - repo="${{ github.repository }}" - TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_BRANCH_ID }}" - TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_COMMIT_ID }}" - KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.KERNEL_VERSION }}" - DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.DRIVER_VERSION }}" - BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.BUNDLE_VERSION }}" - OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.OS_PRETTY_NAME }}" - GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.GCC_VERSION }}" - TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHBENCH_COMMIT_ID }}" - TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHVISION_COMMIT_ID }}" - TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCHAUDIO_COMMIT_ID }}" - TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TRANSFORMERS_VERSION }}" - TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TIMM_COMMIT_ID }}" - TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TRITON_COMMIT_ID }}" - TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TIMEOUT_MODELS }}" - # Test status - if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.result }}" == "success" ];then - test_status=Success - elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.result }}" == "failure" ];then - test_status=Failure - cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" - else - test_status=None - exit 0 - fi - # Test Type - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_type="On-demand" - test_issue_id=426 - cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" - elif [ "${{ github.event.schedule }}" == "30 16 * * 5" ];then - test_type="Weekly" - test_issue_id=432 - else - test_type="Nightly" - test_issue_id=432 - fi - # Test report - echo -e "**${test_status}** $test_type Rolling Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt - printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt - echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | rolling-$DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" - if [ "${{ inputs.model }}" != "" ];then - test_scope+="; model=${{ inputs.model }}" - fi - echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt - fi - echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt - echo "$cc_comment" >> ${{ github.workspace }}/report.txt - # Report - report_txt=$(cat ${{ github.workspace }}/report.txt) - gh --repo $repo issue comment $test_issue_id --body "$report_txt" diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml deleted file mode 100644 index 23f5456f28..0000000000 --- a/.github/workflows/nightly_ondemand_whl.yml +++ /dev/null @@ -1,396 +0,0 @@ -name: Torch Nightly WHL Tests - -on: - schedule: - # GMT+8 21:00 every workday - - cron: '0 14 * * 0-4' - # GMT+8 0:00 Saturday - - cron: '0 17 * * 5' - workflow_dispatch: - inputs: - pytorch: - required: false - type: string - default: 'nightly' - description: Pytorch branch/commit - ut: - required: false - type: string - default: 'torch_xpu' - description: UT scope. `op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,torch_xpu,xpu_profiling`. Delimiter is comma - suite: - required: true - type: string - default: 'huggingface' - description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma - dt: - required: true - type: string - default: 'float32' - description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma - mode: - required: true - type: string - default: 'inference' - description: Test mode. `inference,training`. Delimiter is comma - scenario: - required: true - type: string - default: 'accuracy' - description: Test scenario. `accuracy,performance`. Delimiter is comma - model: - required: false - type: string - default: '' - description: Model. Will only run this one mode if set - python: - required: false - type: string - default: '3.10' - description: Python version - -permissions: read-all - -concurrency: - group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: ${{ github.event_name != 'schedule' }} - -jobs: - Linux-Nightly-Ondemand-UT-WHL-Tests: - if: ${{ (github.event_name == 'schedule' || inputs.ut != '') && github.repository_owner == 'intel' }} - uses: ./.github/workflows/_linux_ut.yml - with: - ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_transformers,op_extended,op_ut' || inputs.ut }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - pytorch: nightly_wheel - runner: linux.idc.xpu - - Linux-Nightly-Ondemand-E2E-WHL-Tests: - runs-on: pvc_e2e - if: ${{ github.repository_owner == 'intel' }} - timeout-minutes: 3600 - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - pytorch: ${{ github.event_name == 'schedule' && 'nightly' || inputs.pytorch }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 17 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} - outputs: - TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }} - TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }} - TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} - TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} - TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} - TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} - TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} - TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} - TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=${{ env.python }} cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Prepare Stock Pytorch - id: installed - run: | - pwd - cd ../ - source activate e2e_ci - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - rm -rf pytorch || sudo rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout ${TORCH_COMMIT_ID} - # apply PRs for stock pytorch - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git diff && git show -s - - name: Identify pinned versions - id: pinned - run: | - source activate e2e_ci - source .github/scripts/env.sh - echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - cd ../pytorch - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo ${GITHUB_ENV} - - name: Show GITHUB_ENV - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - - # Nihglty launch - - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Torchbench BF16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy - pytorch: nightly_wheel - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly Timm_models FP16 Training Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: float16 - mode: training - scenario: accuracy - pytorch: nightly_wheel - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Nightly PT2E Accuracy Test - if: ${{ env.run_type == 'nightly' }} - uses: ./.github/actions/pt2e - with: - dt: float32,int8 - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - env_prepare: true - - # Weekly launch - - name: Weekly Huggingface Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Torchbench Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly Timm_models Full Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - env_prepare: true - dt: float32,bfloat16,float16,amp_bf16,amp_fp16 - mode: inference,training - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Weekly PT2E Accuracy Test - if: ${{ env.run_type == 'weekly' }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: float32,int8 - scenario: accuracy,performance - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - # On-demand launch - - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && inputs.suite != 'pt2e' }} - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: ${{ inputs.suite }} - env_prepare: true - dt: ${{ inputs.dt }} - mode: ${{ inputs.mode }} - scenario: ${{ inputs.scenario }} - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: OnDemand PT2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) - if: ${{ github.event_name != 'schedule' && contains(inputs.suite, 'pt2e') }} - uses: ./.github/actions/pt2e - with: - env_prepare: true - dt: ${{ inputs.dt }} - scenario: ${{ inputs.scenario }} - pytorch: nightly_wheel - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - if [ "${{ env.run_type }}" == "on-demand" ];then - artifact_type="weekly" - else - artifact_type="${{ env.run_type }}" - fi - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-${artifact_type}-Pre-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - id: summary - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ - find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days - tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs - # Print summary - if [ "${{ inputs.suite }}" != 'pt2e' ];then - source activate e2e_ci - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - fi - pt2e_summary_csv="$(find ${{ github.workspace }}/upload_files/ -name "summary.csv")" - if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep ',failed' ${pt2e_summary_csv} |wc -l) - if [ ${failed_num} -ne 0 ];then - echo "[Warning] PT2E has failures!" - fi - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-${{ env.run_type }}-Pre-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - name: Upload Reference Run ID - if: ${{ env.run_type != 'on-demand' }} - run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ - sed "s/Inductor-${{ env.run_type }}-Pre-XPU-E2E:.*/Inductor-${{ env.run_type }}-Pre-XPU-E2E: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt - - Tests-Failure-And-Report: - if: ${{ ! cancelled() }} - runs-on: [ self-hosted, Linux ] - permissions: - issues: write - env: - GH_TOKEN: ${{ github.token }} - python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} - needs: Linux-Nightly-Ondemand-E2E-WHL-Tests - steps: - - name: Report github issue for XPU OPS nightly - if: github.repository_owner == 'intel' - run: | - set -xe - # Test env - build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - repo="${{ github.repository }}" - TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}" - TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}" - TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}" - DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}" - KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}" - BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}" - OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.OS_PRETTY_NAME }}" - GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.GCC_VERSION }}" - TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHBENCH_COMMIT_ID }}" - TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHVISION_COMMIT_ID }}" - TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHAUDIO_COMMIT_ID }}" - TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRANSFORMERS_VERSION }}" - TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMM_COMMIT_ID }}" - TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRITON_COMMIT_ID }}" - TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMEOUT_MODELS }}" - # Test status - if [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "success" ];then - test_status=Success - elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "failure" ];then - test_status=Failure - cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" - else - test_status=None - exit 0 - fi - # Test Type - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_type="On-demand" - test_issue_id=426 - cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" - elif [ "${{ github.event.schedule }}" == "0 17 * * 5" ];then - test_type="Weekly" - test_issue_id=432 - else - test_type="Nightly" - test_issue_id=432 - fi - # Test report - echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt - printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt - echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt - echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt - if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then - test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" - if [ "${{ inputs.model }}" != "" ];then - test_scope+="; model=${{ inputs.model }}" - fi - echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt - fi - echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt - echo "$cc_comment" >> ${{ github.workspace }}/report.txt - # Report - report_txt=$(cat ${{ github.workspace }}/report.txt) - gh --repo $repo issue comment $test_issue_id --body "$report_txt" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3f3b1c1b58..23683fa701 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -21,9 +21,8 @@ concurrency: jobs: preci-lint-check: - name: preci-lint-check if: ${{ github.repository_owner == 'intel' }} - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 30 steps: - name: Checkout torch-xpu-ops @@ -51,11 +50,9 @@ jobs: export CLANG=1 bash third_party/torch-xpu-ops/.github/scripts/lintrunner.sh - preci-conditions-filter: - name: preci-conditions-filter - if: ${{ github.event.pull_request.draft == false }} - needs: [preci-lint-check] - runs-on: ubuntu-22.04 + conditions-filter: + if: ${{ github.repository_owner == 'intel' && github.event.pull_request.draft == false }} + runs-on: ubuntu-latest timeout-minutes: 10 env: GH_TOKEN: ${{ github.token }} @@ -92,175 +89,58 @@ jobs: disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)" echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" - preci-linux-build: - name: preci-linux - if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}} - needs: [preci-conditions-filter] + linux-build: + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} + needs: [conditions-filter, preci-lint-check] secrets: inherit uses: ./.github/workflows/_linux_build.yml with: + runner: pvc_rolling + test_type: build-cicd pytorch: main - runner: pvc_e2e + torch_xpu_ops: cicd - preci-linux-ut: - name: preci-linux - needs: [preci-conditions-filter, preci-linux-build] + linux-ut: + needs: [conditions-filter, linux-build] uses: ./.github/workflows/_linux_ut.yml with: - disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} - ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + test_type: build-cicd-ut + pytorch: main + torch_xpu_ops: cicd + ut: ut_regression,ut_transformers,ut_extended,ut_op,xpu_dev1,xpu_distributed + disabled_tests: ${{ needs.conditions-filter.outputs.disabled_tests }} - preci-linux-e2e: - if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} - name: preci-linux / e2e_test - needs: [preci-conditions-filter, preci-linux-build] - runs-on: pvc_e2e - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - timeout-minutes: 300 - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=3.10 cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number }} - - name: Install Pytorch XPU - run: | - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout ${TORCH_COMMIT_ID} - # apply PRs for stock pytorch - # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - git show -s && git status && git diff - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - python .github/scripts/build_triton_wheel.py --device xpu - pip install pytorch_triton_xpu-*.whl - - name: Identify pinned versions - run: | - cd ../pytorch - echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - source ../torch-xpu-ops/.github/scripts/env.sh - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - - name: Torch Config - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log || sudo rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* || sudo rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache || sudo rm -rf ~/.triton/cache - cd .. - source activate e2e_ci - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - - name: Huggingface BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Huggingface FP16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: float16 - mode: training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Timm_models BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Torchbench BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files || sudo rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - # Print summary - source activate e2e_ci - export IS_PR=1 - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files + linux-e2e: + name: linux-e2e + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} + needs: [conditions-filter, linux-build] + strategy: + fail-fast: false + matrix: + suite: [huggingface, timm_models, torchbench] + uses: ./.github/workflows/_linux_e2e.yml + with: + runner: pvc_rolling + test_type: build-cicd-e2e + pytorch: main + suite: ${{ matrix.suite }} + linux-e2e-summary: + if: ${{ ! cancelled() }} + name: linux-e2e + permissions: write-all + needs: [linux-e2e] + uses: ./.github/workflows/_linux_e2e_summary.yml + with: + test_type: build-cicd-e2e - preci-windows: - name: preci-windows - if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }} - needs: [preci-conditions-filter] + windows: + name: windows + if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} + needs: [conditions-filter, preci-lint-check] uses: ./.github/workflows/_windows_ut.yml with: - ut: op_extended,torch_xpu + ut: ut_extended,ut_torch runner: Windows_CI - src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }} - has_label: ${{ needs.preci-conditions-filter.outputs.has_label }} + src_changed: ${{ needs.conditions-filter.outputs.src_changed }} + has_label: ${{ needs.conditions-filter.outputs.has_label }} diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 01a608ae6d..49f3be5876 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -1,6 +1,7 @@ import os import sys +import torch from skip_list_common import skip_dict from skip_list_win import skip_dict as skip_dict_win @@ -16,8 +17,20 @@ skip_options += skip_option skip_options += '"' +# pytest options +xpu_num = torch.xpu.device_count() +parallel_options = ( + " --dist worksteal " + + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)]) + if xpu_num > 1 + else " -n 1 " +) +test_options = f" --timeout 600 --timeout_method=thread {parallel_options} " + os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" -test_command = "pytest --timeout 600 -v --timeout_method=thread --junit-xml=./op_extended.xml test_ops_xpu.py" +test_command = ( + f" pytest {test_options} -v --junit-xml=./ut_extended.xml test_ops_xpu.py " +) test_command += skip_options res = os.system(test_command) sys.exit(res) diff --git a/test/xpu/run_test_with_only.py b/test/xpu/run_test_with_only.py index 9d70896b11..06ebc87e8d 100644 --- a/test/xpu/run_test_with_only.py +++ b/test/xpu/run_test_with_only.py @@ -1,12 +1,25 @@ import os import sys +import torch + # Cases in the file is too slow to run all suites on CPU. So add white list. def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + + # pytest options + xpu_num = torch.xpu.device_count() + parallel_options = ( + " --dist worksteal " + + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)]) + if xpu_num > 1 + else " -n 1 " + ) + test_options = f" --timeout 600 --timeout_method=thread {parallel_options} " + if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -14,8 +27,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - "pytest --timeout 600 -v " - + "--junit-xml=./op_ut_with_only.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case + skip_options ) @@ -27,15 +39,14 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - "pytest --timeout 600 -v " - + "--junit-xml=./op_ut_with_only.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case + exe_options ) return os.system(test_command) else: test_command = ( - "pytest --timeout 600 -v --junit-xml=./op_ut_with_only.xml " + test_case + f" pytest {test_options} -v --junit-xml=./ut_op_with_only.xml " + test_case ) return os.system(test_command) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index df524100b3..26c0152f71 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -1163,6 +1163,17 @@ def copy_tests( def launch_test(test_case, skip_list=None, exe_list=None): os.environ["PYTORCH_ENABLE_XPU_FALLBACK"] = "1" os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" + + # pytest options + xpu_num = torch.xpu.device_count() + parallel_options = ( + " --dist worksteal " + + " ".join([f"--tx popen//env:ZE_AFFINITY_MASK={x}" for x in range(xpu_num)]) + if xpu_num > 1 + else " -n 1 " + ) + test_options = f" --timeout 600 --timeout_method=thread {parallel_options} " + if skip_list is not None: skip_options = ' -k "not ' + skip_list[0] for skip_case in skip_list[1:]: @@ -1170,7 +1181,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_options += skip_option skip_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += skip_options @@ -1181,13 +1192,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): exe_options += exe_option exe_options += '"' test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) test_command += exe_options else: test_command = ( - f"pytest --timeout 600 -v --junit-xml=./op_ut_with_skip_{test_case}.xml " + f" pytest {test_options} -v --junit-xml=./ut_op_with_skip_{test_case}.xml " + test_case ) return os.system(test_command)