|
| 1 | +name: Weekly-Distributed-OnDemand Tests |
| 2 | + |
| 3 | +on: |
| 4 | + schedule: |
| 5 | + # GMT+8 0:00 Saturday |
| 6 | + - cron: '0 16 * * 5' |
| 7 | + workflow_dispatch: |
| 8 | + inputs: |
| 9 | + pytorch: |
| 10 | + required: false |
| 11 | + type: string |
| 12 | + default: 'https://github.com/daisyden/pytorch.git@distributed_2.8' |
| 13 | + description: Pytorch branch/commit |
| 14 | + keep_torch_xpu_ops: |
| 15 | + required: false |
| 16 | + type: string |
| 17 | + default: 'https://github.com/intel/torch-xpu-ops.git@daisyden/distributed_2.8' |
| 18 | + description: Keep torch-xpu-ops pin. `true` means use pined commit |
| 19 | + triton: |
| 20 | + required: false |
| 21 | + type: string |
| 22 | + default: 'bdd0656b' |
| 23 | + description: Triton commit. Use pytorch pined commit by default |
| 24 | + python: |
| 25 | + required: false |
| 26 | + type: string |
| 27 | + default: '3.10' |
| 28 | + description: Python version |
| 29 | + |
| 30 | +permissions: read-all |
| 31 | + |
| 32 | +concurrency: |
| 33 | + group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.triton }}-${{ inputs.python }} |
| 34 | + cancel-in-progress: ${{ github.event_name != 'schedule' }} |
| 35 | + |
| 36 | +jobs: |
| 37 | + Linux-Weekly-Ondemand-Build: |
| 38 | + if: ${{ github.repository_owner == 'intel' }} |
| 39 | + name: linux-weekly-ondemand |
| 40 | + secrets: inherit |
| 41 | + uses: ./.github/workflows/_linux_build.yml |
| 42 | + with: |
| 43 | + pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }} |
| 44 | + keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} |
| 45 | + python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} |
| 46 | + runner: PVC-7358 |
| 47 | + |
| 48 | + Linux-Weekly-Ondemand-Distributed-Tests: |
| 49 | + runs-on: PVC-7358 |
| 50 | + name: linux-weekly-ondemand / distributed_weekly |
| 51 | + needs: Linux-Weekly-Ondemand-Build |
| 52 | + timeout-minutes: 3600 |
| 53 | + permissions: |
| 54 | + issues: write |
| 55 | + env: |
| 56 | + GH_TOKEN: ${{ github.token }} |
| 57 | + pytorch: ${{ needs.Linux-Weekly-Ondemand-Build.outputs.torch_commit_id }} |
| 58 | + keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} |
| 59 | + python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} |
| 60 | + run_type: ${{ (github.event_name == 'schedule' && (github.event.schedule == '0 16 * * 5' && 'weekly' || 'nightly')) || 'on-demand' }} |
| 61 | + outputs: |
| 62 | + TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} |
| 63 | + TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} |
| 64 | + DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} |
| 65 | + KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} |
| 66 | + BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} |
| 67 | + OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} |
| 68 | + GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} |
| 69 | + TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} |
| 70 | + TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} |
| 71 | + TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} |
| 72 | + TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} |
| 73 | + TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} |
| 74 | + TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} |
| 75 | + TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} |
| 76 | + steps: |
| 77 | + - name: Checkout torch-xpu-ops |
| 78 | + uses: actions/checkout@v4 |
| 79 | + - name: Prepare Conda ENV |
| 80 | + run: | |
| 81 | + which conda && conda clean -ay |
| 82 | + conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci |
| 83 | + conda create -n e2e_ci python=${{ env.python }} cmake ninja -y |
| 84 | + source activate e2e_ci |
| 85 | + pip install pandas scipy psutil requests |
| 86 | + - name: Download Pytorch wheel |
| 87 | + if: ${{ inputs.pytorch != 'nightly_wheel' }} |
| 88 | + uses: actions/download-artifact@v4 |
| 89 | + with: |
| 90 | + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} |
| 91 | + - name: Prepare Stock Pytorch |
| 92 | + run: | |
| 93 | + pwd |
| 94 | + cd ../ |
| 95 | + rm -rf pytorch || sudo rm -rf pytorch |
| 96 | + source activate e2e_ci |
| 97 | + pip install --force-reinstall ${{ github.workspace }}/torch*.whl |
| 98 | + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') |
| 99 | + git clone https://github.com/daisyden/pytorch.git pytorch |
| 100 | + cd pytorch |
| 101 | + git checkout ${TORCH_COMMIT_ID} |
| 102 | + git status && git diff && git show -s |
| 103 | + rm -rf vision || sudo rm -rf vision |
| 104 | + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. |
| 105 | + pip install -r .ci/docker/requirements-ci.txt |
| 106 | + - name: Prepare Torch-xpu-ops |
| 107 | + run: | |
| 108 | + cd ../pytorch |
| 109 | + rm -rf third_party/torch-xpu-ops |
| 110 | + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then |
| 111 | + cd third_party |
| 112 | + git clone https://github.com/intel/torch-xpu-ops.git |
| 113 | + cd torch-xpu-ops |
| 114 | + git checkout daisyden/distributed_2.8 |
| 115 | + else |
| 116 | + TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt) |
| 117 | + git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops |
| 118 | + cd third_party/torch-xpu-ops |
| 119 | + git checkout ${TORCH_XPU_OPS_COMMIT} |
| 120 | + fi |
| 121 | + - name: Triton Installation |
| 122 | + run: | |
| 123 | + source activate e2e_ci |
| 124 | + cd ../pytorch |
| 125 | + if [ -z ${{ inputs.triton }} ]; then |
| 126 | + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |
| 127 | + else |
| 128 | + TRITON_COMMIT_ID="${{ inputs.triton }}" |
| 129 | + fi |
| 130 | + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then |
| 131 | + pip install cmake ninja pybind11 |
| 132 | + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" |
| 133 | + fi |
| 134 | + - name: Identify pinned versions |
| 135 | + id: pinned |
| 136 | + run: | |
| 137 | + source .github/scripts/env.sh |
| 138 | + cd ../pytorch |
| 139 | + if [ -z ${{ inputs.triton }} ]; then |
| 140 | + echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 141 | + else |
| 142 | + echo "TRITON_COMMIT_ID=${{ inputs.triton }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 143 | + fi |
| 144 | + echo "TORCH_BRANCH_ID=${{ inputs.pytorch }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 145 | + echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 146 | + echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 147 | + echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 148 | + echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 149 | + echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 150 | + echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 151 | + . /etc/os-release |
| 152 | + echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 153 | + echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" |
| 154 | + echo ${GITHUB_ENV} |
| 155 | + - name: Show GITHUB_ENV |
| 156 | + run: | |
| 157 | + echo "$GITHUB_ENV" |
| 158 | + - name: Set Ptrace_scope |
| 159 | + if: ${{ always() }} |
| 160 | + run: | |
| 161 | + set -x -e -u -o pipefail |
| 162 | + sudo rm -rf ptrace_scope.bk |
| 163 | + sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk |
| 164 | + cat ptrace_scope.bk |
| 165 | + echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope |
| 166 | + - name: Run Torch XPU Distributed UT |
| 167 | + run: | |
| 168 | + set -x -e -o pipefail |
| 169 | + source activate e2e_ci |
| 170 | + pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers |
| 171 | + pip install hypothesis==6.131.27 |
| 172 | + mkdir -p ut_log/xpu_distributed |
| 173 | + cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ |
| 174 | + cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/check-ut.py ut_log/ |
| 175 | + cd ../pytorch/third_party/torch-xpu-ops/test/xpu |
| 176 | + XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") |
| 177 | + if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then |
| 178 | + echo -e "[ERROR] XCCL is not enabled" |
| 179 | + exit 1 |
| 180 | + fi |
| 181 | + python run_distributed_local.py \ |
| 182 | + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \ |
| 183 | + tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log || true |
| 184 | + cp *.xml ${{ github.workspace }}/ut_log |
| 185 | + - name: Reset Ptrace_scope |
| 186 | + if: ${{ always() }} |
| 187 | + run: | |
| 188 | + if [ -f ptrace_scope.bk ]; then |
| 189 | + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope |
| 190 | + fi |
| 191 | + - name: UT Test Results Summary |
| 192 | + run: | |
| 193 | + source activate xpu_op_${ZE_AFFINITY_MASK} |
| 194 | + pip install junitparser |
| 195 | + cd ${{ github.workspace }}/ut_log/ |
| 196 | + python check-ut.py ${{ github.workspace }}/ut_log/*.xml \ |
| 197 | + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \ |
| 198 | + >> $GITHUB_STEP_SUMMARY || true |
| 199 | + cd ${{ github.workspace }}/ut_log/xpu_distributed |
| 200 | + gh --repo https://github.com/intel/torch-xpu-ops.git issue view 1624 --json body -q .body | sed '/^$/d' > Known_issue.log |
| 201 | + bash ut_result_check.sh 'pytorch_distributed' |
| 202 | + - name: Upload Inductor XPU UT Log |
| 203 | + if: ${{ ! cancelled() }} |
| 204 | + uses: actions/upload-artifact@v4 |
| 205 | + with: |
| 206 | + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed |
| 207 | + path: ${{ github.workspace }}/ut_log |
0 commit comments