pytorch · DiweiSun · Jun 20, 2025 · Jun 23, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir 
+python3 setup.py install
+
+pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
+
+cd test/quantization
+pytest -v -s *.py
diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
@@ -0,0 +1,156 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: xpu-test
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    timeout-minutes: 60
+    runs-on: ao-pvc
+    env:
+      DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3
+      TEST_COMMAND: .github/scripts/ci_test_xpu.sh
+      PYTORCH_RETRY_TEST_CASES: 1
+      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
+      XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout Torchao
+        uses: actions/checkout@v4
+
+      - name: Clean all stopped docker containers
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
+
+      - name: Runner health check GPU count
+        if: always()
+        shell: bash
+        run: |
+          ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
+          msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+          if [[ $ngpu -eq 0 ]]; then
+            echo "Error: Failed to detect any GPUs on the runner"
+            echo "$msg"
+            exit 1
+          fi
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        shell: bash
+        run: |
+          echo "docker pull ${DOCKER_IMAGE}"
+          docker pull ${DOCKER_IMAGE}
+
+      - name: Test
+        id: test
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        timeout-minutes: 60
+        run: |
+          set -x
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e BRANCH \
+            -e SHA1 \
+            --user $(id -u):$(id -g) \
+            --ulimit stack=10485760:83886080 \
+            --ulimit core=0 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="8g" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            --privileged \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # save container name for later step
+          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
+          docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
+
+      - name: Change permissions
+        if: ${{ always() && steps.test.conclusion }}
+        run: |
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Stop container before exit
+        if: always()
+        run: |
+          # Workaround for multiple runners on same IDC node
+          docker stop "${{ env.CONTAINER_NAME }}"
+
+      - name: Store Core dumps on GitHub
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Teardown XPU
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -48,15 +48,18 @@
     is_ROCM,
     is_sm_at_least_89,
     is_sm_at_least_90,
+    auto_detect_device,
 )
 
 is_cusparselt_available = (
     hasattr(torch.backends, "cusparselt") and torch.backends.cusparselt.is_available()
 )
 
+_DEVICE = auto_detect_device()
+
 
 def get_quantization_functions(
-    do_sparse: bool, do_int4: bool, device: str = "cuda", int4_zp_int: bool = False
+    do_sparse: bool, do_int4: bool, device: str =_DEVICE, int4_zp_int: bool = False
 ):
     base_functions = [
         int8_weight_only(),
@@ -114,9 +117,9 @@ class TestAffineQuantized(TestCase):
         ["xpu"] if torch.xpu.is_available() else []
     )
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+
     def test_tensor_core_layout_transpose(self):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         t = linear.weight
         shape = t.shape
         apply_int4_weight_only_quant = int4_weight_only(group_size=32)
@@ -182,7 +185,7 @@ def _apply(module, config_or_subclass_inserter):
             ql = _apply(linear, apply_quant)
             ql.to(device)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+
     def test_register_new_dispatch(self):
         from torchao.dtypes import AffineQuantizedTensor
         from torchao.dtypes.affine_quantized_tensor_ops import (
@@ -219,10 +222,10 @@ def apply_uint6_weight_only_quant(linear):
             )
             return linear
 
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         apply_uint6_weight_only_quant(linear)
 
-        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")
+        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)
         with self.assertRaisesRegex(
             AssertionError, "dispatching to my impl for uint6 weight only quant"
         ):
@@ -245,13 +248,13 @@ def test_print_quantized_module(self):
                     ql = apply_quant(linear)
                 assert "AffineQuantizedTensor" in str(ql)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+
     @common_utils.parametrize(
-        "apply_quant", get_quantization_functions(False, True, "cuda", False)
+        "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
     def test_test_copy__apply(self, apply_quant):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
-        linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
+        linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
 
         if isinstance(apply_quant, AOBaseConfig):
             quantize_(linear, apply_quant)
@@ -262,20 +265,20 @@ def test_test_copy__apply(self, apply_quant):
             ql = apply_quant(linear)
             ql2 = apply_quant(linear2)
 
-        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")
+        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)
         output = ql(example_input)
         ql2.weight.copy_(ql.weight)
         ql2.bias = ql.bias
         output2 = ql2(example_input)
         self.assertEqual(output, output2)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+
     @common_utils.parametrize(
-        "apply_quant", get_quantization_functions(False, True, "cuda", False)
+        "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
     def test_copy__mismatch_metadata(self, apply_quant):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
-        linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
+        linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device=_DEVICE)
 
         if isinstance(apply_quant, AOBaseConfig):
             quantize_(linear, apply_quant)
@@ -349,9 +352,8 @@ def test_alias(self, device, dtype):
         quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
         _ = dummy.weight[...]
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
     @skip_if_rocm("ROCm enablement in progress")
     def test_slice_int4wo(self, device, dtype):
         # in_feature not divisible by 1024
@@ -363,9 +365,7 @@ def test_slice_int4wo(self, device, dtype):
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
 
-    @common_utils.parametrize("device", ["cuda"])
     @common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
-    @skip_if_no_cuda()
     @skip_if_no_gemlite()
     def test_slice_gemlite(self, device, dtype):
         # in_feature not divisible by 1024
@@ -446,7 +446,7 @@ def dequant(input_layer, in_features, orig_shape):
             )
             self.assertEqual((W_slice_ref - W_slice).abs().mean().item(), 0)
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
     def test_matmul(self, device, dtype):
         x = torch.randn(53, 2048)
@@ -463,14 +463,13 @@ def test_matmul(self, device, dtype):
         # make sure it runs
         torch.matmul(x, w.t())
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
     @skip_if_rocm("ROCm enablement in progress")
     def test_slice_and_copy_int4wo(self, device, dtype):
-        l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
         l.weight = torch.nn.Parameter(
-            torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
+            torch.zeros(1024, 1024, dtype=torch.bfloat16, device=_DEVICE)
         )
         quantize_(l, Int4WeightOnlyConfig())
         param = l.weight
@@ -487,7 +486,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         assert param.data.dequantize()[0][0] == 0
 
         # dummy_l has random input (shouldn't be 0)
-        dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        dummy_l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
         quantize_(dummy_l, Int4WeightOnlyConfig())
         quantized = dummy_l.weight
         quantized = quantized.narrow(0, 0, 512)
@@ -497,9 +496,8 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         # making sure param.data is updated
         assert param.data.dequantize()[0][0] != 0
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
     @skip_if_rocm("ROCm enablement in progress")
     def test_mm_int4wo(self, device, dtype):
         weight = torch.randn(512, 1024).to(device).to(dtype)