Skip to content

Enable Unit Testing and CI for XPU #2709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/scripts/ci_test_xpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir
python3 setup.py install

pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'

cd test/quantization
pytest -v -s *.py
156 changes: 156 additions & 0 deletions .github/workflows/pr-test-xpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether

name: xpu-test

on:
push:
branches:
- main
- 'gh/**'
pull_request:
branches:
- main
- 'gh/**'

concurrency:
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

jobs:
test:
# Don't run on forked repos or empty test matrix
# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
timeout-minutes: 60
runs-on: ao-pvc
env:
DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3
TEST_COMMAND: .github/scripts/ci_test_xpu.sh
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
steps:
# [see note: pytorch repo ref]
- name: Checkout Torchao
uses: actions/checkout@v4

- name: Clean all stopped docker containers
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Use following to pull public copy of the image
id: print-ghcr-mirror
shell: bash
run: |
echo "docker pull ${DOCKER_IMAGE}"
docker pull ${DOCKER_IMAGE}

- name: Test
id: test
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
timeout-minutes: 60
run: |
set -x

# detached container should get cleaned up by teardown_ec2_linux
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e BRANCH \
-e SHA1 \
--user $(id -u):$(id -g) \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
--privileged \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# save container name for later step
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"

- name: Change permissions
if: ${{ always() && steps.test.conclusion }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

- name: Stop container before exit
if: always()
run: |
# Workaround for multiple runners on same IDC node
docker stop "${{ env.CONTAINER_NAME }}"

- name: Store Core dumps on GitHub
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*

- name: Teardown XPU
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi
52 changes: 25 additions & 27 deletions test/dtypes/test_affine_quantized.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,18 @@
is_ROCM,
is_sm_at_least_89,
is_sm_at_least_90,
auto_detect_device,
)

is_cusparselt_available = (
hasattr(torch.backends, "cusparselt") and torch.backends.cusparselt.is_available()
)

_DEVICE = auto_detect_device()


def get_quantization_functions(
do_sparse: bool, do_int4: bool, device: str = "cuda", int4_zp_int: bool = False
do_sparse: bool, do_int4: bool, device: str =_DEVICE, int4_zp_int: bool = False
):
base_functions = [
int8_weight_only(),
Expand Down Expand Up @@ -114,9 +117,9 @@ class TestAffineQuantized(TestCase):
["xpu"] if torch.xpu.is_available() else []
)

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

def test_tensor_core_layout_transpose(self):
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
t = linear.weight
shape = t.shape
apply_int4_weight_only_quant = int4_weight_only(group_size=32)
Expand Down Expand Up @@ -182,7 +185,7 @@ def _apply(module, config_or_subclass_inserter):
ql = _apply(linear, apply_quant)
ql.to(device)

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

def test_register_new_dispatch(self):
from torchao.dtypes import AffineQuantizedTensor
from torchao.dtypes.affine_quantized_tensor_ops import (
Expand Down Expand Up @@ -219,10 +222,10 @@ def apply_uint6_weight_only_quant(linear):
)
return linear

linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
apply_uint6_weight_only_quant(linear)

example_input = torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")
example_input = torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)
with self.assertRaisesRegex(
AssertionError, "dispatching to my impl for uint6 weight only quant"
):
Expand All @@ -245,13 +248,13 @@ def test_print_quantized_module(self):
ql = apply_quant(linear)
assert "AffineQuantizedTensor" in str(ql)

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

@common_utils.parametrize(
"apply_quant", get_quantization_functions(False, True, "cuda", False)
"apply_quant", get_quantization_functions(False, True, _DEVICE, False)
)
def test_test_copy__apply(self, apply_quant):
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)

if isinstance(apply_quant, AOBaseConfig):
quantize_(linear, apply_quant)
Expand All @@ -262,20 +265,20 @@ def test_test_copy__apply(self, apply_quant):
ql = apply_quant(linear)
ql2 = apply_quant(linear2)

example_input = torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")
example_input = torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)
output = ql(example_input)
ql2.weight.copy_(ql.weight)
ql2.bias = ql.bias
output2 = ql2(example_input)
self.assertEqual(output, output2)

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")

@common_utils.parametrize(
"apply_quant", get_quantization_functions(False, True, "cuda", False)
"apply_quant", get_quantization_functions(False, True, _DEVICE, False)
)
def test_copy__mismatch_metadata(self, apply_quant):
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device="cuda")
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device=_DEVICE)

if isinstance(apply_quant, AOBaseConfig):
quantize_(linear, apply_quant)
Expand Down Expand Up @@ -349,9 +352,8 @@ def test_alias(self, device, dtype):
quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
_ = dummy.weight[...]

@common_utils.parametrize("device", ["cuda"])
@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
@skip_if_no_cuda()
@skip_if_rocm("ROCm enablement in progress")
def test_slice_int4wo(self, device, dtype):
# in_feature not divisible by 1024
Expand All @@ -363,9 +365,7 @@ def test_slice_int4wo(self, device, dtype):
_ = dummy.weight.narrow(0, 0, 64)
_ = dummy.weight.narrow(1, 0, 128)

@common_utils.parametrize("device", ["cuda"])
@common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
@skip_if_no_cuda()
@skip_if_no_gemlite()
def test_slice_gemlite(self, device, dtype):
# in_feature not divisible by 1024
Expand Down Expand Up @@ -446,7 +446,7 @@ def dequant(input_layer, in_features, orig_shape):
)
self.assertEqual((W_slice_ref - W_slice).abs().mean().item(), 0)

@common_utils.parametrize("device", ["cuda"])
@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
def test_matmul(self, device, dtype):
x = torch.randn(53, 2048)
Expand All @@ -463,14 +463,13 @@ def test_matmul(self, device, dtype):
# make sure it runs
torch.matmul(x, w.t())

@common_utils.parametrize("device", ["cuda"])
@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
@skip_if_no_cuda()
@skip_if_rocm("ROCm enablement in progress")
def test_slice_and_copy_int4wo(self, device, dtype):
l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
l.weight = torch.nn.Parameter(
torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
torch.zeros(1024, 1024, dtype=torch.bfloat16, device=_DEVICE)
)
quantize_(l, Int4WeightOnlyConfig())
param = l.weight
Expand All @@ -487,7 +486,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
assert param.data.dequantize()[0][0] == 0

# dummy_l has random input (shouldn't be 0)
dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
dummy_l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
quantize_(dummy_l, Int4WeightOnlyConfig())
quantized = dummy_l.weight
quantized = quantized.narrow(0, 0, 512)
Expand All @@ -497,9 +496,8 @@ def test_slice_and_copy_int4wo(self, device, dtype):
# making sure param.data is updated
assert param.data.dequantize()[0][0] != 0

@common_utils.parametrize("device", ["cuda"])
@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
@skip_if_no_cuda()
@skip_if_rocm("ROCm enablement in progress")
def test_mm_int4wo(self, device, dtype):
weight = torch.randn(512, 1024).to(device).to(dtype)
Expand Down
Loading