Tests: WA aarch64 CPU regressions for torch 2.6.0; add Windows torch==2.7.0+cu118 test config

matthewdouglas · matthewdouglas · commit 9c49d098c423 · 2025-05-23T22:05:00.000-04:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -129,6 +129,10 @@ jobs:
         with:
           python-version: 3.9
 
+      - name: Setup MSVC
+        if: startsWith(matrix.os, 'windows')
+        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl for torch.compile
+
       - name: Install dependencies
         run: |
           pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
@@ -188,6 +192,15 @@ jobs:
             torch_version: "2.7.0"
             pypi_index: "https://download.pytorch.org/whl/cu128"
 
+          # Add torch 2.7+cu118 for Windows.
+          - os: windows-2025
+            arch: x86_64
+            gpu: T4
+            runner: CUDA-Windows-x64
+            cuda_version: "11.8.0"
+            torch_version: "2.7.0"
+            pypi_index: "https://download.pytorch.org/whl/cu118"
+
           # L40S runners
           - os: ubuntu-22.04
             gpu: L40S
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
@@ -1,6 +1,7 @@
 import copy
 import os
 import pickle
+import platform
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -299,6 +300,16 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
     if fullgraph and torch.__version__ < (2, 8):
         pytest.skip("fullgraph mode requires torch 2.8 or higher")
 
+    # Has a strange regression on Linux aarch64 CPU in torch==2.6.0 when fullgraph=False.
+    if (
+        not fullgraph
+        and device == "cpu"
+        and platform.machine() == "aarch64"
+        and platform.system() == "Linux"
+        and ((2, 7) > torch.__version__ >= (2, 6))
+    ):
+        pytest.xfail("Regression in torch==2.6.0 on Linux aarch64 CPU")
+
     dim = 256
     batch_size = 16
 
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
@@ -2,6 +2,7 @@
 import copy
 import os
 import pickle
+import platform
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -238,7 +239,6 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
 
     torch.compiler.reset()
 
-    torch._dynamo.config.patch()
     # Create a small network with Linear8bitLt layers
     net = torch.nn.Sequential(
         *[bnb.nn.Linear8bitLt(dim, dim, bias=bias, has_fp16_weights=False, threshold=threshold) for _ in range(4)]
@@ -267,7 +267,15 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
         torch.testing.assert_close(compiled_output, ref_output)
 
         # Test with gradients. Currently only works with threshold=0.
-        if threshold == 0:
+        # Has a strange regression on Linux aarch64 CPU in torch==2.6.0.
+        is_broken_platform = (
+            device == "cpu"
+            and platform.machine() == "aarch64"
+            and platform.system() == "Linux"
+            and ((2, 7) > torch.__version__ >= (2, 6))
+        )
+
+        if threshold == 0 and not is_broken_platform:
             x.requires_grad_(True)
             y1 = net(x).sum()
             y1.backward()