update

sayakpaul · sayakpaul · commit 49b0f5db1320 · 2025-07-10T15:11:39.000+05:30
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -877,7 +877,7 @@ class Bnb4BitCompileTests(QuantCompileTests, unittest.TestCase):
     @property
     def quantization_config(self):
         return PipelineQuantizationConfig(
-            quant_backend="bitsandbytes_8bit",
+            quant_backend="bitsandbytes_4bit",
             quant_kwargs={
                 "load_in_4bit": True,
                 "bnb_4bit_quant_type": "nf4",
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import inspect
 
 import torch
 
@@ -54,19 +55,16 @@ def _test_torch_compile(self, torch_dtype=torch.bfloat16):
         # `fullgraph=True` ensures no graph breaks
         pipe.transformer.compile(fullgraph=True)
 
-        with torch._dynamo.config.patch(error_on_recompile=True):
-            for _ in range(2):
-                # small resolutions to ensure speedy execution.
-                pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+        # small resolutions to ensure speedy execution.
+        pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
     def _test_torch_compile_with_cpu_offload(self, torch_dtype=torch.bfloat16):
         pipe = self._init_pipeline(self.quantization_config, torch_dtype)
         pipe.enable_model_cpu_offload()
         pipe.transformer.compile()
 
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+        # small resolutions to ensure speedy execution.
+        pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
     def _test_torch_compile_with_group_offload_leaf(self, torch_dtype=torch.bfloat16, *, use_stream: bool = False):
         torch._dynamo.config.cache_size_limit = 1000
@@ -85,15 +83,17 @@ def _test_torch_compile_with_group_offload_leaf(self, torch_dtype=torch.bfloat16
                 if torch.device(component.device).type == "cpu":
                     component.to("cuda")
 
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
+        # small resolutions to ensure speedy execution.
+        pipe("a dog", num_inference_steps=2, max_sequence_length=16, height=256, width=256)
 
     def test_torch_compile(self):
         self._test_torch_compile()
 
     def test_torch_compile_with_cpu_offload(self):
         self._test_torch_compile_with_cpu_offload()
 
-    def test_torch_compile_with_group_offload_leaf(self):
-        self._test_torch_compile_with_group_offload_leaf()
+    def test_torch_compile_with_group_offload_leaf(self, use_stream=False):
+        for cls in inspect.getmro(self.__class__):
+            if "test_torch_compile_with_group_offload_leaf" in cls.__dict__ and cls is not QuantCompileTests:
+                return
+        self._test_torch_compile_with_group_offload_leaf(use_stream=use_stream)
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
@@ -645,8 +645,9 @@ def quantization_config(self):
     )
     def test_torch_compile_with_cpu_offload(self):
         # RuntimeError: _apply(): Couldn't swap Linear.weight
-        super()._test_torch_compile_with_cpu_offload()
+        super().test_torch_compile_with_cpu_offload()
 
+    @parameterized.expand([False, True])
     @unittest.skip(
         """
         For `use_stream=False`:
@@ -656,8 +657,7 @@ def test_torch_compile_with_cpu_offload(self):
             Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO.
         """
     )
-    @parameterized.expand([False, True])
-    def test_torch_compile_with_group_offload_leaf(self):
+    def test_torch_compile_with_group_offload_leaf(self, use_stream):
         # For use_stream=False:
         # If we run group offloading without compilation, we will see:
         #   RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0".  This is no longer allowed; the devices must match.
@@ -670,7 +670,7 @@ def test_torch_compile_with_group_offload_leaf(self):
 
         # For use_stream=True:
         # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
-        super()._test_torch_compile_with_group_offload_leaf()
+        super()._test_torch_compile_with_group_offload_leaf(use_stream=use_stream)
 
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners