modularize.

sayakpaul · sayakpaul · commit 11cfd6c0817e · 2025-06-07T10:07:06.000+05:30
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -51,7 +51,7 @@
     torch_device,
 )
 
-from ..utils import QuantCompileMiscTests
+from ..test_torch_compile_utils import QuantCompileMiscTests
 
 
 def get_some_linear_layer(model):
@@ -861,18 +861,24 @@ def test_fp4_double_safe(self):
         self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
 
 
+@require_torch_version_greater("2.7.1")
 class Bnb4BitCompileTests(QuantCompileMiscTests):
-    @require_torch_version_greater("2.7.1")
+    quantization_config = PipelineQuantizationConfig(
+        quant_backend="bitsandbytes_8bit",
+        quant_kwargs={
+            "load_in_4bit": True,
+            "bnb_4bit_quant_type": "nf4",
+            "bnb_4bit_compute_dtype": torch.bfloat16,
+        },
+        components_to_quantize=["transformer", "text_encoder_2"],
+    )
+
     def test_torch_compile(self):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        super()._test_torch_compile(quantization_config=self.quantization_config)
 
-        quantization_config = PipelineQuantizationConfig(
-            quant_backend="bitsandbytes_4bit",
-            quant_kwargs={
-                "load_in_4bit": True,
-                "bnb_4bit_quant_type": "nf4",
-                "bnb_4bit_compute_dtype": torch.bfloat16,
-            },
-            components_to_quantize=["transformer", "text_encoder_2"],
+    def test_torch_compile_with_cpu_offload(self):
+        torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        super()._test_torch_compile_with_cpu_offload(
+            quantization_config=self.quantization_config, torch_dtype=torch.float16
         )
-        super().test_torch_compile(quantization_config=quantization_config)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
@@ -49,7 +49,7 @@
     torch_device,
 )
 
-from ..utils import QuantCompileMiscTests
+from ..test_torch_compile_utils import QuantCompileMiscTests
 
 
 def get_some_linear_layer(model):
@@ -779,16 +779,20 @@ def test_serialization_sharded(self):
         self.assertTrue(torch.equal(out_0, out_1))
 
 
+@require_torch_version_greater_equal("2.6.0")
 class Bnb8BitCompileTests(QuantCompileMiscTests):
-    @require_torch_version_greater_equal("2.6.0")
+    quantization_config = PipelineQuantizationConfig(
+        quant_backend="bitsandbytes_8bit",
+        quant_kwargs={"load_in_8bit": True},
+        components_to_quantize=["transformer", "text_encoder_2"],
+    )
+
     def test_torch_compile(self):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        super()._test_torch_compile(quantization_config=self.quantization_config, torch_dtype=torch.float16)
 
-        quantization_config = PipelineQuantizationConfig(
-            quant_backend="bitsandbytes_8bit",
-            quant_kwargs={
-                "load_in_8bit": True,
-            },
-            components_to_quantize=["transformer", "text_encoder_2"],
+    def test_torch_compile_with_cpu_offload(self):
+        torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        super()._test_torch_compile_with_cpu_offload(
+            quantization_config=self.quantization_config, torch_dtype=torch.float16
         )
-        super().test_torch_compile(quantization_config=quantization_config, torch_dtype=torch.float16)
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
@@ -24,6 +24,8 @@
 @require_torch_gpu
 @slow
 class QuantCompileMiscTests(unittest.TestCase):
+    quantization_config = None
+
     def setUp(self):
         super().setUp()
         gc.collect()
@@ -36,14 +38,28 @@ def tearDown(self):
         backend_empty_cache(torch_device)
         torch.compiler.reset()
 
-    def test_torch_compile(self, quantization_config, torch_dtype=torch.bfloat16):
+    def _init_pipeline(self, quantization_config, torch_dtype):
         pipe = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-3-medium-diffusers",
             quantization_config=quantization_config,
             torch_dtype=torch_dtype,
-        ).to("cuda")
+        )
+        return pipe
+
+    def _test_torch_compile(self, quantization_config, torch_dtype=torch.bfloat16):
+        pipe = self._init_pipeline(quantization_config, torch_dtype).to("cuda")
+        # import to ensure fullgraph True
         pipe.transformer.compile(fullgraph=True)
 
         for _ in range(2):
             # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=4, max_sequence_length=16, height=256, width=256)
+            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
+
+    def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype=torch.bfloat16):
+        pipe = self._init_pipeline(quantization_config, torch_dtype)
+        pipe.enable_model_cpu_offload()
+        pipe.transformer.compile()
+
+        for _ in range(2):
+            # small resolutions to ensure speedy execution.
+            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)