add group offloading+compile

sayakpaul · sayakpaul · commit 0e4f1523aa2c · 2025-06-07T10:14:49.000+05:30
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -879,6 +879,8 @@ def test_torch_compile(self):
 
     def test_torch_compile_with_cpu_offload(self):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
-        super()._test_torch_compile_with_cpu_offload(
-            quantization_config=self.quantization_config, torch_dtype=torch.float16
-        )
+        super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
+
+    def test_torch_compile_with_group_offload(self):
+        torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        super()._test_torch_compile_with_group_offload(quantization_config=self.quantization_config)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
@@ -796,3 +796,9 @@ def test_torch_compile_with_cpu_offload(self):
         super()._test_torch_compile_with_cpu_offload(
             quantization_config=self.quantization_config, torch_dtype=torch.float16
         )
+
+    def test_torch_compile_with_group_offload(self):
+        torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        super()._test_torch_compile_with_group_offload(
+            quantization_config=self.quantization_config, torch_dtype=torch.float16
+        )
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
@@ -63,3 +63,20 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype=
         for _ in range(2):
             # small resolutions to ensure speedy execution.
             pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
+
+    def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtype=torch.bfloat16):
+        pipe = self._init_pipeline(quantization_config, torch_dtype)
+        group_offload_kwargs = {
+            "onload_device": "cuda",
+            "offload_device": "cpu",
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
+            "non_blocking": True,
+        }
+        pipe.enable_group_offload(**group_offload_kwargs)
+        pipe.transformer.compile()
+
+        for _ in range(2):
+            # small resolutions to ensure speedy execution.
+            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)