From e41f093b2b493e7bf6a2686182d85ac2437a64df Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 15 May 2025 10:51:01 +0530
Subject: [PATCH 1/2] add tests for combining layerwise upcasting and
 groupoffloading.

---
 tests/models/test_modeling_common.py | 39 ++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 58edeb55c4b1..8e6fda5bfe8a 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1580,6 +1580,45 @@ def run_forward(model):
         self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading3, atol=1e-5))
         self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading4, atol=1e-5))
 
+    @parameterized.expand([(False, torch.float16, torch.float32), (True, torch.float16, torch.float32)])
+    @require_torch_accelerator
+    @torch.no_grad()
+    def test_group_offloading_with_layerwise_casting(self, record_stream, storage_dtype, compute_dtype):
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        if not getattr(model, "_supports_group_offloading", True):
+            return
+
+        model.to(torch_device)
+        model.eval()
+        _ = model(**inputs_dict)[0]
+
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+        model = self.model_class(**init_dict)
+        model.eval()
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1)
+        model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
+        _ = model(**inputs_dict)[0]
+
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+        model = self.model_class(**init_dict)
+        model.eval()
+        model.enable_group_offload(
+            torch_device,
+            offload_type="block_level",
+            num_blocks_per_group=1,
+            use_stream=True,
+            non_blocking=True,
+            record_stream=record_stream,
+        )
+        _ = model(**inputs_dict)[0]
+
     def test_auto_model(self, expected_max_diff=5e-5):
         if self.forward_requires_fresh_args:
             model = self.model_class(**self.init_dict)

From aacf6259ae3abb7b5e3a3194792fc00c7a2c27bf Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 15 May 2025 16:38:42 +0530
Subject: [PATCH 2/2] feedback

---
 tests/models/test_modeling_common.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 8e6fda5bfe8a..0b17d7977a41 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1580,10 +1580,10 @@ def run_forward(model):
         self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading3, atol=1e-5))
         self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading4, atol=1e-5))
 
-    @parameterized.expand([(False, torch.float16, torch.float32), (True, torch.float16, torch.float32)])
+    @parameterized.expand([(False, "block_level"), (True, "leaf_level")])
     @require_torch_accelerator
     @torch.no_grad()
-    def test_group_offloading_with_layerwise_casting(self, record_stream, storage_dtype, compute_dtype):
+    def test_group_offloading_with_layerwise_casting(self, record_stream, offload_type):
         torch.manual_seed(0)
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict)
@@ -1597,26 +1597,15 @@ def test_group_offloading_with_layerwise_casting(self, record_stream, storage_dt
 
         torch.manual_seed(0)
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        storage_dtype, compute_dtype = torch.float16, torch.float32
         inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
         model = self.model_class(**init_dict)
         model.eval()
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1)
-        model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
-        _ = model(**inputs_dict)[0]
-
-        torch.manual_seed(0)
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
-        model = self.model_class(**init_dict)
-        model.eval()
+        additional_kwargs = {} if offload_type == "leaf_level" else {"num_blocks_per_group": 1}
         model.enable_group_offload(
-            torch_device,
-            offload_type="block_level",
-            num_blocks_per_group=1,
-            use_stream=True,
-            non_blocking=True,
-            record_stream=record_stream,
+            torch_device, offload_type=offload_type, use_stream=True, record_stream=record_stream, **additional_kwargs
         )
+        model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
         _ = model(**inputs_dict)[0]
 
     def test_auto_model(self, expected_max_diff=5e-5):