[Refactor] Rename offload_model to set_onload_device (#643)

nrmlthms · brian-dellabetta · web-flow · commit 68f1a7d63f7e · 2026-03-25T01:01:47.000-04:00
* [Refactor] Rename offload_model to set_onload_device - Add set_onload_device as the canonical function (replaces offload_model) - Deprecate offload_model using @deprecated decorator pointing to set_onload_device - Remove offload_device param from set_onload_device (was already ignored with warning) - Update all internal usages and tests Part of vllm-project/llm-compressor#2483 * [Docs] Update README references from offload_model to set_onload_device * Run make commands to apply formatting --------- Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
diff --git a/src/compressed_tensors/offload/README.md b/src/compressed_tensors/offload/README.md
@@ -128,7 +128,7 @@ Offloads tensors to CPU RAM. Onloading is a standard `.to(device)` call from CPU
 
 #### `DeviceCache` — `cache/device.py`
 
-Offloads tensors to a CUDA device. Onloading is typically a no-op (the tensor is already on device), but handles the case where `onload_device` is changed after initialization (e.g., during `offload_model` reconfiguration).
+Offloads tensors to a CUDA device. Onloading is typically a no-op (the tensor is already on device), but handles the case where `onload_device` is changed after initialization (e.g., during `set_onload_device` reconfiguration).
 
 - **offload**: moves tensor to the device (`self.offload_device = self.onload_device` at init).
 - **onload**: `send_tensors(offloaded, device=self.onload_device)`.
@@ -214,7 +214,7 @@ The primary function for attaching offloading to a single `torch.nn.Module`. It:
 offload_module(layer, onload_device="cuda:0", offload_device="cpu")
 ```
 
-**When to use:** when you want fine-grained control over which specific modules are offloaded. For model-wide dispatch, prefer `dispatch_model` or `offload_model`.
+**When to use:** when you want fine-grained control over which specific modules are offloaded. For model-wide dispatch, prefer `dispatch_model` or `set_onload_device`.
 
 > **Note:** Raises `ValueError` if the module is already offloaded. Call `remove_module_offload` first.
 
@@ -273,17 +273,19 @@ model = dispatch_model(model, device_memory={torch.device("cuda:0"): 16e9})
 
 ---
 
-#### `offload_model(model, onload_device, offload_device=None)`
+#### `set_onload_device(model, onload_device)`
 
 A lighter-weight dispatch that moves all modules in a model to the same `onload_device`, without changing where weights are stored. For modules not yet offloaded, it offloads them to their current device.
 
 ```python
 # Move all execution to cuda:0, keeping offloads unchanged
-model = offload_model(model, onload_device="cuda:0")
+model = set_onload_device(model, onload_device="cuda:0")
 ```
 
 **When to use:** when you have already loaded a model with weights in the right place (e.g., via `load_offloaded_model`) and just need to set the execution device. Less powerful than `dispatch_model` but simpler.
 
+> **Note:** `offload_model` is a deprecated alias for this function.
+
 ---
 
 #### `dispatch_with_map(model, device_map, offload_dir=None)`
@@ -684,7 +686,7 @@ compressed_tensors.offload
 ├── load.py                   load_offloaded_model()
 │     └── calls from_accelerate after loading
 │
-├── dispatch.py               dispatch_model(), offload_model(), dispatch_with_map()
+├── dispatch.py               dispatch_model(), set_onload_device(), dispatch_with_map()
 │     └── calls offload_module() for each module
 │
 ├── module.py                 offload_module(), remove_module_offload()
diff --git a/src/compressed_tensors/offload/__init__.py b/src/compressed_tensors/offload/__init__.py
@@ -14,6 +14,7 @@
     get_device_map,
     offload_model,
     remove_dispatch,
+    set_onload_device,
 )
 from compressed_tensors.offload.dist_utils import (
     as_broadcastable,
@@ -29,7 +30,8 @@
 
 __all__ = [
     # dispatch models
-    "offload_model",
+    "set_onload_device",
+    "offload_model",  # deprecated, use set_onload_device
     "dispatch_model",
     "remove_dispatch",
     "dispatch_with_map",
diff --git a/src/compressed_tensors/offload/cache/dist_cpu.py b/src/compressed_tensors/offload/cache/dist_cpu.py
@@ -28,7 +28,7 @@ def offload(self, tensor: torch.Tensor | None) -> torch.Tensor | None:
         if dist.get_rank() == 0:
             # create shared memory cpu tensor
             tensor = super().offload(tensor).share_memory_()
-            (handle, filename, nbytes) = tensor.untyped_storage()._share_filename_cpu_()
+            handle, filename, nbytes = tensor.untyped_storage()._share_filename_cpu_()
             broadcast_obj = [handle, filename, nbytes]
         else:
             broadcast_obj = [None, None, None]
diff --git a/src/compressed_tensors/offload/dispatch.py b/src/compressed_tensors/offload/dispatch.py
@@ -17,11 +17,13 @@
 )
 from compressed_tensors.utils import getattr_chain
 from compressed_tensors.utils.binary_search import SearchFailureError, max_binary_search
+from compressed_tensors.utils.helpers import deprecated
 from loguru import logger
 from transformers import PreTrainedModel
 
 
 __all__ = [
+    "set_onload_device",
     "offload_model",
     "dispatch_with_map",
     "get_device_map",
@@ -35,28 +37,19 @@
 DeviceMap = dict[str, tuple[torch.device | None, torch.device | str | None]]
 
 
-def offload_model(
+def set_onload_device(
     model: ModelType,
     onload_device: torch.device | str,
-    offload_device: Any = None,
 ) -> ModelType:
     """
     Modify the dispatch of a model to onload to the provided `onload_device`. Existing
-    offloaded tensors will not be modified. If a module is not offloaded, it will be
-    offloaded to the provided `offload_device`.
+    offloaded tensors will not be modified. If a module is not already offloaded, it
+    will be offloaded to its current device.
 
     :param model: model to dispatch
     :param onload_device: device to move weights to during forward pass
-    :param offload_device: device to offload weights to, if not already offloaded
     :return: dispatched model
     """
-    if offload_device is not None:
-        logger.warning(
-            "`offload_model` now keeps the same offload device that model was loaded "
-            "on. Please specify offload by loading the model on its offload device(s)"
-        )
-
-    # offload modules in place
     for module in model.modules():
         if isinstance(module._parameters, OffloadCache):
             module._parameters.onload_device = onload_device
@@ -68,6 +61,19 @@ def offload_model(
     return model
 
 
+@deprecated("set_onload_device")
+def offload_model(
+    model: ModelType,
+    onload_device: torch.device | str,
+    offload_device: Any = None,
+) -> ModelType:
+    """
+    .. deprecated::
+        Use :func:`set_onload_device` instead.
+    """
+    return set_onload_device(model, onload_device)
+
+
 def dispatch_with_map(
     model: torch.nn.Module,
     device_map: DeviceMap,
diff --git a/src/compressed_tensors/transform/utils/hadamard.py b/src/compressed_tensors/transform/utils/hadamard.py
@@ -137,7 +137,7 @@ def _matmul_hadU(X: torch.Tensor) -> torch.Tensor:
         output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :]
         output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
         output = output.view(input.shape[0], input.shape[1], -1)
-        (input, output) = (output, input)
+        input, output = (output, input)
     assert input.shape[1] == K
     del output
 
diff --git a/src/compressed_tensors/utils/offload.py b/src/compressed_tensors/utils/offload.py
@@ -21,9 +21,9 @@
     disable_offloading,
     get_execution_device,
     get_offloaded_device,
-    offload_model,
     register_offload_module,
     remove_dispatch,
+    set_onload_device,
     update_offload_parameter,
 )
 from compressed_tensors.utils.helpers import deprecated
@@ -134,7 +134,7 @@ def delete_offload_module(base: torch.nn.Module, name: str):
     delattr(base, name)
 
 
-@deprecated("compressed_tensors.offload::offload_model")
+@deprecated("compressed_tensors.offload::set_onload_device")
 def offloaded_dispatch(
     module: torch.nn.Module,
     execution_device: torch.device,
@@ -152,7 +152,7 @@ def offloaded_dispatch(
         raise ValueError(
             "Passing offload_device to offloaded_dispatch is no longer supported"
         )
-    offload_model(module, execution_device)
+    set_onload_device(module, execution_device)
 
 
 @deprecated("compressed_tensors.offload::align_module_device")
diff --git a/src/compressed_tensors/utils/semi_structured_conversions.py b/src/compressed_tensors/utils/semi_structured_conversions.py
@@ -10,7 +10,6 @@
 
 import torch
 
-
 __all__ = [
     "sparse_semi_structured_from_dense_cutlass",
     "sparse_semi_structured_to_dense_cutlass",
diff --git a/tests/test_offload/test_dispatch.py b/tests/test_offload/test_dispatch.py
@@ -9,7 +9,7 @@
 from compressed_tensors.offload.dispatch import (
     dispatch_model,
     get_device_memory,
-    offload_model,
+    set_onload_device,
 )
 from compressed_tensors.offload.utils import module_size
 from tests.testing_utils import requires_gpu
@@ -190,7 +190,7 @@ def test_offload_and_dispatch_model(model_id):
 
     # offload entire model
     model.to("cpu")
-    model = offload_model(model, "cuda:0")
+    model = set_onload_device(model, "cuda:0")
     offloaded_logits = model(**sample).logits
     for module in model.modules():
         assert_module_offloaded(module, "cuda:0", torch.device("cpu"))
diff --git a/tests/test_quantization/lifecycle/test_initialize.py b/tests/test_quantization/lifecycle/test_initialize.py
@@ -5,7 +5,7 @@
 
 import pytest
 import torch
-from compressed_tensors.offload import offload_model
+from compressed_tensors.offload import set_onload_device
 from compressed_tensors.quantization import (
     FP8_E4M3_DATA,
     ActivationOrdering,
@@ -108,7 +108,7 @@ def test_initialize_module_for_quantization(
 def test_initialize_module_for_quantization_offloaded(
     create_quantization_scheme, weights, input_activations, layer
 ):
-    offload_model(layer, "cuda:0")
+    set_onload_device(layer, "cuda:0")
 
     test_initialize_module_for_quantization(
         create_quantization_scheme,
diff --git a/tests/test_transform/factory/test_correctness.py b/tests/test_transform/factory/test_correctness.py
@@ -3,7 +3,7 @@
 
 import pytest
 import torch
-from compressed_tensors.offload import offload_model
+from compressed_tensors.offload import set_onload_device
 from compressed_tensors.transform import (
     TransformArgs,
     TransformConfig,
@@ -87,7 +87,7 @@ def test_correctness_model(type, randomize, input_batch_size, model_apply, offlo
     # load model
     model = model_apply[0]
     if offload:
-        offload_model(model, torch.device("cuda"))
+        set_onload_device(model, torch.device("cuda"))
 
     # get output
     input = torch.rand((input_batch_size, 5, model.fcs[0].in_features))
diff --git a/tests/test_transform/factory/test_memory.py b/tests/test_transform/factory/test_memory.py
@@ -8,7 +8,7 @@
 from compressed_tensors.offload import (
     disable_offloading,
     disable_onloading,
-    offload_model,
+    set_onload_device,
 )
 from compressed_tensors.transform import (
     TransformArgs,
@@ -31,7 +31,7 @@ def test_memory_sharing(type, randomize, requires_grad, offload):
     # load model (maybe with offloading)
     model = TransformableModel(2, 2, 4, 4, 8, 8)
     if offload:
-        offload_model(model, torch.device("cuda"))
+        set_onload_device(model, torch.device("cuda"))
 
     # add transforms to model
     config = TransformConfig(
diff --git a/tests/test_transform/factory/test_serialization.py b/tests/test_transform/factory/test_serialization.py
@@ -5,7 +5,7 @@
 
 import pytest
 import torch
-from compressed_tensors.offload import offload_model
+from compressed_tensors.offload import set_onload_device
 from compressed_tensors.transform import (
     TransformConfig,
     TransformScheme,
@@ -23,7 +23,7 @@ def test_serialization(type, randomize, model_apply, tmp_path, offload):
     # get model, maybe offload
     model, apply = model_apply
     if offload:
-        offload_model(model, torch.device("cuda"))
+        set_onload_device(model, torch.device("cuda"))
 
     # apply transforms to model
     config = TransformConfig(