always return on CPU, onload at runtime

kylesayrs · kylesayrs · commit 6929f1687d16 · 2025-08-21T21:22:39.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/transform/factory/base.py b/src/compressed_tensors/transform/factory/base.py
@@ -14,7 +14,7 @@
 
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import List, Optional, Tuple, Set
+from typing import List, Optional, Set, Tuple
 
 import torch
 import torch.nn.utils.parametrize as P
diff --git a/src/compressed_tensors/transform/factory/hadamard.py b/src/compressed_tensors/transform/factory/hadamard.py
@@ -22,7 +22,7 @@
     apply_transform_weight,
     get_transform_size,
 )
-from compressed_tensors.utils import get_execution_device, get_offloaded_device
+from compressed_tensors.utils import get_execution_device
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
 from torch import Tensor, device, dtype
 from torch.nn import Module, Parameter
@@ -42,7 +42,6 @@ def __init__(self, name: str, scheme: TransformScheme, seed: Optional[int] = Non
         super().__init__(name, scheme, seed)
         self.weights = ParameterizedDefaultDict(self._create_weight)
         self.perms = ParameterizedDefaultDict(self._create_permutation)
-        self._shared_tensors_device = None
 
     def create_transform(self, module: Module, args: TransformArgs):
         """
@@ -54,11 +53,9 @@ def create_transform(self, module: Module, args: TransformArgs):
         """
         assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
-        dtype = self.scheme.precision
-        device = get_offloaded_device(module)
         exec_device = get_execution_device(module)
 
-        factory_kwargs = {"device": device, "construct_device": exec_device}
+        factory_kwargs = {"construct_device": exec_device}
         weight = self.weights.get(size, factory_kwargs=factory_kwargs)
         # TODO: permutations should be keyed by fused modules, not weight
         perm = self.perms[weight] if self.scheme.randomize else None
@@ -67,25 +64,12 @@ def create_transform(self, module: Module, args: TransformArgs):
     def _create_weight(
         self,
         size: int,
-        device: device,
         construct_device: device,
     ) -> Parameter:
-        # check that shared tensors device is consistent
-        if self._shared_tensors_device is None:
-            self._shared_tensors_device = device
-
-        if device != self._shared_tensors_device:
-            raise NotImplementedError(
-                "Creating multi-gpu transform weights are not supported as of now due "
-                "to the limitations of shared tensors across GPUs."
-                # in the future, tensors can be shared within GPUs,
-                # and can be all-reduced during updates and compression
-            )
-
-        # construct on execution device, cache shared tensor on offload device
+        # construct on execution device, cache shared tensor on cpu
         precision = self.scheme.precision
         data = deterministic_hadamard_matrix(size, precision, construct_device)
-        data = data.to(device=device)
+        data = data.to(device="cpu")
         return Parameter(data, requires_grad=self.scheme.requires_grad)
 
     def _create_permutation(self, weight: Parameter) -> Parameter:
@@ -120,9 +104,10 @@ def forward(self, value: Tensor) -> Tensor:
         if self.args.inverse:
             weight = weight.T
 
+        # onloading is done by accelerate if parent module is offloaded
         return (
             apply_transform_weight(
-                weight.to(self._precision),
+                weight.to(dtype=self._precision, device=value.device),
                 value.to(self._precision),
                 self.args.location,
                 self.module_type,
diff --git a/src/compressed_tensors/transform/factory/matrix_multiply.py b/src/compressed_tensors/transform/factory/matrix_multiply.py
@@ -21,9 +21,8 @@
     apply_transform_weight,
     get_transform_size,
 )
-from compressed_tensors.utils import get_offloaded_device
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
-from torch import Tensor, device, dtype
+from torch import Tensor
 from torch.nn import Module, Parameter
 
 
@@ -53,34 +52,20 @@ def create_transform(self, module: Module, args: TransformArgs):
         """
         assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
-        device = get_offloaded_device(module)
 
-        factory_kwargs = {"device": device}
-        weight = self.weights.get(size, factory_kwargs=factory_kwargs)
+        weight = self.weights.get(size)
         if args.inverse:
             weight = self.inverses[weight]
 
         return RandomMatrixTransform(weight, self.scheme, args, type(module))
 
-    def _create_weight(self, size: int, device: device) -> Parameter:
-        # check that shared tensors device is consistent
-        if self._shared_tensors_device is None:
-            self._shared_tensors_device = device
-
-        if device != self._shared_tensors_device:
-            raise NotImplementedError(
-                "Creating multi-gpu transform weights are not supported as of now due "
-                "to the limitations of shared tensors across GPUs"
-                # in the future, tensors can be shared within GPUs,
-                # and can be all-reduced during updates and compression
-            )
-
+    def _create_weight(self, size: int) -> Parameter:
         # TODO: construct such that weight is invertible (has non-zero determinant)
         data = torch.rand(
             (size, size),
             generator=self.generator,
             dtype=self.scheme.precision,
-            device=device,
+            device=torch.device("cpu"),
         )
         return Parameter(data, requires_grad=self.scheme.requires_grad)
 
@@ -106,8 +91,9 @@ def __init__(
         self._precision = scheme.precision if args.is_online() else torch.float64
 
     def forward(self, value: Tensor) -> Parameter:
+        # onloading is done by accelerate if parent module is offloaded
         return apply_transform_weight(
-            self.weight.to(self._precision),
+            self.weight.to(dtype=self._precision, device=value.device),
             value.to(self._precision),
             self.args.location,
             self.module_type,
@@ -116,7 +102,7 @@ def forward(self, value: Tensor) -> Parameter:
     def right_inverse(self, value: Tensor) -> Tensor:
         inverse = high_precision_invert(self.weight)
         return apply_transform_weight(
-            inverse.to(self._precision),
+            inverse.to(dtype=self._precision, device=value.device),
             value.to(self._precision),
             self.args.location,
             self.module_type,
diff --git a/src/compressed_tensors/transform/factory/random_hadamard.py b/src/compressed_tensors/transform/factory/random_hadamard.py
@@ -14,7 +14,7 @@
 
 from compressed_tensors.transform import HadamardFactory, TransformFactory
 from compressed_tensors.transform.utils.hadamard import random_hadamard_matrix
-from torch import device, dtype
+from torch import device
 from torch.nn import Parameter
 
 
@@ -31,11 +31,10 @@ class RandomHadamardFactory(HadamardFactory):
     def _create_weight(
         self,
         size: int,
-        device: device,
         construct_device: device,
     ) -> Parameter:
         # construct on execution device, cache on offload device
         precision = self.scheme.precision
         data = random_hadamard_matrix(size, precision, construct_device, self.generator)
-        data = data.to(device=device)
+        data = data.to(device="cpu")
         return Parameter(data, requires_grad=self.scheme.requires_grad)