Make all tensors on same device for svdquant with cpu-offloading (NVIDIA#550)

vishalpandya1990 · web-flow · commit 9cd0824d4204 · 2025-11-17T16:02:05.000+05:30
## What does this PR do? **Type of change:** Bug Fix **Overview:** ? While running SVDQuant with cpu-offloading enabled using diffuser-ptq example (sd3.5-medium model), error about "not all tensors on same device" were observed at following steps: 1. awq-scale computation - get_scale() using x_max and w_max 2. loss update for each alpha - update_loss() 3. _apply_weight_pre_quant_scale() - while multiplying with pre-quant-scale 4. apply_pre_quant_scale_and_smooth() - while multiplying with pre-quant-scale These errors should also be seen with flux model - with SVDQuant and cpu-offloading enabled. So, in this change, updating above places to ensure that concerned tensors are on same device. Using ".to(device)" for this effect. ## Testing - Tried SVDQuant with cpu-offloading enabled - with sd3.5-medium, on RTX 5090, Windows 11 22621. With this change, final ONNX model (transformer) was produced without any error. ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: vipandya <vipandya@nvidia.com>
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -251,7 +251,9 @@ def disable_pre_quant_scale_and_resmooth(linear: nn.Module, delete_pre_quant_sca
 def _apply_weight_pre_quant_scale(linear, pre_quant_scale):
     if _ENABLE_FOLDING_PQS_TO_WEIGHTS:
         linear.weight.data.copy_(
-            (linear.weight * pre_quant_scale.squeeze()[None, :]).to(linear.weight.dtype)
+            (linear.weight * pre_quant_scale.to(linear.weight.device).squeeze()[None, :]).to(
+                linear.weight.dtype
+            )
         )
     else:
         linear.weight_quantizer._enable_pre_quant_scale = True
@@ -300,7 +302,9 @@ def apply_pre_quant_scale_and_smooth(
         _amax_for_smoothing = linear.input_quantizer._amax_for_smoothing.to(
             device=device, dtype=dtype
         )
-        linear.input_quantizer.amax = (_amax_for_smoothing * pre_quant_scale).amax().to(dtype)
+        linear.input_quantizer.amax = (
+            (_amax_for_smoothing * pre_quant_scale.to(device)).amax().to(dtype)
+        )
 
         if is_quantized_column_parallel_linear(linear) or is_quantized_row_parallel_linear(linear):
             linear.input_quantizer.sync_amax_across_distributed_group(
@@ -507,7 +511,10 @@ def get_act_scale(x):
 
     def get_scale(x_max, w_max, alpha, tensor_parallel_group=None):
         scales = (
-            (x_max.pow(alpha) / (w_max.pow(1 - alpha) + torch.finfo(torch.float32).tiny))
+            (
+                x_max.pow(alpha)
+                / (w_max.to(x_max.device).pow(1 - alpha) + torch.finfo(torch.float32).tiny)
+            )
             .clamp(min=1e-4, max=1e4)
             .view(-1)
         )
@@ -521,7 +528,7 @@ def update_loss(self, out, out_actual, alpha):
         out_actual = out_actual[0] if isinstance(out_actual, tuple) else out_actual
         out = out[0] if isinstance(out, tuple) else out
         loss = (out - out_actual).float().pow(2).mean()
-        self.awq_lite.loss[alpha] += loss
+        self.awq_lite.loss[alpha] += loss.to(self.awq_lite.loss[alpha].device)
 
     def update_best_params(self):
         if not self.awq_lite.is_enabled: