fix(svdquant): run SVD on GPU to maintain utilization (NVIDIA#633)

TaekyungHeo · web-flow · commit 1524251cf184 · 2025-12-03T17:27:43.000Z
## What does this PR do? **Type of change:** Bug fix **Overview:** Fix SVD quantization running on CPU instead of GPU, which caused jobs to be killed by the internal job scheduler due to low GPU utilization during long-running SVD operations. ## Changes - Keep tensor on GPU during SVD computation by explicitly specifying device - Add `full_matrices=False` for faster computation (only need first `lowrank` singular vectors) ## Motivation The original implementation ran SVD on CPU, causing: 1. Jobs killed by internal scheduler due to low GPU utilization during SVD phase 2. Potential performance degradation from CPU-GPU data transfer overhead ## Usage No API changes. Existing svdquant usage remains the same. ## Testing - Tested locally with Wan2.2 SVD quantization - Verified job no longer killed due to low GPU utilization ## Before your PR is "*Ready for review*" - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: No - **Did you add or update any necessary documentation?**: No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: No Signed-off-by: Taekyung Heo <theo@nvidia.com>
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -1029,7 +1029,11 @@ def svdquant(
 
     def postprocess(module, name):
         print_rank_0(f"SVD {name}")
-        u, s, vt = torch.linalg.svd(module.weight.data.double())
+        weight = module.weight.data
+        original_device = weight.device
+        original_dtype = weight.dtype
+        weight_f64 = weight.to(dtype=torch.float64, device=original_device)
+        u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False)
         if u.shape[1] < lowrank or vt.shape[0] < lowrank:
             warnings.warn(
                 "The low-rank dimensions do not match the layer dimensions. "
@@ -1039,9 +1043,12 @@ def postprocess(module, name):
             return
         us = u[:, :lowrank] * s[:lowrank]
         vt = vt[:lowrank]
-        dtype = module.weight.dtype
-        module.weight_quantizer.svdquant_lora_a = vt.to(dtype=dtype)
-        module.weight_quantizer.svdquant_lora_b = us.to(dtype=dtype)
+        module.weight_quantizer.svdquant_lora_a = vt.to(
+            dtype=original_dtype, device=original_device
+        )
+        module.weight_quantizer.svdquant_lora_b = us.to(
+            dtype=original_dtype, device=original_device
+        )
         module.weight.data.sub_(
             module.weight_quantizer.svdquant_lora_b @ module.weight_quantizer.svdquant_lora_a
         )