minor

realAsma · realAsma · commit 1e3df5f332f7 · 2025-09-26T22:55:42.000Z
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/tensor_quant.py b/modelopt/torch/quantization/tensor_quant.py
@@ -379,8 +379,8 @@ def forward(
 
         def legacy_quant_func():
             # The LegacyFakeTensorQuantFunction support cpu and amax with any shape that can be broadcasted to inputs.
-            outputs, scale = _tensor_quant(inputs, amax, num_bits, unsigned, narrow_range)
-            return outputs / scale.to(inputs.dtype)
+            outputs = _tensor_quant(inputs, amax, num_bits, unsigned, narrow_range)
+            return outputs
 
         if not inputs.is_cuda:
             outputs = legacy_quant_func()
@@ -614,9 +614,10 @@ def _tensor_quant(inputs, amax, num_bits=8, unsigned=False, narrow_range=True):
         scale[zero_amax_mask] = (
             1.0  # Return 1 makes more sense for values quantized to 0 with amax=0
         )
+    outputs = outputs / scale
 
     outputs = outputs.to(input_dtype)
-    return outputs, scale
+    return outputs
 
 
 fake_tensor_quant = FakeTensorQuantFunction.apply