refactor

meenchen · meenchen · commit 6d6b4bab914d · 2025-10-08T17:37:41.000Z
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -61,19 +61,15 @@ def apply(
         x = layer.input_quantizer(x)
         if layer.weight_quantizer.is_enabled:
             original_weight = layer.weight
-            _data = None
-            # for parameter, we keep the original data, otherwise we modify the weight
             quantized_tensor = layer.weight_quantizer(layer.weight)
+            # parameterize the quantized weight
             if isinstance(original_weight, torch.nn.Parameter):
-                _data = original_weight.data
-                layer.weight.data = quantized_tensor
-            else:
-                layer.weight = quantized_tensor
+                quantized_tensor = torch.nn.Parameter(
+                    quantized_tensor, requires_grad=original_weight.requires_grad
+                )
+            layer.weight = quantized_tensor
             output = self.quant_method.apply(layer, x, bias)
-            if _data is not None:
-                layer.weight.data = _data
-            else:
-                layer.weight = original_weight
+            layer.weight = original_weight
         else:
             output = self.quant_method.apply(layer, x, bias)
         output = layer.output_quantizer(output)