vllm-project · brian-dellabetta · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -79,8 +79,15 @@ def on_start(self, state: State, event: Event, **kwargs):
         for _, module in tqdm.tqdm(named_modules):
             update_weight_global_scale(module)
 
-        for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):
+        # NOTE: update_fused_layer_weight_global_scales operates on Attention
+        # and MLP layers, not quantizable Linear layers. Rather than running
+        # on targeted modules, we need to run on all modules.
+        # Because this call is idempotent, setting all global_scales to the
+        # min value, it is ok to run potentially multiple times for all modules
+        for module in state.model.modules():
             update_fused_layer_weight_global_scales(module)
+
+        for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):
             update_weight_zp_scale(module)
 
     def on_event(self, state: State, event: Event, **kwargs):