vllm-project · brian-dellabetta · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -76,11 +76,18 @@ def on_start(self, state: State, event: Event, **kwargs):
         # TODO: this step can be combined with update_weight_zp_scale
         # once update_fused_layer_weight_global_scales is removed
         # and not required by vLLM
-        for _, module in tqdm.tqdm(named_modules):
+        for _, module in tqdm.tqdm(named_modules, desc="Updating global scales"):
             update_weight_global_scale(module)
 
-        for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):
+        # NOTE: update_fused_layer_weight_global_scales operates on Attention
+        # and MLP layers, not quantizable Linear layers. Rather than running
+        # on targeted modules, we need to run on all modules.
+        # Because this call is idempotent, setting all global_scales to the
+        # min value, it is ok to run potentially multiple times for all modules
+        for module in tqdm.tqdm(state.model.modules(), desc="Fusing global scales"):
             update_fused_layer_weight_global_scales(module)
+
+        for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):
             update_weight_zp_scale(module)
 
     def on_event(self, state: State, event: Event, **kwargs):