From 8b203e2a96e5ca4b9f009f2a84e6c67dbd1c7cbc Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 26 Sep 2025 15:22:10 +0000 Subject: [PATCH 1/2] run fused layer update on all modules Signed-off-by: Brian Dellabetta --- .../modifiers/quantization/quantization/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index aa6208da4..eb0357c54 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -79,8 +79,15 @@ def on_start(self, state: State, event: Event, **kwargs): for _, module in tqdm.tqdm(named_modules): update_weight_global_scale(module) - for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): + # NOTE: update_fused_layer_weight_global_scales operates on Attention + # and MLP layers, not quantizable Linear layers. Rather than running + # on targeted modules, we need to run on all modules. + # Because this call is idempotent, setting all global_scales to the + # min value, it is ok to run potentially multiple times for all modules + for module in state.model.modules(): update_fused_layer_weight_global_scales(module) + + for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): update_weight_zp_scale(module) def on_event(self, state: State, event: Event, **kwargs): From a40d3bfdd735a0a79c9ff957ed62925faaafa8b0 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Fri, 26 Sep 2025 15:37:24 +0000 Subject: [PATCH 2/2] tqdm labels Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/quantization/quantization/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index eb0357c54..424cabcf6 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -76,7 +76,7 @@ def on_start(self, state: State, event: Event, **kwargs): # TODO: this step can be combined with update_weight_zp_scale # once update_fused_layer_weight_global_scales is removed # and not required by vLLM - for _, module in tqdm.tqdm(named_modules): + for _, module in tqdm.tqdm(named_modules, desc="Updating global scales"): update_weight_global_scale(module) # NOTE: update_fused_layer_weight_global_scales operates on Attention @@ -84,7 +84,7 @@ def on_start(self, state: State, event: Event, **kwargs): # on targeted modules, we need to run on all modules. # Because this call is idempotent, setting all global_scales to the # min value, it is ok to run potentially multiple times for all modules - for module in state.model.modules(): + for module in tqdm.tqdm(state.model.modules(), desc="Fusing global scales"): update_fused_layer_weight_global_scales(module) for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):