diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index aa6208da4..424cabcf6 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -76,11 +76,18 @@ def on_start(self, state: State, event: Event, **kwargs): # TODO: this step can be combined with update_weight_zp_scale # once update_fused_layer_weight_global_scales is removed # and not required by vLLM - for _, module in tqdm.tqdm(named_modules): + for _, module in tqdm.tqdm(named_modules, desc="Updating global scales"): update_weight_global_scale(module) - for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): + # NOTE: update_fused_layer_weight_global_scales operates on Attention + # and MLP layers, not quantizable Linear layers. Rather than running + # on targeted modules, we need to run on all modules. + # Because this call is idempotent, setting all global_scales to the + # min value, it is ok to run potentially multiple times for all modules + for module in tqdm.tqdm(state.model.modules(), desc="Fusing global scales"): update_fused_layer_weight_global_scales(module) + + for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"): update_weight_zp_scale(module) def on_event(self, state: State, event: Event, **kwargs):