[PT] Strip pruned model (#3716)

AlexanderDokuchaev · web-flow · commit 1ea7e45e1a64 · 2025-11-07T12:42:13.000+01:00
### Changes Add strip function for pruned model Simplify strip to avoid build graph for `IN_PLACE` and `DQ` format. ### Related tickets 176026 ### Tests https://github.com/openvinotoolkit/nncf/actions/runs/19017695102
diff --git a/docs/usage/training_time_compression/quantization_aware_training_lora/Usage.md b/docs/usage/training_time_compression/quantization_aware_training_lora/Usage.md
@@ -98,5 +98,5 @@ To convert a PyTorch model to an INT4 OpenVINO model, transform the `FQ_LORA` or
 
 ```python
 # Convert to OpenVINO format after training is complete
-compressed_model = nncf.strip(model, strip_format=StripFormat.DQ, example_input=example_input)
+compressed_model = nncf.strip(model, strip_format=StripFormat.DQ)
 ```
diff --git a/examples/llm_compression/torch/distillation_qat_with_lora/main.py b/examples/llm_compression/torch/distillation_qat_with_lora/main.py
@@ -228,9 +228,8 @@ def export_to_openvino(pretrained: str, ckpt_file: Path, ir_dir: Path) -> OVMode
     :return: A wrapper of OpenVINO model ready for evaluation.
     """
     model_to_eval = AutoModelForCausalLM.from_pretrained(pretrained, torch_dtype=torch.float32, device_map="cpu")
-    example_input = model_to_eval.dummy_inputs
     model_to_eval = load_checkpoint(model_to_eval, ckpt_file)
-    model_to_eval = nncf.strip(model_to_eval, do_copy=False, strip_format=StripFormat.DQ, example_input=example_input)
+    model_to_eval = nncf.strip(model_to_eval, do_copy=False, strip_format=StripFormat.DQ)
     export_from_model(model_to_eval, ir_dir, device="cpu")
     return OVModelForCausalLM.from_pretrained(
         model_id=ir_dir,
diff --git a/examples/llm_compression/torch/downstream_qat_with_nls/main.py b/examples/llm_compression/torch/downstream_qat_with_nls/main.py
@@ -446,14 +446,13 @@ def export_to_openvino(
     :return: A wrapper of OpenVINO model ready for evaluation.
     """
     model_to_eval = AutoModelForCausalLM.from_pretrained(pretrained, torch_dtype=torch.float32, device_map="cpu")
-    example_input = model_to_eval.dummy_inputs
     model_to_eval = load_checkpoint(model_to_eval, ckpt_file)
     if specific_rank_config is not None:
         configure_lora_adapters(
             get_layer_id_vs_lora_quantizers_map(model_to_eval),
             specific_rank_config=specific_rank_config,
         )
-    model_to_eval = nncf.strip(model_to_eval, do_copy=False, strip_format=StripFormat.DQ, example_input=example_input)
+    model_to_eval = nncf.strip(model_to_eval, do_copy=False, strip_format=StripFormat.DQ)
     export_from_model(model_to_eval, ir_dir, device="cpu")
     return OVModelForCausalLM.from_pretrained(
         model_id=ir_dir,
diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py
@@ -150,8 +150,7 @@ class StripFormat(StrEnum):
     :param DQ: Replaces FakeQuantize operations with a dequantization subgraph and stores compressed weights
         in low-bit precision using fake quantize parameters. This is the default format for deploying models
         with compressed weights.
-    :param IN_PLACE: Directly applies fake quantizers to the weights, replacing the original weights with their
-        fake quantized versions.
+    :param IN_PLACE: Directly applies NNCF operations to the weights, replacing the original weights.
     """
 
     NATIVE = "native"
diff --git a/src/nncf/torch/function_hook/hook_storage.py b/src/nncf/torch/function_hook/hook_storage.py
@@ -229,6 +229,8 @@ def delete_hook(self, hook_name: str) -> None:
             raise ValueError(msg)
 
         del storage_dict[hook_key][hook_id]
+        if not storage_dict[hook_key]:
+            del storage_dict[hook_key]
 
 
 def decode_hook_name(hook_name: str) -> tuple[str, str, int]:
diff --git a/src/nncf/torch/function_hook/pruning/strip.py b/src/nncf/torch/function_hook/pruning/strip.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TypeVar
+
+import torch
+from torch import nn
+
+import nncf
+from nncf.torch.function_hook.hook_storage import decode_hook_name
+from nncf.torch.function_hook.pruning.magnitude.modules import UnstructuredPruningMask
+from nncf.torch.function_hook.pruning.rb.modules import RBPruningMask
+from nncf.torch.function_hook.wrapper import get_hook_storage
+from nncf.torch.model_graph_manager import get_module_by_name
+from nncf.torch.model_graph_manager import split_const_name
+
+TModel = TypeVar("TModel", bound=nn.Module)
+
+
+@torch.no_grad()
+def apply_pruning_in_place(model: TModel) -> TModel:
+    """
+    Applies pruning masks in-place to the weights:
+        (weights + pruning mask) -> (pruned weights)
+
+    :param model: Compressed model
+    :return: The modified NNCF network.
+    """
+    hook_storage = get_hook_storage(model)
+    hooks_to_delete = []
+    for hook_name, hook_module in hook_storage.named_hooks():
+        if not isinstance(hook_module, (RBPruningMask, UnstructuredPruningMask)):
+            continue
+
+        hook_module.eval()
+        hook_type, op_name, port_id = decode_hook_name(hook_name)
+        if hook_type != "post_hooks" or port_id != 0:
+            msg = f"Unexpected place of SparsityBinaryMask: {hook_type=}, {op_name=}, {port_id=}"
+            raise nncf.InternalError(msg)
+
+        module_name, weight_attr_name = split_const_name(op_name)
+        module = get_module_by_name(module_name, model)
+        weight_param = getattr(module, weight_attr_name)
+
+        weight_param.requires_grad = False
+        weight_param.data = hook_module(weight_param)
+
+        hooks_to_delete.append(hook_name)
+
+    for hook_name in hooks_to_delete:
+        hook_storage.delete_hook(hook_name)
+    return model
diff --git a/src/nncf/torch/function_hook/strip.py b/src/nncf/torch/function_hook/strip.py
@@ -20,6 +20,7 @@
 from nncf.parameters import StripFormat
 from nncf.torch.function_hook.hook_storage import decode_hook_name
 from nncf.torch.function_hook.nncf_graph.nncf_graph_builder import build_nncf_graph
+from nncf.torch.function_hook.pruning.strip import apply_pruning_in_place
 from nncf.torch.function_hook.wrapper import get_hook_storage
 from nncf.torch.model_graph_manager import get_const_data
 from nncf.torch.model_graph_manager import get_const_node
@@ -36,7 +37,7 @@
 TModel = TypeVar("TModel", bound=nn.Module)
 
 
-def strip_quantized_model(model: TModel, example_input: Any, strip_format: StripFormat = StripFormat.NATIVE) -> TModel:
+def strip_model(model: TModel, example_input: Any = None, strip_format: StripFormat = StripFormat.NATIVE) -> TModel:
     """
     Removes auxiliary layers and operations added during the quantization process,
     resulting in a clean quantized model ready for deployment. The functionality of the model object is still preserved
@@ -47,14 +48,17 @@ def strip_quantized_model(model: TModel, example_input: Any, strip_format: Strip
     :param strip_format: Describes the format in which model is saved after strip.
     :return: The modified NNCF network.
     """
-    graph = build_nncf_graph(model, example_input)
-
     if strip_format == StripFormat.NATIVE:
+        if example_input is None:
+            msg = "The example_input parameter is required to strip the model."
+            raise nncf.InternalError(msg)
+        graph = build_nncf_graph(model, example_input)
         model = replace_quantizer_to_torch_native_module(model, graph)
     elif strip_format == StripFormat.DQ:
-        model = replace_quantizer_to_compressed_weight_with_decompressor(model, graph)
+        model = replace_quantizer_to_compressed_weight_with_decompressor(model)
     elif strip_format == StripFormat.IN_PLACE:
-        model = apply_compression_in_place(model, graph)
+        model = apply_pruning_in_place(model)
+        model = apply_compression_in_place(model)
     else:
         msg = f"Unsupported strip format: {strip_format}"
         raise nncf.ParameterNotSupportedError(msg)
@@ -105,57 +109,48 @@ def replace_quantizer_to_torch_native_module(model: TModel, graph: NNCFGraph) ->
     return model
 
 
-def replace_quantizer_to_compressed_weight_with_decompressor(model: TModel, graph: NNCFGraph) -> TModel:
+def replace_quantizer_to_compressed_weight_with_decompressor(model: TModel) -> TModel:
     """
     Performs transformation from fake quantize format (FQ) to dequantization one (DQ):
         (weights + FQ) -> (compressed_weights + DQ)
 
     :param model: Compressed model
-    :param graph: The model graph.
     :return: The modified NNCF network.
     """
     hook_storage = get_hook_storage(model)
 
-    for name, module in hook_storage.named_hooks():
-        if not isinstance(module, (SymmetricQuantizer, AsymmetricQuantizer)):
+    for hook_name, hook_module in hook_storage.named_hooks():
+        if not isinstance(hook_module, (SymmetricQuantizer, AsymmetricQuantizer)):
             continue
         msg = ""
-        if module._qspec.half_range or module._qspec.narrow_range:
+        if hook_module._qspec.half_range or hook_module._qspec.narrow_range:
             msg += "Unexpected parameters of quantizers on strip: half_range and narrow_range should be False.\n"
-        if module.num_bits not in [4, 8]:
-            msg += f"Unsupported number of bits {module.num_bits} for the quantizer {module}.\n"
+        if hook_module.num_bits not in [4, 8]:
+            msg += f"Unsupported number of bits {hook_module.num_bits} for the quantizer {hook_module}.\n"
         if msg:
             raise nncf.ValidationError(msg)
 
-        _, op_name, _ = decode_hook_name(name)
-        weight_node = graph.get_node_by_name(op_name)
-
-        if weight_node is None:
-            msg = "FQ is not assigned to weight. Strip to DQ format is not supported for FQ on activation."
-            raise nncf.UnsupportedModelError(msg)
-
-        if not isinstance(weight_node.layer_attributes, ConstantLayerAttributes):
-            msg = f"Unexpected layer attributes type {type(weight_node.layer_attributes)}"
-            raise nncf.InternalError(msg)
-
-        weight = get_const_data(weight_node, model)
+        _, op_name, _ = decode_hook_name(hook_name)
 
-        convert_fn = asym_fq_to_decompressor if isinstance(module, AsymmetricQuantizer) else sym_fq_to_decompressor
-        decompressor, q_weight = convert_fn(module, weight)  # type: ignore[operator]
-        packed_tensor = decompressor.pack_weight(q_weight)
-
-        module_name, weight_attr_name = split_const_name(weight_node.layer_attributes.name)
+        module_name, weight_attr_name = split_const_name(op_name)
         module = get_module_by_name(module_name, model)
         weight_param = getattr(module, weight_attr_name)
 
+        with torch.no_grad():
+            if isinstance(hook_module, AsymmetricQuantizer):
+                decompressor, q_weight = asym_fq_to_decompressor(hook_module, weight_param)
+            else:
+                decompressor, q_weight = sym_fq_to_decompressor(hook_module, weight_param)  # type: ignore[assignment]
+            packed_tensor = decompressor.pack_weight(q_weight)
+
         weight_param.requires_grad = False
         weight_param.data = packed_tensor
 
-        hook_storage.set_submodule(name, decompressor)
+        hook_storage.set_submodule(hook_name, decompressor)
     return model
 
 
-def apply_compression_in_place(model: TModel, graph: NNCFGraph) -> TModel:
+def apply_compression_in_place(model: TModel) -> TModel:
     """
     Applies fake quantizers in-place to the weights:
         (weights + FQ) -> (fake quantized weights)
@@ -167,31 +162,26 @@ def apply_compression_in_place(model: TModel, graph: NNCFGraph) -> TModel:
     hook_storage = get_hook_storage(model)
 
     hooks_to_delete = []
-    for name, hook in hook_storage.named_hooks():
-        if not isinstance(hook, (SymmetricQuantizer, AsymmetricQuantizer, BaseWeightsDecompressor)):
+    for hook_name, hook_module in hook_storage.named_hooks():
+        if not isinstance(hook_module, (SymmetricQuantizer, AsymmetricQuantizer, BaseWeightsDecompressor)):
             continue
-        _, op_name, _ = decode_hook_name(name)
-        weight_node = graph.get_node_by_name(op_name)
+        hook_module.eval()
 
-        if weight_node is None:
-            msg = "FQ is not assigned to weight. In-place strip is not supported for FQ on activation."
-            raise nncf.UnsupportedModelError(msg)
-
-        if not isinstance(weight_node.layer_attributes, ConstantLayerAttributes):
-            msg = f"Unexpected layer attributes type {type(weight_node.layer_attributes)}"
-            raise nncf.InternalError(msg)
-
-        weight = get_const_data(weight_node, model)
-        fq_weight = hook(weight) if isinstance(hook, BaseWeightsDecompressor) else hook.quantize(weight)
-
-        module_name, weight_attr_name = split_const_name(weight_node.layer_attributes.name)
+        _, op_name, _ = decode_hook_name(hook_name)
+        module_name, weight_attr_name = split_const_name(op_name)
         module = get_module_by_name(module_name, model)
         weight_param = getattr(module, weight_attr_name)
 
+        with torch.no_grad():
+            if isinstance(hook_module, (SymmetricQuantizer, AsymmetricQuantizer)):
+                fq_weight = hook_module.quantize(weight_param)
+            else:
+                fq_weight = hook_module(weight_param)
+
         weight_param.requires_grad = False
         weight_param.data = fq_weight
 
-        hooks_to_delete.append(name)
+        hooks_to_delete.append(hook_name)
 
     for hook_name in hooks_to_delete:
         hook_storage.delete_hook(hook_name)
diff --git a/src/nncf/torch/strip.py b/src/nncf/torch/strip.py
@@ -11,11 +11,10 @@
 
 
 from copy import deepcopy
-from typing import Any, Optional, TypeVar
+from typing import Any, TypeVar
 
 from torch import nn
 
-import nncf
 from nncf.common.check_features import is_torch_tracing_by_patching
 from nncf.parameters import StripFormat
 
@@ -26,7 +25,7 @@ def strip(
     model: TModel,
     do_copy: bool = True,
     strip_format: StripFormat = StripFormat.NATIVE,
-    example_input: Optional[Any] = None,
+    example_input: Any = None,
 ) -> TModel:
     """
     Removes auxiliary layers and operations added during the compression process, resulting in a clean
@@ -41,10 +40,7 @@ def strip(
     if is_torch_tracing_by_patching():
         return model.nncf.strip(do_copy, strip_format)
 
-    from nncf.torch.function_hook.strip import strip_quantized_model
+    from nncf.torch.function_hook.strip import strip_model
 
-    if example_input is None:
-        msg = "Required example_input for strip model."
-        raise nncf.InternalError(msg)
     model = deepcopy(model) if do_copy else model
-    return strip_quantized_model(model, example_input, strip_format)
+    return strip_model(model, example_input, strip_format)
diff --git a/tests/torch2/function_hook/pruning/helpers.py b/tests/torch2/function_hook/pruning/helpers.py
@@ -22,6 +22,7 @@ def get_example_inputs():
     def __init__(self) -> None:
         super().__init__()
         self.conv = nn.Conv2d(3, 3, 3)
+        self.conv.weight.data = torch.arange(1, 82, dtype=torch.float32).view(3, 3, 3, 3)
 
     def forward(self, x: torch.Tensor):
         x = self.conv(x)
diff --git a/tests/torch2/function_hook/pruning/magnitude/test_strip.py b/tests/torch2/function_hook/pruning/magnitude/test_strip.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+import nncf
+from nncf import PruneMode
+from nncf.torch.function_hook.pruning.magnitude.modules import UnstructuredPruningMask
+from nncf.torch.function_hook.wrapper import get_hook_storage
+from tests.torch2.function_hook.pruning.helpers import ConvModel
+
+
+def test_strip():
+    model = ConvModel()
+    example_inputs = ConvModel.get_example_inputs()
+    pruned_model = nncf.prune(
+        model, mode=PruneMode.UNSTRUCTURED_MAGNITUDE_LOCAL, ratio=0.5, examples_inputs=example_inputs
+    )
+    pruned_model.eval()
+
+    hook_storage = get_hook_storage(pruned_model)
+    pruning_module = hook_storage.post_hooks["conv:weight__0"]["0"]
+
+    assert isinstance(pruning_module, UnstructuredPruningMask)
+
+    with torch.no_grad():
+        pruned_weight = pruning_module(pruned_model.conv.weight)
+
+    striped_model = nncf.strip(pruned_model, strip_format=nncf.StripFormat.IN_PLACE, do_copy=False)
+    hook_storage = get_hook_storage(striped_model)
+
+    assert not list(hook_storage.named_hooks())
+    assert torch.equal(striped_model.conv.weight, pruned_weight)
+    assert torch.count_nonzero(striped_model.conv.weight) == 40
diff --git a/tests/torch2/function_hook/pruning/rb/test_strip.py b/tests/torch2/function_hook/pruning/rb/test_strip.py
diff --git a/tests/torch2/function_hook/quantization/strip/test_strip_in_place.py b/tests/torch2/function_hook/quantization/strip/test_strip_in_place.py