openvinotoolkit · anzr299 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
@@ -0,0 +1,10 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -27,6 +27,7 @@
 from nncf.common.logging import nncf_logger
 from nncf.common.utils.api_marker import api
 from nncf.experimental.quantization.algorithms.post_training.algorithm import ExperimentalPostTrainingQuantization
+from nncf.experimental.quantization.algorithms.weight_compression.algorithm import WeightsCompressionPT2E
 from nncf.experimental.torch.fx.constant_folding import constant_fold
 from nncf.experimental.torch.fx.quantization.quantizer.openvino_adapter import OpenVINOQuantizerAdapter
 from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer
@@ -38,7 +39,6 @@
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
 from nncf.quantization.range_estimator import RangeEstimatorParameters
 
-
 @api(canonical_alias="nncf.experimental.torch.fx.quantize_pt2e")
 def quantize_pt2e(
     model: torch.fx.GraphModule,
@@ -157,3 +157,62 @@ def _quant_node_constraint(n: torch.fx.Node) -> bool:
     related to quantization
     """
     return n.op == "call_function" and n.target in QUANTIZE_NODE_TARGETS
+
+@api(canonical_alias="nncf.experimental.torch.fx.compress_pt2e")
+def compress_pt2e(
+                model: torch.fx.GraphModule,
+                quantizer: Quantizer,
+                dataset: Optional[nncf.Dataset] = None,
+                awq: bool = False,
+                scale_estimation: bool = False,
+                gptq: bool = False,
+                lora_correction: bool = False,
+                subset_size: int = 128,  # Dataset size to use
+                sensitivity_metric: nncf.SensitivityMetric = nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,
+                advanced_parameters: nncf.AdvancedCompressionParameters = None,
+                ) -> torch.fx.GraphModule:
+    """
+    Applies Weight Compression to the torch.fx.GraphModule provided model
+    using provided torch.ao quantizer.
+
+    :param model: A torch.fx.GraphModule instance to be quantized.
+    :param quantizer: Torch ao quantizer to annotate nodes in the graph with quantization setups
+        to convey the desired way of quantization.
+    :param dataset: A representative dataset for the
+        calibration process.
+    :param awq: Determines whether to use or not the modified AWQ algorithm.
+    :param scale_estimation: Determines whether to use or not scale estimation for 4-bit layers.
+    :param gptq: Determines whether to use or not GPTQ algorithm.
+    :param lora_correction: Determines whether to use or not LoRA Correction algorithm.
+    :param subset_size: Number of data samples to calculate activation statistics used for assigning different
+        quantization precision.
+    :param sensitivity_metric: The sensitivity metric for assigning quantization precision to layers. In order to
+        preserve the accuracy of the model, the more sensitive layers receive a higher precision.
+    :param advanced_parameters: Advanced parameters for algorithms in the compression pipeline.
+    """
+    if isinstance(quantizer, OpenVINOQuantizer) or hasattr(quantizer, "get_nncf_weight_compression_setup"):
+        quantizer = OpenVINOQuantizerAdapter(quantizer)
+        compression_format = nncf.CompressionFormat.DQ # since OVQUantizer has a defined decompression subgraph which we want, this is a minimally invasive way to do it
+    else:
+        #TODO Support Third party quantizers here.
+        msg = 'Only OpenVINO Quantizer is supported currently.'
+        raise nncf.InternalError(msg)
+
+    quantization_algorithm = WeightsCompressionPT2E(
+        quantizer=quantizer,
+        awq=awq,
+        subset_size=subset_size,
+        scale_estimation=scale_estimation,
+        gptq=gptq,
+        lora_correction=lora_correction,
+        sensitivity_metric=sensitivity_metric,
+        compression_format=compression_format,
+        advanced_parameters=advanced_parameters,
+        )
+
+    # Here the model is annotated
+    transformed_model = quantizer.transform_prior_quantization(model)
+    nncf_graph = NNCFGraphFactory.create(transformed_model)
+    quantized_model = quantization_algorithm.apply(transformed_model, nncf_graph, dataset=dataset)
+    quantized_model = torch.fx.GraphModule(quantized_model, graph=quantized_model.graph)
+    return quantized_model
@@ -14,7 +14,7 @@
 from nncf.common.graph.graph import NNCFGraph
 from nncf.common.quantization.quantizer_setup import SingleConfigQuantizerSetup
 from nncf.experimental.quantization.quantizer import Quantizer
-from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer
+from executorch.backends.openvino.quantizer.quantizer import OpenVINOQuantizer
 
 
 class OpenVINOQuantizerAdapter(Quantizer):
@@ -24,9 +24,16 @@ class OpenVINOQuantizerAdapter(Quantizer):
 
     def __init__(self, quantizer: OpenVINOQuantizer):
         self._quantizer = quantizer
+        self._weight_compression_configuration = self._quantizer.weight_compression_configuration
 
     def transform_prior_quantization(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         return self._quantizer.transform_for_annotation(model)
 
     def get_quantization_setup(self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph) -> SingleConfigQuantizerSetup:
         return self._quantizer.get_nncf_quantization_setup(model, nncf_graph)
+
+    def get_weight_compression_setup(self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph) -> SingleConfigQuantizerSetup:
+        return self._quantizer.get_nncf_weight_compression_setup(model, nncf_graph)
+
+    def get_nodes_to_compress(self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph):
+        return self._quantizer.get_nodes_to_compress(model, nncf_graph)
@@ -769,10 +769,51 @@ def is_weight_compression_supported(
 
         return is_supported_dtype and not no_bit_reduction
 
+    def collect_weight_compression_statistics(
+        self,
+        model: TModel,
+        graph: NNCFGraph,
+        dataset: Dataset,
+        weight_params: list[WeightCompressionParameters],
+        statistic_points: Optional[StatisticPointsContainer] = None,
+    ) -> Optional[dict[str, Any]]:
+        """
+        Collects statistics for weight compression if data-aware compression or 
+        mixed-precision is enabled.
+
+        :param model: Backend-specific input model.
+        :param graph: NNCFGraph instance.
+        :param dataset: Dataset for statistics collection.
+        :param weight_params: Weight parameters for which to collect statistics.
+        :param statistic_points: Optional pre-collected statistic points.
+        :return: A dictionary of collected statistics, or None if not applicable.
+        """
+        statistics = None
+        if not (self._data_aware_mixed_precision or self._data_aware_compression) and not dataset:
+            return statistics, statistic_points
+        matmul_nodes_to_compress = [
+            wp.node_with_weight
+            for wp in weight_params
+            if wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes
+        ]
+        matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(
+            matmul_nodes_to_compress, graph
+        )
+
+        if statistic_points is None:
+            statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
+            statistic_points = self._collect_statistics(dataset, graph, model, statistic_points)
+
+        statistics = self._get_statistics_for_weights_compression(
+            matmul_input_to_output_nodes_map, statistic_points
+        )
+        return statistics, statistic_points
+
     def get_weight_compression_parameters(
         self,
         model: TModel,
         graph: NNCFGraph,
+        nodes_to_compress: list[NNCFNode],
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
     ) -> tuple[list[WeightCompressionParameters], Optional[dict[str, WCTensorStatistic]]]:
@@ -791,8 +832,6 @@ def get_weight_compression_parameters(
             Compression algorithm configuration, and a mapping of target node names to the
             collected statistics.
         """
-        nodes_to_compress = self.get_nodes_to_compress(graph)
-
         all_weight_params: list[WeightCompressionParameters] = []
         skipped_weight_params: list[WeightCompressionParameters] = []
 
@@ -870,23 +909,8 @@ def get_weight_compression_parameters(
             group_size_values = {w_params.weight_name: self._group_size for w_params in ratio_defining_params}
 
         # Collect statistics for the weights compression
-        statistics = None
-        if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset:
-            weight_params = ratio_defining_params if self._backup_mode == BackupMode.NONE else all_weight_params
-            matmul_nodes_to_compress = [
-                wp.node_with_weight
-                for wp in weight_params
-                if wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes
-            ]
-            matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(
-                matmul_nodes_to_compress, graph
-            )
-            if statistic_points is None:
-                statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
-                statistic_points = self._collect_statistics(dataset, graph, model, statistic_points)
-            statistics = self._get_statistics_for_weights_compression(
-                matmul_input_to_output_nodes_map, statistic_points
-            )
+        weight_params = ratio_defining_params if self._backup_mode == BackupMode.NONE else all_weight_params
+        statistics, statistic_points = self.collect_weight_compression_statistics(model, graph, dataset, weight_params, statistic_points)
 
         # Set weight compression configuration
         self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)
@@ -901,18 +925,14 @@ def get_weight_compression_parameters(
 
         return all_weight_params, statistics
 
-    def apply(
-        self,
-        model: TModel,
-        graph: NNCFGraph,
-        statistic_points: Optional[StatisticPointsContainer] = None,
-        dataset: Optional[Dataset] = None,
+    def apply_wc_algos(
+            self,
+            model: TModel,
+            graph: NNCFGraph,
+            all_weight_params: list[WeightCompressionParameters],
+            statistics: dict[str, Any],
+            dataset: Optional[Dataset] = None,
     ) -> TModel:
-        self.set_backend_entity(model)
-
-        # Get processed weight compression parameters ready for compression
-        all_weight_params, statistics = self.get_weight_compression_parameters(model, graph, statistic_points, dataset)
-
         if self._awq:
             model = self.awq_algo.apply(model, graph, all_weight_params, statistics, self._backend_entity)
             # After applying AWQ we need to update statistics since AWQ alters the activations
@@ -967,7 +987,7 @@ def apply(
         self._backend_entity.dump_parameters(
             model,
             parameters={
-                "mode": self._mode.value,
+                "mode": self._mode.value if not isinstance(self._mode, str) else self._mode,
                 "group_size": self._group_size,
                 "ratio": self._ratio,
                 "all_layers": self._all_layers,
@@ -983,6 +1003,25 @@ def apply(
             },
             algo_name="weight_compression",
         )
+
+        return transformed_model
+
+
+    def apply(
+        self,
+        model: TModel,
+        graph: NNCFGraph,
+        statistic_points: Optional[StatisticPointsContainer] = None,
+        dataset: Optional[Dataset] = None,
+    ) -> TModel:
+        self.set_backend_entity(model)
+        nodes_to_compress = self.get_nodes_to_compress(graph)
+         # Get processed weight compression parameters ready for compression
+        all_weight_params, statistics = self.get_weight_compression_parameters(
+            model, graph, nodes_to_compress, statistic_points, dataset
+        )
+        transformed_model = self.apply_wc_algos(model, graph, all_weight_params, statistics, dataset)
+
         return transformed_model
 
     def _get_activation_node_and_port(self, node: NNCFNode, nncf_graph: NNCFGraph) -> tuple[NNCFNode, int]: