openvinotoolkit · anzr299 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
@@ -0,0 +1,10 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,98 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+import nncf
+from nncf import SensitivityMetric
+from nncf.common.graph.graph import NNCFGraph
+from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
+from nncf.common.utils.backend import BackendType
+from nncf.quantization.algorithms.algorithm import Algorithm
+from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression
-from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression
+from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression as OriginalWeightCompression
-from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression
+from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression as OriginalWeightCompression
+
+
+class WeightsCompressionPT2E(Algorithm):
-class WeightsCompressionPT2E(Algorithm):
+class WeightCompression(Algorithm):
-class WeightsCompressionPT2E(Algorithm):
+class WeightCompression(Algorithm):
+    def __init__(
+        self,
+        quantizer,
+        subset_size: int = 128,
+        awq: bool = False,
+        scale_estimation: bool = False,
+        gptq: bool = False,
+        lora_correction: bool = False,
+        sensitivity_metric: nncf.SensitivityMetric = SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,
+        compression_format: nncf.CompressionFormat = nncf.CompressionFormat.DQ,
+        advanced_parameters: nncf.AdvancedCompressionParameters = None,
+    ) -> torch.fx.GraphModule:
+        self._quantizer = quantizer
+
+        wc_config = self._quantizer.get_weight_compression_config()
+
+        self._mode = wc_config.get("mode", None)
+        self._awq = awq
+        self._gptq = gptq
+        self._scale_estimation = scale_estimation
+        self._subset_size = subset_size
+        self._advanced_parameters = advanced_parameters
+        self._lora_correction = lora_correction
+        self._ratio = wc_config.get("ratio", 1)
+        self._group_size = wc_config.get("group_size", 128)
+        self._all_layers = wc_config.get("all_layers", False)
+        self._backup_mode = wc_config.get("backup_mode", nncf.BackupMode.INT8_ASYM)
+        self._sensitivity_metric = sensitivity_metric
+        self._compression_format = compression_format
+        self._algo = WeightCompression(
+            mode=self._mode,
+            ratio=self._ratio,
+            group_size=self._group_size,
+            ignored_scope=nncf.IgnoredScope(),  # This is already defined in the quantizer object
+            all_layers=self._all_layers,
+            sensitivity_metric=self._sensitivity_metric,
+            awq=self._awq,
+            subset_size=self._subset_size,
+            scale_estimation=self._scale_estimation,
+            gptq=self._gptq,
+            lora_correction=self._lora_correction,
+            backup_mode=self._backup_mode,
+            compression_format=self._compression_format,
+            advanced_parameters=self._advanced_parameters,
+        )
+
+    def available_backends(self) -> list[BackendType]:
+        return self._algo.available_backends()
+
+    def apply(
+        self,
+        model: torch.fx.GraphModule,
+        graph: NNCFGraph,
+        statistic_points=None,
+        dataset=None,
+    ):
+        self._algo.set_backend_entity(model)
+
+        all_weight_params, ratio_defining_params, group_size_values, skipped_weight_params = self._quantizer.get_weight_compression_parameters(
+            model, graph
+        )
+
+        return self._algo.apply_with_parameters(
+            model,
+            graph,
+            dataset,
+            statistic_points,
+            all_weight_params,
+            ratio_defining_params,
+            group_size_values,
+            skipped_weight_params,
+        )
+
+    def get_statistic_points(self, model, graph: NNCFGraph) -> StatisticPointsContainer:
+        return self._algo.get_statistic_points(model, graph)
@@ -9,5 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nncf.experimental.torch.fx.quantization.quantize_pt2e import compress_pt2e as compress_pt2e
 from nncf.experimental.torch.fx.quantization.quantize_pt2e import quantize_pt2e as quantize_pt2e
 from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer as OpenVINOQuantizer
@@ -27,6 +27,7 @@
 from nncf.common.logging import nncf_logger
 from nncf.common.utils.api_marker import api
 from nncf.experimental.quantization.algorithms.post_training.algorithm import ExperimentalPostTrainingQuantization
+from nncf.experimental.quantization.algorithms.weight_compression.algorithm import WeightsCompressionPT2E
 from nncf.experimental.torch.fx.constant_folding import constant_fold
 from nncf.experimental.torch.fx.quantization.quantizer.openvino_adapter import OpenVINOQuantizerAdapter
 from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer
@@ -157,3 +158,63 @@ def _quant_node_constraint(n: torch.fx.Node) -> bool:
     related to quantization
     """
     return n.op == "call_function" and n.target in QUANTIZE_NODE_TARGETS
+
+
+@api(canonical_alias="nncf.experimental.torch.fx.compress_pt2e")
+def compress_pt2e(
+    model: torch.fx.GraphModule,
+    quantizer: Quantizer,
+    dataset: Optional[nncf.Dataset] = None,
+    awq: bool = False,
+    scale_estimation: bool = False,
+    gptq: bool = False,
+    lora_correction: bool = False,
+    subset_size: int = 128,  # Dataset size to use
+    sensitivity_metric: nncf.SensitivityMetric = None,
+    advanced_parameters: nncf.AdvancedCompressionParameters = None,
+) -> torch.fx.GraphModule:
+    """
+    Applies Weight Compression to the torch.fx.GraphModule provided model
+    using provided torch.ao quantizer.
+
+    :param model: A torch.fx.GraphModule instance to be quantized.
+    :param quantizer: Torch ao quantizer to annotate nodes in the graph with quantization setups
+        to convey the desired way of quantization.
+    :param dataset: A representative dataset for the
+        calibration process.
+    :param awq: Determines whether to use or not the modified AWQ algorithm.
+    :param scale_estimation: Determines whether to use or not scale estimation for 4-bit layers.
+    :param gptq: Determines whether to use or not GPTQ algorithm.
+    :param lora_correction: Determines whether to use or not LoRA Correction algorithm.
+    :param subset_size: Number of data samples to calculate activation statistics used for assigning different
+        quantization precision.
+    :param sensitivity_metric: The sensitivity metric for assigning quantization precision to layers. In order to
+        preserve the accuracy of the model, the more sensitive layers receive a higher precision.
+    :param advanced_parameters: Advanced parameters for algorithms in the compression pipeline.
+    """
+    if isinstance(quantizer, OpenVINOQuantizer) or hasattr(quantizer, "get_nncf_weight_compression_setup"):
+        quantizer = OpenVINOQuantizerAdapter(quantizer)
+        compression_format = nncf.CompressionFormat.DQ
+    else:
+        # TODO Support Third party quantizers here.
+        msg = "Only OpenVINO Quantizer is supported currently."
+        raise nncf.InternalError(msg)
+
+    quantization_algorithm = WeightsCompressionPT2E(
+        quantizer=quantizer,
+        awq=awq,
+        subset_size=subset_size,
+        scale_estimation=scale_estimation,
+        gptq=gptq,
+        lora_correction=lora_correction,
+        sensitivity_metric=sensitivity_metric,
+        compression_format=compression_format,
+        advanced_parameters=advanced_parameters,
+    )
+
+    # Here the model is annotated
+    transformed_model = quantizer.transform_prior_quantization(model)
+    nncf_graph = NNCFGraphFactory.create(transformed_model)
+    quantized_model = quantization_algorithm.apply(transformed_model, nncf_graph, dataset=dataset)
+    quantized_model = torch.fx.GraphModule(quantized_model, graph=quantized_model.graph)
+    return quantized_model
@@ -9,6 +9,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any
+
 import torch.fx
 
 from nncf.common.graph.graph import NNCFGraph
@@ -30,3 +32,11 @@ def transform_prior_quantization(self, model: torch.fx.GraphModule) -> torch.fx.
 
     def get_quantization_setup(self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph) -> SingleConfigQuantizerSetup:
         return self._quantizer.get_nncf_quantization_setup(model, nncf_graph)
+
+    def get_weight_compression_parameters(
+        self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph,
+    ) -> SingleConfigQuantizerSetup:
+        return self._quantizer.get_nncf_weight_compression_parameters(model, nncf_graph)
+
+    def get_weight_compression_config(self) -> dict[str, Any]:
+        return self._quantizer.weight_compression_configuration
@@ -102,7 +102,7 @@ def get_weight_compression_configuration(
     )
 
     return {
-        "mode": mode,
+        "mode": mode if isinstance(mode, nncf.CompressWeightsMode) else nncf.CompressWeightsMode(mode),
         "ratio": ratio or 1,
         "group_size": group_size,
         "all_layers": all_layers or False,
@@ -527,11 +527,8 @@ def _set_weight_compression_config(
             primary_precision_weight_params = self._mixed_precision_algo.apply(
                 model, graph, statistics_points, weight_params=ratio_defining_params
             )
-        else:
-            primary_precision_weight_params = ratio_defining_params
-
-        for weight_param in primary_precision_weight_params:
-            weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])
+            for weight_param in primary_precision_weight_params:
+                weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])
 
         # Check if group size is valid for each weight in ratio_defining_params
         failed_nodes = []
@@ -769,12 +766,32 @@ def is_weight_compression_supported(
 
         return is_supported_dtype and not no_bit_reduction
 
+    def _collect_statistics_and_statistic_points(
+        self, model, graph, statistic_points, dataset, ratio_defining_params, all_weight_params
+    ):
+            if not dataset or not (self._data_aware_mixed_precision or self._data_aware_compression):
+                return None, statistic_points
+            weight_params = ratio_defining_params if self._backup_mode == BackupMode.NONE else all_weight_params
+            matmul_nodes_to_compress = [
+                wp.node_with_weight
+                for wp in weight_params
+                if wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes
+            ]
+            matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(matmul_nodes_to_compress, graph)
+            if statistic_points is None:
+                statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
+                statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
+                statistics_aggregator.register_statistic_points(statistic_points)
+                statistics_aggregator.collect_statistics(model, graph)
+                statistic_points = statistics_aggregator.statistic_points
+            return self._get_statistics_for_weights_compression(
+                matmul_input_to_output_nodes_map, statistic_points
+            ), statistic_points
+
     def get_weight_compression_parameters(
         self,
         model: TModel,
         graph: NNCFGraph,
-        statistic_points: Optional[StatisticPointsContainer] = None,
-        dataset: Optional[Dataset] = None,
     ) -> tuple[list[WeightCompressionParameters], Optional[dict[str, WCTensorStatistic]]]:
         """
         Generates a list of weight compression parameters based on the Weight Compression algorithm
@@ -869,37 +886,18 @@ def get_weight_compression_parameters(
         else:
             group_size_values = {w_params.weight_name: self._group_size for w_params in ratio_defining_params}
 
-        # Collect statistics for the weights compression
-        statistics = None
-        if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset:
-            weight_params = ratio_defining_params if self._backup_mode == BackupMode.NONE else all_weight_params
-            matmul_nodes_to_compress = [
-                wp.node_with_weight
-                for wp in weight_params
-                if wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes
-            ]
-            matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(
-                matmul_nodes_to_compress, graph
-            )
-            if statistic_points is None:
-                statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
-                statistic_points = self._collect_statistics(dataset, graph, model, statistic_points)
-            statistics = self._get_statistics_for_weights_compression(
-                matmul_input_to_output_nodes_map, statistic_points
-            )
-
-        # Set weight compression configuration
-        self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)
-
+        # If no mixed precision has to be applied, then set the primary config for all ratio defining params.
+        if self._ratio == 1 or len(ratio_defining_params) == 0:
+            for weight_param in ratio_defining_params:
+                weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])
+
         # Print statistics
         nncf_logger.info(
             self._get_bitwidth_distribution_str(all_weight_params, ratio_defining_params, skipped_weight_params)
         )
 
-        # Filter all_weight_params and by excluding nodes that should remain in their original floating-point precision
-        all_weight_params = list(filter(lambda w_params: w_params.compression_config is not None, all_weight_params))
+        return all_weight_params, ratio_defining_params, group_size_values, skipped_weight_params
 
-        return all_weight_params, statistics
 
     def apply(
         self,
@@ -911,7 +909,45 @@ def apply(
         self.set_backend_entity(model)
 
         # Get processed weight compression parameters ready for compression
-        all_weight_params, statistics = self.get_weight_compression_parameters(model, graph, statistic_points, dataset)
+        all_weight_params, ratio_defining_params, group_size_values, skipped_weight_params = self.get_weight_compression_parameters(
+            model, graph
+        )
+        return self.apply_with_parameters(
+            model,
+            graph,
+            dataset,
+            statistic_points,
+            all_weight_params,
+            ratio_defining_params,
+            group_size_values,
+            skipped_weight_params,
+        )
+
+    def apply_with_parameters(
+        self,
+        model,
+        graph,
+        dataset,
+        statistic_points,
+        all_weight_params,
+        ratio_defining_params,
+        group_size_values,
+        skipped_weight_params,
+    ):
+        # Collect statistics for the weights compression
+        statistics, statistic_points = self._collect_statistics_and_statistic_points(
+            model, graph, statistic_points, dataset, ratio_defining_params, all_weight_params
+        )
+        # Set weight compression configuration
+        self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)
+
+        # Filter all_weight_params and by excluding nodes that should remain in their original floating-point precision
+        all_weight_params = list(filter(lambda w_params: w_params.compression_config is not None, all_weight_params))
+
+        # Print statistics
+        nncf_logger.info(
+            self._get_bitwidth_distribution_str(all_weight_params, ratio_defining_params, skipped_weight_params)
+        )
 
         if self._awq:
             model = self.awq_algo.apply(model, graph, all_weight_params, statistics, self._backend_entity)
@@ -1048,26 +1084,6 @@ def get_compression_nodes_info(
         matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(matmul_nodes_to_compress, graph)
         return nodes_to_compress, matmul_input_to_output_nodes_map
 
-    def _collect_statistics(
-        self,
-        dataset: Dataset,
-        graph: NNCFGraph,
-        model: TModel,
-        statistic_points: StatisticPointsContainer,
-    ):
-        """
-        Creates statistics aggregator, registers all statistics specified for algorithm, and then collect them.
-
-        :param dataset: Dataset to collect values.
-        :param graph: Model graph.
-        :param model: Model for statistics collection.
-        :param statistic_points: Statistics points.
-        """
-        statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
-        statistics_aggregator.register_statistic_points(statistic_points)
-        statistics_aggregator.collect_statistics(model, graph)
-        return statistics_aggregator.statistic_points
-
     def get_statistic_points(
         self,
         model: TModel,
@@ -1147,4 +1163,4 @@ def _get_statistics_for_weights_compression(
                 # Each activation node may have multiple MatMul nodes which it is an input to
                 for node in matmul_nodes:
                     statistics[node.node_name] = copy.deepcopy(stats)
-        return statistics
+        return statistics