Skip to content
Draft
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
190f9d5
init
anzr299 Sep 22, 2025
c52fcca
fixes
anzr299 Sep 22, 2025
4e56cb5
add message for unsupported external quantizers
anzr299 Sep 22, 2025
9651ceb
add algorithm
anzr299 Sep 22, 2025
14daeb5
impotr openvino quantizer from nncf instead of executorch
anzr299 Sep 22, 2025
3746815
Add observers and openvino quantizer to nncf
anzr299 Sep 22, 2025
0815dc5
fix
anzr299 Sep 22, 2025
1b8d940
minor fix
anzr299 Sep 22, 2025
7d35374
fix
anzr299 Sep 22, 2025
427ebc2
fix some more bugs; observers was importing from torchao. causing mis…
anzr299 Sep 22, 2025
24dbfb6
add compress pt2e to init
anzr299 Sep 22, 2025
4bb8c1a
fix quantizer init file. Remove extra code.
anzr299 Sep 22, 2025
8902842
small fix for the big problem:)
anzr299 Sep 23, 2025
3842538
fix quantizer preset definition
anzr299 Sep 23, 2025
2e70c2e
fix openvino quantizer for ptq. call _algo instead of legacy _min_max…
anzr299 Sep 23, 2025
b1c9aad
fix quantizer defaults
anzr299 Sep 23, 2025
33fe01c
microfix
anzr299 Sep 23, 2025
d8e1006
precommit fix
anzr299 Sep 23, 2025
88a8472
revert openvino quantizer to old
anzr299 Sep 23, 2025
7a8e51a
create ovquantizer in executorch dir
anzr299 Sep 23, 2025
fed5052
update executorch quantizer location.
anzr299 Sep 23, 2025
2866473
check if openvino quantizer has weight compression in openvino adapter
anzr299 Sep 23, 2025
7171d56
review comments
anzr299 Sep 24, 2025
3e3b067
revert ignored scope changes; make sensitivity metric None to check i…
anzr299 Sep 24, 2025
5b7b210
precommit fix
anzr299 Sep 24, 2025
71a479f
pre commit format
anzr299 Sep 24, 2025
b24a59c
rename executorch quantizer to test_quantizer
anzr299 Sep 24, 2025
d12225a
fix last precommit
anzr299 Sep 24, 2025
9870ee2
remove unused mypy ignore
anzr299 Sep 24, 2025
8015629
get the mode as struct
anzr299 Sep 24, 2025
0804218
fix algorithm
anzr299 Sep 24, 2025
1f1fda3
remove quantizer and observers from nncf. Instead import from executorch
anzr299 Sep 24, 2025
623ce46
rework wc algorithm so that get_weight_comrpession_params becomes mor…
anzr299 Oct 1, 2025
d14a6eb
fix bugs; use sensitivity metric instead of mixed precision algo
anzr299 Oct 1, 2025
e91b455
update algorithm with new reworking
anzr299 Oct 6, 2025
448bf84
changes
anzr299 Oct 6, 2025
8e23572
review changes
anzr299 Oct 6, 2025
36ddf53
change WeightsCompressionPT2E to ExperimentalWeightsCompression
anzr299 Oct 7, 2025
07b730b
change ExperimentalWeightsCompression to WeightsCompression
anzr299 Oct 7, 2025
d5dd422
add comments
anzr299 Oct 7, 2025
076a76b
add typehints
anzr299 Oct 7, 2025
2ce9eec
add docstrings
anzr299 Oct 7, 2025
1bebf3e
add typehint for quantize pt2e
anzr299 Oct 7, 2025
ea81cfd
Merge branch 'openvinotoolkit:develop' into an/fx/compress_pt2e
anzr299 Oct 7, 2025
e82920f
return original develop branch changes
anzr299 Oct 7, 2025
82cc10b
update typehints and docs
anzr299 Oct 7, 2025
beae508
format
anzr299 Oct 7, 2025
8bd95df
update type hinting of openvino adapter
anzr299 Oct 7, 2025
aac9d3f
add test
anzr299 Oct 10, 2025
4278cfd
update reference graphs; use more samples for calibration dataset. Th…
anzr299 Oct 10, 2025
6fd5216
remove groupsize values as return statement from get_weight_compressi…
anzr299 Oct 10, 2025
118b611
update algorithm
anzr299 Oct 13, 2025
e9f3cd4
change WeightCompression to OriginalWeightCompression in experimental…
anzr299 Oct 13, 2025
a969e58
update docstrings as discussed offline
anzr299 Oct 13, 2025
71d0597
revert torchaoadapter code
anzr299 Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2025 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright (c) 2025 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch

import nncf
from nncf import SensitivityMetric
from nncf.common.graph.graph import NNCFGraph
from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
from nncf.common.utils.backend import BackendType
from nncf.quantization.algorithms.algorithm import Algorithm
from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression
from nncf.quantization.algorithms.weight_compression.algorithm import WeightCompression as OriginalWeightCompression

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, good catch! I will change it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done



class WeightsCompressionPT2E(Algorithm):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This algorithm is not designed for the PT2E, this is experimental WC algorithm which could be implemented in any backend

Suggested change
class WeightsCompressionPT2E(Algorithm):
class WeightCompression(Algorithm):

Copy link
Collaborator Author

@anzr299 anzr299 Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I rename it to ExperimentalWeightCompression instead? since it could be confused with the original

Copy link
Collaborator

@daniil-lyakhov daniil-lyakhov Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is inside the experimental directory, that should be descriptive enough. I suggest the WeightCompression name

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doner

def __init__(
self,
quantizer,
subset_size: int = 128,
awq: bool = False,
scale_estimation: bool = False,
gptq: bool = False,
lora_correction: bool = False,
sensitivity_metric: nncf.SensitivityMetric = SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,
compression_format: nncf.CompressionFormat = nncf.CompressionFormat.DQ,
advanced_parameters: nncf.AdvancedCompressionParameters = None,
) -> torch.fx.GraphModule:
self._quantizer = quantizer
Comment on lines 57 to 71
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typehints an docstring are missing


wc_config = self._quantizer.get_weight_compression_config()

self._mode = wc_config.get("mode", None)
self._awq = awq
self._gptq = gptq
self._scale_estimation = scale_estimation
self._subset_size = subset_size
self._advanced_parameters = advanced_parameters
self._lora_correction = lora_correction
self._ratio = wc_config.get("ratio", 1)
self._group_size = wc_config.get("group_size", 128)
self._all_layers = wc_config.get("all_layers", False)
self._backup_mode = wc_config.get("backup_mode", nncf.BackupMode.INT8_ASYM)
self._sensitivity_metric = sensitivity_metric
self._compression_format = compression_format
self._algo = WeightCompression(
mode=self._mode,
ratio=self._ratio,
group_size=self._group_size,
ignored_scope=nncf.IgnoredScope(), # This is already defined in the quantizer object
all_layers=self._all_layers,
sensitivity_metric=self._sensitivity_metric,
awq=self._awq,
subset_size=self._subset_size,
scale_estimation=self._scale_estimation,
gptq=self._gptq,
lora_correction=self._lora_correction,
backup_mode=self._backup_mode,
compression_format=self._compression_format,
advanced_parameters=self._advanced_parameters,
)

def available_backends(self) -> list[BackendType]:
return self._algo.available_backends()

def apply(
self,
model: torch.fx.GraphModule,
graph: NNCFGraph,
statistic_points=None,
dataset=None,
):
self._algo.set_backend_entity(model)

all_weight_params, ratio_defining_params, group_size_values, skipped_weight_params = self._quantizer.get_weight_compression_parameters(
model, graph
)

return self._algo.apply_with_parameters(
model,
graph,
dataset,
statistic_points,
all_weight_params,
ratio_defining_params,
group_size_values,
skipped_weight_params,
)

def get_statistic_points(self, model, graph: NNCFGraph) -> StatisticPointsContainer:
return self._algo.get_statistic_points(model, graph)
1 change: 1 addition & 0 deletions src/nncf/experimental/torch/fx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from nncf.experimental.torch.fx.quantization.quantize_pt2e import compress_pt2e as compress_pt2e
from nncf.experimental.torch.fx.quantization.quantize_pt2e import quantize_pt2e as quantize_pt2e
from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer as OpenVINOQuantizer
61 changes: 61 additions & 0 deletions src/nncf/experimental/torch/fx/quantization/quantize_pt2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from nncf.common.logging import nncf_logger
from nncf.common.utils.api_marker import api
from nncf.experimental.quantization.algorithms.post_training.algorithm import ExperimentalPostTrainingQuantization
from nncf.experimental.quantization.algorithms.weight_compression.algorithm import WeightsCompressionPT2E
from nncf.experimental.torch.fx.constant_folding import constant_fold
from nncf.experimental.torch.fx.quantization.quantizer.openvino_adapter import OpenVINOQuantizerAdapter
from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer
Expand Down Expand Up @@ -157,3 +158,63 @@ def _quant_node_constraint(n: torch.fx.Node) -> bool:
related to quantization
"""
return n.op == "call_function" and n.target in QUANTIZE_NODE_TARGETS


@api(canonical_alias="nncf.experimental.torch.fx.compress_pt2e")
def compress_pt2e(
model: torch.fx.GraphModule,
quantizer: Quantizer,
dataset: Optional[nncf.Dataset] = None,
awq: bool = False,
scale_estimation: bool = False,
gptq: bool = False,
lora_correction: bool = False,
subset_size: int = 128, # Dataset size to use
sensitivity_metric: nncf.SensitivityMetric = None,
advanced_parameters: nncf.AdvancedCompressionParameters = None,
) -> torch.fx.GraphModule:
"""
Applies Weight Compression to the torch.fx.GraphModule provided model
using provided torch.ao quantizer.

:param model: A torch.fx.GraphModule instance to be quantized.
:param quantizer: Torch ao quantizer to annotate nodes in the graph with quantization setups
to convey the desired way of quantization.
:param dataset: A representative dataset for the
calibration process.
:param awq: Determines whether to use or not the modified AWQ algorithm.
:param scale_estimation: Determines whether to use or not scale estimation for 4-bit layers.
:param gptq: Determines whether to use or not GPTQ algorithm.
:param lora_correction: Determines whether to use or not LoRA Correction algorithm.
:param subset_size: Number of data samples to calculate activation statistics used for assigning different
quantization precision.
:param sensitivity_metric: The sensitivity metric for assigning quantization precision to layers. In order to
preserve the accuracy of the model, the more sensitive layers receive a higher precision.
:param advanced_parameters: Advanced parameters for algorithms in the compression pipeline.
"""
if isinstance(quantizer, OpenVINOQuantizer) or hasattr(quantizer, "get_nncf_weight_compression_setup"):
quantizer = OpenVINOQuantizerAdapter(quantizer)
compression_format = nncf.CompressionFormat.DQ
else:
# TODO Support Third party quantizers here.
msg = "Only OpenVINO Quantizer is supported currently."
raise nncf.InternalError(msg)

quantization_algorithm = WeightsCompressionPT2E(
quantizer=quantizer,
awq=awq,
subset_size=subset_size,
scale_estimation=scale_estimation,
gptq=gptq,
lora_correction=lora_correction,
sensitivity_metric=sensitivity_metric,
compression_format=compression_format,
advanced_parameters=advanced_parameters,
)

# Here the model is annotated
transformed_model = quantizer.transform_prior_quantization(model)
nncf_graph = NNCFGraphFactory.create(transformed_model)
quantized_model = quantization_algorithm.apply(transformed_model, nncf_graph, dataset=dataset)
quantized_model = torch.fx.GraphModule(quantized_model, graph=quantized_model.graph)
return quantized_model
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any

import torch.fx

from nncf.common.graph.graph import NNCFGraph
Expand All @@ -30,3 +32,11 @@ def transform_prior_quantization(self, model: torch.fx.GraphModule) -> torch.fx.

def get_quantization_setup(self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph) -> SingleConfigQuantizerSetup:
return self._quantizer.get_nncf_quantization_setup(model, nncf_graph)

def get_weight_compression_parameters(
self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph,
) -> SingleConfigQuantizerSetup:
return self._quantizer.get_nncf_weight_compression_parameters(model, nncf_graph)

def get_weight_compression_config(self) -> dict[str, Any]:
return self._quantizer.weight_compression_configuration
126 changes: 71 additions & 55 deletions src/nncf/quantization/algorithms/weight_compression/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_weight_compression_configuration(
)

return {
"mode": mode,
"mode": mode if isinstance(mode, nncf.CompressWeightsMode) else nncf.CompressWeightsMode(mode),
"ratio": ratio or 1,
"group_size": group_size,
"all_layers": all_layers or False,
Expand Down Expand Up @@ -527,11 +527,8 @@ def _set_weight_compression_config(
primary_precision_weight_params = self._mixed_precision_algo.apply(
model, graph, statistics_points, weight_params=ratio_defining_params
)
else:
primary_precision_weight_params = ratio_defining_params

for weight_param in primary_precision_weight_params:
weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])
for weight_param in primary_precision_weight_params:
weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])

# Check if group size is valid for each weight in ratio_defining_params
failed_nodes = []
Expand Down Expand Up @@ -769,12 +766,32 @@ def is_weight_compression_supported(

return is_supported_dtype and not no_bit_reduction

def _collect_statistics_and_statistic_points(
self, model, graph, statistic_points, dataset, ratio_defining_params, all_weight_params
):
if not dataset or not (self._data_aware_mixed_precision or self._data_aware_compression):
return None, statistic_points
weight_params = ratio_defining_params if self._backup_mode == BackupMode.NONE else all_weight_params
matmul_nodes_to_compress = [
wp.node_with_weight
for wp in weight_params
if wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes
]
matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(matmul_nodes_to_compress, graph)
if statistic_points is None:
statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
statistics_aggregator.register_statistic_points(statistic_points)
statistics_aggregator.collect_statistics(model, graph)
statistic_points = statistics_aggregator.statistic_points
return self._get_statistics_for_weights_compression(
matmul_input_to_output_nodes_map, statistic_points
), statistic_points

def get_weight_compression_parameters(
self,
model: TModel,
graph: NNCFGraph,
statistic_points: Optional[StatisticPointsContainer] = None,
dataset: Optional[Dataset] = None,
) -> tuple[list[WeightCompressionParameters], Optional[dict[str, WCTensorStatistic]]]:
"""
Generates a list of weight compression parameters based on the Weight Compression algorithm
Expand Down Expand Up @@ -869,37 +886,18 @@ def get_weight_compression_parameters(
else:
group_size_values = {w_params.weight_name: self._group_size for w_params in ratio_defining_params}

# Collect statistics for the weights compression
statistics = None
if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset:
weight_params = ratio_defining_params if self._backup_mode == BackupMode.NONE else all_weight_params
matmul_nodes_to_compress = [
wp.node_with_weight
for wp in weight_params
if wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes
]
matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(
matmul_nodes_to_compress, graph
)
if statistic_points is None:
statistic_points = self.get_statistic_points(model, graph, matmul_input_to_output_nodes_map.keys())
statistic_points = self._collect_statistics(dataset, graph, model, statistic_points)
statistics = self._get_statistics_for_weights_compression(
matmul_input_to_output_nodes_map, statistic_points
)

# Set weight compression configuration
self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)

# If no mixed precision has to be applied, then set the primary config for all ratio defining params.
if self._ratio == 1 or len(ratio_defining_params) == 0:
for weight_param in ratio_defining_params:
weight_param.compression_config = self._get_primary_config(group_size_values[weight_param.weight_name])

# Print statistics
nncf_logger.info(
self._get_bitwidth_distribution_str(all_weight_params, ratio_defining_params, skipped_weight_params)
)

# Filter all_weight_params and by excluding nodes that should remain in their original floating-point precision
all_weight_params = list(filter(lambda w_params: w_params.compression_config is not None, all_weight_params))
return all_weight_params, ratio_defining_params, group_size_values, skipped_weight_params

return all_weight_params, statistics

def apply(
self,
Expand All @@ -911,7 +909,45 @@ def apply(
self.set_backend_entity(model)

# Get processed weight compression parameters ready for compression
all_weight_params, statistics = self.get_weight_compression_parameters(model, graph, statistic_points, dataset)
all_weight_params, ratio_defining_params, group_size_values, skipped_weight_params = self.get_weight_compression_parameters(
model, graph
)
return self.apply_with_parameters(
model,
graph,
dataset,
statistic_points,
all_weight_params,
ratio_defining_params,
group_size_values,
skipped_weight_params,
)

def apply_with_parameters(
self,
model,
graph,
dataset,
statistic_points,
all_weight_params,
ratio_defining_params,
group_size_values,
skipped_weight_params,
):
# Collect statistics for the weights compression
Comment on lines +957 to +958
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Description

statistics, statistic_points = self._collect_statistics_and_statistic_points(
model, graph, statistic_points, dataset, ratio_defining_params, all_weight_params
)
# Set weight compression configuration
self._set_weight_compression_config(ratio_defining_params, model, graph, statistic_points, group_size_values)

# Filter all_weight_params and by excluding nodes that should remain in their original floating-point precision
all_weight_params = list(filter(lambda w_params: w_params.compression_config is not None, all_weight_params))

# Print statistics
nncf_logger.info(
self._get_bitwidth_distribution_str(all_weight_params, ratio_defining_params, skipped_weight_params)
)

if self._awq:
model = self.awq_algo.apply(model, graph, all_weight_params, statistics, self._backend_entity)
Expand Down Expand Up @@ -1048,26 +1084,6 @@ def get_compression_nodes_info(
matmul_input_to_output_nodes_map = self.get_matmul_input_to_output_nodes_map(matmul_nodes_to_compress, graph)
return nodes_to_compress, matmul_input_to_output_nodes_map

def _collect_statistics(
self,
dataset: Dataset,
graph: NNCFGraph,
model: TModel,
statistic_points: StatisticPointsContainer,
):
"""
Creates statistics aggregator, registers all statistics specified for algorithm, and then collect them.

:param dataset: Dataset to collect values.
:param graph: Model graph.
:param model: Model for statistics collection.
:param statistic_points: Statistics points.
"""
statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
statistics_aggregator.register_statistic_points(statistic_points)
statistics_aggregator.collect_statistics(model, graph)
return statistics_aggregator.statistic_points

def get_statistic_points(
self,
model: TModel,
Expand Down Expand Up @@ -1147,4 +1163,4 @@ def _get_statistics_for_weights_compression(
# Each activation node may have multiple MatMul nodes which it is an input to
for node in matmul_nodes:
statistics[node.node_name] = copy.deepcopy(stats)
return statistics
return statistics
Loading