Skip to content

Commit 6b0fc1c

Browse files
[OpenVINOQuantizer] Mark quantizer and quantize_pt2e as API (#3277)
### Changes Mark quantizer and quantize_pt2e as API ### Reason for changes To introduce `OpenVINOQuantizer` and `quantize_pt2e` in the api docs: https://openvinotoolkit.github.io/nncf/index.html ### Related tickets daniil-lyakhov/executorch#2
1 parent a403e29 commit 6b0fc1c

File tree

4 files changed

+102
-34
lines changed

4 files changed

+102
-34
lines changed

docs/api/source/conf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,14 @@ def collect_api_entities() -> APIInfo:
146146
"nncf.tensor.functions.torch_io",
147147
"nncf.tensor.functions.numpy_io",
148148
"nncf.tensor.functions.openvino_numeric",
149+
"nncf.torch.dynamic_graph.patch_pytorch",
149150
]
150151

151152
with mock(mock_modules):
153+
import torch
154+
155+
# Set torch version to allow nncf.torch import
156+
torch.__version__ = "0.0.0"
152157
api_info = collect_api_entities()
153158

154159
module_fqns = set()

nncf/experimental/torch/fx/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,6 @@
88
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
99
# See the License for the specific language governing permissions and
1010
# limitations under the License.
11+
12+
from nncf.experimental.torch.fx.quantization.quantize_pt2e import quantize_pt2e as quantize_pt2e
13+
from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer as OpenVINOQuantizer

nncf/experimental/torch/fx/quantization/quantize_pt2e.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@
2323
from torch.fx.passes.infra.pass_manager import PassManager
2424

2525
import nncf
26+
from nncf import Dataset
2627
from nncf.common.factory import NNCFGraphFactory
2728
from nncf.common.logging import nncf_logger
28-
from nncf.data import Dataset
29+
from nncf.common.utils.api_marker import api
2930
from nncf.experimental.quantization.algorithms.post_training.algorithm import ExperimentalPostTrainingQuantization
3031
from nncf.experimental.torch.fx.constant_folding import constant_fold
3132
from nncf.experimental.torch.fx.quantization.quantizer.openvino_adapter import OpenVINOQuantizerAdapter
@@ -35,9 +36,10 @@
3536
from nncf.experimental.torch.fx.transformations import compress_post_quantize_transformation
3637
from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters
3738
from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
38-
from nncf.quantization.advanced_parameters import RangeEstimatorParameters
39+
from nncf.quantization.range_estimator import RangeEstimatorParameters
3940

4041

42+
@api(canonical_alias="nncf.experimental.torch.fx.quantize_pt2e")
4143
def quantize_pt2e(
4244
model: torch.fx.GraphModule,
4345
quantizer: Quantizer,
@@ -57,8 +59,11 @@ def quantize_pt2e(
5759
Applies post-training quantization to the torch.fx.GraphModule provided model
5860
using provided torch.ao quantizer.
5961
62+
:param model: A torch.fx.GraphModule instance to be quantized.
6063
:param quantizer: Torch ao quantizer to annotate nodes in the graph with quantization setups
6164
to convey the desired way of quantization.
65+
:param calibration_dataset: A representative dataset for the
66+
calibration process.
6267
:param subset_size: Size of a subset to calculate activations
6368
statistics used for quantization.
6469
:param fast_bias_correction: Setting this option to `False` enables a different
@@ -77,6 +82,7 @@ def quantize_pt2e(
7782
:param fold_quantize: Boolean flag for whether fold the quantize op or not. The value is True by default.
7883
:param do_copy: The copy of the given model is being quantized if do_copy == True,
7984
otherwise the model is quantized inplace. Default value is False.
85+
:return: The quantized torch.fx.GraphModule instance.
8086
"""
8187
nncf_logger.warning("This is an experimental feature and may change in the future without notice.")
8288

nncf/experimental/torch/fx/quantization/quantizer/openvino_quantizer.py

Lines changed: 86 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,33 +23,59 @@
2323
from torch.ao.quantization.quantizer.quantizer import SharedQuantizationSpec as TorchAOSharedQuantizationSpec
2424

2525
import nncf
26+
from nncf import IgnoredScope
27+
from nncf import ModelType
28+
from nncf import OverflowFix
29+
from nncf import QuantizationMode
30+
from nncf import QuantizationPreset
31+
from nncf import TargetDevice
2632
from nncf.common.graph.graph import NNCFGraph
2733
from nncf.common.logging import nncf_logger
28-
from nncf.common.quantization.quantizer_propagation.solver import QuantizerPropagationRule
34+
from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule
2935
from nncf.common.quantization.quantizer_setup import QuantizationPointBase
3036
from nncf.common.quantization.quantizer_setup import SingleConfigQuantizerSetup
31-
from nncf.common.quantization.structs import QuantizationPreset
3237
from nncf.common.quantization.structs import QuantizationScheme
38+
from nncf.common.utils.api_marker import api
3339
from nncf.experimental.torch.fx.nncf_graph_builder import GraphConverter
3440
from nncf.experimental.torch.fx.node_utils import get_graph_node_by_name
3541
from nncf.experimental.torch.fx.transformations import fold_constant_except_qdq
36-
from nncf.parameters import ModelType
37-
from nncf.parameters import QuantizationMode
38-
from nncf.parameters import TargetDevice
3942
from nncf.quantization.advanced_parameters import FP8QuantizationParameters
40-
from nncf.quantization.advanced_parameters import OverflowFix
4143
from nncf.quantization.advanced_parameters import QuantizationParameters
4244
from nncf.quantization.algorithms.min_max.algorithm import MinMaxQuantization
43-
from nncf.scopes import IgnoredScope
4445
from nncf.torch.model_graph_manager import get_weight_tensor_port_ids
4546

4647
QUANT_ANNOTATION_KEY = "quantization_annotation"
4748

4849

50+
@api(canonical_alias="nncf.experimental.torch.fx.OpenVINOQuantizer")
4951
class OpenVINOQuantizer(TorchAOQuantizer):
5052
"""
5153
Implementation of the Torch AO quantizer which annotates models with quantization annotations
5254
optimally for the inference via OpenVINO.
55+
56+
:param mode: Defines optimization mode for the algorithm. None by default.
57+
:param preset: A preset controls the quantization mode (symmetric and asymmetric).
58+
It can take the following values:
59+
- `performance`: Symmetric quantization of weights and activations.
60+
- `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
61+
Default value is None. In this case, `mixed` preset is used for `transformer`
62+
model type otherwise `performance`.
63+
:param target_device: A target device the specificity of which will be taken
64+
into account while compressing in order to obtain the best performance
65+
for this type of device, defaults to TargetDevice.ANY.
66+
:param model_type: Model type is needed to specify additional patterns
67+
in the model. Supported only `transformer` now.
68+
:param ignored_scope: An ignored scope that defined the list of model control
69+
flow graph nodes to be ignored during quantization.
70+
:param overflow_fix: This option controls whether to apply the overflow issue
71+
fix for the 8-bit quantization.
72+
:param quantize_outputs: Whether to insert additional quantizers right before
73+
each of the model outputs.
74+
:param activations_quantization_params: Quantization parameters for model
75+
activations.
76+
:param weights_quantization_params: Quantization parameters for model weights.
77+
:param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
78+
MERGE_ALL_IN_ONE by default.
5379
"""
5480

5581
def __init__(
@@ -66,31 +92,6 @@ def __init__(
6692
weights_quantization_params: Optional[Union[QuantizationParameters, FP8QuantizationParameters]] = None,
6793
quantizer_propagation_rule: QuantizerPropagationRule = QuantizerPropagationRule.MERGE_ALL_IN_ONE,
6894
):
69-
"""
70-
:param mode: Defines optimization mode for the algorithm. None by default.
71-
:param preset: A preset controls the quantization mode (symmetric and asymmetric).
72-
It can take the following values:
73-
- `performance`: Symmetric quantization of weights and activations.
74-
- `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
75-
Default value is None. In this case, `mixed` preset is used for `transformer`
76-
model type otherwise `performance`.
77-
:param target_device: A target device the specificity of which will be taken
78-
into account while compressing in order to obtain the best performance
79-
for this type of device, defaults to TargetDevice.ANY.
80-
:param model_type: Model type is needed to specify additional patterns
81-
in the model. Supported only `transformer` now.
82-
:param ignored_scope: An ignored scope that defined the list of model control
83-
flow graph nodes to be ignored during quantization.
84-
:param overflow_fix: This option controls whether to apply the overflow issue
85-
fix for the 8-bit quantization.
86-
:param quantize_outputs: Whether to insert additional quantizers right before
87-
each of the model outputs.
88-
:param activations_quantization_params: Quantization parameters for model
89-
activations.
90-
:param weights_quantization_params: Quantization parameters for model weights.
91-
:param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
92-
MERGE_ALL_IN_ONE by default.
93-
"""
9495
self._min_max_algo = MinMaxQuantization(
9596
mode=mode,
9697
preset=preset,
@@ -104,13 +105,48 @@ def __init__(
104105
quantizer_propagation_rule=quantizer_propagation_rule,
105106
)
106107

108+
def set_ignored_scope(
109+
self,
110+
names: Optional[List[str]] = None,
111+
patterns: Optional[List[str]] = None,
112+
types: Optional[List[str]] = None,
113+
subgraphs: Optional[List[Tuple[List[str], List[str]]]] = None,
114+
validate: bool = True,
115+
) -> None:
116+
"""
117+
Provides an option to specify portions of model to be excluded from compression.
118+
The ignored scope defines model sub-graphs that should be excluded from the quantization process.
119+
120+
:param names: List of ignored node names.
121+
:param patterns: List of regular expressions that define patterns for names of ignored nodes.
122+
:param types: List of ignored operation types.
123+
:param subgraphs: List of ignored subgraphs.
124+
:param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
125+
in the model graph.
126+
"""
127+
self._min_max_algo.set_ignored_scope(
128+
nncf.IgnoredScope(
129+
names=names or [],
130+
patterns=patterns or [],
131+
types=types or [],
132+
subgraphs=subgraphs or [],
133+
validate=validate,
134+
)
135+
)
136+
107137
def get_nncf_quantization_setup(
108138
self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
109139
) -> SingleConfigQuantizerSetup:
110140
self._min_max_algo._set_backend_entity(model)
111141
return self._min_max_algo.find_quantization_setup(model, nncf_graph)
112142

113143
def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
144+
"""
145+
Adds quantization annotations to the nodes in the model graph in-place.
146+
147+
:param model: A torch.fx.GraphModule to annotate.
148+
:return: The torch.fx.GraphModule with updated annotations.
149+
"""
114150
nncf_graph = GraphConverter.create_nncf_graph(model)
115151
quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
116152

@@ -305,8 +341,26 @@ def _get_torch_ao_qspec_from_qp(qp: QuantizationPointBase) -> TorchAOQuantizatio
305341
)
306342

307343
def validate(self, model: torch.fx.GraphModule) -> None:
344+
"""
345+
Validates the annotated model before the insertion of FakeQuantizers / observers.
346+
347+
:param model: Annotated torch.fx.GraphModule to validate after the annotation.
348+
"""
308349
pass
309350

310351
def transform_for_annotation(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
352+
"""
353+
Allows for user defined transforms to run before annotating the graph.
354+
This allows quantizer to allow quantizing part of the model that are otherwise not quantizable.
355+
For example quantizer can
356+
a) decompose a compound operator like scaled dot product attention,
357+
into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa
358+
or b) transform scalars to tensor to allow quantizing scalares.
359+
360+
Note: this is an optional method
361+
362+
:param model: Given torch.fx.GraphModule to transform before the annotation.
363+
:return: The transformed torch.fx.GraphModule ready for the annotation.
364+
"""
311365
fold_constant_except_qdq(model)
312366
return model

0 commit comments

Comments
 (0)