2323from torch .ao .quantization .quantizer .quantizer import SharedQuantizationSpec as TorchAOSharedQuantizationSpec
2424
2525import nncf
26+ from nncf import IgnoredScope
27+ from nncf import ModelType
28+ from nncf import OverflowFix
29+ from nncf import QuantizationMode
30+ from nncf import QuantizationPreset
31+ from nncf import TargetDevice
2632from nncf .common .graph .graph import NNCFGraph
2733from nncf .common .logging import nncf_logger
28- from nncf .common .quantization .quantizer_propagation .solver import QuantizerPropagationRule
34+ from nncf .common .quantization .quantizer_propagation .structs import QuantizerPropagationRule
2935from nncf .common .quantization .quantizer_setup import QuantizationPointBase
3036from nncf .common .quantization .quantizer_setup import SingleConfigQuantizerSetup
31- from nncf .common .quantization .structs import QuantizationPreset
3237from nncf .common .quantization .structs import QuantizationScheme
38+ from nncf .common .utils .api_marker import api
3339from nncf .experimental .torch .fx .nncf_graph_builder import GraphConverter
3440from nncf .experimental .torch .fx .node_utils import get_graph_node_by_name
3541from nncf .experimental .torch .fx .transformations import fold_constant_except_qdq
36- from nncf .parameters import ModelType
37- from nncf .parameters import QuantizationMode
38- from nncf .parameters import TargetDevice
3942from nncf .quantization .advanced_parameters import FP8QuantizationParameters
40- from nncf .quantization .advanced_parameters import OverflowFix
4143from nncf .quantization .advanced_parameters import QuantizationParameters
4244from nncf .quantization .algorithms .min_max .algorithm import MinMaxQuantization
43- from nncf .scopes import IgnoredScope
4445from nncf .torch .model_graph_manager import get_weight_tensor_port_ids
4546
4647QUANT_ANNOTATION_KEY = "quantization_annotation"
4748
4849
50+ @api (canonical_alias = "nncf.experimental.torch.fx.OpenVINOQuantizer" )
4951class OpenVINOQuantizer (TorchAOQuantizer ):
5052 """
5153 Implementation of the Torch AO quantizer which annotates models with quantization annotations
5254 optimally for the inference via OpenVINO.
55+
56+ :param mode: Defines optimization mode for the algorithm. None by default.
57+ :param preset: A preset controls the quantization mode (symmetric and asymmetric).
58+ It can take the following values:
59+ - `performance`: Symmetric quantization of weights and activations.
60+ - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
61+ Default value is None. In this case, `mixed` preset is used for `transformer`
62+ model type otherwise `performance`.
63+ :param target_device: A target device the specificity of which will be taken
64+ into account while compressing in order to obtain the best performance
65+ for this type of device, defaults to TargetDevice.ANY.
66+ :param model_type: Model type is needed to specify additional patterns
67+ in the model. Supported only `transformer` now.
68+ :param ignored_scope: An ignored scope that defined the list of model control
69+ flow graph nodes to be ignored during quantization.
70+ :param overflow_fix: This option controls whether to apply the overflow issue
71+ fix for the 8-bit quantization.
72+ :param quantize_outputs: Whether to insert additional quantizers right before
73+ each of the model outputs.
74+ :param activations_quantization_params: Quantization parameters for model
75+ activations.
76+ :param weights_quantization_params: Quantization parameters for model weights.
77+ :param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
78+ MERGE_ALL_IN_ONE by default.
5379 """
5480
5581 def __init__ (
@@ -66,31 +92,6 @@ def __init__(
6692 weights_quantization_params : Optional [Union [QuantizationParameters , FP8QuantizationParameters ]] = None ,
6793 quantizer_propagation_rule : QuantizerPropagationRule = QuantizerPropagationRule .MERGE_ALL_IN_ONE ,
6894 ):
69- """
70- :param mode: Defines optimization mode for the algorithm. None by default.
71- :param preset: A preset controls the quantization mode (symmetric and asymmetric).
72- It can take the following values:
73- - `performance`: Symmetric quantization of weights and activations.
74- - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
75- Default value is None. In this case, `mixed` preset is used for `transformer`
76- model type otherwise `performance`.
77- :param target_device: A target device the specificity of which will be taken
78- into account while compressing in order to obtain the best performance
79- for this type of device, defaults to TargetDevice.ANY.
80- :param model_type: Model type is needed to specify additional patterns
81- in the model. Supported only `transformer` now.
82- :param ignored_scope: An ignored scope that defined the list of model control
83- flow graph nodes to be ignored during quantization.
84- :param overflow_fix: This option controls whether to apply the overflow issue
85- fix for the 8-bit quantization.
86- :param quantize_outputs: Whether to insert additional quantizers right before
87- each of the model outputs.
88- :param activations_quantization_params: Quantization parameters for model
89- activations.
90- :param weights_quantization_params: Quantization parameters for model weights.
91- :param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
92- MERGE_ALL_IN_ONE by default.
93- """
9495 self ._min_max_algo = MinMaxQuantization (
9596 mode = mode ,
9697 preset = preset ,
@@ -104,13 +105,48 @@ def __init__(
104105 quantizer_propagation_rule = quantizer_propagation_rule ,
105106 )
106107
108+ def set_ignored_scope (
109+ self ,
110+ names : Optional [List [str ]] = None ,
111+ patterns : Optional [List [str ]] = None ,
112+ types : Optional [List [str ]] = None ,
113+ subgraphs : Optional [List [Tuple [List [str ], List [str ]]]] = None ,
114+ validate : bool = True ,
115+ ) -> None :
116+ """
117+ Provides an option to specify portions of model to be excluded from compression.
118+ The ignored scope defines model sub-graphs that should be excluded from the quantization process.
119+
120+ :param names: List of ignored node names.
121+ :param patterns: List of regular expressions that define patterns for names of ignored nodes.
122+ :param types: List of ignored operation types.
123+ :param subgraphs: List of ignored subgraphs.
124+ :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
125+ in the model graph.
126+ """
127+ self ._min_max_algo .set_ignored_scope (
128+ nncf .IgnoredScope (
129+ names = names or [],
130+ patterns = patterns or [],
131+ types = types or [],
132+ subgraphs = subgraphs or [],
133+ validate = validate ,
134+ )
135+ )
136+
107137 def get_nncf_quantization_setup (
108138 self , model : torch .fx .GraphModule , nncf_graph : NNCFGraph
109139 ) -> SingleConfigQuantizerSetup :
110140 self ._min_max_algo ._set_backend_entity (model )
111141 return self ._min_max_algo .find_quantization_setup (model , nncf_graph )
112142
113143 def annotate (self , model : torch .fx .GraphModule ) -> torch .fx .GraphModule :
144+ """
145+ Adds quantization annotations to the nodes in the model graph in-place.
146+
147+ :param model: A torch.fx.GraphModule to annotate.
148+ :return: The torch.fx.GraphModule with updated annotations.
149+ """
114150 nncf_graph = GraphConverter .create_nncf_graph (model )
115151 quantization_setup = self .get_nncf_quantization_setup (model , nncf_graph )
116152
@@ -305,8 +341,26 @@ def _get_torch_ao_qspec_from_qp(qp: QuantizationPointBase) -> TorchAOQuantizatio
305341 )
306342
307343 def validate (self , model : torch .fx .GraphModule ) -> None :
344+ """
345+ Validates the annotated model before the insertion of FakeQuantizers / observers.
346+
347+ :param model: Annotated torch.fx.GraphModule to validate after the annotation.
348+ """
308349 pass
309350
310351 def transform_for_annotation (self , model : torch .fx .GraphModule ) -> torch .fx .GraphModule :
352+ """
353+ Allows for user defined transforms to run before annotating the graph.
354+ This allows quantizer to allow quantizing part of the model that are otherwise not quantizable.
355+ For example quantizer can
356+ a) decompose a compound operator like scaled dot product attention,
357+ into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa
358+ or b) transform scalars to tensor to allow quantizing scalares.
359+
360+ Note: this is an optional method
361+
362+ :param model: Given torch.fx.GraphModule to transform before the annotation.
363+ :return: The transformed torch.fx.GraphModule ready for the annotation.
364+ """
311365 fold_constant_except_qdq (model )
312366 return model
0 commit comments