23
23
from torch .ao .quantization .quantizer .quantizer import SharedQuantizationSpec as TorchAOSharedQuantizationSpec
24
24
25
25
import nncf
26
+ from nncf import IgnoredScope
27
+ from nncf import ModelType
28
+ from nncf import OverflowFix
29
+ from nncf import QuantizationMode
30
+ from nncf import QuantizationPreset
31
+ from nncf import TargetDevice
26
32
from nncf .common .graph .graph import NNCFGraph
27
33
from nncf .common .logging import nncf_logger
28
- from nncf .common .quantization .quantizer_propagation .solver import QuantizerPropagationRule
34
+ from nncf .common .quantization .quantizer_propagation .structs import QuantizerPropagationRule
29
35
from nncf .common .quantization .quantizer_setup import QuantizationPointBase
30
36
from nncf .common .quantization .quantizer_setup import SingleConfigQuantizerSetup
31
- from nncf .common .quantization .structs import QuantizationPreset
32
37
from nncf .common .quantization .structs import QuantizationScheme
38
+ from nncf .common .utils .api_marker import api
33
39
from nncf .experimental .torch .fx .nncf_graph_builder import GraphConverter
34
40
from nncf .experimental .torch .fx .node_utils import get_graph_node_by_name
35
41
from nncf .experimental .torch .fx .transformations import fold_constant_except_qdq
36
- from nncf .parameters import ModelType
37
- from nncf .parameters import QuantizationMode
38
- from nncf .parameters import TargetDevice
39
42
from nncf .quantization .advanced_parameters import FP8QuantizationParameters
40
- from nncf .quantization .advanced_parameters import OverflowFix
41
43
from nncf .quantization .advanced_parameters import QuantizationParameters
42
44
from nncf .quantization .algorithms .min_max .algorithm import MinMaxQuantization
43
- from nncf .scopes import IgnoredScope
44
45
from nncf .torch .model_graph_manager import get_weight_tensor_port_ids
45
46
46
47
QUANT_ANNOTATION_KEY = "quantization_annotation"
47
48
48
49
50
+ @api (canonical_alias = "nncf.experimental.torch.fx.OpenVINOQuantizer" )
49
51
class OpenVINOQuantizer (TorchAOQuantizer ):
50
52
"""
51
53
Implementation of the Torch AO quantizer which annotates models with quantization annotations
52
54
optimally for the inference via OpenVINO.
55
+
56
+ :param mode: Defines optimization mode for the algorithm. None by default.
57
+ :param preset: A preset controls the quantization mode (symmetric and asymmetric).
58
+ It can take the following values:
59
+ - `performance`: Symmetric quantization of weights and activations.
60
+ - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
61
+ Default value is None. In this case, `mixed` preset is used for `transformer`
62
+ model type otherwise `performance`.
63
+ :param target_device: A target device the specificity of which will be taken
64
+ into account while compressing in order to obtain the best performance
65
+ for this type of device, defaults to TargetDevice.ANY.
66
+ :param model_type: Model type is needed to specify additional patterns
67
+ in the model. Supported only `transformer` now.
68
+ :param ignored_scope: An ignored scope that defined the list of model control
69
+ flow graph nodes to be ignored during quantization.
70
+ :param overflow_fix: This option controls whether to apply the overflow issue
71
+ fix for the 8-bit quantization.
72
+ :param quantize_outputs: Whether to insert additional quantizers right before
73
+ each of the model outputs.
74
+ :param activations_quantization_params: Quantization parameters for model
75
+ activations.
76
+ :param weights_quantization_params: Quantization parameters for model weights.
77
+ :param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
78
+ MERGE_ALL_IN_ONE by default.
53
79
"""
54
80
55
81
def __init__ (
@@ -66,31 +92,6 @@ def __init__(
66
92
weights_quantization_params : Optional [Union [QuantizationParameters , FP8QuantizationParameters ]] = None ,
67
93
quantizer_propagation_rule : QuantizerPropagationRule = QuantizerPropagationRule .MERGE_ALL_IN_ONE ,
68
94
):
69
- """
70
- :param mode: Defines optimization mode for the algorithm. None by default.
71
- :param preset: A preset controls the quantization mode (symmetric and asymmetric).
72
- It can take the following values:
73
- - `performance`: Symmetric quantization of weights and activations.
74
- - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
75
- Default value is None. In this case, `mixed` preset is used for `transformer`
76
- model type otherwise `performance`.
77
- :param target_device: A target device the specificity of which will be taken
78
- into account while compressing in order to obtain the best performance
79
- for this type of device, defaults to TargetDevice.ANY.
80
- :param model_type: Model type is needed to specify additional patterns
81
- in the model. Supported only `transformer` now.
82
- :param ignored_scope: An ignored scope that defined the list of model control
83
- flow graph nodes to be ignored during quantization.
84
- :param overflow_fix: This option controls whether to apply the overflow issue
85
- fix for the 8-bit quantization.
86
- :param quantize_outputs: Whether to insert additional quantizers right before
87
- each of the model outputs.
88
- :param activations_quantization_params: Quantization parameters for model
89
- activations.
90
- :param weights_quantization_params: Quantization parameters for model weights.
91
- :param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
92
- MERGE_ALL_IN_ONE by default.
93
- """
94
95
self ._min_max_algo = MinMaxQuantization (
95
96
mode = mode ,
96
97
preset = preset ,
@@ -104,13 +105,48 @@ def __init__(
104
105
quantizer_propagation_rule = quantizer_propagation_rule ,
105
106
)
106
107
108
+ def set_ignored_scope (
109
+ self ,
110
+ names : Optional [List [str ]] = None ,
111
+ patterns : Optional [List [str ]] = None ,
112
+ types : Optional [List [str ]] = None ,
113
+ subgraphs : Optional [List [Tuple [List [str ], List [str ]]]] = None ,
114
+ validate : bool = True ,
115
+ ) -> None :
116
+ """
117
+ Provides an option to specify portions of model to be excluded from compression.
118
+ The ignored scope defines model sub-graphs that should be excluded from the quantization process.
119
+
120
+ :param names: List of ignored node names.
121
+ :param patterns: List of regular expressions that define patterns for names of ignored nodes.
122
+ :param types: List of ignored operation types.
123
+ :param subgraphs: List of ignored subgraphs.
124
+ :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
125
+ in the model graph.
126
+ """
127
+ self ._min_max_algo .set_ignored_scope (
128
+ nncf .IgnoredScope (
129
+ names = names or [],
130
+ patterns = patterns or [],
131
+ types = types or [],
132
+ subgraphs = subgraphs or [],
133
+ validate = validate ,
134
+ )
135
+ )
136
+
107
137
def get_nncf_quantization_setup (
108
138
self , model : torch .fx .GraphModule , nncf_graph : NNCFGraph
109
139
) -> SingleConfigQuantizerSetup :
110
140
self ._min_max_algo ._set_backend_entity (model )
111
141
return self ._min_max_algo .find_quantization_setup (model , nncf_graph )
112
142
113
143
def annotate (self , model : torch .fx .GraphModule ) -> torch .fx .GraphModule :
144
+ """
145
+ Adds quantization annotations to the nodes in the model graph in-place.
146
+
147
+ :param model: A torch.fx.GraphModule to annotate.
148
+ :return: The torch.fx.GraphModule with updated annotations.
149
+ """
114
150
nncf_graph = GraphConverter .create_nncf_graph (model )
115
151
quantization_setup = self .get_nncf_quantization_setup (model , nncf_graph )
116
152
@@ -305,8 +341,26 @@ def _get_torch_ao_qspec_from_qp(qp: QuantizationPointBase) -> TorchAOQuantizatio
305
341
)
306
342
307
343
def validate (self , model : torch .fx .GraphModule ) -> None :
344
+ """
345
+ Validates the annotated model before the insertion of FakeQuantizers / observers.
346
+
347
+ :param model: Annotated torch.fx.GraphModule to validate after the annotation.
348
+ """
308
349
pass
309
350
310
351
def transform_for_annotation (self , model : torch .fx .GraphModule ) -> torch .fx .GraphModule :
352
+ """
353
+ Allows for user defined transforms to run before annotating the graph.
354
+ This allows quantizer to allow quantizing part of the model that are otherwise not quantizable.
355
+ For example quantizer can
356
+ a) decompose a compound operator like scaled dot product attention,
357
+ into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa
358
+ or b) transform scalars to tensor to allow quantizing scalares.
359
+
360
+ Note: this is an optional method
361
+
362
+ :param model: Given torch.fx.GraphModule to transform before the annotation.
363
+ :return: The transformed torch.fx.GraphModule ready for the annotation.
364
+ """
311
365
fold_constant_except_qdq (model )
312
366
return model
0 commit comments