[OpenVINOQuantizer] Mark quantizer and quantize_pt2e as API (#3277)

daniil-lyakhov · web-flow · commit 6b0fc1ceb8d1 · 2025-02-24T12:03:50.000+04:00
### Changes Mark quantizer and quantize_pt2e as API ### Reason for changes To introduce `OpenVINOQuantizer` and `quantize_pt2e` in the api docs: https://openvinotoolkit.github.io/nncf/index.html ### Related tickets daniil-lyakhov/executorch#2
diff --git a/docs/api/source/conf.py b/docs/api/source/conf.py
@@ -146,9 +146,14 @@ def collect_api_entities() -> APIInfo:
     "nncf.tensor.functions.torch_io",
     "nncf.tensor.functions.numpy_io",
     "nncf.tensor.functions.openvino_numeric",
+    "nncf.torch.dynamic_graph.patch_pytorch",
 ]
 
 with mock(mock_modules):
+    import torch
+
+    # Set torch version to allow nncf.torch import
+    torch.__version__ = "0.0.0"
     api_info = collect_api_entities()
 
 module_fqns = set()
diff --git a/nncf/experimental/torch/fx/__init__.py b/nncf/experimental/torch/fx/__init__.py
@@ -8,3 +8,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from nncf.experimental.torch.fx.quantization.quantize_pt2e import quantize_pt2e as quantize_pt2e
+from nncf.experimental.torch.fx.quantization.quantizer.openvino_quantizer import OpenVINOQuantizer as OpenVINOQuantizer
diff --git a/nncf/experimental/torch/fx/quantization/quantize_pt2e.py b/nncf/experimental/torch/fx/quantization/quantize_pt2e.py
@@ -23,9 +23,10 @@
 from torch.fx.passes.infra.pass_manager import PassManager
 
 import nncf
+from nncf import Dataset
 from nncf.common.factory import NNCFGraphFactory
 from nncf.common.logging import nncf_logger
-from nncf.data import Dataset
+from nncf.common.utils.api_marker import api
 from nncf.experimental.quantization.algorithms.post_training.algorithm import ExperimentalPostTrainingQuantization
 from nncf.experimental.torch.fx.constant_folding import constant_fold
 from nncf.experimental.torch.fx.quantization.quantizer.openvino_adapter import OpenVINOQuantizerAdapter
@@ -35,9 +36,10 @@
 from nncf.experimental.torch.fx.transformations import compress_post_quantize_transformation
 from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
-from nncf.quantization.advanced_parameters import RangeEstimatorParameters
+from nncf.quantization.range_estimator import RangeEstimatorParameters
 
 
+@api(canonical_alias="nncf.experimental.torch.fx.quantize_pt2e")
 def quantize_pt2e(
     model: torch.fx.GraphModule,
     quantizer: Quantizer,
@@ -57,8 +59,11 @@ def quantize_pt2e(
     Applies post-training quantization to the torch.fx.GraphModule provided model
     using provided torch.ao quantizer.
 
+    :param model: A torch.fx.GraphModule instance to be quantized.
     :param quantizer: Torch ao quantizer to annotate nodes in the graph with quantization setups
         to convey the desired way of quantization.
+    :param calibration_dataset: A representative dataset for the
+        calibration process.
     :param subset_size: Size of a subset to calculate activations
         statistics used for quantization.
     :param fast_bias_correction: Setting this option to `False` enables a different
@@ -77,6 +82,7 @@ def quantize_pt2e(
     :param fold_quantize: Boolean flag for whether fold the quantize op or not. The value is True by default.
     :param do_copy: The copy of the given model is being quantized if do_copy == True,
         otherwise the model is quantized inplace. Default value is False.
+    :return: The quantized torch.fx.GraphModule instance.
     """
     nncf_logger.warning("This is an experimental feature and may change in the future without notice.")
 
diff --git a/nncf/experimental/torch/fx/quantization/quantizer/openvino_quantizer.py b/nncf/experimental/torch/fx/quantization/quantizer/openvino_quantizer.py
@@ -23,33 +23,59 @@
 from torch.ao.quantization.quantizer.quantizer import SharedQuantizationSpec as TorchAOSharedQuantizationSpec
 
 import nncf
+from nncf import IgnoredScope
+from nncf import ModelType
+from nncf import OverflowFix
+from nncf import QuantizationMode
+from nncf import QuantizationPreset
+from nncf import TargetDevice
 from nncf.common.graph.graph import NNCFGraph
 from nncf.common.logging import nncf_logger
-from nncf.common.quantization.quantizer_propagation.solver import QuantizerPropagationRule
+from nncf.common.quantization.quantizer_propagation.structs import QuantizerPropagationRule
 from nncf.common.quantization.quantizer_setup import QuantizationPointBase
 from nncf.common.quantization.quantizer_setup import SingleConfigQuantizerSetup
-from nncf.common.quantization.structs import QuantizationPreset
 from nncf.common.quantization.structs import QuantizationScheme
+from nncf.common.utils.api_marker import api
 from nncf.experimental.torch.fx.nncf_graph_builder import GraphConverter
 from nncf.experimental.torch.fx.node_utils import get_graph_node_by_name
 from nncf.experimental.torch.fx.transformations import fold_constant_except_qdq
-from nncf.parameters import ModelType
-from nncf.parameters import QuantizationMode
-from nncf.parameters import TargetDevice
 from nncf.quantization.advanced_parameters import FP8QuantizationParameters
-from nncf.quantization.advanced_parameters import OverflowFix
 from nncf.quantization.advanced_parameters import QuantizationParameters
 from nncf.quantization.algorithms.min_max.algorithm import MinMaxQuantization
-from nncf.scopes import IgnoredScope
 from nncf.torch.model_graph_manager import get_weight_tensor_port_ids
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 
 
+@api(canonical_alias="nncf.experimental.torch.fx.OpenVINOQuantizer")
 class OpenVINOQuantizer(TorchAOQuantizer):
     """
     Implementation of the Torch AO quantizer which annotates models with quantization annotations
     optimally for the inference via OpenVINO.
+
+    :param mode: Defines optimization mode for the algorithm. None by default.
+    :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+        It can take the following values:
+        - `performance`: Symmetric quantization of weights and activations.
+        - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+        Default value is None. In this case, `mixed` preset is used for `transformer`
+        model type otherwise `performance`.
+    :param target_device: A target device the specificity of which will be taken
+        into account while compressing in order to obtain the best performance
+        for this type of device, defaults to TargetDevice.ANY.
+    :param model_type: Model type is needed to specify additional patterns
+        in the model. Supported only `transformer` now.
+    :param ignored_scope: An ignored scope that defined the list of model control
+        flow graph nodes to be ignored during quantization.
+    :param overflow_fix: This option controls whether to apply the overflow issue
+        fix for the 8-bit quantization.
+    :param quantize_outputs: Whether to insert additional quantizers right before
+        each of the model outputs.
+    :param activations_quantization_params: Quantization parameters for model
+        activations.
+    :param weights_quantization_params: Quantization parameters for model weights.
+    :param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
+        MERGE_ALL_IN_ONE by default.
     """
 
     def __init__(
@@ -66,31 +92,6 @@ def __init__(
         weights_quantization_params: Optional[Union[QuantizationParameters, FP8QuantizationParameters]] = None,
         quantizer_propagation_rule: QuantizerPropagationRule = QuantizerPropagationRule.MERGE_ALL_IN_ONE,
     ):
-        """
-        :param mode: Defines optimization mode for the algorithm. None by default.
-        :param preset: A preset controls the quantization mode (symmetric and asymmetric).
-            It can take the following values:
-            - `performance`: Symmetric quantization of weights and activations.
-            - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
-            Default value is None. In this case, `mixed` preset is used for `transformer`
-            model type otherwise `performance`.
-        :param target_device: A target device the specificity of which will be taken
-            into account while compressing in order to obtain the best performance
-            for this type of device, defaults to TargetDevice.ANY.
-        :param model_type: Model type is needed to specify additional patterns
-            in the model. Supported only `transformer` now.
-        :param ignored_scope: An ignored scope that defined the list of model control
-            flow graph nodes to be ignored during quantization.
-        :param overflow_fix: This option controls whether to apply the overflow issue
-            fix for the 8-bit quantization.
-        :param quantize_outputs: Whether to insert additional quantizers right before
-            each of the model outputs.
-        :param activations_quantization_params: Quantization parameters for model
-            activations.
-        :param weights_quantization_params: Quantization parameters for model weights.
-        :param quantizer_propagation_rule: The strategy to be used while propagating and merging quantizers.
-        MERGE_ALL_IN_ONE by default.
-        """
         self._min_max_algo = MinMaxQuantization(
             mode=mode,
             preset=preset,
@@ -104,13 +105,48 @@ def __init__(
             quantizer_propagation_rule=quantizer_propagation_rule,
         )
 
+    def set_ignored_scope(
+        self,
+        names: Optional[List[str]] = None,
+        patterns: Optional[List[str]] = None,
+        types: Optional[List[str]] = None,
+        subgraphs: Optional[List[Tuple[List[str], List[str]]]] = None,
+        validate: bool = True,
+    ) -> None:
+        """
+        Provides an option to specify portions of model to be excluded from compression.
+        The ignored scope defines model sub-graphs that should be excluded from the quantization process.
+
+        :param names: List of ignored node names.
+        :param patterns: List of regular expressions that define patterns for names of ignored nodes.
+        :param types: List of ignored operation types.
+        :param subgraphs: List of ignored subgraphs.
+        :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
+            in the model graph.
+        """
+        self._min_max_algo.set_ignored_scope(
+            nncf.IgnoredScope(
+                names=names or [],
+                patterns=patterns or [],
+                types=types or [],
+                subgraphs=subgraphs or [],
+                validate=validate,
+            )
+        )
+
     def get_nncf_quantization_setup(
         self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
     ) -> SingleConfigQuantizerSetup:
         self._min_max_algo._set_backend_entity(model)
         return self._min_max_algo.find_quantization_setup(model, nncf_graph)
 
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """
+        Adds quantization annotations to the nodes in the model graph in-place.
+
+        :param model: A torch.fx.GraphModule to annotate.
+        :return: The torch.fx.GraphModule with updated annotations.
+        """
         nncf_graph = GraphConverter.create_nncf_graph(model)
         quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
 
@@ -305,8 +341,26 @@ def _get_torch_ao_qspec_from_qp(qp: QuantizationPointBase) -> TorchAOQuantizatio
         )
 
     def validate(self, model: torch.fx.GraphModule) -> None:
+        """
+        Validates the annotated model before the insertion of FakeQuantizers / observers.
+
+        :param model: Annotated torch.fx.GraphModule to validate after the  annotation.
+        """
         pass
 
     def transform_for_annotation(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """
+        Allows for user defined transforms to run before annotating the graph.
+        This allows quantizer to allow quantizing part of the model that are otherwise not quantizable.
+        For example quantizer can
+        a) decompose a compound operator like scaled dot product attention,
+        into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa
+        or b) transform scalars to tensor to allow quantizing scalares.
+
+        Note: this is an optional method
+
+        :param model: Given torch.fx.GraphModule to transform before the annotation.
+        :return: The transformed torch.fx.GraphModule ready for the annotation.
+        """
         fold_constant_except_qdq(model)
         return model