Deprecate onnx/ort model export and quantization (#795)

IlyasMoutawwakil · web-flow · commit 384dda54873b · 2024-07-01T22:54:54.000+02:00
* deprecate onnx/ort model export and quantization

* fix
diff --git a/optimum/intel/neural_compressor/configuration.py b/optimum/intel/neural_compressor/configuration.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import logging
 from typing import Dict, Optional, Union
 
 from neural_compressor.config import DistillationConfig, WeightPruningConfig, _BaseQuantizationConfig
@@ -28,6 +29,8 @@
     "post_training_weight_only": "weight_only",
 }
 
+logger = logging.getLogger(__name__)
+
 
 class INCConfig(BaseConfig):
     CONFIG_NAME = "inc_config.json"
@@ -49,6 +52,9 @@ def __init__(
         self.distillation = self._create_distillation_config(distillation) or {}
         self.save_onnx_model = save_onnx_model
 
+        if self.save_onnx_model:
+            logger.warning("ONNX model saving is deprecated and will be removed soon.")
+
     @staticmethod
     def _create_quantization_config(config: Union[Dict, _BaseQuantizationConfig]):
         # TODO : add activations_dtype and weights_dtype
diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py
@@ -200,9 +200,15 @@ def quantize(
         use_xpu = device == torch.device("xpu") or device == "xpu"
         calibration_dataloader = None
 
+        if save_onnx_model:
+            logger.warning("ONNX model export is deprecated and will be removed soon.")
+
+        if isinstance(self._original_model, ORTModel):
+            logger.warning("ONNX model quantization is deprecated and will be removed soon.")
+
         if save_onnx_model and isinstance(self._original_model, ORTModel):
+            logger.warning("The model provided is already an ONNX model. Setting `save_onnx_model` to False.")
             save_onnx_model = False
-            logger.warning("Model provided is an ONNX model, `save_onnx_model` is set to False")
 
         default_name = WEIGHTS_NAME if not isinstance(self._original_model, ORTModel) else ONNX_WEIGHTS_NAME
         self._set_task()
@@ -223,13 +229,16 @@ def quantize(
                 f"but only version {IPEX_MINIMUM_VERSION} or higher is supported."
             )
 
-        if save_onnx_model:
-            if (
-                not isinstance(quantization_config, PostTrainingQuantConfig)
-                or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC
-            ):
-                logger.warning("ONNX export for dynamic and weight only quantized model is not supported.")
-                save_onnx_model = False
+        if save_onnx_model and (
+            not isinstance(quantization_config, PostTrainingQuantConfig)
+            or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC
+        ):
+            logger.warning(
+                "ONNX export for dynamic and weight only quantized model is not supported. "
+                "Only static quantization model can be exported to ONNX format. "
+                "Setting `save_onnx_model` to False."
+            )
+            save_onnx_model = False
 
         # ITREX Weight Only Quantization
         if not isinstance(quantization_config, PostTrainingQuantConfig):
@@ -296,9 +305,13 @@ def quantize(
                     remove_unused_columns=remove_unused_columns,
                     data_collator=data_collator,
                 )
+
             op_type_dict = getattr(quantization_config, "op_type_dict", None)
-            if op_type_dict is None or "Embedding" not in op_type_dict:
-                logger.warning("ONNX export is no supported for model with quantized embeddings")
+            if save_onnx_model and (op_type_dict is None or "Embedding" not in op_type_dict):
+                logger.warning(
+                    "ONNX export is no supported for model with quantized embeddings. "
+                    "Setting `save_onnx_model` to False."
+                )
                 save_onnx_model = False
 
         if not isinstance(quantization_config, PostTrainingQuantConfig):
diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py
@@ -175,6 +175,9 @@ def __init__(
         # TODO : To deprecate once support transformers > 4.30.0
         self.deepspeed = None
 
+        if save_onnx_model:
+            logger.warning("ONNX model saving is deprecated and will be removed soon.")
+
         # Attach dtype and architecture to the config
         if quantization_config is not None:
             self.dtype = "int8"
@@ -678,15 +681,12 @@ def _inner_training_loop(
     def save_model(
         self,
         output_dir: Optional[str] = None,
-        _internal_call: bool = False,
-        save_onnx_model: Optional[bool] = None,
+        save_onnx_model: bool = False,
     ):
         """
         Will save the model, so you can reload it using `from_pretrained()`.
         Will only save from the main process.
         """
-        save_onnx_model = save_onnx_model if save_onnx_model is not None else self.save_onnx_model
-
         if output_dir is None:
             output_dir = self.args.output_dir
 
@@ -734,7 +734,10 @@ def _save(
 
         # Disable ONNX export for quantized model as deprecated in neural-compressor>=2.2.0
         if save_onnx_model and self.dtype == "int8":
-            logger.warning("ONNX export for quantized model is no longer supported by neural-compressor>=2.2.0. ")
+            logger.warning(
+                "ONNX export for quantized model is no longer supported by neural-compressor>=2.2.0. "
+                "Setting `save_onnx_model` to False."
+            )
             save_onnx_model = False
 
         # Export the compressed model to the ONNX format