huggingface
diff --git a/‎docs/source/openvino/optimization.mdx‎
Lines changed: 53 additions & 0 deletions b/‎docs/source/openvino/optimization.mdx‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎optimum/intel/openvino/configuration.py‎
Lines changed: 37 additions & 0 deletions b/‎optimum/intel/openvino/configuration.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎optimum/intel/openvino/modeling_sam.py‎
Lines changed: 57 additions & 16 deletions b/‎optimum/intel/openvino/modeling_sam.py‎
Lines changed: 57 additions & 16 deletions
@@ -769,6 +769,59 @@ Click on a ✅ to copy the command/code for the corresponding optimization case.
                 </button>
             </td>
         </tr>
+        <tr>
+            <td style="text-align: center; vertical-align: middle;">feature-extraction<br>(OVSamModel)</td>
+            <td style="text-align: center; vertical-align: middle;">–</td>
+            <td style="text-align: center; vertical-align: middle;">
+                <button onclick="
+                    navigator.clipboard.writeText('OVSamModel.from_pretrained(\'facebook/sam-vit-base\', quantization_config=OVPipelineQuantizationConfig(quantization_configs=dict(vision_encoder=OVWeightQuantizationConfig(bits=8)))).save_pretrained(\'save_dir\')');
+                    let m=document.getElementById('copyMsg');
+                    m.style.display='block';
+                    clearTimeout(window._copyTimeout);
+                    window._copyTimeout=setTimeout(()=>m.style.display='none', 2000);
+                ">
+                    ✅
+                </button>
+            </td>
+            <td style="text-align: center; vertical-align: middle;">–</td>
+            <td style="text-align: center; vertical-align: middle;">
+                <button onclick="
+                    navigator.clipboard.writeText('OVSamModel.from_pretrained(\'facebook/sam-vit-base\', quantization_config=OVPipelineQuantizationConfig(quantization_configs=dict(vision_encoder=OVWeightQuantizationConfig(bits=4, dataset=\'coco\')))).save_pretrained(\'save_dir\')');
+                    let m=document.getElementById('copyMsg');
+                    m.style.display='block';
+                    clearTimeout(window._copyTimeout);
+                    window._copyTimeout=setTimeout(()=>m.style.display='none', 2000);
+                ">
+                    ✅
+                </button>
+            </td>
+            <td style="text-align: center; vertical-align: middle;">–</td>
+            <td style="text-align: center; vertical-align: middle;">-</td>
+            <td style="text-align: center; vertical-align: middle;">
+                <button onclick="
+                    navigator.clipboard.writeText('optimum-cli export openvino -m facebook/sam-vit-base --quant-mode int8 --dataset coco ./save_dir');
+                    let m=document.getElementById('copyMsg');
+                    m.style.display='block';
+                    clearTimeout(window._copyTimeout);
+                    window._copyTimeout=setTimeout(()=>m.style.display='none', 2000);
+                ">
+                    ✅
+                </button>
+            </td>
+            <td style="text-align: center; vertical-align: middle;">
+                <button onclick="
+                    navigator.clipboard.writeText('OVSamModel.from_pretrained(\'facebook/sam-vit-base\', quantization_config=OVPipelineQuantizationConfig(quantization_configs=dict(vision_encoder=OVQuantizationConfig(bits=8, dataset=\'coco\')))).save_pretrained(\'save_dir\')');
+                    let m=document.getElementById('copyMsg');
+                    m.style.display='block';
+                    clearTimeout(window._copyTimeout);
+                    window._copyTimeout=setTimeout(()=>m.style.display='none', 2000);
+                ">
+                    ✅
+                </button>
+            </td>
+            <td style="text-align: center; vertical-align: middle;">–</td>
+            <td style="text-align: center; vertical-align: middle;">–</td>
+        </tr>
         <tr>
             <td style="text-align: center; vertical-align: middle;">text-to-audio<br>(OVModelForTextToSpeechSeq2Seq)</td>
             <td style="text-align: center; vertical-align: middle;">✅</td>
 
@@ -31,6 +31,7 @@
 from .utils import (
     PREDEFINED_CAUSAL_LANGUAGE_DATASETS,
     PREDEFINED_LANGUAGE_DATASETS,
+    PREDEFINED_SAM_DATASETS,
     PREDEFINED_SD_DATASETS,
     PREDEFINED_SPEECH_TO_TEXT_DATASETS,
     PREDEFINED_VISUAL_LM_DATASETS,
@@ -361,6 +362,36 @@ class OVQuantizationMethod(str, Enum):
 
 # Default configs for int8 full quantization
 _DEFAULT_INT8_FQ_CONFIGS = {
+    "facebook/sam-vit-base": {
+        "quantization_configs": {
+            "vision_encoder": {
+                "dtype": "int8",
+                "dataset": "coco",
+                "num_samples": 128,
+                "weight_only": False,
+            },
+        }
+    },
+    "facebook/sam-vit-large": {
+        "quantization_configs": {
+            "vision_encoder": {
+                "dtype": "int8",
+                "dataset": "coco",
+                "num_samples": 128,
+                "weight_only": False,
+            },
+        }
+    },
+    "facebook/sam-vit-huge": {
+        "quantization_configs": {
+            "vision_encoder": {
+                "dtype": "int8",
+                "dataset": "coco",
+                "num_samples": 128,
+                "weight_only": False,
+            },
+        }
+    },
     "google-t5/t5-small": {
         "dtype": "int8",
         "dataset": "wikitext2",
@@ -723,16 +754,19 @@ def post_init(self):
             visual_lm_datasets = set(PREDEFINED_VISUAL_LM_DATASETS.keys())
             stable_diffusion_datasets = set(PREDEFINED_SD_DATASETS.keys())
             language_datasets = set(PREDEFINED_LANGUAGE_DATASETS.keys())
+            sam_datasets = set(PREDEFINED_SAM_DATASETS.keys())
             if (
                 self.dataset
                 not in PREDEFINED_CAUSAL_LANGUAGE_DATASETS
                 | language_datasets
                 | visual_lm_datasets
                 | stable_diffusion_datasets
+                | sam_datasets
             ):
                 raise ValueError(
                     "You have entered a string value for dataset. You can only choose between "
                     f"{language_datasets} for text feature extraction models, "
+                    f"{sam_datasets} for SegmentAnything models, "
                     f"{PREDEFINED_CAUSAL_LANGUAGE_DATASETS} for LLMs, {visual_lm_datasets} for visual LLMs or "
                     f"{stable_diffusion_datasets} for diffusion models, but we found {self.dataset}."
                 )
@@ -992,19 +1026,22 @@ def post_init(self):
             visual_lm_datasets = set(PREDEFINED_VISUAL_LM_DATASETS.keys())
             stable_diffusion_datasets = set(PREDEFINED_SD_DATASETS.keys())
             language_datasets = set(PREDEFINED_LANGUAGE_DATASETS.keys())
+            sam_datasets = set(PREDEFINED_SAM_DATASETS.keys())
             if (
                 self.dataset
                 not in PREDEFINED_CAUSAL_LANGUAGE_DATASETS
                 | language_datasets
                 | speech_to_text_datasets
                 | stable_diffusion_datasets
                 | visual_lm_datasets
+                | sam_datasets
             ):
                 raise ValueError(
                     "You can only choose between the following datasets:"
                     f"{language_datasets} for text feature extraction models, "
                     f"{PREDEFINED_CAUSAL_LANGUAGE_DATASETS} for LLMs, "
                     f"{speech_to_text_datasets} for speech-to-text models, "
+                    f"{sam_datasets} for SegmentAnything models, "
                     f"{visual_lm_datasets} for visual LLMs or "
                     f"{stable_diffusion_datasets} for diffusion models, but we found {self.dataset}."
                 )
 
@@ -14,6 +14,8 @@
 from transformers.models.sam.modeling_sam import SamImageSegmentationOutput, SamPositionalEmbedding
 
 from ...exporters.openvino.utils import save_config
+from .. import OVConfig
+from .configuration import OVQuantizationConfigBase
 from .modeling_base import OVBaseModel, OVModelPart
 from .utils import (
     ONNX_PROMPT_ENCODER_MASK_DECODER_MODEL_NAME,
@@ -83,19 +85,23 @@ def __init__(
         dynamic_shapes: bool = True,
         ov_config: Optional[Dict[str, str]] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVQuantizationConfigBase, Dict] = None,
         **kwargs,
     ):
         self.config = config
         self._model_save_dir = model_save_dir
         self._device = device.upper()
         self.ov_config = {} if ov_config is None else {**ov_config}
         self.preprocessors = kwargs.get("preprocessors", [])
-        self.vision_encoder_model = vision_encoder_model
-        self.prompt_encoder_mask_decoder_model = prompt_encoder_mask_decoder_model
         self._compile_only = kwargs.get("compile_only", False)
+
+        self._openvino_config = None
+        if quantization_config:
+            self._openvino_config = OVConfig(quantization_config=quantization_config)
+
         enable_compilation = kwargs.get("compile", True)
-        self.vision_encoder = OVSamVisionEncoder(self.vision_encoder_model, self)
-        self.prompt_encoder_mask_decoder = OVSamPromptEncoder(self.prompt_encoder_mask_decoder_model, self)
+        self.vision_encoder = OVSamVisionEncoder(vision_encoder_model, self)
+        self.prompt_encoder_mask_decoder = OVSamPromptEncoder(prompt_encoder_mask_decoder_model, self)
 
         if dynamic_shapes and not self.is_dynamic and not self._compile_only:
             self.reshape()
@@ -117,9 +123,8 @@ def clear_requests(self):
             raise ValueError(
                 "`clear_requests()` is not supported with `compile_only` mode, please initialize model without this option"
             )
-
-        for _, component in self.components.items():
-            component.clear_requests()
+        self.vision_encoder.clear_requests()
+        self.prompt_encoder_mask_decoder.clear_requests()
 
     def compile(self):
         self.vision_encoder._compile()
@@ -143,15 +148,16 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
         """
         src_models = self.ov_submodels
         dst_file_names = {
-            "vision_encoder_model": OV_VISION_ENCODER_MODEL_NAME,
-            "prompt_encoder_mask_decoder_model": OV_PROMPT_ENCODER_MASK_DECODER_MODEL_NAME,
+            "vision_encoder": OV_VISION_ENCODER_MODEL_NAME,
+            "prompt_encoder_mask_decoder": OV_PROMPT_ENCODER_MASK_DECODER_MODEL_NAME,
         }
 
         for name in self._ov_submodel_names:
             model = src_models[name]
             dst_file_name = dst_file_names[name]
             dst_path = os.path.join(save_directory, dst_file_name)
             ov.save_model(model, dst_path, compress_to_fp16=False)
+        self._save_openvino_config(save_directory)
 
     @classmethod
     def _from_pretrained(
@@ -166,6 +172,8 @@ def _from_pretrained(
         vision_encoder_file_name: Optional[str] = None,
         prompt_encoder_mask_decoder_file_name: Optional[str] = None,
         local_files_only: bool = False,
+        load_in_8bit: bool = False,
+        quantization_config: Union[OVQuantizationConfigBase, Dict] = None,
         **kwargs,
     ):
         """
@@ -198,6 +206,10 @@ def _from_pretrained(
                 openvino_prompt_encoder_mask_decoder.xml, allowing to load the decoder model with a different name.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
+            load_in_8bit(`bool`, *optional*, defaults to `False`):
+                Whether or not to apply 8-bit weight quantization.
+            quantization_config(`Union[OVQuantizationConfigBase, Dict]`, *optional*, defaults to `None`):
+                Quantization configuration to apply to the model.
         """
         if use_auth_token is not None:
             warnings.warn(
@@ -272,21 +284,50 @@ def _from_pretrained(
                 model_save_dir,
             )
 
+        quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
         model = cls(
             vision_encoder_model=vision_encoder_model,
             prompt_encoder_mask_decoder_model=prompt_encoder_model,
             config=config,
             model_save_dir=model_save_dir,
+            quantization_config=quantization_config,
             **kwargs,
         )
 
+        if quantization_config is not None:
+            from optimum.intel import OVQuantizer
+
+            quantizer = OVQuantizer(model)
+            quantization_config_copy = quantization_config.clone()
+            quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
+            quantization_config_copy.processor = quantization_config.processor or model_id
+            quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+
         return model
 
     @property
     def _ov_submodel_names(self):
-        model_names = ["vision_encoder_model", "prompt_encoder_mask_decoder_model"]
+        model_names = ["vision_encoder", "prompt_encoder_mask_decoder"]
         return model_names
 
+    @property
+    def ov_submodels(self) -> Dict[str, ov.Model]:
+        return {component_name: getattr(self, component_name).model for component_name in self._ov_submodel_names}
+
+    @property
+    def vision_encoder_model(self) -> ov.Model:
+        logger.warning(
+            "Access to the `vision_encoder_model` attribute is deprecated and will be removed in optimum-intel v1.26, please use `vision_encoder.model` instead"
+        )
+        return self.vision_encoder.model
+
+    @property
+    def prompt_encoder_mask_decoder_model(self) -> ov.Model:
+        logger.warning(
+            "Access to the `prompt_encoder_mask_decoder_model` attribute is deprecated and will be removed in optimum-intel v1.26, please use `prompt_encoder_mask_decoder.model` instead"
+        )
+        return self.prompt_encoder_mask_decoder.model
+
     def reshape(self, batch_size: int = -1, point_batch_size: int = -1, num_points_per_image: int = -1):
         """
         Propagates the given input shapes on the model's layers, fixing the inputs shapes of the model.
@@ -304,19 +345,19 @@ def reshape(self, batch_size: int = -1, point_batch_size: int = -1, num_points_p
                 "`reshape()` is not supported with `compile_only` mode, please initialize model without this option"
             )
         vision_encoder_shapes = {}
-        for inputs in self.vision_encoder_model.inputs:
+        for inputs in self.vision_encoder.model.inputs:
             vision_encoder_shapes[inputs] = inputs.get_partial_shape()
             vision_encoder_shapes[inputs][0] = batch_size
-        self.vision_encoder_model.reshape(vision_encoder_shapes)
+        self.vision_encoder.model.reshape(vision_encoder_shapes)
         self.vision_encoder.request = None
         mask_decoder_shapes = {}
-        for inputs in self.prompt_encoder_mask_decoder_model.inputs:
+        for inputs in self.prompt_encoder_mask_decoder.model.inputs:
             mask_decoder_shapes[inputs] = inputs.get_partial_shape()
             mask_decoder_shapes[inputs][0] = batch_size
             if inputs.get_any_name() in ["input_points", "input_labels"]:
                 mask_decoder_shapes[inputs][1] = point_batch_size
                 mask_decoder_shapes[inputs][2] = num_points_per_image
-        self.prompt_encoder_mask_decoder_model.reshape(mask_decoder_shapes)
+        self.prompt_encoder_mask_decoder.model.reshape(mask_decoder_shapes)
         self.prompt_encoder_mask_decoder.request = None
         return self
 
@@ -398,6 +439,6 @@ def get_image_features(self, pixel_values, *args, **kwargs):
 
     @property
     def is_dynamic(self):
-        return model_has_dynamic_inputs(self.vision_encoder_model) or model_has_dynamic_inputs(
-            self.prompt_encoder_mask_decoder_model
+        return model_has_dynamic_inputs(self.vision_encoder.model) or model_has_dynamic_inputs(
+            self.prompt_encoder_mask_decoder.model
         )