huggingface
diff --git a/‎.github/workflows/test_openvino.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_openvino.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎optimum/exporters/openvino/convert.py‎
Lines changed: 3 additions & 0 deletions b/‎optimum/exporters/openvino/convert.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎optimum/exporters/openvino/model_configs.py‎
Lines changed: 39 additions & 35 deletions b/‎optimum/exporters/openvino/model_configs.py‎
Lines changed: 39 additions & 35 deletions
@@ -41,7 +41,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[openvino,openvino-tokenizers,diffusers,tests]
+          pip install .[openvino,diffusers,tests]
 
       - if: ${{ matrix.transformers-version != 'latest' }}
         name: Install specific dependencies and versions required for older transformers
 
@@ -381,6 +381,9 @@ def export_pytorch(
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     output = Path(output)
 
+    # TODO: temporary solution but statefulness should be added to the export config earlier
+    config.stateful = stateful
+
     if stateful:
         # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect
         # both of them are applied to demonstrate the best performance.
 
@@ -99,7 +99,6 @@
     GptBigCodeModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
-    GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     GraniteMoEModelPatcher,
     IBertModelPatcher,
@@ -112,7 +111,6 @@
     JaisModelPatcher,
     Llama4ImageEmbeddingsModelPatcher,
     Llama4TextModelPatcher,
-    LlamaModelPatcher,
     LlavaImageEmbeddingModelPatcher,
     LlavaNextVideoImageEmbeddingModelPatcher,
     LlavaQwen2ImageEmbeddingsModelPatcher,
@@ -127,6 +125,7 @@
     MistralModelPatcher,
     MixtralModelPatcher,
     MPTModelPatcher,
+    OVDecoderModelPatcher,
     OVSpeechT5ModelPatcher,
     PegasusModelPatcher,
     PegasusStatefulSeq2SeqDecoderPatcher,
@@ -142,11 +141,10 @@
     Qwen2MoEPatcher,
     Qwen2VLLanguageModelPatcher,
     Qwen2VLVisionEmbMergerPatcher,
+    Qwen3MoeModelPatcher,
     QwenModelPatcher,
-    RotaryEmbPatcher,
     SanaTextEncoderModelPatcher,
     StatefulSeq2SeqDecoderPatcher,
-    UpdateCausalMaskModelPatcher,
     XverseModelPatcher,
 )
 
@@ -289,7 +287,7 @@ class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager("qwen2_moe", *["text-generation", "text-generation-with-past"], library_name="transformers")
@@ -307,7 +305,6 @@ def patch_model_for_export(
 
 
 @register_in_tasks_manager("qwen3", *["text-generation", "text-generation-with-past"], library_name="transformers")
-@register_in_tasks_manager("qwen3_moe", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class Qwen3OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     MIN_TRANSFORMERS_VERSION = "4.51.0"
 
@@ -318,7 +315,15 @@ class Qwen3OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("qwen3_moe", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class Qwen3MoEOpenVINOConfig(Qwen3OpenVINOConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return Qwen3MoeModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers")
@@ -400,7 +405,7 @@ class StableLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
@@ -576,7 +581,7 @@ class GemmaOpenVINOConfig(GemmaOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager(
@@ -594,7 +599,7 @@ class LlamaOpenVINOConfig(LlamaOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager(
@@ -671,7 +676,6 @@ class QwenOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     )
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, QwenDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = QwenDummyPastKeyValuesGenerator
-    no_position_ids = False
 
     def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs)
@@ -734,7 +738,7 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
             decoder_sequence_name = "past_sequence_length"
             name = "past_key_values"
         else:
-            decoder_sequence_name = "past_sequence_length + 1"
+            decoder_sequence_name = "past_sequence_length + sequence_length"
             name = "present"
 
         for i in range(self._normalized_config.num_layers):
@@ -760,13 +764,7 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs)
-
-
-def patch_model_for_export(
-    self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
-) -> "ModelPatcher":
-    return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers")
@@ -867,7 +865,7 @@ class PhiOpenVINOConfig(PhiOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs)
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 class OVFalconDummyPastKeyValuesGenerator(FalconDummyPastKeyValuesGenerator):
@@ -952,20 +950,6 @@ class BioGPTOpenVINOConfig(
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
-@register_in_tasks_manager(
-    "gpt_neox_japanese", *["text-generation", "text-generation-with-past"], library_name="transformers"
-)
-class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig):
-    # GPTNeoxJapanese does not require position_ids input.
-    DEFAULT_ONNX_OPSET = 13
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
-
-    def patch_model_for_export(
-        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
-    ) -> "ModelPatcher":
-        return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs)
-
-
 @register_in_tasks_manager(
     "gpt_neo",
     *[
@@ -1295,6 +1279,20 @@ def patch_model_for_export(
         return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "gpt_neox_japanese", *["text-generation", "text-generation-with-past"], library_name="transformers"
+)
+class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig):
+    # GPTNeoxJapanese does not require position_ids input.
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager(
     "gemma2",
     *[
@@ -1544,6 +1542,7 @@ def patch_model_for_export(
         return IBertModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+# TODO: this is a very confusing class TBH, why not simply decompose the VLM into components, like diffusion models ?
 class LMInputEmbedsConfigHelper(TextDecoderWithPositionIdsOnnxConfig):
     def __init__(self, export_config, patcher_cls=None, dummy_input_generator=None, inputs_update=None):
         self.orig_export_config = export_config
@@ -1586,15 +1585,20 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs = self.orig_export_config.generate_dummy_inputs(framework, **kwargs)
         input_ids = dummy_inputs.pop("input_ids")
+        pask_key_values = dummy_inputs.get("past_key_values")
         inputs_embed_shape = (input_ids.shape[0], input_ids.shape[1], self._normalized_config.hidden_size)
         inputs_embeds = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[0].random_float_tensor(
             inputs_embed_shape
         )
         dummy_inputs["inputs_embeds"] = inputs_embeds
         if "token_type_ids" in self.inputs:
+            if is_transformers_version(">=", "4.53"):
+                token_type_ids_shape = (input_ids.shape[0], input_ids.shape[1] + pask_key_values[0][0].shape[-2])
+            else:
+                token_type_ids_shape = (input_ids.shape[0], input_ids.shape[1])
             dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[
                 0
-            ].random_int_tensor(input_ids.shape, min_value=0, max_value=2)
+            ].random_int_tensor(token_type_ids_shape, min_value=0, max_value=2)
         return dummy_inputs