Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,6 @@ def export_pytorch(
logger.info(f"Using framework PyTorch: {torch.__version__}")
output = Path(output)

# TODO: temporary solution but statefulness should be added to the export config earlier
config.stateful = stateful

with torch.no_grad():
if hasattr(model, "config"):
model.config.torchscript = False
Expand Down
23 changes: 1 addition & 22 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@


if is_transformers_version(">=", "4.53"):
from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask
from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask
from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock


Expand Down Expand Up @@ -223,11 +223,6 @@ def __enter__(self):
# Although I'm not sure this is the right way to handle this, we are basically pretending that -65,504 is -inf
ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask_without_vmap)

# for decoder models, we use eager mask without vmap for sdpa as well
# to avoid a nan output issue in OpenVINO that only happens in case of:
# non-stateful models on cpu and stateful models on npu
ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)

Expand All @@ -236,7 +231,6 @@ def __exit__(self, exc_type, exc_value, traceback):
del self._model._update_causal_mask_original

if is_transformers_version(">=", "4.53.0"):
ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask)
ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask)


Expand Down Expand Up @@ -4420,16 +4414,10 @@ def __enter__(self):
# to avoid overflow issues on some hardware (e.g. Intel NPU)
ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask_without_vmap)

# for decoder models, we use eager mask without vmap for sdpa as well
# to avoid a nan output issue in OpenVINO that only happens in case of:
# non-stateful models on cpu and stateful models on npu
ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)

if is_transformers_version(">=", "4.53.0"):
ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask)
ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask)


Expand All @@ -4448,11 +4436,6 @@ def __enter__(self):
self._model.config._orig_attn_implementation = self._model.config._attn_implementation
self._model.config._attn_implementation = "sdpa"

if is_transformers_version(">=", "4.53"):
# starting from 4.53, we get unmatching outputs if we use the boolean mask
# TODO: This is an openvino issue (inconsistency between boolean and float masks)
ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)

Expand All @@ -4464,10 +4447,6 @@ def __exit__(self, exc_type, exc_value, traceback):
self._model.config._attn_implementation = self._model.config._orig_attn_implementation
del self._model.config._orig_attn_implementation

if is_transformers_version(">=", "4.53"):
# remove the eager_mask_without_vmap from the ALL_MASK_ATTENTION_FUNCTIONS
ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask)


class MiniCPMModelPatcher(OVDecoderModelPatcher):
def __init__(
Expand Down