Torch 2.9 support (#82)

IlyasMoutawwakil · IlyasMoutawwakil · commit 3228c243072e · 2025-10-16T11:02:03.000+02:00
fix #83 and #84
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
@@ -567,6 +567,7 @@ def remap(value):
                 dynamic_axes=dynamix_axes,
                 do_constant_folding=do_constant_folding,
                 opset_version=opset,
+                dynamo=False,  # torch dynamo not yet supported
             )
 
         # check if external data was exported
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -18,29 +18,26 @@
 import inspect
 import sys
 import types
+import warnings
 from typing import TYPE_CHECKING, Any, Callable
 
 import torch
 import transformers
-from torch.onnx.symbolic_opset14 import (
-    _attention_scale,
-    _causal_attention_mask,
-    _onnx_symbolic,
-    _type_utils,
-    jit_utils,
-    symbolic_helper,
-)
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
 
-from optimum.utils import is_diffusers_version, is_transformers_version, logging
+from optimum.utils import is_diffusers_version, is_torch_version, is_transformers_version, logging
 
 
+if is_torch_version("<", "2.9"):
+    from torch.onnx.symbolic_opset14 import _onnx_symbolic, jit_utils, symbolic_helper
+else:
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset14 import _onnx_symbolic, jit_utils, symbolic_helper
+
 if is_transformers_version(">=", "4.44") and is_transformers_version("<", "4.50"):
     from optimum.exporters.onnx._traceable_cache import TraceableCache
 if is_transformers_version(">=", "4.54"):
     from optimum.exporters.onnx._traceable_decorator import traceable_check_model_inputs
-
 if is_transformers_version(">=", "4.43") and is_transformers_version("<", "4.48"):
     from transformers.models.clip.modeling_clip import CLIPAttention, CLIPSdpaAttention
 if is_transformers_version(">=", "4.48"):
@@ -79,76 +76,92 @@ def __ior_(g: jit_utils.GraphContext, self: torch._C.Value, other: torch._C.Valu
     return g.op("Or", self, other)
 
 
-@_onnx_symbolic("aten::scaled_dot_product_attention")
-@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
-def scaled_dot_product_attention(
-    g: jit_utils.GraphContext,
-    query: torch._C.Value,
-    key: torch._C.Value,
-    value: torch._C.Value,
-    attn_mask: torch._C.Value | None = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale: torch._C.Value | None = None,
-    enable_gqa: bool = False,
-):
-    assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
-        "is_causal and attn_mask cannot be set at the same time"
+if is_torch_version("<", "2.9"):
+    # this wad fixed in torch in 2.9 https://github.com/pytorch/pytorch/pull/159973
+    from torch.onnx.errors import OnnxExporterWarning
+    from torch.onnx.symbolic_opset14 import (
+        _attention_scale,
+        _causal_attention_mask,
+        _onnx_symbolic,
+        _type_utils,
+        jit_utils,
+        symbolic_helper,
     )
-    assert not enable_gqa, "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
-
-    if symbolic_helper._is_none(scale):
-        scale = _attention_scale(g, query)
-
-    if is_causal:
-        attn_mask = _causal_attention_mask(g, query, key)
-
-    # Swap the last two axes of key
-    # NOTE: onnx-script has different logic here, because the attribute perms in
-    # transpose needs list of ints
-    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
-    key_transposed_axes = list(range(key_shape_builtin))
-    key_transposed_axes[-1], key_transposed_axes[-2] = (key_transposed_axes[-2], key_transposed_axes[-1])
-    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
-
-    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
-    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
-    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
-    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
-    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
-
-    if symbolic_helper._is_none(attn_mask):
-        mul_qk_add = mul_qk
-        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-    elif _type_utils.JitScalarType.from_value(attn_mask) == _type_utils.JitScalarType.BOOL:
-        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
-        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
-        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
-        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
-        mul_qk_add = g.op("Add", mul_qk, attn_mask)
-        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-        # when using scaled dot product attention with a boolean mask, we replace NaN values in attn_weight with 0.0
-        attn_weight = g.op(
-            "Where", g.op("IsNaN", attn_weight), g.op("Constant", value_t=torch.tensor([0.0])), attn_weight
-        )
-    elif _type_utils.JitScalarType.from_value(attn_mask) in (
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.BFLOAT16,
-    ):
-        mul_qk_add = g.op("Add", mul_qk, attn_mask)
-        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-    else:
-        raise ValueError(f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}")
 
-    if dropout_p != 0:
-        attn_weight = g.op(
-            "Dropout",
-            attn_weight,
-            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+    warnings.filterwarnings("ignore", category=OnnxExporterWarning)
+
+    @_onnx_symbolic("aten::scaled_dot_product_attention")
+    @symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
+    def scaled_dot_product_attention(
+        g: jit_utils.GraphContext,
+        query: torch._C.Value,
+        key: torch._C.Value,
+        value: torch._C.Value,
+        attn_mask: torch._C.Value | None = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: torch._C.Value | None = None,
+        enable_gqa: bool = False,
+    ):
+        assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
+            "is_causal and attn_mask cannot be set at the same time"
         )
+        assert not enable_gqa, "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+
+        if symbolic_helper._is_none(scale):
+            scale = _attention_scale(g, query)
+
+        if is_causal:
+            attn_mask = _causal_attention_mask(g, query, key)
+
+        # Swap the last two axes of key
+        # NOTE: onnx-script has different logic here, because the attribute perms in
+        # transpose needs list of ints
+        key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+        key_transposed_axes = list(range(key_shape_builtin))
+        key_transposed_axes[-1], key_transposed_axes[-2] = (key_transposed_axes[-2], key_transposed_axes[-1])
+        key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
+
+        # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+        # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+        query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+        key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
+        mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
+
+        if symbolic_helper._is_none(attn_mask):
+            mul_qk_add = mul_qk
+            attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+        elif _type_utils.JitScalarType.from_value(attn_mask) == _type_utils.JitScalarType.BOOL:
+            # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
+            const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+            const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+            attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
+            mul_qk_add = g.op("Add", mul_qk, attn_mask)
+            attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+            # when using scaled dot product attention with a boolean mask, we replace NaN values in attn_weight with 0.0
+            attn_weight = g.op(
+                "Where", g.op("IsNaN", attn_weight), g.op("Constant", value_t=torch.tensor([0.0])), attn_weight
+            )
+        elif _type_utils.JitScalarType.from_value(attn_mask) in (
+            _type_utils.JitScalarType.FLOAT,
+            _type_utils.JitScalarType.HALF,
+            _type_utils.JitScalarType.BFLOAT16,
+        ):
+            mul_qk_add = g.op("Add", mul_qk, attn_mask)
+            attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+        else:
+            raise ValueError(f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}")
+
+        if dropout_p != 0:
+            attn_weight = g.op(
+                "Dropout",
+                attn_weight,
+                g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+            )
+
+        return g.op("MatMul", attn_weight, value)
 
-    return g.op("MatMul", attn_weight, value)
+    warnings.filterwarnings("default", category=OnnxExporterWarning)
 
 
 def patch_everywhere(attribute_name: str, patch: Any, module_name_prefix: str | None = None):

Original file line number	Diff line number	Diff line change
`@@ -567,6 +567,7 @@ def remap(value):`
`567`	`567`	`dynamic_axes=dynamix_axes,`
`568`	`568`	`do_constant_folding=do_constant_folding,`
`569`	`569`	`opset_version=opset,`
	`570`	`+ dynamo=False, # torch dynamo not yet supported`
`570`	`571`	`)`
`571`	`572`
`572`	`573`	`# check if external data was exported`