microsoft
diff --git a/‎onnxscript/function_libs/torch_lib/ops/core.py‎
Lines changed: 35 additions & 23 deletions b/‎onnxscript/function_libs/torch_lib/ops/core.py‎
Lines changed: 35 additions & 23 deletions
diff --git a/‎onnxscript/function_libs/torch_lib/ops/nn.py‎
Lines changed: 12 additions & 3 deletions b/‎onnxscript/function_libs/torch_lib/ops/nn.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎onnxscript/rewriter/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎onnxscript/rewriter/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxscript/rewriter/_pattern_ir.py‎
Lines changed: 20 additions & 6 deletions b/‎onnxscript/rewriter/_pattern_ir.py‎
Lines changed: 20 additions & 6 deletions
@@ -36,6 +36,7 @@
     graph,
     ir,
 )
+from onnxscript._internal import version_utils
 from onnxscript.function_libs.torch_lib.ops import common as common_ops
 from onnxscript.function_libs.torch_lib.registration import torch_op
 from onnxscript.function_libs.torch_lib.tensor_typing import (
@@ -1647,29 +1648,40 @@ def aten_choose_qparams_optimized(
     raise NotImplementedError()
 
 
-@torch_op("aten::chunk")
-def aten_chunk(self: TTensor, chunks: int, dim: int = 0) -> Sequence[TTensor]:
-    """chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]"""
-    # This will create a Sequence of tensors
-    neg_1 = op.Constant(value_ints=[-1])
-    # Get size of specified dim
-    self_shape = op.Shape(self)
-    dim_size = op.Gather(self_shape, dim, axis=0)
-    # Compute size/chunk to get the number of data in one chunk
-    num_per_chunk = op.Div(dim_size, chunks)
-    num_per_chunk = op.Cast(op.Mod(dim_size, chunks) > 0, to=INT64.dtype) + num_per_chunk  # type: ignore[operator]
-
-    # Compute real chunk number
-    num_chunk = op.Div(dim_size, num_per_chunk)
-    # Get something like [n, n, n, n, ...], total num_chunk
-    list_split = op.Expand(num_per_chunk, op.Reshape(num_chunk, neg_1))
-
-    remainder = op.Mod(dim_size, num_per_chunk)
-    if remainder > 0:  # type: ignore[operator]
-        # Append the remainder to the [n, n, n, n, ..., r]
-        list_split = op.Concat(list_split, op.Reshape(remainder, neg_1), axis=0)
-
-    return op.SplitToSequence(self, list_split, axis=dim)
+if version_utils.torch_older_than("2.7.0"):
+    # PyTorch <2.7 does not support determining the number of outputs for the Split op
+    # https://github.com/pytorch/pytorch/commit/9a1eac6704671c72a2e85c9138db57eb3a80bfb6
+    @torch_op("aten::chunk")
+    def aten_chunk(self: TTensor, chunks: int, dim: int = 0) -> Sequence[TTensor]:
+        """chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]"""
+        # This will create a Sequence of tensors
+        neg_1 = op.Constant(value_ints=[-1])
+        # Get size of specified dim
+        self_shape = op.Shape(self)
+        dim_size = op.Gather(self_shape, dim, axis=0)
+        # Compute size/chunk to get the number of data in one chunk
+        num_per_chunk = op.Div(dim_size, chunks)
+        num_per_chunk = op.Cast(op.Mod(dim_size, chunks) > 0, to=INT64.dtype) + num_per_chunk  # type: ignore[operator]
+
+        # Compute real chunk number
+        num_chunk = op.Div(dim_size, num_per_chunk)
+        # Get something like [n, n, n, n, ...], total num_chunk
+        list_split = op.Expand(num_per_chunk, op.Reshape(num_chunk, neg_1))
+
+        remainder = op.Mod(dim_size, num_per_chunk)
+        if remainder > 0:  # type: ignore[operator]
+            # Append the remainder to the [n, n, n, n, ..., r]
+            list_split = op.Concat(list_split, op.Reshape(remainder, neg_1), axis=0)
+
+        return op.SplitToSequence(self, list_split, axis=dim)
+else:
+
+    @torch_op("aten::chunk", trace_only=True)
+    def aten_chunk(self: TTensor, chunks: int, dim: int = 0) -> Sequence[TTensor]:
+        """chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]"""
+        if chunks == 1:
+            return op.Identity(self)
+        return op.Split(self, axis=dim, num_outputs=chunks)
 
 
 @torch_op("aten::clamp", trace_only=True)
 
@@ -2037,7 +2037,8 @@ def _aten_scaled_dot_product_attention_no_mask_onnx(
         op.MatMul(query_scaled, key_transposed_scaled),
         axis=-1,
     )
-    attn_weight, _ = op.Dropout(attn_weight, dropout_p)
+    if dropout_p != 0:
+        attn_weight, _ = op.Dropout(attn_weight, dropout_p)
     return op.MatMul(attn_weight, value)
 
 
@@ -2076,7 +2077,14 @@ def _aten_scaled_dot_product_attention_bool_mask_onnx(
         op.Add(op.MatMul(query_scaled, key_transposed_scaled), attn_mask),
         axis=-1,
     )
-    attn_weight, _ = op.Dropout(attn_weight, dropout_p)
+    # When using scaled dot product attention with a boolean mask, the softmax operation might return NaN values
+    # due to the presence of -inf in an entire row (padding tokens), resulting in 0/0 (NaN) in the softmax output.
+    # This is because there's no safe/masked softmax imp in ONNX, so we need to handle NaN values explicitly to match
+    # the behavior of PyTorch with boolean masks.
+    # Reference: https://github.com/pytorch/pytorch/issues/103749
+    attn_weight = op.Where(op.IsNaN(attn_weight), zero, attn_weight)
+    if dropout_p != 0:
+        attn_weight, _ = op.Dropout(attn_weight, dropout_p)
     return op.MatMul(attn_weight, value)
 
 
@@ -2111,7 +2119,8 @@ def _aten_scaled_dot_product_attention_float_mask_onnx(
         op.Add(op.MatMul(query_scaled, key_transposed_scaled), attn_mask),
         axis=-1,
     )
-    attn_weight, _ = op.Dropout(attn_weight, dropout_p)
+    if dropout_p != 0:
+        attn_weight, _ = op.Dropout(attn_weight, dropout_p)
     return op.MatMul(attn_weight, value)
 
 
 
@@ -27,6 +27,7 @@
     broadcast_to_matmul,
     cast_constant_of_shape,
     collapse_slices,
+    fuse_pad_into_conv,
     fuse_relus_clips,
     no_op,
     pattern,
@@ -49,6 +50,7 @@
     *fuse_relus_clips.fuse_relus_clips_rules().rules,
     *basic_rules.basic_optimization_rules().rules,
     *redundant_scatter_nd.rules.rules,
+    *fuse_pad_into_conv.fuse_pad_into_conv_rule_set().rules,
 )
 
 
 
@@ -76,20 +76,33 @@ def __str__(self) -> str:
 class AttrPattern(Pattern[ir.Attr]):
     """Base class for an attribute pattern. Matches any attribute value by default."""
 
-    def __init__(self, name: str | None):
+    def __init__(self, name: str | None, *, can_match_none: bool = False):
         self._name = name
+        self._can_match_none = can_match_none
 
     @property
     def name(self) -> str | None:
         return self._name
 
+    @property
+    def can_match_none(self) -> bool:
+        """Indicates whether this pattern can match a None attribute."""
+        return self._can_match_none
+
     def matches(self, attr: ir.Attr) -> bool:
         return True
 
     def __str__(self) -> str:
         return self._name if self._name is not None else "anonymous:" + str(id(self))
 
 
+class AttrVar(AttrPattern):
+    """Represents a pattern variable used to match against attribute values."""
+
+    def __init__(self, name: str | None, *, can_match_none: bool = False):
+        super().__init__(name, can_match_none=can_match_none)
+
+
 # TODO: Support tensors. Align with usage elsewhere.
 SupportedAttrTypes = Union[
     int,
@@ -129,11 +142,11 @@ def _to_attr_pattern(value: AttrPattern | ValuePattern | SupportedAttrTypes) ->
         # annotations to distinguish between ValuePattern and AttrPattern, but forces users to
         # use these type annotations.
         # TODO: check for misuse at rule-creation time. (Currently will be caught by matcher at match-time.)
-        if value.can_match_none or value.check_method is not None:
+        if value.check_method is not None:
             raise ValueError(
-                "Pattern variables used in attributes must not have can_match_none or check_method set."
+                "Pattern variables used in attributes must not have check_method set."
             )
-        return AttrPattern(value.name)
+        return AttrVar(value.name, can_match_none=value.can_match_none)
     if isinstance(value, (int, float, str)):
         return AttrConstantPattern(value)
     if isinstance(value, Sequence):
@@ -493,8 +506,9 @@ def matches(self, node: ir.Node, match: _basics.MatchResult) -> _basics.MatchRes
         for name, attr_pattern in self.attributes.items():
             attr_value = node.attributes.get(name)
             if attr_value is None:
-                return match.fail(f"Attribute {name} not found in node.", node)
-            if not attr_pattern.matches(attr_value):
+                if not attr_pattern.can_match_none:
+                    return match.fail(f"Attribute {name} not found in node.", node)
+            elif not attr_pattern.matches(attr_value):
                 return match.fail(
                     f"Attribute {name} mismatch: expected {attr_pattern}, got {attr_value}.",
                     node,