Merge branch 'main' into justinchu/should-fold

justinchuby · web-flow · commit 01e88aa7034f · 2025-09-30T17:26:55.000-07:00
diff --git a/onnxscript/function_libs/torch_lib/ops/core.py b/onnxscript/function_libs/torch_lib/ops/core.py
@@ -3688,23 +3688,27 @@ def python_math_floor(self: TFloat) -> TInt:
 
 
 @torch_op("aten::floor_divide", trace_only=True)
-def aten_floor_divide(self: TFloat, other: TFloat) -> TFloat:
+def aten_floor_divide(self: TTensor, other: TTensor) -> TTensor:
     """floor_divide(Tensor self, Tensor other) -> Tensor"""
 
-    return op.Floor(op.Div(self, other))
+    if self.dtype.is_floating_point():
+        return op.Floor(op.Div(self, other))
 
+    assert self.dtype.is_integer()
 
-@torch_op("aten::floor_divide", trace_only=True)
-def aten_floor_divide_int(self: TInt, other: TInt) -> TInt:
-    """floor_divide(Tensor self, Tensor other) -> Tensor"""
+    if not self.dtype.is_signed():
+        return op.Div(self, other)
 
-    # TODO(justinchuby): This can be simplified if we can constrain the
-    # inputs to be positive integers. Consider how we can embed constraints in the model.
-    dtype = self.dtype
-    self = op.Cast(self, to=FLOAT.dtype)
-    other = op.Cast(other, to=FLOAT.dtype)
-    result = op.Floor(op.Div(self, other))
-    return op.Cast(result, to=dtype)
+    # Convert truncation to flooring
+    # Reference: https://github.com/pytorch/pytorch/blob/ffc645c870f0abd368606ba1e2b3b58cacb03046/torch/_refs/__init__.py#L1401C1-L1409C70
+    # offset = (torch.signbit(a) != torch.signbit(b)).logical_and(torch.fmod(a, b) != 0)
+    # return prims.div(a, b) - _maybe_convert_to_dtype(offset, a.dtype)
+    offset = op.And(
+        op.Not(op.Equal(op.Sign(self), op.Sign(other))),
+        op.Cast(op.Mod(self, other), to=BOOL.dtype),
+    )
+    offset = op.Cast(offset, to=self.dtype)
+    return op.Sub(op.Div(self, other), offset)
 
 
 @torch_op("_operator::floordiv", trace_only=True)
diff --git a/onnxscript/rewriter/ort_fusions/sdpa.py b/onnxscript/rewriter/ort_fusions/sdpa.py
@@ -88,6 +88,9 @@ def pattern(
         )
 
         attn_weight = op.Softmax(attn_score, axis=-1)
+        is_nan = op.IsNaN(attn_weight)
+        adj_attn_weight = op.Where(is_nan, 0.0, attn_weight)
+        attn_weight = pattern.OrValue([adj_attn_weight, attn_weight])
         attn_output = op.MatMul(attn_weight, value)
         return attn_output
 
diff --git a/onnxscript/rewriter/ort_fusions/sdpa_test.py b/onnxscript/rewriter/ort_fusions/sdpa_test.py
@@ -44,7 +44,10 @@ def _unmasked_pre_div_sdpa_script(query, key, value):
     scaled_key = op.Div(key_transposed, divisor)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -56,7 +59,10 @@ def _unmasked_pre_mul_sdpa_script(query, key, value):
     scaled_key = op.Mul(key_transposed, multiplier)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -67,7 +73,10 @@ def _unmasked_post_div_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Div(attn_score, divisor)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -78,7 +87,10 @@ def _unmasked_post_mul_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Mul(attn_score, multiplier)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -90,7 +102,10 @@ def _custom_scale_pre_div_sdpa_script(query, key, value):
     scaled_key = op.Div(key_transposed, divisor)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -102,7 +117,10 @@ def _custom_scale_pre_mul_sdpa_script(query, key, value):
     scaled_key = op.Mul(key_transposed, multiplier)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -115,7 +133,10 @@ def _custom_multi_scale_pre_mul_sdpa_script(query, key, value):
     scaled_key = op.Mul(key_transposed, multiplier_k)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -126,7 +147,10 @@ def _custom_scale_post_div_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Div(attn_score, divisor)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -137,7 +161,10 @@ def _custom_scale_post_mul_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Mul(attn_score, multiplier)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -150,7 +177,10 @@ def _masked_pre_div_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -163,7 +193,10 @@ def _masked_pre_mul_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -175,7 +208,10 @@ def _masked_post_div_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Div(attn_score, divisor)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -187,7 +223,10 @@ def _masked_post_mul_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Mul(attn_score, multiplier)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -200,7 +239,10 @@ def _masked_custom_scale_pre_div_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -213,7 +255,10 @@ def _masked_custom_scale_pre_mul_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -225,7 +270,10 @@ def _masked_custom_scale_post_div_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Div(attn_score, divisor)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -237,7 +285,10 @@ def _masked_custom_scale_post_mul_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Mul(attn_score, multiplier)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
diff --git a/onnxscript/rewriter/rules/common/_fuse_conv_affine_test.py b/onnxscript/rewriter/rules/common/_fuse_conv_affine_test.py
@@ -18,9 +18,7 @@ def clone_model(self, model: ir.Model) -> ir.Model:
 
     def test_conv_affine_fusion(self):
         tape = ir.tape.Tape()
-        x = ir.Input(
-            "x", shape=ir.Shape([1, 3, 32, 32]), type=ir.TensorType(ir.DataType.FLOAT)
-        )
+        x = ir.val("x", dtype=ir.DataType.FLOAT, shape=ir.Shape([1, 3, 32, 32]))
         w = tape.initializer(ir.tensor(np.ones((3, 3, 3, 3), dtype=np.float32), name="w"))
         b = tape.initializer(ir.tensor(np.ones((3,), dtype=np.float32), name="b"))
         scale = tape.initializer(ir.tensor(np.array([2.0], dtype=np.float32), name="scale"))
@@ -31,10 +29,10 @@ def test_conv_affine_fusion(self):
         z = tape.op(
             "Add",
             [mul_out, offset],
-            output=ir.Input(
+            output=ir.val(
                 "z",
+                dtype=ir.DataType.FLOAT,
                 shape=ir.Shape([1, 3, 32, 32]),
-                type=ir.TensorType(ir.DataType.FLOAT),
             ),
         )
 
@@ -65,9 +63,7 @@ def test_conv_affine_fusion(self):
 
     def test_affine_conv_fusion_without_pad(self):
         tape = ir.tape.Tape()
-        x = ir.Input(
-            "x", shape=ir.Shape([1, 3, 32, 32]), type=ir.TensorType(ir.DataType.FLOAT)
-        )
+        x = ir.val("x", dtype=ir.DataType.FLOAT, shape=ir.Shape([1, 3, 32, 32]))
         w = tape.initializer(ir.tensor(np.ones((3, 3, 3, 3), dtype=np.float32), name="w"))
         b = tape.initializer(ir.tensor(np.ones((3,), dtype=np.float32), name="b"))
         scale = tape.initializer(ir.tensor(np.array([2.0], dtype=np.float32), name="scale"))
@@ -77,10 +73,10 @@ def test_affine_conv_fusion_without_pad(self):
         z = tape.op(
             "Add",
             [mul_out, offset],
-            output=ir.Input(
+            output=ir.val(
                 "z",
+                dtype=ir.DataType.FLOAT,
                 shape=ir.Shape([1, 3, 32, 32]),
-                type=ir.TensorType(ir.DataType.FLOAT),
             ),
         )
         conv_out = tape.op("Conv", [z, w, b], attributes={"pads": [0, 0, 0, 0]})
diff --git a/tests/function_libs/torch_lib/extra_opinfo.py b/tests/function_libs/torch_lib/extra_opinfo.py
@@ -2270,18 +2270,9 @@ def __init__(self):
     opinfo_core.BinaryUfuncInfo(
         "ops.aten.floor_divide",
         aten_name="floor_divide",
-        dtypes=common_dtype.floating_types_and_half(),
+        dtypes=common_dtype.all_types_and_half(),
         rhs_make_tensor_kwargs=dict(exclude_zero=True),
     ),
-    opinfo_core.BinaryUfuncInfo(
-        "ops.aten.floor_divide.int",
-        aten_name="floor_divide",
-        op=torch.ops.aten.floor_divide,
-        dtypes=common_dtype.integral_types(),
-        # Create only positive inputs
-        lhs_make_tensor_kwargs=dict(low=0),
-        rhs_make_tensor_kwargs=dict(exclude_zero=True, low=0),
-    ),
     opinfo_core.OpInfo(
         "ops.aten.hamming_window",
         aten_name="hamming_window",
diff --git a/tests/function_libs/torch_lib/ops_test_data.py b/tests/function_libs/torch_lib/ops_test_data.py
@@ -794,7 +794,6 @@ def _where_input_wrangler(
     TorchLibOpInfo("flatten", core_ops.aten_flatten),
     TorchLibOpInfo("floor", core_ops.aten_floor),
     TorchLibOpInfo("ops.aten.floor_divide", core_ops.aten_floor_divide),
-    TorchLibOpInfo("ops.aten.floor_divide.int", core_ops.aten_floor_divide_int),
     TorchLibOpInfo("fmod", core_ops.aten_fmod),
     TorchLibOpInfo("frac", core_ops.aten_frac),
     TorchLibOpInfo("full", core_ops.aten_full),

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,9 @@ def pattern(`
`88`	`88`	`)`
`89`	`89`
`90`	`90`	`attn_weight = op.Softmax(attn_score, axis=-1)`
	`91`	`+ is_nan = op.IsNaN(attn_weight)`
	`92`	`+ adj_attn_weight = op.Where(is_nan, 0.0, attn_weight)`
	`93`	`+ attn_weight = pattern.OrValue([adj_attn_weight, attn_weight])`
`91`	`94`	`attn_output = op.MatMul(attn_weight, value)`
`92`	`95`	`return attn_output`
`93`	`96`