[Gluon] Broadcast auto with concrete layouts (#7491)

peterbell10 · web-flow · commit 7d3bf12eecc8 · 2025-07-12T21:27:35.000+01:00
This also makes `arange` return auto layout by default, so you can do
for example:
```python
xidx = gl.arange(0, 32)[:, None]
yidx = gl.arange(0, 16)[None, :]
off = xidx * x_stride + yidx * y_stride
off = gl.convert_layout(off, concrete_layout)
```

I also fixed `filecheck_test` to disable the overflow sanitizer, as it
can cause lit tests to fail if the check statements match with the
overflow sanitizer ops.
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -1233,3 +1233,28 @@ def test_auto_layout():
     z = x + y
     # CHECK: (tensor<16x8xi32, #gluon.auto_encoding>) -> tensor<16xi32, #gluon.auto_encoding
     ttgl.sum(z, axis=1)
+
+    # CHECK: tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #gluon.auto_encoding>
+    ttgl.arange(0, 32)
+
+
+@filecheck_test
+@gluon.jit
+def test_auto_layout_broadcast():
+    # CHECK: [[BLOCKED:#.*]] = #ttg.blocked
+    # CHECK: [[X:%.*]] = arith.constant dense<1> : tensor<16x1xi32, #gluon.auto_encoding>
+    # CHECK: [[Y:%.*]] = arith.constant dense<2> : tensor<1x16xi32, [[BLOCKED]]>
+    x = ttgl.full([16, 1], 1, ttgl.int32, layout=ttgl.AutoLayout())
+    y = ttgl.full([1, 16], 2, ttgl.int32, layout=ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0]))
+
+    # CHECK: [[XCVT:%.*]] = ttg.convert_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
+    # CHECK: [[XBCAST:%.*]] = tt.broadcast [[XCVT]]
+    # CHECK: [[YBCAST:%.*]] = tt.broadcast [[Y]]
+    # CHECK: arith.addi [[XBCAST]], [[YBCAST]] : tensor<16x16xi32, [[BLOCKED]]>
+    _ = x + y
+
+    # CHECK: [[XCVT2:%.*]] = ttg.convert_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
+    # CHECK: [[YBCAST2:%.*]] = tt.broadcast [[Y]]
+    # CHECK: [[XBCAST2:%.*]] = tt.broadcast [[XCVT2]]
+    # CHECK: arith.muli [[YBCAST2]], [[XBCAST2]] : tensor<16x16xi32, [[BLOCKED]]>
+    _ = y * x
diff --git a/python/triton/_filecheck.py b/python/triton/_filecheck.py
@@ -42,8 +42,9 @@ def run_filecheck(name, module_str, check_template):
             temp.write(check_template)
 
         try:
-            subprocess.check_output([filecheck_path, temp_expected, "--input-file", temp_module],
-                                    stderr=subprocess.STDOUT)
+            subprocess.check_output(
+                [filecheck_path, temp_expected, "--input-file", temp_module, "--dump-input-context=50"],
+                stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as error:
             decoded = error.output.decode('unicode_escape')
             raise ValueError(decoded)
@@ -60,8 +61,10 @@ def run_parser(kernel_fn):
     ir.load_dialects(context)
     stub_backend.load_dialects(context)
 
-    extra_options = src.parse_options()
-    options = stub_backend.parse_options(dict(**extra_options))
+    options = dict(sanitize_overflow=False)
+    options.update(src.parse_options())
+
+    options = stub_backend.parse_options(options)
     codegen_fns = stub_backend.get_codegen_implementation(options)
     module_map = stub_backend.get_module_map()
     module = src.make_ir(options, codegen_fns, module_map, context)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -43,6 +43,7 @@
 )
 
 _IMPORT_FROM_TRITON: List[str] = [
+    "broadcast",
     "expand_dims",
     "inline_asm_elementwise",
     "join",
@@ -341,14 +342,14 @@ def _keep_alive(self, _semantic: GluonSemantic = None) -> None:
 
 
 @builtin
-def arange(start, end, layout, _semantic=None):
+def arange(start, end, layout=None, _semantic=None):
     """
     Generate a sequence tensor with values in [start, end) using a specified layout.
 
     Args:
         start (int): Inclusive start of the sequence.
         end (int): Exclusive end of the sequence.
-        layout (DistributedLayout): The layout of the output tensor.
+        layout (DistributedLayout): The layout of the output tensor. Defaults to AutoLayout.
 
     Returns:
         tensor: A 1D tensor containing sequential values.
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -112,7 +112,14 @@ def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
         lhs_shape = lhs_ty.get_block_shapes()
         rhs_shape = rhs_ty.get_block_shapes()
         ret_shape = self._broadcast_shapes(lhs_shape, rhs_shape)
-        if lhs_ty.layout != rhs_ty.layout:
+
+        is_lhs_auto = isinstance(lhs_ty.layout, AutoLayout)
+        is_rhs_auto = isinstance(rhs_ty.layout, AutoLayout)
+        if is_lhs_auto and not is_rhs_auto:
+            lhs = self.convert_layout(lhs, rhs_ty.layout)
+        elif is_rhs_auto and not is_lhs_auto:
+            rhs = self.convert_layout(rhs, lhs_ty.layout)
+        elif lhs_ty.layout != rhs_ty.layout:
             raise ValueError(f"Layout mismatch in broadcast: {lhs_ty.layout} vs {rhs_ty.layout}")
 
         lhs = self.broadcast_impl_shape(lhs, ret_shape)
@@ -121,6 +128,8 @@ def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
 
     def arange(self, start, end, layout):
         shape = [end - start]
+        if layout is None:
+            layout = AutoLayout()
         ret_ty = ttgl.distributed_type(ttgl.int32, shape, layout)
         return super().arange(start, end, ret_ty=ret_ty)
 
@@ -138,7 +147,7 @@ def full(self, shape, value, dtype, layout):
         scalar = self.make_scalar(value, dtype)
         return self.splat(scalar, shape, layout)
 
-    def convert_layout(self, value, layout, assert_trivial):
+    def convert_layout(self, value, layout, assert_trivial=False):
         ty = value.type
         _check(isinstance(ty, ttgl.distributed_type),
                lambda: f"expected convert_layout input to be a distributed_type but got: {ty!r}")