[Gluon] Fix warp_specialize with constexprs, add a few APIs (#7097)

Mogball · web-flow · commit d9fcc1010855 · 2025-06-06T11:20:52.000-07:00
* where, maximum, minimum
* add gluon_ir builder for fence_async_shared. Not sure what API to use
* fix `ttgl.warp_specialize` passing constexpr arguments
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -299,6 +299,11 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttng::AsyncTMAScatterOp>(descPtr, xOffsets, yOffset,
                                                   src);
            })
+      .def("create_fence_async_shared",
+           [](GluonOpBuilder &self, bool bCluster) -> OpState {
+             return self.create<ttng::FenceAsyncSharedOp>(bCluster);
+           })
+
       .def("create_broadcast",
            [](TritonOpBuilder &self, Value &arg, Type retTy) -> Value {
              return self.create<tt::BroadcastOp>(retTy, arg);
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1425,12 +1425,7 @@ void init_triton_ir(py::module &&m) {
            })
       .def("create_expand_dims",
            [](TritonOpBuilder &self, Value &arg, int axis) -> Value {
-             auto argType = dyn_cast<RankedTensorType>(arg.getType());
-             auto argEltType = argType.getElementType();
-             std::vector<int64_t> retShape = argType.getShape();
-             retShape.insert(retShape.begin() + axis, 1);
-             return self.create<ExpandDimsOp>(
-                 RankedTensorType::get(retShape, argEltType), arg, axis);
+             return self.create<ExpandDimsOp>(arg, axis);
            })
       .def("create_cat",
            [](TritonOpBuilder &self, Value &lhs, Value &rhs) -> Value {
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -283,17 +283,17 @@ def test_shared_memory_cast(fresh_knobs):
 
 
 @gluon.jit
-def warp_specialize_default(a, b):
+def warp_specialize_default(a, b, e: ttgl.constexpr):
     return b, a
 
 
 @gluon.jit
-def warp_specialize_worker0(a, b):
+def warp_specialize_worker0(a, b, e: ttgl.constexpr):
     pass
 
 
 @gluon.jit
-def warp_specialize_worker1(a, b):
+def warp_specialize_worker1(a, b, e: ttgl.constexpr):
     pass
 
 
@@ -322,15 +322,15 @@ def test_warp_specialize():
     # CHECK-NEXT:    [[C:%.*]] = tt.make_range {end = 4 : i32, start = 0 : i32}
     # CHECK-NEXT:    [[OUTS:%.*]]:3 = ttg.warp_specialize([[A]], [[B]], [[C]]) {{.*}}requestedRegisters = array<i32: 24, 48>
     # CHECK-NEXT:    default {
-    # CHECK-NEXT:      [[RESULTS:%.*]]:3 = tt.call @{{.*}}warp_specialize_default{{.*}}([[A]], [[B]], [[C]])
+    # CHECK-NEXT:      [[RESULTS:%.*]]:3 = tt.call @{{.*}}warp_specialize_default{{.*}}cconstexpr_42{{.*}}([[A]], [[B]], [[C]])
     # CHECK-NEXT:      warp_yield [[RESULTS]]#0, [[RESULTS]]#1, [[RESULTS]]#2
     # CHECK-NEXT:    }
     # CHECK-NEXT:    partition0(%arg0: tensor<1xi32, [[BLOCKED]]>, %arg1: tensor<2xi32, [[BLOCKED]]>, %arg2: tensor<4xi32, [[BLOCKED]]>) num_warps(4) {
-    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker0{{.*}}(%arg0, %arg1, %arg2)
+    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker0{{.*}}cconstexpr_42{{.*}}(%arg0, %arg1, %arg2)
     # CHECK-NEXT:      warp_return
     # CHECK-NEXT:    }
     # CHECK-NEXT:    partition1(%arg0: tensor<1xi32, [[BLOCKED]]>, %arg1: tensor<2xi32, [[BLOCKED]]>, %arg2: tensor<4xi32, [[BLOCKED]]>) num_warps(4) {
-    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker1{{.*}}(%arg0, %arg1, %arg2)
+    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker1{{.*}}cconstexpr_42{{.*}}(%arg0, %arg1, %arg2)
     # CHECK-NEXT:      warp_return
     # CHECK-NEXT:    }
     # CHECK-NEXT:    call @{{.*}}anchor{{.*}}([[OUTS]]#0)
@@ -340,8 +340,9 @@ def test_warp_specialize():
     b = ttgl.arange(0, 2, layout=layout)
     c = ttgl.arange(0, 4, layout=layout)
     pair = Pair(a, b)
-    a, b = ttgl.warp_specialize((pair, c), warp_specialize_default, [warp_specialize_worker0, warp_specialize_worker1],
-                                [4, 4], [24, 48])
+    e: ttgl.constexpr = 42
+    a, b = ttgl.warp_specialize((pair, c, e), warp_specialize_default,
+                                [warp_specialize_worker0, warp_specialize_worker1], [4, 4], [24, 48])
     anchor(a)
     anchor(b)
 
@@ -781,3 +782,23 @@ def test_reduce(fresh_knobs):
   } loc(#loc)
 } loc(#loc)
 """)
+
+
+@filecheck_test
+@gluon.jit
+def test_elementwise_core():
+    # CHECK: [[BLOCKED:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+    # CHECK: @test_elementwise_core
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1], [32], [4], [0])
+    x = ttgl.arange(0, 16, layout)
+    y = ttgl.arange(16, 32, layout)
+
+    # CHECK: arith.select {{.*}} : tensor<16xi1, [[BLOCKED]]>, tensor<16xi32, [[BLOCKED]]>
+    a = ttgl.where(x > 8, x, y)
+    # CHECK: arith.maxsi {{.*}} : tensor<16xi32, [[BLOCKED]]>
+    b = ttgl.maximum(x, y)
+    # CHECK: arith.minsi {{.*}} : tensor<16xi32, [[BLOCKED]]>
+    c = ttgl.minimum(x, y)
+    ttgl.static_assert(a.type == x.type)
+    ttgl.static_assert(b.type == x.type)
+    ttgl.static_assert(c.type == x.type)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -49,6 +49,9 @@
     "static_assert",  # NOQA: F822
     "store",  # NOQA: F822
     "to_tensor",  # NOQA: F822
+    "where",  # NOQA: F822
+    "maximum",  # NOQA: F822
+    "minimum",  # NOQA: F822
 ]
 
 __all__ = [
@@ -303,6 +306,5 @@ def warp_specialize(args, default_partition, worker_partitions, worker_num_warps
                     _semantic=None, _generator=None):
     worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
     worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
-    args = [_unwrap_if_constexpr(arg) for arg in args]
     return _semantic.warp_specialize(args, default_partition, worker_partitions, worker_num_warps,  #
                                      worker_num_regs, _generator)
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1683,7 +1683,7 @@ def associative_scan(self, inputs: Sequence[TensorTy], axis: int, region_builder
 
         scan_op = self.builder.create_scan([t.handle for t in inputs], axis, reverse)
         region_builder_fn(scan_op)
-        scan_op.verify()
+        assert scan_op.verify()
 
         return tuple(self.wrap_tensor(scan_op.get_result(i), inputs[i].type.scalar, shape) for i in range(len(inputs)))