Merge commit '6af491923135061b107375f1716c7224b1807708'

whitneywhtsang · whitneywhtsang · commit fb68276aa28f · 2025-06-05T05:29:36.000Z
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -141,8 +141,8 @@ scf::ForOp replaceForOpWithNewSignature(
     SmallVectorImpl<std::tuple<Value, Value>> &replacements);
 scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
                                         ValueRange newIterOperands);
-Block::BlockArgListType addIterArgsToLoop(OpBuilder &rewriter, scf::ForOp &loop,
-                                          ValueRange newIterOperands);
+[[nodiscard]] scf::ForOp addIterArgsToLoop(OpBuilder &rewriter, scf::ForOp loop,
+                                           ValueRange newIterOperands);
 
 // Replace WhileOp with a new WhileOp with extra operands. The YieldOp is not
 // updated and needs to be updated separately for the loop to be correct.
diff --git a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
@@ -337,7 +337,8 @@ ttng::TMEMAllocOp hoistTMEMAlloc(TMEMTokenAllocOp alloc, scf::ForOp &forOp) {
   // By hoisting the allocation out of the loop, we need to turn the underlying
   // memory variable into a loop-carried depdendency.
   auto tokType = builder.getType<AsyncTokenType>();
-  Value newTok = addIterArgsToLoop(builder, forOp, newAlloc.getToken()).front();
+  forOp = addIterArgsToLoop(builder, forOp, newAlloc.getToken());
+  Value newTok = forOp.getRegionIterArgs().back();
   appendToForOpYield(forOp, joinLastMemoryUses(builder, alloc.getToken()));
 
   if (src != nullptr) {
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp
@@ -249,7 +249,7 @@ class OptimizeAccumulatorInitPass
       }
 
       Value loopArgFlagValue = loopArgIsZero ? vFalse : vTrue;
-      (void)addIterArgsToLoop(rewriter, forOp, {loopArgFlagValue});
+      forOp = addIterArgsToLoop(rewriter, forOp, {loopArgFlagValue});
       loopArgFlagValue =
           forOp.getRegionIterArg(forOp.getNumRegionIterArgs() - 1);
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -556,7 +556,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
   }
 
   // Patch the loop to add the new loop carried dependencies.
-  (void)addIterArgsToLoop(builder, forOp, newOperands);
+  forOp = addIterArgsToLoop(builder, forOp, newOperands);
 
   // Update yield op with temporary yield values
   auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
@@ -750,7 +750,7 @@ scf::ForOp lowerTMADescriptors(scf::ForOp forOp, CoarseSchedule &schedule) {
     newOperands.push_back(zero);
   }
 
-  (void)addIterArgsToLoop(builder, forOp, newOperands);
+  forOp = addIterArgsToLoop(builder, forOp, newOperands);
 
   auto tmaCounters = ArrayRef<BlockArgument>(forOp.getBody()->getArguments())
                          .slice(tmaCounterArgsStartIdx);
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -106,7 +106,7 @@ Value triton::sinkValueRedefinition(RewriterBase &rewriter, Value in, Value out,
     // `in` is live into the loop body. `out` becomes the live-out if the
     // loop executes at least once.
     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-      (void)addIterArgsToLoop(rewriter, forOp, in);
+      forOp = addIterArgsToLoop(rewriter, forOp, in);
       appendToForOpYield(forOp, out);
       out = forOp.getResults().back();
       continue;
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -682,17 +682,15 @@ scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
   return newForOp;
 }
 
-Block::BlockArgListType addIterArgsToLoop(OpBuilder &rewriter, scf::ForOp &loop,
-                                          ValueRange newIterOperands) {
-  unsigned curArgIdx = loop.getNumRegionIterArgs();
+scf::ForOp addIterArgsToLoop(OpBuilder &rewriter, scf::ForOp loop,
+                             ValueRange newIterOperands) {
   scf::ForOp newLoop =
       replaceForOpWithNewSignature(rewriter, loop, newIterOperands);
   // Save the caller from insertion point invalidation.
   if (rewriter.getInsertionPoint() == loop->getIterator())
     rewriter.setInsertionPoint(newLoop);
   loop.erase();
-  loop = newLoop;
-  return loop.getRegionIterArgs().slice(curArgIdx);
+  return newLoop;
 }
 
 scf::WhileOp replaceWhileOpWithNewSignature(
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -118,8 +118,8 @@ addIndexAndPhase(PartitionBuilder &b, scf::ForOp &loop, unsigned numStages,
   b.setInsertionPoint(loop);
 
   // Index and phase both start at 0.
-  unsigned curArgIdx = loop.getNumRegionIterArgs();
-  auto newArgs = addIterArgsToLoop(b, loop, {b.intCst(0), b.intCst(0)});
+  loop = addIterArgsToLoop(b, loop, {b.intCst(0), b.intCst(0)});
+  auto newArgs = loop.getRegionIterArgs().take_back(2);
   BlockArgument index = newArgs[0];
   BlockArgument phase = newArgs[1];
 
@@ -488,7 +488,8 @@ static LogicalResult pipelineMMA(scf::ForOp &loop, PipelinedMMA &mma,
       createTMemAlloc(b, oldAllocOp, /*multiBuffered=*/true, numMmaStages);
 
   // Use placeholder values for the indices in the loop.
-  auto indexPhase = addIterArgsToLoop(b, loop, {b.intCst(0), b.intCst(0)});
+  loop = addIterArgsToLoop(b, loop, {b.intCst(0), b.intCst(0)});
+  auto indexPhase = loop.getRegionIterArgs().take_back(2);
   BlockArgument index = indexPhase[0];
   BlockArgument phase = indexPhase[1];
 
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -307,25 +307,25 @@ def anchor(x):
 @filecheck_test
 @gluon.jit
 def test_warp_specialize():
-    # CHECK-LABEL: tt.func public @test_warp_specialize
+    # CHECK-LABEL: test_warp_specialize
     # CHECK-NEXT:    [[A:%.*]] = tt.make_range {end = 1 : i32, start = 0 : i32}
     # CHECK-NEXT:    [[B:%.*]] = tt.make_range {end = 2 : i32, start = 0 : i32}
     # CHECK-NEXT:    [[C:%.*]] = tt.make_range {end = 4 : i32, start = 0 : i32}
     # CHECK-NEXT:    [[OUTS:%.*]]:3 = ttg.warp_specialize([[A]], [[B]], [[C]]) {{.*}}requestedRegisters = array<i32: 24, 48>
     # CHECK-NEXT:    default {
-    # CHECK-NEXT:      [[RESULTS:%.*]]:3 = tt.call @"warp_specialize_default{{.*}}"([[A]], [[B]], [[C]])
+    # CHECK-NEXT:      [[RESULTS:%.*]]:3 = tt.call @{{.*}}warp_specialize_default{{.*}}([[A]], [[B]], [[C]])
     # CHECK-NEXT:      warp_yield [[RESULTS]]#0, [[RESULTS]]#1, [[RESULTS]]#2
     # CHECK-NEXT:    }
     # CHECK-NEXT:    partition0(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>, %arg2: tensor<4xi32>) num_warps(4) {
-    # CHECK-NEXT:      call @"warp_specialize_worker0{{.*}}"(%arg0, %arg1, %arg2)
+    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker0{{.*}}(%arg0, %arg1, %arg2)
     # CHECK-NEXT:      warp_return
     # CHECK-NEXT:    }
     # CHECK-NEXT:    partition1(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>, %arg2: tensor<4xi32>) num_warps(4) {
-    # CHECK-NEXT:      call @"warp_specialize_worker1{{.*}}"(%arg0, %arg1, %arg2)
+    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker1{{.*}}(%arg0, %arg1, %arg2)
     # CHECK-NEXT:      warp_return
     # CHECK-NEXT:    }
-    # CHECK-NEXT:    call @anchor{{.*}}([[OUTS]]#0)
-    # CHECK-NEXT:    call @"anchor{{.*}}"([[OUTS]]#1, [[OUTS]]#2)
+    # CHECK-NEXT:    call @{{.*}}anchor{{.*}}([[OUTS]]#0)
+    # CHECK-NEXT:    call @{{.*}}anchor{{.*}}([[OUTS]]#1, [[OUTS]]#2)
     pair = Pair(tl.arange(0, 1), tl.arange(0, 2))
     a, b = ttgl.warp_specialize((pair, tl.arange(0, 4)), warp_specialize_default,
                                 [warp_specialize_worker0, warp_specialize_worker1], [4, 4], [24, 48])
@@ -541,6 +541,29 @@ def kernel():
     assert "order must be a permutation of 0..(rank-1), but was [1]" in str(e.value.__cause__)
 
 
+@gluon.jit
+def tmem_subslice_kernel():
+    layout: ttgl.constexpr = ttgl.nvidia.blackwell.TensorMemoryLayout(block=[128, 128], unpacked=True)
+    tmem = ttgl.nvidia.blackwell.allocate_tensor_memory(ttgl.int32, [2, 256, 256], layout)
+    tmem.subslice(0)
+
+
+def test_tmem_subslice_constexpr():
+    expecttest.assert_expected_inline(
+        run_parser(tmem_subslice_kernel).str_nodebug(), """\
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+module {
+  tt.func public @tmem_subslice_kernel() attributes {noinline = false} {
+    %result = ttng.tmem_alloc : () -> !ttg.memdesc<2x256x256xi32, #tmem, #ttng.tensor_memory, mutable>
+    %c0_i32 = arith.constant 0 : i32
+    %c0_i32_0 = arith.constant 0 : i32
+    %0 = ttg.memdesc_subview %result[%c0_i32, %c0_i32_0, %c0_i32_0] : !ttg.memdesc<2x256x256xi32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<256x256xi32, #tmem, #ttng.tensor_memory, mutable, 2x256x256>
+    tt.return
+  }
+}
+""")
+
+
 @gluon.jit
 def smem_and_layout_user(smem, a: ttgl.constexpr):
     pass
@@ -561,10 +584,10 @@ def kernel():
 module {
   tt.func public @kernel() attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<32x32xi32, #shared, #smem, mutable>
-    tt.call @"smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_constexpr[1]_constexpr[0]____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(constexpr_1_ ,constexpr_0_), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%0) : (!ttg.memdesc<32x32xi32, #shared, #smem, mutable>) -> ()
+    tt.call @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_constexpr[1]_constexpr[0]____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(constexpr_1_ ,constexpr_0_), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%0) : (!ttg.memdesc<32x32xi32, #shared, #smem, mutable>) -> ()
     tt.return
   }
-  tt.func private @"smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_constexpr[1]_constexpr[0]____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(constexpr_1_ ,constexpr_0_), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%arg0: !ttg.memdesc<32x32xi32, #shared, #smem, mutable>) attributes {noinline = false} {
+  tt.func private @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_constexpr[1]_constexpr[0]____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(constexpr_1_ ,constexpr_0_), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%arg0: !ttg.memdesc<32x32xi32, #shared, #smem, mutable>) attributes {noinline = false} {
     tt.return
   }
 }
diff --git a/python/test/unit/language/test_frontend.py b/python/test/unit/language/test_frontend.py
@@ -42,7 +42,7 @@ def test_assign_attribute():
     scalar = 11
     pair = Pair(tl.arange(0, 4), scalar)
     # CHECK: %c42_i32 = arith.constant 42 : i32
-    # CHECK-NEXT: call @"anchor{{.*}}"([[RANGE]], %c42_i32)
+    # CHECK-NEXT: call @{{.*}}anchor{{.*}}([[RANGE]], %c42_i32)
     pair.second = 42
     anchor(pair)
 
@@ -58,7 +58,7 @@ def test_augassign_attribute():
     # CHECK: %c42_i32 = arith.constant 42 : i32
     # CHECK: [[VALUE:%.*]] = arith.addi %c11_i32, %c42_i32
     pair.second += 42
-    # CHECK-NEXT: call @"anchor{{.*}}"([[RANGE]], [[VALUE]])
+    # CHECK-NEXT: call @{{.*}}anchor{{.*}}([[RANGE]], [[VALUE]])
     anchor(pair)
 
 
@@ -69,12 +69,12 @@ def test_jit_method():
     # CHECK: %c11_i32 = arith.constant 11 : i32
     # CHECK: [[RANGE:%.*]] = tt.make_range {end = 4 : i32, start = 0 : i32}
     scalar = 11
-    # CHECK: [[V:%.*]]:2 = tt.call @"unpack{{.*}}"([[RANGE]], %c11_i32)
+    # CHECK: [[V:%.*]]:2 = tt.call @{{.*}}unpack{{.*}}([[RANGE]], %c11_i32)
     pair = Pair(tl.arange(0, 4), scalar)
     a, b = pair.unpack()
-    # CHECK: call @anchor{{.*}}([[V]]#0)
+    # CHECK: call @{{.*}}anchor{{.*}}([[V]]#0)
     anchor(a)
-    # CHECK: call @anchor{{.*}}([[V]]#1)
+    # CHECK: call @{{.*}}anchor{{.*}}([[V]]#1)
     anchor(b)
 
 
@@ -95,10 +95,10 @@ def test_aggregate_initializers():
     # CHECK-LABEL: test_aggregate_initializers
     value = TypeWithBuiltinInitializer()
     # CHECK: [[RANGE:%.*]] = tt.make_range {end = 4 : i32, start = 0 : i32}
-    # CHECK: call @"anchor{{.*}}"([[RANGE]])
+    # CHECK: call @{{.*}}anchor{{.*}}([[RANGE]])
     anchor(value)
     # CHECK: [[RANGE:%.*]] = tt.make_range {end = 8 : i32, start = 4 : i32}
-    # CHECK: call @"anchor{{.*}}"([[RANGE]])
+    # CHECK: call @{{.*}}anchor{{.*}}([[RANGE]])
     value.modify(tl.arange(4, 8))
     anchor(value)
 
@@ -118,11 +118,11 @@ def list_of_functions_constexpr(arg, fns: tl.constexpr):
 @triton.jit
 def test_list_of_functions():
     # CHECK-LABEL: test_list_of_functions
-    # CHECK: call @"list_of_functions_constexpr{{.*}}cJITFunction(test_frontend:anchor){{.*}}cJITFunction(test_frontend:forward)"
+    # CHECK: call @{{.*}}list_of_functions_constexpr{{.*}}cJITFunction(test_frontend:anchor){{.*}}cJITFunction(test_frontend:forward)
 
-    # CHECK-LABEL: tt.func private @"list_of_functions_constexpr
-    # CHECK-NEXT: call @anchor
-    # CHECK-NEXT: call @forward
+    # CHECK: tt.func private @{{.*}}list_of_functions_constexpr
+    # CHECK-NEXT: call @{{.*}}anchor
+    # CHECK-NEXT: call @{{.*}}forward
     list_of_functions_constexpr(tl.arange(0, 4), [anchor, forward])
 
 
@@ -138,6 +138,61 @@ def test_call_in_loop():
     # CHECK-LABEL: test_call_in_loop
     acc = 0
     # CHECK: scf.for
-    # CHECK:   call @accumulate
+    # CHECK:   call @{{.*}}accumulate
     for i in range(10):
         acc = accumulate(acc, i)
+
+
+@tl.core._aggregate
+class FunctionParent:
+
+    @triton.jit
+    def function_with_name():
+        pass
+
+
+@triton.jit
+def function_with_name():
+    pass
+
+
+@filecheck_test
+@triton.jit
+def test_function_name_mangling():
+    # CHECK-LABEL: test_function_name_mangling
+    # CHECK: call @test_frontend.function_with_name
+    # CHECK: call @test_frontend.FunctionParent.function_with_name
+    function_with_name()
+    FunctionParent.function_with_name()
+
+
+@tl.core._aggregate
+class AggregateWithConstexpr:
+    a: tl.tensor
+    b: tl.constexpr
+
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+
+    @staticmethod
+    def create(a):
+        return AggregateWithConstexpr(a, tl.constexpr(42))
+
+
+@triton.jit
+def add_rhs_constexpr(agg):
+    _ = agg.a + agg.b
+
+
+@filecheck_test
+@triton.jit
+def test_aggregate_with_constexpr():
+    # CHECK-LABEL: test_aggregate_with_constexpr
+    # CHECK: tt.call @"test_frontend.add_rhs_constexpr__test_frontend.AggregateWithConstexpr<i32S4S, constexpr[42]>
+    agg = AggregateWithConstexpr.create(tl.arange(0, 4))
+    add_rhs_constexpr(agg)
+
+    # CHECK: tt.func private @"test_frontend.add_rhs_constexpr__test_frontend.AggregateWithConstexpr<i32S4S, constexpr[42]>
+    # CHECK: %cst = arith.constant dense<42> : tensor<4xi32>
+    # CHECK: arith.addi %arg0, %cst : tensor<4xi32>
diff --git a/python/test/unit/language/test_module.py b/python/test/unit/language/test_module.py
@@ -0,0 +1,6 @@
+import triton
+
+
+@triton.jit
+def function_with_name():
+    pass
diff --git a/python/test/unit/test_filecheck.py b/python/test/unit/test_filecheck.py
@@ -17,7 +17,7 @@ def test_kernel():
         # CHECK-LABEL: test_kernel
         scalar = 42
         # CHECK: %c42_i32 = arith.constant 42 : i32
-        # CHECK-NEXT: call @anchor{{.*}}(%c42_i32) : (i32) -> ()
+        # CHECK-NEXT: call @{{.*}}anchor{{.*}}(%c42_i32) : (i32) -> ()
         anchor(scalar)
 
     run_filecheck_test(test_kernel)
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -13,7 +13,7 @@
 from .._C.libtriton import ir, gluon_ir
 from ..language import constexpr, semantic, str_to_ty, tensor
 from ..language.core import _unwrap_if_constexpr, base_value, base_type
-from ..runtime.jit import get_jit_fn_file_line
+from ..runtime.jit import get_jit_fn_file_line, get_full_name
 # ideally we wouldn't need any runtime component
 from ..runtime import JITFunction
 from .._utils import find_paths_if, get_iterable_path, set_iterable_path
@@ -315,6 +315,7 @@ def __init__(self, context, prototype, gscope, function_name, jit_fn: JITFunctio
         self.jit_fn = jit_fn
         # TODO: we currently generate illegal names for non-kernel functions involving constexprs!
         if is_kernel:
+            function_name = function_name[function_name.rfind('.') + 1:]
             function_name = check_identifier_legality(function_name, "function")
         self.function_name = function_name
         self.is_kernel = is_kernel
@@ -1200,7 +1201,7 @@ def call_JitFunction(self, fn: JITFunction, args, kwargs):
         args_path = find_paths_if(args, lambda _, x: not _is_constexpr(x))
         args_val = [get_iterable_path(args, path) for path in args_path]
         # mangle
-        fn_name = mangle_fn(fn.__name__, [arg.type for arg in args_val], args_cst)
+        fn_name = mangle_fn(get_full_name(fn), [arg.type for arg in args_val], args_cst)
         # generate function def if necessary
         if not self.module.has_function(fn_name):
             gscope = fn.__globals__
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -71,7 +71,7 @@ def memdesc_slice(mem_desc, index, shape, layout, builder: GluonOpBuilder):
     assert mem_desc.rank > len(shape), f"source rank ({mem_desc.rank}) must be greater than result rank ({len(shape)})"
 
     offsets = [builder.get_int32(0)] * mem_desc.rank
-    offsets[0] = index.handle
+    offsets[0] = tl_semantic._convert_elem_to_ir_value(builder, index, require_i64=False)
     return _memdesc_subview(mem_desc, offsets, shape, layout, builder)
 
 
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -2,6 +2,7 @@
 from typing import Optional, Tuple, List, TYPE_CHECKING
 
 from dataclasses import dataclass
+from triton.language.semantic import _convert_elem_to_ir_value, _convert_to_ir_values
 from triton.experimental.gluon.language import _core as ttgl
 from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
 
@@ -140,12 +141,12 @@ def subslice(self, index, shape=None, layout=None, _builder: GluonOpBuilder = No
         if shape is None:
             shape = self.shape[1:]
 
-        index = _unwrap_if_constexpr(index)
+        index = _convert_elem_to_ir_value(_builder, index, require_i64=False)
         shape = [_unwrap_if_constexpr(s) for s in shape]
         layout = _unwrap_if_constexpr(layout)
 
         offsets = [_builder.get_int32(0)] * self.rank
-        offsets[0] = index.handle
+        offsets[0] = index
         ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
         ret.handle = _builder.create_memdesc_subview(ret.type.to_ir(_builder), self.handle, offsets)
         return ret
@@ -178,6 +179,6 @@ def tcgen05_mma(a, b, acc, *, use_acc=True, pred=True, mbarriers=None, mbarrier_
             true = ttgl.to_tensor(True, _builder=_builder)
             mbarrier_preds = [true] * len(mbarriers)
         else:
-            mbarrier_preds = [pred.handle for pred in mbarrier_preds]
+            mbarrier_preds = _convert_to_ir_values(_builder, mbarrier_preds, require_i64=False)
 
     _builder.create_tcgen05_mma(a.handle, b.handle, acc.handle, use_acc.handle, pred.handle, mbarriers, mbarrier_preds)
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp

Original file line number	Diff line number	Diff line change
`@@ -249,7 +249,7 @@ class OptimizeAccumulatorInitPass`
`249`	`249`	`}`
`250`	`250`
`251`	`251`	`Value loopArgFlagValue = loopArgIsZero ? vFalse : vTrue;`
`252`		`- (void)addIterArgsToLoop(rewriter, forOp, {loopArgFlagValue});`
	`252`	`+ forOp = addIterArgsToLoop(rewriter, forOp, {loopArgFlagValue});`
`253`	`253`	`loopArgFlagValue =`
`254`	`254`	`forOp.getRegionIterArg(forOp.getNumRegionIterArgs() - 1);`
`255`	`255`
Original file line number	Diff line number	Diff line change
`@@ -556,7 +556,7 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {`
`556`	`556`	`}`
`557`	`557`
`558`	`558`	`// Patch the loop to add the new loop carried dependencies.`
`559`		`- (void)addIterArgsToLoop(builder, forOp, newOperands);`
	`559`	`+ forOp = addIterArgsToLoop(builder, forOp, newOperands);`
`560`	`560`
`561`	`561`	`// Update yield op with temporary yield values`
`562`	`562`	`auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());`
`@@ -750,7 +750,7 @@ scf::ForOp lowerTMADescriptors(scf::ForOp forOp, CoarseSchedule &schedule) {`
`750`	`750`	`newOperands.push_back(zero);`
`751`	`751`	`}`
`752`	`752`
`753`		`- (void)addIterArgsToLoop(builder, forOp, newOperands);`
	`753`	`+ forOp = addIterArgsToLoop(builder, forOp, newOperands);`
`754`	`754`
`755`	`755`	`auto tmaCounters = ArrayRef<BlockArgument>(forOp.getBody()->getArguments())`
`756`	`756`	`.slice(tmaCounterArgsStartIdx);`