Merge commit 'e21efcbac19e8aa48ce9c7a11b6e7f663ba8f553'

whitneywhtsang · whitneywhtsang · commit 1019d72f824f · 2025-06-30T20:12:57.000Z
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -214,10 +214,10 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,
   }
 
   for (auto [inDimName, idx] : indices) {
-    if (idx.getDefiningOp<LLVM::ConstantOp>()) {
+    APInt constant;
+    if (matchPattern(idx, m_ConstantInt(&constant))) {
       continue;
     }
-
     int nBits = layout.getInDimSizeLog2(inDimName);
     for (int i = 0; i < nBits; i++) {
       Value bit = b.and_(idx, b.i32_val(1 << i));
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -442,6 +442,8 @@ LogicalResult PipelinedLoadGroup::lowerLoads(WarpSchedule &schedule,
       StageCluster userStageCluster = getStageCluster(loadBeforeOp);
       Value loaded = b.createInto<LocalLoadOp>(*partition, userStageCluster,
                                                load.type, view);
+      b.createInto<ttng::FenceAsyncSharedOp>(*partition, userStageCluster,
+                                             /*bCluster=*/false);
       for (OpOperand *use : uses)
         use->set(loaded);
     }
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -59,10 +59,33 @@ LogicalResult WarpGroupDotOp::inferReturnTypes(
 }
 
 LogicalResult WarpGroupDotOp::verify() {
-  auto nvmmaEnc =
-      dyn_cast<NvidiaMmaEncodingAttr>(getD().getType().getEncoding());
+  auto resTy = getD().getType();
+  auto nvmmaEnc = dyn_cast<NvidiaMmaEncodingAttr>(resTy.getEncoding());
   if (!nvmmaEnc || !nvmmaEnc.isHopper())
     return emitOpError("WGMMA result layout must be Hopper NVMMA");
+  auto numWarps = gpu::lookupNumWarps(getOperation());
+  if (numWarps % 4)
+    return emitOpError("WGMMA requires num_warps to be divisible by 4");
+  auto retShapePerCTA = getShapePerCTA(resTy);
+  int rank = retShapePerCTA.size();
+  if (rank != 2)
+    return emitOpError("WGMMA result shape must be 2D");
+  if (retShapePerCTA[0] % 64 != 0)
+    return emitOpError("WGMMA result M dimension must be divisible by 64");
+  if (retShapePerCTA[1] % 8 != 0)
+    return emitOpError("WGMMA result N dimension must be divisible by 8");
+  auto aElemTy = getA().getType().getElementType();
+  if (!(llvm::isa<Float8E5M2Type, Float8E4M3FNType>(aElemTy) ||
+        aElemTy.isInteger(8) || aElemTy.isF16() || aElemTy.isBF16() ||
+        aElemTy.isF32()))
+    return emitOpError("WGMMA result element type must be F16, BF16, F32, "
+                       "F8E5M2, F8E4M3FN, or integer type");
+  if (getMaxNumImpreciseAcc() < 32 &&
+      (llvm::isa<Float8E5M2Type, Float8E4M3FNType>(aElemTy)) &&
+      resTy.getElementType().isF32()) {
+    return emitOpError("Cannot use F32 as the accumulator element type when "
+                       "the max_num_imprecise_acc is less than 32");
+  }
   return success();
 }
 
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp
@@ -127,7 +127,10 @@ SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
   // Last dim must equal the swizzle byte size
   if (swizzleBytes != 0) {
     auto contigDimSize = (8 * swizzleBytes) / elementBitWidth;
-    assert(blockShape[contigDim] >= contigDimSize);
+    if (blockShape[contigDim] < contigDimSize) {
+      llvm::reportFatalUsageError("Block shape is too small for the swizzle "
+                                  "byte size in NVMMA Shared Layout.");
+    }
     blockShape[contigDim] = contigDimSize;
   }
   if (fp4Padded && packedSize) {
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -86,6 +86,7 @@ struct GluonLayouts {
   py::handle BlockedLayout;
   py::handle SliceLayout;
   py::handle DistributedLinearLayout;
+  py::handle NVMMADistributedLayout;
   py::handle NVMMASharedLayout;
   py::handle SwizzledSharedLayout;
 
@@ -96,6 +97,8 @@ struct GluonLayouts {
     SliceLayout = py::object(layouts.attr("SliceLayout")).release();
     DistributedLinearLayout =
         py::object(layouts.attr("DistributedLinearLayout")).release();
+    NVMMADistributedLayout =
+        py::object(layouts.attr("NVMMADistributedLayout")).release();
     NVMMASharedLayout = py::object(layouts.attr("NVMMASharedLayout")).release();
     SwizzledSharedLayout =
         py::object(layouts.attr("SwizzledSharedLayout")).release();
@@ -131,6 +134,14 @@ py::object layoutToGluon(Attribute layout) {
         ll.getBases().lookup(kReg), ll.getBases().lookup(kLane),
         ll.getBases().lookup(kWarp), ll.getBases().lookup(kBlock),
         toStdVector(ArrayRef(llvm::to_vector(ll.getOutDimSizes()))));
+  } else if (auto mma = dyn_cast<ttg::NvidiaMmaEncodingAttr>(layout)) {
+    auto ctaLayout = mma.getCTALayout();
+    return layouts.NVMMADistributedLayout(
+        std::vector<unsigned>{mma.getVersionMajor(), mma.getVersionMinor()},
+        toStdVector(mma.getWarpsPerCTA()),
+        toStdVector(ctaLayout.getCTAsPerCGA()),
+        toStdVector(ctaLayout.getCTASplitNum()),
+        toStdVector(ctaLayout.getCTAOrder()), toStdVector(mma.getInstrShape()));
   } else if (auto nvmma = dyn_cast<ttg::NVMMASharedEncodingAttr>(layout)) {
     auto ctaLayout = nvmma.getCTALayout();
     return layouts.NVMMASharedLayout(
@@ -224,6 +235,20 @@ void init_gluon_ir(py::module &&m) {
                                         /*requiresSurjective=*/true);
              return ttg::LinearEncodingAttr::get(ctx, ll);
            })
+      .def("get_mma_layout",
+           [](GluonOpBuilder &self, std::vector<unsigned> &version,
+              std::vector<unsigned> &warpsPerCta,
+              std::vector<unsigned> &ctasPerCga,
+              std::vector<unsigned> &ctaSplitNum,
+              std::vector<unsigned> &ctaOrder,
+              std::vector<unsigned> &instrShape) -> Attribute {
+             auto ctx = self.getContext();
+             auto ctaLayout = self.getChecked<ttg::CTALayoutAttr>(
+                 ctx, ctasPerCga, ctaSplitNum, ctaOrder);
+             return self.getChecked<ttg::NvidiaMmaEncodingAttr>(
+                 ctx, version[0], version[1], warpsPerCta, ctaLayout,
+                 instrShape);
+           })
       .def("get_nvmma_shared_layout",
            [](GluonOpBuilder &self, unsigned swizzleByteWidth,
               unsigned elementBitwidth, bool transposed, bool fp4Padded,
@@ -359,6 +384,14 @@ void init_gluon_ir(py::module &&m) {
              auto op = self.create<triton::SplitOp>(TypeRange{resTy, resTy}, a);
              return py::make_tuple(op->getResult(0), op->getResult(1));
            })
+      .def("create_warpgroup_mma",
+           [](GluonOpBuilder &self, Value a, Value b, Value acc, Value useAcc,
+              triton::InputPrecision precision = triton::InputPrecision::IEEE,
+              int maxNumImpreciseAcc = 0, bool isAsync = false) -> Value {
+             return self.create<ttng::WarpGroupDotOp>(
+                 a, b, acc, useAcc, precision, maxNumImpreciseAcc, isAsync);
+           })
+
       .def("create_tmem_alloc",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttng::TMEMAllocOp>(resultTy, value);
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -1,11 +1,12 @@
 import torch
 import pytest
 
-from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hopper_or_newer
+from triton._internal_testing import is_cuda, is_ampere_or_newer, is_hopper_or_newer, is_hopper
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
 from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
 from triton.experimental.gluon.language.nvidia.hopper import tma
+from triton.experimental.gluon.language.nvidia import hopper
 
 
 @gluon.jit
@@ -96,3 +97,44 @@ def test_async_copy_mbarrier():
     async_copy_mbarrier_kernel[(1, )](out, inp, inp.shape[0], XBLOCK=32, YBLOCK=32)
     torch.testing.assert_close(out[:20], inp)
     torch.testing.assert_close(out[20:], torch.zeros((12, 32), **tensor_opts))
+
+
+@gluon.jit
+def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttgl.constexpr):
+    block_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0])
+    mma_layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1],
+                                                             instr_shape=[16, 32, 16])
+    nvmma_layout: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2)
+
+    a_offs_m = ttgl.arange(0, M, layout=ttgl.SliceLayout(1, block_layout))[:, None]
+    a_offs_n = ttgl.arange(0, K, layout=ttgl.SliceLayout(0, block_layout))[None, :]
+    b_offs_m = ttgl.arange(0, K, layout=ttgl.SliceLayout(1, block_layout))[:, None]
+    b_offs_n = ttgl.arange(0, N, layout=ttgl.SliceLayout(0, block_layout))[None, :]
+
+    out_offs_m = ttgl.arange(0, M, layout=ttgl.SliceLayout(1, mma_layout))[:, None]
+    out_offs_n = ttgl.arange(0, N, layout=ttgl.SliceLayout(0, mma_layout))[None, :]
+
+    acc = ttgl.zeros([M, N], dtype=a.dtype.element_ty, layout=mma_layout)
+    A = ttgl.load(a + a_offs_m * K + a_offs_n)
+    B = ttgl.load(b + b_offs_m * N + b_offs_n)
+
+    a_shmem = ttgl.allocate_shared_memory(ttgl.float16, [M, K], nvmma_layout, A)
+    b_shmem = ttgl.allocate_shared_memory(ttgl.float16, [K, N], nvmma_layout, B)
+
+    acc = hopper.warpgroup_mma(a_shmem, b_shmem, acc)
+
+    ttgl.store(out + out_offs_m * N + out_offs_n, acc)
+
+
+@pytest.mark.skipif(not is_hopper(), reason="Requires Hopper")
+def test_warpgroup_mma():
+    torch.manual_seed(0)
+    M, N, K = 64, 32, 32
+    a = torch.randn((M, K), device="cuda", dtype=torch.float16)
+    b = torch.randn((K, N), device="cuda", dtype=torch.float16)
+    out = torch.zeros((M, N), device="cuda", dtype=torch.float16)
+    warpgroup_mma_kernel[(1, )](a, b, out, M, N, K)
+
+    ref = torch.matmul(a, b)
+
+    torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -8,11 +8,12 @@
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
 from triton.experimental.gluon.language.nvidia import blackwell
+from triton.experimental.gluon.language.nvidia import hopper
 from triton.experimental.gluon.language.nvidia.blackwell import mbarrier, tma, TensorMemoryLayout, async_copy
 from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
 from triton._filecheck import filecheck_test, run_parser
 import triton.language as tl
-from triton._internal_testing import is_cuda, is_ampere_or_newer, is_blackwell, is_hopper_or_newer
+from triton._internal_testing import is_cuda, is_ampere_or_newer, is_blackwell, is_hopper, is_hopper_or_newer
 from triton.compiler.errors import CompilationError, CompileTimeAssertionFailure
 
 TARGET_PAT = re.compile('ttg.target = "[^"]*"')
@@ -446,6 +447,41 @@ def test_tcgen05_mma(fresh_knobs):
 """)
 
 
+@gluon.jit
+def warpgroup_mma_kernel(nvmma_layout: ttgl.constexpr, acc_layout: ttgl.constexpr):
+    a = ttgl.allocate_shared_memory(ttgl.float16, [128, 128], nvmma_layout)
+    b = ttgl.allocate_shared_memory(ttgl.float16, [128, 128], nvmma_layout)
+    acc = ttgl.full([128, 128], 0, dtype=ttgl.float16, layout=acc_layout)
+    hopper.warpgroup_mma(a, b, acc)
+
+
+@pytest.mark.skipif(not is_hopper(), reason="Requires Hopper WGMMA")
+def test_warpgroup_mma(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    nvmma_layout = ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2)
+    mma_layout = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1], instr_shape=[16, 32, 16])
+    h = warpgroup_mma_kernel.warmup(nvmma_layout, mma_layout, grid=(1, ))
+    expecttest.assert_expected_inline(
+        anonymize_ir(h.asm["source"]), """\
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 32, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @warpgroup_mma_kernel() attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
+    %cst = arith.constant 0.000000e+00 : f16 loc(#loc)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma> loc(#loc)
+    %true = arith.constant true loc(#loc)
+    %2 = ttng.warp_group_dot %0, %1, %cst_0, %true {inputPrecision = 0 : i32} : !ttg.memdesc<128x128xf16, #shared, #smem, mutable> * !ttg.memdesc<128x128xf16, #shared, #smem, mutable> -> tensor<128x128xf16, #mma> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc = loc(unknown)
+""")
+
+
 @gluon.jit
 def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr):
     smem = ttgl.allocate_shared_memory(ttgl.float16, [XBLOCK, XBLOCK], input_desc.layout)
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -6,6 +6,7 @@
     "BlockedLayout",
     "SliceLayout",
     "DistributedLinearLayout",
+    "NVMMADistributedLayout",
     "NVMMASharedLayout",
     "SwizzledSharedLayout",
 ]
@@ -133,6 +134,37 @@ def mangle(self):
         return f"DLL{self.reg_bases}_{self.lane_bases}_{self.warp_bases}_{self.block_bases}_{self.shape}DLL"
 
 
+@dataclass(frozen=True)
+class NVMMADistributedLayout(DistributedLayout):
+    version: List[int]
+    warps_per_cta: List[int]
+    instr_shape: List[int]
+    ctas_per_cga: Optional[List[int]] = None
+    cta_split_num: Optional[List[int]] = None
+    cta_order: Optional[List[int]] = None
+
+    def __post_init__(self):
+        super().__setattr__("version", _unwrap_if_constexpr(self.version))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("instr_shape", _unwrap_if_constexpr(self.instr_shape))
+        super().__setattr__("ctas_per_cga", _unwrap_if_constexpr(self.ctas_per_cga))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
+
+        rank = 2
+        _realize_cta_layout(self, rank)
+        assert len(self.ctas_per_cga) == rank
+        assert len(self.cta_split_num) == rank
+        assert len(self.cta_order) == rank
+
+    def _to_ir(self, builder):
+        return builder.get_mma_layout(self.version, self.warps_per_cta, self.ctas_per_cga, self.cta_split_num,
+                                      self.cta_order, self.instr_shape)
+
+    def mangle(self) -> str:
+        return f"MMA_{self.version}_{self.warps_per_cta}_{self.instr_shape}_{self.ctas_per_cga}_{self.cta_split_num}_{self.cta_order}_MMA"
+
+
 class SharedLayout:
     pass
 
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -2,10 +2,26 @@
 from . import mbarrier, tma
 from ... import _core
 
-__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma"]
+__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma"]
 
 
 @_core.builtin
 def fence_async_shared(cluster=False, _semantic=None):
     cluster = _core._unwrap_if_constexpr(cluster)
     _semantic.builder.create_fence_async_shared(cluster)
+
+
+@_core.builtin
+def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_acc=0, is_async=False, _semantic=None):
+    use_acc = _semantic.to_tensor(use_acc)
+
+    if precision is None:
+        precision = _semantic.builder.options.default_dot_input_precision
+
+    precision = _semantic._str_to_dot_input_precision(precision)
+    max_num_imprecise_acc = _core._unwrap_if_constexpr(max_num_imprecise_acc)
+    is_async = _core._unwrap_if_constexpr(is_async)
+
+    handle = _semantic.builder.create_warpgroup_mma(a.handle, b.handle, acc.handle, use_acc.handle, precision,
+                                                    max_num_imprecise_acc, is_async)
+    return _core.tensor(handle, acc.type)
diff --git a/test/TritonGPU/load-mma-specialization.mlir b/test/TritonGPU/load-mma-specialization.mlir
@@ -1024,6 +1024,7 @@ tt.func @specialize_load_only(%desc: !tt.tensordesc<tensor<128x64xf16, #shared>>
   scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
     // CHECK: wait_barrier {{.*}} {ttg.partition = 0 : i32}
     // CHECK-NEXT: local_load {{.*}} {ttg.partition = 0 : i32}
+    // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
     // CHECK-NEXT: arrive_barrier {{.*}} {ttg.partition = 0 : i32}
     %val = tt.descriptor_load %desc[%i, %i] : !tt.tensordesc<tensor<128x64xf16, #shared>> -> tensor<128x64xf16, #oper_layout>
     "use"(%val) : (tensor<128x64xf16, #oper_layout>) -> ()
@@ -1078,6 +1079,7 @@ tt.func @specialize_mma_only(%rhs_desc: !tt.tensordesc<tensor<64x128xf16, #share
     // CHECK-NEXT: [[LOADED:%.*]], %{{.*}} = ttng.tmem_load [[ACC_TMEM:%.*]][]
     // CHECK: wait_barrier
     // CHECK-NEXT: local_load
+    // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
     // CHECK-NEXT: arrive_barrier
     // CHECK-NEXT: [[RESULTS:%.*]]:2 = "some_producer"
     %rhs_reg, %next_acc = "some_producer"(%loaded, %acc) : (tensor<64x128xf16, #oper_layout>, tensor<128x128xf32, #acc_layout>) -> (tensor<64x128xf16, #oper_layout>, tensor<128x128xf32, #acc_layout>)
@@ -1187,6 +1189,7 @@ tt.func @store_mma_load(
 
     // CHECK-NEXT: wait_barrier [[LOAD_READY_BAR]], {{.*}}partition = 0
     // CHECK-NEXT: [[LHS:%.*]] = ttg.local_load [[LOAD_BUF]] {ttg.partition = 0 : i32}
+    // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
     // CHECK-NEXT: arrive_barrier [[LOAD_EMPTY_BAR]], {{.*}}partition = 0
     // CHECK-NEXT: [[LHS_OP:%.*]] = arith.addf [[LHS]], [[LHS]] {ttg.partition = 0 : i32}
     // CHECK-NEXT: local_store [[LHS_OP]], [[LHS_SHARED]] {ttg.partition = 0 : i32}
@@ -1234,6 +1237,7 @@ tt.func @local_alloc_into_mma(
 
     // CHECK: wait_barrier [[LOAD_READY_BAR]], {{.*}}partition = 0
     // CHECK-NEXT: [[RHS_REG:%.*]] = ttg.local_load {{.*}}partition = 0
+    // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
     // CHECK-NEXT: arrive_barrier
     // CHECK-NEXT: [[RHS_REG_MOD:%.*]] = arith.addf [[RHS_REG]], [[RHS_REG]] {ttg.partition = 0 : i32}
     // CHECK-NEXT: wait_barrier [[MMA_OPER_BAR:%.*]], %arg{{.*}}partition = 0
diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp

Original file line number	Diff line number	Diff line change
`@@ -214,10 +214,10 @@ applyLinearLayout(Location loc, RewriterBase &rewriter,`
`214`	`214`	`}`
`215`	`215`
`216`	`216`	`for (auto [inDimName, idx] : indices) {`
`217`		`- if (idx.getDefiningOp<LLVM::ConstantOp>()) {`
	`217`	`+ APInt constant;`
	`218`	`+ if (matchPattern(idx, m_ConstantInt(&constant))) {`
`218`	`219`	`continue;`
`219`	`220`	`}`
`220`		`-`
`221`	`221`	`int nBits = layout.getInDimSizeLog2(inDimName);`
`222`	`222`	`for (int i = 0; i < nBits; i++) {`
`223`	`223`	`Value bit = b.and_(idx, b.i32_val(1 << i));`
Original file line number	Diff line number	Diff line change
`@@ -442,6 +442,8 @@ LogicalResult PipelinedLoadGroup::lowerLoads(WarpSchedule &schedule,`
`442`	`442`	`StageCluster userStageCluster = getStageCluster(loadBeforeOp);`
`443`	`443`	`Value loaded = b.createInto<LocalLoadOp>(*partition, userStageCluster,`
`444`	`444`	`load.type, view);`
	`445`	`+ b.createInto<ttng::FenceAsyncSharedOp>(*partition, userStageCluster,`
	`446`	`+ /bCluster=/false);`
`445`	`447`	`for (OpOperand *use : uses)`
`446`	`448`	`use->set(loaded);`
`447`	`449`	`}`