[PIPELINER] Support pipelining scalar loads (#7498)

ThomasRaoux · web-flow · commit 4a8277bf8b83 · 2025-07-15T01:34:41.000Z
So far scalar loads were not being pipelined but they can still cause
latency problems.
Extend support for scalar loads, to keep things simpler we convert those
into tensor&lt;1&gt; loads during loop lowering.
This also introduces a new `unsplat` op to make the conversion from
tensor to scalar simple.
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -432,6 +432,16 @@ def TT_SplatOp : TT_Op<"splat", [Pure,
     let hasFolder = 1;
 }
 
+def TT_UnsplatOp : TT_Op<"unsplat", [Pure,
+                                     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+    let summary = "convert a tensor with a single element to a scalar";
+    let arguments = (ins TT_Tensor:$src);
+    let results = (outs TT_Type:$result);
+
+    let assemblyFormat = "$src attr-dict `:` type($src)";
+    let hasVerifier = 1;
+}
+
 def TT_ExpandDimsOp : TT_Op<"expand_dims", [Pure,
                                             DeclareOpInterfaceMethods<InferTypeOpInterface>,
                                             SameOperandsAndResultElementType]> {
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -60,6 +60,18 @@ struct SplatOpConversion : public ConvertOpToLLVMPattern<triton::SplatOp> {
     return success();
   }
 };
+
+struct UnsplatOpConversion : public ConvertOpToLLVMPattern<triton::UnsplatOp> {
+  using ConvertOpToLLVMPattern<triton::UnsplatOp>::ConvertOpToLLVMPattern;
+  LogicalResult matchAndRewrite(triton::UnsplatOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter &rewriter) const {
+    auto loc = op->getLoc();
+    auto scrVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
+    rewriter.replaceOp(op, scrVals[0]);
+    return success();
+  }
+};
+
 // This pattern helps to convert arith::ConstantOp(with SplatElementsAttr),
 // the logic is the same as triton::SplatOp, so the underlying implementation
 // is reused.
@@ -550,6 +562,7 @@ void mlir::triton::populateViewOpToLLVMPatterns(
   patterns.add<ReshapeOpConversion>(typeConverter, benefit);
   patterns.add<ExpandDimsOpConversion>(typeConverter, benefit);
   patterns.add<SplatOpConversion>(typeConverter, benefit);
+  patterns.add<UnsplatOpConversion>(typeConverter, benefit);
   patterns.add<ArithConstantSplatOpConversion>(typeConverter, benefit);
   patterns.add<ArithConstantArrayOpConversion>(typeConverter, benefit);
   patterns.add<CatOpConversion>(typeConverter, benefit);
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -606,6 +606,24 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) {
   return ret;
 }
 
+//-- UnsplatOp --
+LogicalResult UnsplatOp::verify() {
+  auto srcShape = getSrc().getType().getShape();
+  if (product(srcShape) != 1) {
+    return emitError("source tensor must have exactly one element");
+  }
+  return success();
+}
+
+LogicalResult UnsplatOp::inferReturnTypes(
+    MLIRContext *context, std::optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  auto dstTy = cast<RankedTensorType>(operands[0].getType()).getElementType();
+  inferredReturnTypes.push_back(dstTy);
+  return success();
+}
+
 //-- ExpandDimsOp --
 LogicalResult ExpandDimsOp::inferReturnTypes(
     MLIRContext *context, std::optional<Location> loc, ValueRange operands,
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -334,6 +334,49 @@ struct LoadGroupInfo {
   bool hasTMALoad = false;
 };
 
+// Convert a scalar load to a load of a tensor of shape <1>.
+void convertScalarToTensorLoad(Operation *op, CoarseSchedule &schedule,
+                               scf::ForOp forOp) {
+  auto scalarLoad = cast<tt::LoadOp>(op);
+  Type scalarTy = scalarLoad.getType();
+  OpBuilderForStage builder(op->getLoc(), op, schedule);
+  builder.setInsertionPoint(op);
+  MLIRContext *ctx = op->getContext();
+  auto nWarps = lookupNumWarps(op);
+  ModuleOp mod = forOp->getParentOfType<ModuleOp>();
+  auto threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
+  auto numCTAs = TritonGPUDialect::getNumCTAs(mod);
+  auto blockedEnc =
+      getDefaultBlockedEncoding(ctx, {1}, nWarps, threadsPerWarp, numCTAs);
+  auto newPtrTy =
+      RankedTensorType::get({1}, scalarLoad.getPtr().getType(), blockedEnc);
+  auto newPtr =
+      builder.create<tt::SplatOp>(op->getLoc(), newPtrTy, scalarLoad.getPtr());
+  scalarLoad.getPtrMutable().assign(newPtr);
+  if (scalarLoad.getMask()) {
+    auto newMaskTy =
+        RankedTensorType::get({1}, scalarLoad.getMask().getType(), blockedEnc);
+    auto newMask = builder.create<tt::SplatOp>(op->getLoc(), newMaskTy,
+                                               scalarLoad.getMask());
+    scalarLoad.getMaskMutable().assign(newMask);
+  }
+  if (scalarLoad.getOther()) {
+    auto newOtherTy =
+        RankedTensorType::get({1}, scalarLoad.getOther().getType(), blockedEnc);
+    auto newOther = builder.create<tt::SplatOp>(op->getLoc(), newOtherTy,
+                                                scalarLoad.getOther());
+    scalarLoad.getOtherMutable().assign(newOther);
+  }
+  auto newDstTy = RankedTensorType::get({1}, scalarLoad.getType(), blockedEnc);
+  scalarLoad.getResult().setType(newDstTy);
+  builder.setInsertionPointAfter(op);
+  Operation *firstUse = getFirstUseOfPipelinedOp({op}, forOp, schedule);
+  builder.setStageCluster(schedule[firstUse]);
+  Operation *unsplat = builder.create<tt::UnsplatOp>(op->getLoc(), scalarTy,
+                                                     scalarLoad.getResult());
+  scalarLoad.getResult().replaceAllUsesExcept(unsplat->getResult(0), unsplat);
+}
+
 void createTMABarrierAndWait(
     scf::ForOp forOp, llvm::MapVector<Operation *, AsyncLoad> &asyncLoads,
     const llvm::MapVector<int, LoadGroupInfo> &loadGroups,
@@ -446,25 +489,39 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
                       triton::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
   llvm::MapVector<Operation *, AsyncLoad> asyncLoads;
   llvm::MapVector<int, LoadGroupInfo> loadGroups;
+  llvm::SmallVector<Operation *> scalarLoads;
   // Only visit the top level ops, we do not support pipelining conditional
   // loads for now
   for (auto &op : forOp.getBody()->without_terminator()) {
     if (isa<tt::LoadOp, tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op)) {
       int stageDiff = getDefUseStageDiff(&op, forOp, schedule);
-      if (stageDiff == 0 || !isa<RankedTensorType>(op.getResultTypes()[0])) {
-        // Don't care about non-pipelined loads. Don't use async loads for
-        // scalar values.
+      if (stageDiff == 0) {
+        // Don't care about non-pipelined loads. Scalar loads will be converted
+        // to tensor loads if they are pipelined.
         continue;
       }
-      SharedEncodingTrait sharedEncoding = getSharedEncoding(&op);
-      // Do not create async loads for small loads (cp.async requires at least 4
-      // bytes)
-      bool canUseAsyncCp =
-          isa<tt::LoadOp>(op) &&
-          canBeConvertedToAsyncLoad(cast<tt::LoadOp>(op), axisInfoAnalysis);
-      int copyVecBytes = getCopyVecBytes(
-          cast<RankedTensorType>(op.getResultTypes()[0]), sharedEncoding);
-      canUseAsyncCp &= copyVecBytes >= 4;
+      SharedEncodingTrait sharedEncoding;
+      bool canUseAsyncCp = false;
+      if (!isa<RankedTensorType>(op.getResultTypes()[0])) {
+        canUseAsyncCp = op.getResultTypes()[0].getIntOrFloatBitWidth() >= 32;
+        sharedEncoding = ttg::SwizzledSharedEncodingAttr::get(
+            forOp.getContext(), 1, 1, 1, {0},
+            ttg::CTALayoutAttr::get(forOp.getContext(), {1}, {1}, {0}));
+        if (canUseAsyncCp) {
+          scalarLoads.push_back(&op);
+        }
+      } else {
+        sharedEncoding = getSharedEncoding(&op);
+        // Do not create async loads for small loads (cp.async requires at least
+        // 4 bytes)
+        canUseAsyncCp =
+            isa<tt::LoadOp>(op) &&
+            canBeConvertedToAsyncLoad(cast<tt::LoadOp>(op), axisInfoAnalysis);
+        int copyVecBytes = getCopyVecBytes(
+            cast<RankedTensorType>(op.getResultTypes()[0]), sharedEncoding);
+
+        canUseAsyncCp &= copyVecBytes >= 4;
+      }
       if (canUseAsyncCp || isTMALoad(&op)) {
         if (loadRequiresAdditionalBuffer(&op)) {
           // Allocate additional buffer required by the wgmma pipelining.
@@ -486,6 +543,11 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
     }
   }
 
+  // Convert scalar loads to be able to use async copy.
+  for (auto op : scalarLoads) {
+    convertScalarToTensorLoad(op, schedule, forOp);
+  }
+
   if (asyncLoads.empty())
     return forOp;
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -330,10 +330,15 @@ bool mlir::triton::canBeConvertedToAsyncLoad(
     vec = std::min<unsigned>(vec, axisInfoAnalysis.getMaskAlignment(mask));
 
   auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType());
-  if (!tensorTy)
-    return false;
-  auto ty = cast<tt::PointerType>(tensorTy.getElementType()).getPointeeType();
-  unsigned width = vec * ty.getIntOrFloatBitWidth();
+  unsigned width = 0;
+  if (tensorTy) {
+    auto ty = cast<tt::PointerType>(tensorTy.getElementType()).getPointeeType();
+    width = vec * ty.getIntOrFloatBitWidth();
+  } else {
+    width = cast<tt::PointerType>(ptr.getType())
+                .getPointeeType()
+                .getIntOrFloatBitWidth();
+  }
 
   // We do not pipeline all loads for the following reasons:
   // 1. On nvidia GPUs, cp.async's cp-size can only be 4, 8, or 16.
diff --git a/test/Triton/invalid.mlir b/test/Triton/invalid.mlir
@@ -538,3 +538,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32}
     tt.return %result : tensor<128x128xf32, #blocked>
   }
 }
+
+// -----
+
+tt.func @unsplat_invalid(%arg0: tensor<128xf32>) {
+  // expected-error @below {{source tensor must have exactly one element}}
+  %0 = tt.unsplat %arg0 : tensor<128xf32>
+  tt.return
+}
diff --git a/test/Triton/ops.mlir b/test/Triton/ops.mlir
@@ -278,3 +278,10 @@ tt.func @tma_scatter(%arg0: !tt.tensordesc<tensor<1x128xbf16>>, %arg1: tensor<32
   tt.descriptor_scatter %arg0[%arg1, %arg2], %arg3 : !tt.tensordesc<tensor<1x128xbf16>>, tensor<32xi32>, i32, tensor<32x128xbf16>
   tt.return
 }
+
+// CHECK-LABEL: @unsplat
+tt.func @unsplat(%arg0: tensor<1x1xf32>) -> f32 {
+  // CHECK-NEXT: tt.unsplat %{{.+}} : tensor<1x1xf32>
+  %0 = tt.unsplat %arg0 : tensor<1x1xf32>
+  tt.return %0 : f32
+}
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
@@ -393,10 +393,12 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 // CHECK: ttg.async_copy_global_to_local
 // CHECK: ttg.async_commit_group
 // CHECK: scf.for
-// CHECK: ttg.async_wait {{.*}} {num = 2 : i32}
+// CHECK: ttg.async_wait {{.*}} {num = 1 : i32}
 // CHECK: %[[NEXT_BUFFER_1:.*]] = tt.addptr %{{.*}}, {{.*}}
 // CHECK: ttg.async_copy_global_to_local %[[NEXT_BUFFER_1]]
-// CHECK: %[[IND_BUFFER_0:.*]] = tt.load %{{.*}}, {{.*}}
+// CHECK: ttg.async_wait {{.*}} {num = 1 : i32}
+// CHECK: %[[IND_BUFFER_0_T:.*]] = ttg.local_load
+// CHECK: %[[IND_BUFFER_0:.*]] = tt.unsplat %[[IND_BUFFER_0_T]] : tensor<1xi64
 // CHECK: %[[IND_BUFFER_1:.*]] = arith.muli {{.*}}, %[[IND_BUFFER_0]]
 // CHECK: %[[IND_BUFFER_2:.*]] = tt.splat %[[IND_BUFFER_1]]
 // CHECK: %[[NEXT_BUFFER_0:.*]] = tt.addptr {{.*}}, %[[IND_BUFFER_2]]
@@ -406,9 +408,9 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //       AMD:     %[[LOCAL_ALLOC_0:.*]] = ttg.local_alloc
 //       AMD:     %[[LOCAL_ALLOC_1:.*]] = ttg.local_alloc
 //       AMD:     %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}}
+//       AMD:     %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] {amd.pipeliner_part = "prologue"}
 //       AMD:     %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]]
 //       AMD:     %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] {amd.pipeliner_part = "prologue"}
-//       AMD:     %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]]
 //       AMD:     %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]]
 //       AMD:     %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]]
 //       AMD:     %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]]
@@ -418,29 +420,14 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //       AMD:     ttg.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_11]]
 //       AMD:     %[[MEMDESC_SUBVIEW_12:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}]
 //       AMD:     ttg.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_12]]
-//       AMD:     %[[CMPI_13:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}}
-//       AMD:     %[[ADDPTR_14:.*]] = tt.addptr %{{.*}}, %{{.*}}
-//       AMD:     %[[ADDPTR_15:.*]] = tt.addptr %{{.*}}, %{{.*}}
-//       AMD:     %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]]
-//       AMD:     %[[LOAD_17:.*]] = tt.load %[[ADDPTR_14]], %[[SPLAT_16]] {amd.pipeliner_part = "prologue"}
-//       AMD:     %[[LOAD_18:.*]] = tt.load %[[ADDPTR_15]], %[[CMPI_13]]
-//       AMD:     %[[MULI_19:.*]] = arith.muli %{{.*}}, %[[LOAD_18]]
-//       AMD:     %[[SPLAT_20:.*]] = tt.splat %[[MULI_19]]
-//       AMD:     %[[ADDPTR_21:.*]] = tt.addptr %{{.*}}, %[[SPLAT_20]]
-//       AMD:     %[[SPLAT_22:.*]] = tt.splat %[[CMPI_13]]
-//       AMD:     %[[LOAD_23:.*]] = tt.load %[[ADDPTR_21]], %[[SPLAT_22]] {amd.pipeliner_part = "prologue"}
-//       AMD:     %[[MEMDESC_SUBVIEW_24:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}]
-//       AMD:     ttg.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_24]]
-//       AMD:     %[[MEMDESC_SUBVIEW_25:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}]
-//       AMD:     ttg.local_store %[[LOAD_23]], %[[MEMDESC_SUBVIEW_25]]
 //       AMD:     %[[SUBI_26:.*]] = arith.subi %{{.*}}, %{{.*}}
-//       AMD:     %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %[[SUBI_26]] step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_14]], %[[ARG9:.*]] = %[[ADDPTR_15]], %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_11]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_24]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_12]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_25]])
+//       AMD:     %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %[[SUBI_26]] step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_11]], %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_12]])
 //       AMD:       %[[ADDPTR_38:.*]] = tt.addptr %[[ARG8]], %{{.*}}
 //       AMD:       %[[ADDPTR_39:.*]] = tt.addptr %[[ARG9]], %{{.*}}
 //       AMD:       %[[LOAD_40:.*]] = tt.load %[[ADDPTR_38]]
 //       AMD:       %[[LOCAL_LOAD_41:.*]] = ttg.local_load %[[ARG11]]
 //       AMD:       %[[LOAD_42:.*]] = tt.load %[[ADDPTR_39]]
-//       AMD:       %[[MULI_43:.*]] = arith.muli %{{.*}}, %[[LOAD_42]]
+//       AMD:       %[[MULI_43:.*]] = arith.muli %{{.*}}, %[[ARG12]]
 //       AMD:       %[[SPLAT_44:.*]] = tt.splat %[[MULI_43]]
 //       AMD:       %[[ADDPTR_45:.*]] = tt.addptr %{{.*}}, %[[SPLAT_44]]
 //       AMD:       %[[LOAD_46:.*]] = tt.load %[[ADDPTR_45]]
@@ -453,7 +440,7 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //       AMD:       ttg.local_store %[[LOAD_40]], %[[MEMDESC_SUBVIEW_52]]
 //       AMD:       %[[MEMDESC_SUBVIEW_53:.*]] = ttg.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_51]], %{{.*}}, %{{.*}}]
 //       AMD:       ttg.local_store %[[LOAD_46]], %[[MEMDESC_SUBVIEW_53]]
-//       AMD:       scf.yield %[[DOT_48]], %[[ADDPTR_38]], %[[ADDPTR_39]], %[[SELECT_51]], %[[ARG12]], %[[MEMDESC_SUBVIEW_52]], %[[ARG14]], %[[MEMDESC_SUBVIEW_53]]
+//       AMD:       scf.yield %[[DOT_48]], %[[ADDPTR_38]], %[[ADDPTR_39]], %[[SELECT_51]], %[[MEMDESC_SUBVIEW_52]], %[[LOAD_42]], %[[MEMDESC_SUBVIEW_53]]
 //       AMD:     } {tt.num_stages = 3
 //       AMD:     %[[CMPI_28:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
 //       AMD:     %[[CMPI_29:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
@@ -466,8 +453,8 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //       AMD:       scf.yield %{{.*}}#0
 //       AMD:     }
 //       AMD:     %[[SELECT_33:.*]] = arith.select %[[CMPI_28]], %[[IF_32]], %{{.*}}#0
-//       AMD:     %[[LOCAL_LOAD_34:.*]] = ttg.local_load %{{.*}}#5
-//       AMD:     %[[LOCAL_LOAD_35:.*]] = ttg.local_load %{{.*}}#7
+//       AMD:     %[[LOCAL_LOAD_34:.*]] = ttg.local_load %{{.*}}
+//       AMD:     %[[LOCAL_LOAD_35:.*]] = ttg.local_load %{{.*}}
 //       AMD:     %[[IF_36:.*]] = scf.if %[[CMPI_29]]
 //       AMD:       %[[DOT_38:.*]] = tt.dot %[[LOCAL_LOAD_34]], %[[LOCAL_LOAD_35]], %[[SELECT_33]]
 //       AMD:       scf.yield %[[DOT_38]]
@@ -477,34 +464,6 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //       AMD:     %[[SELECT_37:.*]] = arith.select %[[CMPI_29]], %[[IF_36]], %[[SELECT_33]]
 //       AMD:     ttg.local_dealloc %[[LOCAL_ALLOC_0]]
 //       AMD:     ttg.local_dealloc %[[LOCAL_ALLOC_1]]
-
-// AMD_PREFETCH-LABEL: tt.func @indirect_bmm_scalar
-//       AMD_PREFETCH:   ttg.local_alloc
-//       AMD_PREFETCH:   ttg.local_alloc
-//       AMD_PREFETCH:   tt.load
-//       AMD_PREFETCH:   tt.load
-//       AMD_PREFETCH:   tt.load
-//       AMD_PREFETCH:   ttg.local_store
-//       AMD_PREFETCH:   ttg.local_store
-//       AMD_PREFETCH:   tt.load
-//       AMD_PREFETCH:   ttg.local_load
-//       AMD_PREFETCH:   tt.load
-//       AMD_PREFETCH:   tt.load
-//       AMD_PREFETCH:   ttg.local_load
-//       AMD_PREFETCH:   scf.for
-//       AMD_PREFETCH:     ttg.local_store
-//       AMD_PREFETCH:     ttg.local_store
-//       AMD_PREFETCH:     tt.dot
-//       AMD_PREFETCH:     tt.load
-//       AMD_PREFETCH:     ttg.local_load
-//       AMD_PREFETCH:     tt.load
-//       AMD_PREFETCH:     tt.load
-//       AMD_PREFETCH:     ttg.local_load
-//       AMD_PREFETCH:     scf.yield
-//       AMD_PREFETCH:   tt.dot
-//       AMD_PREFETCH:   tt.dot
-//       AMD_PREFETCH:   tt.return
-
 tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32},
                    %76: index,
                    %49: tensor<16x16x!tt.ptr<f16>, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32},
diff --git a/test/TritonGPU/pipeline-lower-loop.mlir b/test/TritonGPU/pipeline-lower-loop.mlir
@@ -1634,3 +1634,24 @@ tt.func @load_cant_use_async_cp(%lb : index, %ub : index, %step : index,
   tt.return
 }
 }
+
+// -----
+
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
+// CHECK-LABEL: @scalar_load
+tt.func @scalar_load(%lb : index, %ub : index, %step : index,
+                     %a_ptr_init : !tt.ptr<i32>) -> () {
+  scf.for %iv = %lb to %ub step %step : index {
+    // CHECK: %[[PTR:.+]] = tt.splat %{{.*}} {loop.cluster = 0 : i32, loop.stage = 0 : i32} : !tt.ptr<i32>
+    // CHECK: %[[CP:.+]] = ttg.async_copy_global_to_local %[[PTR]], %{{.+}} {loop.cluster = 0 : i32, loop.stage = 0 : i32}
+    // CHECK: %[[T0:.+]] = ttg.async_commit_group %[[CP]] {loop.cluster = 0 : i32, loop.stage = 0 : i32}
+    // CHECK: %[[T1:.+]] = ttg.async_wait %[[T0]] {loop.cluster = 1 : i32, loop.stage = 3 : i32, num = 0 : i32}
+    // CHECK: %[[L:.+]] = ttg.local_load %{{.+}} token %[[T1]] {loop.cluster = 1 : i32, loop.stage = 3 : i32}
+    // CHECK: %[[R:.+]] = tt.unsplat %[[L]] {loop.cluster = 1 : i32, loop.stage = 3 : i32}
+    // CHECK: "use"(%[[R]]) {loop.cluster = 1 : i32, loop.stage = 3 : i32} : (i32) -> ()
+    %a = tt.load %a_ptr_init {loop.cluster = 1 : i32, loop.stage = 0 : i32} : !tt.ptr<i32>
+    "use"(%a) {loop.cluster = 2 : i32, loop.stage = 3 : i32} : (i32) -> ()
+  } {tt.scheduled_max_stage = 3 : i32}
+  tt.return
+}
+}

Original file line number	Diff line number	Diff line change
`@@ -538,3 +538,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32}`
`538`	`538`	`tt.return %result : tensor<128x128xf32, #blocked>`
`539`	`539`	`}`
`540`	`540`	`}`
	`541`	`+`
	`542`	`+// -----`
	`543`	`+`
	`544`	`+tt.func @unsplat_invalid(%arg0: tensor<128xf32>) {`
	`545`	`+ // expected-error @below {{source tensor must have exactly one element}}`
	`546`	`+ %0 = tt.unsplat %arg0 : tensor<128xf32>`
	`547`	`+ tt.return`
	`548`	`+}`