intel
diff --git a/‎python/test/unit/language/test_matmul.py‎
Lines changed: 15 additions & 10 deletions b/‎python/test/unit/language/test_matmul.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 8 additions & 0 deletions b/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/amd-block-pingpong.mlir‎
Lines changed: 92 additions & 0 deletions b/‎test/TritonGPU/amd/amd-block-pingpong.mlir‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp‎
Lines changed: 137 additions & 1 deletion b/‎third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp‎
Lines changed: 137 additions & 1 deletion
@@ -902,9 +902,6 @@ def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TR
         if (A_DATA_TYPE == 'float4' and not WITH_A_SCALE) or (B_DATA_TYPE == 'float4' and not WITH_B_SCALE):
             pytest.skip("Float4 without scale is tested in test_block_scale_fp4")
 
-    if B_DATA_TYPE != 'float4' and B_TRANS:
-        pytest.xfail(f'No need to transpose B for {B_DATA_TYPE}')
-
     if is_xpu():
         pytest.skip("FIXME: failed to legalize operation 'tt.dot_scaled' on XPU")
 
@@ -913,13 +910,21 @@ def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TR
 
     torch.manual_seed(42)
 
-    def create_operand(dtype: str, size0: int, size1: int, k_dim: int, transpose: bool = False):
+    def create_operand(dtype: str, size0: int, size1: int, k_dim: int, transpose: bool = True):
         if dtype == "float8e5":
-            v = torch.randint(20, 40, (size0, size1), dtype=torch.uint8).view(torch.float8_e5m2).to(device)
-            v_ref = f8_to_f16(v.view(torch.float8_e5m2), dtype).to(torch.float32)
+            if transpose:
+                v = torch.randint(20, 40, (size0, size1), dtype=torch.uint8).view(torch.float8_e5m2).to(device)
+                v_ref = f8_to_f16(v.view(torch.float8_e5m2), dtype).to(torch.float32)
+            else:
+                v = torch.randint(20, 40, (size1, size0), dtype=torch.uint8).view(torch.float8_e5m2).to(device).T
+                v_ref = f8_to_f16(v.view(torch.float8_e5m2).T, dtype).to(torch.float32).T
         elif dtype == "float8e4nv":
-            v = torch.randint(20, 40, (size0, size1), dtype=torch.uint8).view(torch.float8_e4m3fn).to(device)
-            v_ref = f8_to_f16(v.view(torch.float8_e4m3fn), dtype).to(torch.float32)
+            if transpose:
+                v = torch.randint(20, 40, (size0, size1), dtype=torch.uint8).view(torch.float8_e4m3fn).to(device)
+                v_ref = f8_to_f16(v.view(torch.float8_e4m3fn), dtype).to(torch.float32)
+            else:
+                v = torch.randint(20, 40, (size1, size0), dtype=torch.uint8).view(torch.float8_e4m3fn).to(device).T
+                v_ref = f8_to_f16(v.view(torch.float8_e4m3fn).T, dtype).to(torch.float32).T
         else:
             # float4
             if transpose:
@@ -937,8 +942,8 @@ def create_operand(dtype: str, size0: int, size1: int, k_dim: int, transpose: bo
     a, a_ref = create_operand(A_DATA_TYPE, M, K, 1)
     b, b_ref = create_operand(B_DATA_TYPE, K, N, 0, B_TRANS)
 
-    a_scale_mxfp4 = MXScaleTensor(size=(M, (K + 32 - 1) // 32), device=device).random(high=64.0)
-    b_scale_mxfp4 = MXScaleTensor(size=(N, (K + 32 - 1) // 32), device=device).random(high=64.0)
+    a_scale_mxfp4 = MXScaleTensor(size=(M, (K + 32 - 1) // 32), device=device).random(high=32.0)
+    b_scale_mxfp4 = MXScaleTensor(size=(N, (K + 32 - 1) // 32), device=device).random(high=32.0)
     a_scale = a_scale_mxfp4.data
     b_scale = b_scale_mxfp4.data
 
 
@@ -88,6 +88,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
       !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x64xf16, #shared1, #smem> -> tensor<128x64xf32, #mma>
     tt.return
   }
+
+  // CHECK-LABEL: @wgmma_on_subtile
+  // CHECK: nvgpu.wgmma %{{.*}}, %{{.*}}
+  tt.func @wgmma_on_subtile(%a: tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %b:  !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256>){
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
+    %m = ttng.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, isAsync = true} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256> -> tensor<128x256xf32, #mma>
+    tt.return
+  }
 }
 
 // -----
 
@@ -423,6 +423,98 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
   }
 }
 
+// -----
+
+//CHECK-LABEL: pingpong_small_prologue_load
+//CHECK: ttg.local_load
+//CHECK: rocdl.s.setprio 1
+//CHECK: tt.load
+//CHECK: rocdl.sched.barrier
+//CHECK: ttg.local_load
+//CHECK: rocdl.s.setprio 0
+//CHECK: tt.load
+//CHECK: rocdl.sched.barrier
+//CHECK: rocdl.s.setprio 1
+//CHECK: tt.dot
+//CHECK: rocdl.s.setprio 0
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 16], isTransposed = true}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}>
+#shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @pingpong_small_prologue_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
+    %c1_i32 = arith.constant 1 : i32
+    %cst_0 = arith.constant dense<64> : tensor<64x128xi32, #blocked>
+    %cst_1 = arith.constant dense<64> : tensor<128x64xi32, #blocked1>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked1>
+    %1 = tt.get_program_id x : i32
+    %2 = tt.splat %1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
+    %3 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
+    %4 = arith.addi %2, %3 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
+    %5 = tt.expand_dims %4 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1>
+    %6 = tt.splat %arg6 : i32 -> tensor<128x1xi32, #blocked1>
+    %7 = arith.muli %5, %6 : tensor<128x1xi32, #blocked1>
+    %8 = tt.addptr %0, %7 : tensor<128x1x!tt.ptr<f16>, #blocked1>, tensor<128x1xi32, #blocked1>
+    %9 = tt.broadcast %8 : tensor<128x1x!tt.ptr<f16>, #blocked1> -> tensor<128x64x!tt.ptr<f16>, #blocked1>
+    %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1>
+    %12 = tt.broadcast %11 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1>
+    %13 = tt.addptr %9, %12 : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
+    %14 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>, #blocked>
+    %15 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %16 = tt.expand_dims %15 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %14, %16 : tensor<64x1x!tt.ptr<f16>, #blocked>, tensor<64x1xi32, #blocked>
+    %18 = tt.broadcast %17 : tensor<64x1x!tt.ptr<f16>, #blocked> -> tensor<64x128x!tt.ptr<f16>, #blocked>
+    %19 = tt.splat %arg7 : i32 -> tensor<64x128xi32, #blocked>
+    %20 = tt.addptr %18, %19 : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
+    %21 = ttg.local_alloc  : () -> !ttg.memdesc<1x128x64xf16, #shared, #ttg.shared_memory, mutable>
+    %22 = ttg.local_alloc  : () -> !ttg.memdesc<1x64x128xf16, #shared1, #ttg.shared_memory, mutable>
+    %23 = ttg.memdesc_subview %21[%c0_i32, %c0_i32, %c0_i32] : !ttg.memdesc<1x128x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>
+    %24 = ttg.memdesc_subview %22[%c0_i32, %c0_i32, %c0_i32] : !ttg.memdesc<1x64x128xf16, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<64x128xf16, #shared1, #ttg.shared_memory, mutable>
+    %25:6 = scf.for %arg10 = %c0_i32 to %c64_i32 step %c1_i32 iter_args(%arg11 = %cst, %arg12 = %13, %arg13 = %20, %arg14 = %c0_i32, %arg15 = %23, %arg16 = %24) -> (tensor<128x128xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<64x128x!tt.ptr<f16>, #blocked>, i32, !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>, !ttg.memdesc<64x128xf16, #shared1, #ttg.shared_memory, mutable>)  : i32 {
+      %26 = arith.cmpi eq, %arg10, %c0_i32: i32
+      %27 = scf.if %26 -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> {
+        %28 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked1>
+        %29 = tt.broadcast %28 : tensor<128x1x!tt.ptr<f16>, #blocked1> -> tensor<128x64x!tt.ptr<f16>, #blocked1>
+        %30 = tt.load %29 : tensor<128x64x!tt.ptr<f16>, #blocked1>
+        %31 = ttg.local_alloc  : () -> !ttg.memdesc<1x128x64xf16, #shared, #ttg.shared_memory, mutable>
+        %32 = ttg.memdesc_subview %31[%c0_i32, %c0_i32, %c0_i32] : !ttg.memdesc<1x128x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>
+        ttg.local_store %30, %32 : tensor<128x64xf16, #blocked1> -> !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>
+        %33 = ttg.local_load %32 : !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+        scf.yield %33 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+      } else {
+        scf.yield %cst_2 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+      }
+      %34 = tt.addptr %arg12, %cst_1 : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
+      %35 = tt.load %34 : tensor<128x64x!tt.ptr<f16>, #blocked1>
+      %36 = tt.addptr %arg13, %cst_0 : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
+      %37 = tt.load %36 : tensor<64x128x!tt.ptr<f16>, #blocked>
+      %38 = ttg.local_load %arg15 : !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+      %39 = arith.addf %38, %27: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+      %40 = ttg.local_load %arg16 : !ttg.memdesc<64x128xf16, #shared1, #ttg.shared_memory, mutable> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+      %41 = tt.dot %39, %40, %arg11 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x128xf32, #mma>
+      %42 = arith.addi %arg14, %c1_i32 : i32
+      %43 = arith.cmpi slt, %42, %c1_i32 : i32
+      %44 = arith.select %43, %42, %c0_i32 : i32
+      %45 = ttg.memdesc_subview %21[%44, %c0_i32, %c0_i32] : !ttg.memdesc<1x128x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>
+      ttg.local_store %35, %45 : tensor<128x64xf16, #blocked1> -> !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>
+      %46 = ttg.memdesc_subview %22[%44, %c0_i32, %c0_i32] : !ttg.memdesc<1x64x128xf16, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<64x128xf16, #shared1, #ttg.shared_memory, mutable>
+      ttg.local_store %37, %46 : tensor<64x128xf16, #blocked> -> !ttg.memdesc<64x128xf16, #shared1, #ttg.shared_memory, mutable>
+      scf.yield %41, %34, %36, %44, %45, %46 : tensor<128x128xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<64x128x!tt.ptr<f16>, #blocked>, i32, !ttg.memdesc<128x64xf16, #shared, #ttg.shared_memory, mutable>, !ttg.memdesc<64x128xf16, #shared1, #ttg.shared_memory, mutable>
+    }
+    ttg.local_dealloc %21 : !ttg.memdesc<1x128x64xf16, #shared, #ttg.shared_memory, mutable>
+    ttg.local_dealloc %22 : !ttg.memdesc<1x64x128xf16, #shared1, #ttg.shared_memory, mutable>
+    tt.return
+  }
+}
+
+
 // -----
 // CHECK-LABEL: pingpong_medium_dependency
 
 
@@ -79,6 +79,12 @@ class Pingponger {
   void appendSlicedLoadAB(int slice);
   void appendClusterBarrier(OpBuilder &builder, Location loc);
   void appendOpWithPrio(OpBuilder &builder, Operation *Op, Location loc);
+  void determineDotMemoryOps(tt::DotOp dotOp,
+                             DenseSet<tt::LoadOp> &dotGlobalLoads,
+                             DenseSet<ttg::LocalLoadOp> &dotLocalLoads,
+                             DenseSet<ttg::LocalStoreOp> &dotLocalStores);
+  template <typename T>
+  void findClosestPredOps(Value v, DenseSet<T> &matchingOps);
 };
 
 void Pingponger::updateOpInsertion(Operation *op) { lastInsertedOp = op; }
@@ -150,6 +156,89 @@ void Pingponger::appendOpWithPrio(OpBuilder &builder, Operation *op,
   appendOp(builder.create<ROCDL::SetPrioOp>(loc, lowPriority));
 }
 
+// Find all of the "closest" operations that are of a given type T
+// in the same basic block. Here "closest" means along any path P,
+// the first operation of type T that is encountered when traversing
+// P from the given value v. This also includes "later" operations
+// for block arguments. Note: That we find all T for every path P.
+template <typename T>
+void Pingponger::findClosestPredOps(Value v, DenseSet<T> &matchingOps) {
+  // Create a cache so we can traverse across block arguments.
+  DenseSet<Operation *> visitedOps;
+  std::function<void(Value)> impl;
+  impl = [&matchingOps, &visitedOps, &impl](Value v) {
+    // If we encounter a block argument we only look at the terminators of the
+    // current block
+    if (auto blockArg = dyn_cast<BlockArgument>(v)) {
+      auto operandNumber = blockArg.getArgNumber();
+      auto block = blockArg.getOwner();
+      if (auto yield = dyn_cast<scf::YieldOp>(block->getTerminator())) {
+        auto parentOp = block->getParentOp();
+        // Skip the induction variables to find the yield position
+        if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
+          if (operandNumber < forOp.getNumInductionVars())
+            return;
+          operandNumber -= forOp.getNumInductionVars();
+        }
+        impl(yield->getOperand(operandNumber));
+      }
+    } else {
+      auto definingOp = v.getDefiningOp();
+      if (!definingOp)
+        return;
+      else if (visitedOps.contains(definingOp))
+        return;
+      visitedOps.insert(definingOp);
+      if (auto matchOp = dyn_cast<T>(definingOp))
+        matchingOps.insert(matchOp);
+      else
+        for (auto predValue : definingOp->getOperands())
+          impl(predValue);
+    }
+  };
+  impl(v);
+}
+
+// Populate the dotGlobalLoads, dotLocalLoads, and dotLocalStores set with
+// any loads that are generated by the current dot product. This occurs in
+// steps to:
+// 1. Determine which loads are generated by the dot product via getA()
+//    and getB().
+// 2. Determine which local stores are used to populate the inputs to
+//    the local loads.
+// 3. Determine which global loads are used to populate the inputs to
+//    the local stores.
+// Note: This function currently depends on num_stages=2, which is a
+// precondition for the pingpong scheduling.
+void Pingponger::determineDotMemoryOps(
+    tt::DotOp dotOp, DenseSet<tt::LoadOp> &dotGlobalLoads,
+    DenseSet<ttg::LocalLoadOp> &dotLocalLoads,
+    DenseSet<ttg::LocalStoreOp> &dotLocalStores) {
+  // Find the locals loads used to compute the dot inputs. These
+  // must come before the dot op.
+  findClosestPredOps<ttg::LocalLoadOp>(dotOp.getA(), dotLocalLoads);
+  findClosestPredOps<ttg::LocalLoadOp>(dotOp.getB(), dotLocalLoads);
+
+  // Determine the local stores from the local loads.
+  // With pipelining we expect this to be a single local
+  // store within the loop based on a block argument after routing through
+  // a ttg.MemDescSubviewOp.
+  DenseSet<ttg::MemDescSubviewOp> subviews;
+  for (auto &&localLoad : dotLocalLoads)
+    findClosestPredOps<ttg::MemDescSubviewOp>(localLoad.getSrc(), subviews);
+
+  for (auto &&subview : subviews)
+    for (auto &&user : subview->getUsers())
+      if (auto localStore = dyn_cast<ttg::LocalStoreOp>(user))
+        dotLocalStores.insert(localStore);
+
+  // Determine the global loads from the local stores.
+  // We expect this to just be a global load
+  // within the loop.
+  for (auto &&localStore : dotLocalStores)
+    findClosestPredOps<tt::LoadOp>(localStore.getSrc(), dotGlobalLoads);
+}
+
 // Transform a loop into one Dot - Memory (ping - pong) clusters
 // Each cluster, especially the Dot cluster is guarded with setprio(1->0) so
 // each warp can complete the execution of the cluster without being
@@ -473,6 +562,46 @@ void Pingponger::getDotPingponged() {
     LDBG(message.str());
     return;
   }
+
+  // The existing code depends on the loads being targeted being safe to move,
+  // which will not hold if we do not properly have a GEMM. As a result, we
+  // filter the associated load operations to only those that are associated
+  // // with the GEMM.
+  DenseSet<tt::LoadOp> dotGlobalLoads;
+  DenseSet<ttg::LocalLoadOp> dotLocalLoads;
+  DenseSet<ttg::LocalStoreOp> dotLocalStores;
+  determineDotMemoryOps(dotOps[0], dotGlobalLoads, dotLocalLoads,
+                        dotLocalStores);
+
+  auto origGlobalLoadCount = gLoadOps.size();
+  auto origLocalLoadCount = lLoadOps.size();
+  // Prune Memory operations that may be moved to only those involved in dot
+  // computation.
+  auto gLoadIt = llvm::remove_if(gLoadOps, [&dotGlobalLoads](tt::LoadOp op) {
+    return !dotGlobalLoads.contains(op);
+  });
+  gLoadOps.erase(gLoadIt, gLoadOps.end());
+  auto lLoadIt =
+      llvm::remove_if(lLoadOps, [&dotLocalLoads](ttg::LocalLoadOp op) {
+        return !dotLocalLoads.contains(op);
+      });
+  lLoadOps.erase(lLoadIt, lLoadOps.end());
+  auto lStoreIt =
+      llvm::remove_if(lStoreOps, [&dotLocalStores](ttg::LocalStoreOp op) {
+        return !dotLocalStores.contains(op);
+      });
+  lStoreOps.erase(lStoreIt, lStoreOps.end());
+  // All PingPong Scheduler assumes there are 2 movable global loads and 2
+  // movable local loads.
+  if (gLoadOps.size() != 2 || lLoadOps.size() != 2) {
+    std::stringstream message;
+    message << "Unable to match ping pong slicing pattern. Details: "
+            << gLoadOps.size() << " global loads in dot computation, "
+            << lLoadOps.size() << " local loads in dot computation";
+    LDBG(message.str());
+    return;
+  }
+
   // Pingpong scheduling tries to form two different types of the instruction
   // clusters, i.e., Dot clusters and Memory clusters. While each SIMD has
   // two concurrent warps, both warps can execute a different type of
@@ -532,14 +661,21 @@ void Pingponger::getDotPingponged() {
     // numWarps=4 doesn't need asymmetric sync, return.
     return;
   } else if (numWarps == 8) { // Pingpong between warps from the same block
-    if (gLoadOps.size() != 2 || lLoadOps.size() != 2) {
+    if (origGlobalLoadCount != 2 || origLocalLoadCount != 2) {
       std::stringstream message;
       message << "Unable to match ping pong slicing pattern. Details: "
               << gLoadOps.size() << " global loads, " << lLoadOps.size()
               << " local loads";
       LDBG(message.str());
       return;
     }
+    if (lStoreOps.size() != 2) {
+      std::stringstream message;
+      message << "Unable to match ping pong slicing pattern. Details: "
+              << lStoreOps.size() << " local stores in dot computation ";
+      LDBG(message.str());
+      return;
+    }
     // Transform a loop where the tile size requires dots to be sliced
     if (tileSize == mediumTile) {
       if (transformTwoPPClusters(builder, dotOps[0]->getLoc()).failed()) {
Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {`
`88`	`88`	`!ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x64xf16, #shared1, #smem> -> tensor<128x64xf32, #mma>`
`89`	`89`	`tt.return`
`90`	`90`	`}`
	`91`	`+`
	`92`	`+ // CHECK-LABEL: @wgmma_on_subtile`
	`93`	`+ // CHECK: nvgpu.wgmma %{{.}}, %{{.}}`
	`94`	`+ tt.func @wgmma_on_subtile(%a: tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %b: !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256>){`
	`95`	`+ %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>`
	`96`	`+ %m = ttng.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, isAsync = true} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256> -> tensor<128x256xf32, #mma>`
	`97`	`+ tt.return`
	`98`	`+ }`
`91`	`99`	`}`
`92`	`100`
`93`	`101`	`// -----`