intel
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/src/passes.h
Lines changed: 5 additions & 0 deletions b/‎python/src/passes.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/amd-block-pingpong.mlir
Lines changed: 138 additions & 0 deletions b/‎test/TritonGPU/amd/amd-block-pingpong.mlir
Lines changed: 138 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/mfma-double-rate.mlir
Lines changed: 19 additions & 0 deletions b/‎test/TritonGPU/amd/mfma-double-rate.mlir
Lines changed: 19 additions & 0 deletions
diff --git a/‎third_party/amd/backend/compiler.py
Lines changed: 4 additions & 3 deletions b/‎third_party/amd/backend/compiler.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Lines changed: 3 additions & 0 deletions b/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Lines changed: 3 additions & 0 deletions
@@ -1996,6 +1996,13 @@ SwizzledSharedEncodingAttr AMDMfmaEncodingAttr::composeSharedLayoutForOperand(
     ArrayRef<unsigned> sharedOrder, unsigned vectorSize, unsigned elemBitWidth,
     bool needTrans) const {
   int kDimIndex = operandIdx == 0 ? 1 : 0;
+
+  // Disable swizzling for scales
+  if (operandIdx >= 2) {
+    return SwizzledSharedEncodingAttr::get(getContext(), 1, 1, 1, sharedOrder,
+                                           ctaLayout);
+  }
+
   if (needTrans)
     kDimIndex = 1 - kDimIndex;
 
 
@@ -36,3 +36,8 @@
 #define ADD_PASS_OPTION_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3)           \
   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
                  ty3 val3) { pm.addPass(builder({val0, val1, val2, val3})); })
+
+#define ADD_PASS_OPTION_WRAPPER_5(name, builder, ty0, ty1, ty2, ty3, ty4)      \
+  m.def(name,                                                                  \
+        [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, ty3 val3,      \
+           ty4 val4) { pm.addPass(builder({val0, val1, val2, val3, val4})); })
@@ -1,4 +1,5 @@
 // RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=2" | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=3" | FileCheck %s --check-prefixes CHECK-NS3
 
 //CHECK-LABEL: pingpong_small
 //CHECK: ttg.local_load
@@ -1835,3 +1836,140 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+// CHECK-LABEL: async_ns3_gemm
+// CHECK-NOT: rocdl
+// CHECK-NS3-LABEL: async_ns3_gemm
+// CHECK-NS3: amdgpu.cond_barrier
+// CHECK-NS3: %[[LL0:.+]] = ttg.local_load
+// CHECK-NS3: %[[LL1:.+]] = ttg.local_load
+// CHECK-NS3: ttg.async_wait
+// CHECK-NS3: tt.dot %[[LL0]], %[[LL1]]
+// CHECK-NS3: amdgpu.cond_barrier
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [32, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = true}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [0, 1]}>
+#shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @async_ns3_gemm(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: tensor<256x32x!tt.ptr<bf16>, #blocked>, %arg11: tensor<32x256x!tt.ptr<bf16>, #blocked1>, %arg12: !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>, %arg13: !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>, %arg14: !ttg.async.token, %arg15: !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>, %arg16: !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>, %arg17: !ttg.async.token, %arg18: !ttg.async.token, %arg19: !ttg.async.token, %arg20: tensor<256x32xi32, #blocked>, %arg21: tensor<32x256xi32, #blocked1>, %arg22: !ttg.memdesc<3x256x32xbf16, #shared, #smem, mutable>, %arg23: !ttg.memdesc<3x32x256xbf16, #shared1, #smem, mutable>, %arg24: tensor<256x256x!tt.ptr<bf16>, #mma>, %arg25: tensor<256x256xi1, #mma>) attributes {noinline = false} {
+    %c3_i32 = arith.constant 3 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
+    %0:12 = scf.for %arg26 = %c0_i32 to %arg9 step %c1_i32 iter_args(%arg27 = %cst, %arg28 = %arg10, %arg29 = %arg11, %arg30 = %c1_i32, %arg31 = %arg12, %arg32 = %arg13, %arg33 = %arg14, %arg34 = %arg15, %arg35 = %arg16, %arg36 = %arg17, %arg37 = %arg18, %arg38 = %arg19) -> (tensor<256x256xf32, #mma>, tensor<256x32x!tt.ptr<bf16>, #blocked>, tensor<32x256x!tt.ptr<bf16>, #blocked1>, i32, !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>, !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>, !ttg.async.token, !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>, !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>, !ttg.async.token, !ttg.async.token, !ttg.async.token)  : i32 {
+      %4 = tt.addptr %arg28, %arg20 : tensor<256x32x!tt.ptr<bf16>, #blocked>, tensor<256x32xi32, #blocked>
+      %5 = tt.addptr %arg29, %arg21 : tensor<32x256x!tt.ptr<bf16>, #blocked1>, tensor<32x256xi32, #blocked1>
+      %6 = arith.addi %arg30, %c1_i32 : i32
+      %7 = arith.cmpi slt, %6, %c3_i32 : i32
+      %8 = arith.select %7, %6, %c0_i32 : i32
+      %9 = ttg.memdesc_subview %arg22[%8, %c0_i32, %c0_i32] : !ttg.memdesc<3x256x32xbf16, #shared, #smem, mutable> -> !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>
+      %10 = ttg.async_copy_global_to_local %4, %9 : tensor<256x32x!tt.ptr<bf16>, #blocked> -> <256x32xbf16, #shared, #smem, mutable>
+      %11 = ttg.async_commit_group %10
+      %12 = ttg.local_load %arg31 token %arg33 : !ttg.memdesc<256x32xbf16, #shared, #smem, mutable> -> tensor<256x32xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+      %13 = ttg.memdesc_subview %arg23[%8, %c0_i32, %c0_i32] : !ttg.memdesc<3x32x256xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>
+      %14 = ttg.async_copy_global_to_local %5, %13 : tensor<32x256x!tt.ptr<bf16>, #blocked1> -> <32x256xbf16, #shared1, #smem, mutable>
+      %15 = ttg.async_commit_group %14
+      %16 = ttg.local_load %arg34 token %arg36 : !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable> -> tensor<32x256xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+      %17 = tt.dot %12, %16, %arg27 : tensor<256x32xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<32x256xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<256x256xf32, #mma>
+      %18 = ttg.async_wait %arg37 {num = 0 : i32}
+      %19 = ttg.async_wait %arg38 {num = 0 : i32}
+      scf.yield %17, %4, %5, %8, %arg32, %9, %18, %arg35, %13, %19, %11, %15 : tensor<256x256xf32, #mma>, tensor<256x32x!tt.ptr<bf16>, #blocked>, tensor<32x256x!tt.ptr<bf16>, #blocked1>, i32, !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>, !ttg.memdesc<256x32xbf16, #shared, #smem, mutable>, !ttg.async.token, !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>, !ttg.memdesc<32x256xbf16, #shared1, #smem, mutable>, !ttg.async.token, !ttg.async.token, !ttg.async.token
+    }
+    %1 = ttg.async_wait %0#10 {num = 0 : i32}
+    %2 = ttg.async_wait %0#11 {num = 0 : i32}
+    ttg.local_dealloc %arg22 : !ttg.memdesc<3x256x32xbf16, #shared, #smem, mutable>
+    ttg.local_dealloc %arg23 : !ttg.memdesc<3x32x256xbf16, #shared1, #smem, mutable>
+    %3 = arith.truncf %0#0 : tensor<256x256xf32, #mma> to tensor<256x256xbf16, #mma>
+    tt.store %arg24, %3, %arg25 : tensor<256x256x!tt.ptr<bf16>, #mma>
+    tt.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gemm_mxfp4
+// CHECK: amdgpu.cond_barrier
+// CHECK: %[[WAIT:.+]] = ttg.async_wait
+// CHECK: ttg.async_copy_global_to_local
+// CHECK: ttg.async_copy_global_to_local
+// CHECK: ttg.async_copy_global_to_local
+// CHECK: ttg.async_copy_global_to_local
+// CHECK: rocdl.sched.barrier 0
+// CHECK: rocdl.s.barrier
+// CHECK: rocdl.sched.barrier 0
+// CHECK: %[[LL0:.+]] = ttg.local_load
+// CHECK-SAME: %[[WAIT]]
+// CHECK: %[[LL1:.+]] = ttg.local_load
+// CHECK-SAME: %[[WAIT]]
+// CHECK: %[[LL2:.+]] = ttg.local_load
+// CHECK-SAME: %[[WAIT]]
+// CHECK: %[[LL3:.+]] = ttg.local_load
+// CHECK-SAME: %[[WAIT]]
+// CHECK: tt.dot_scaled %[[LL2]] scale %[[LL0]], %[[LL3]] scale %[[LL1]]
+// CHECK: amdgpu.cond_barrier
+
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>
+#linear = #ttg.linear<{register = [[0, 4], [32, 0], [64, 0], [128, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[0, 0], [0, 0], [16, 0]], block = []}>
+#linear1 = #ttg.linear<{register = [[0, 4], [64, 0], [128, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[16, 0], [32, 0], [0, 0]], block = []}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = true}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#shared1 = #ttg.swizzled_shared<{vec = 16, perPhase = 2, maxPhase = 8, order = [1, 0]}>
+#shared2 = #ttg.swizzled_shared<{vec = 16, perPhase = 2, maxPhase = 8, order = [0, 1]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @gemm_mxfp4(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: !tt.ptr<i8> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg4: !tt.ptr<i8> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: tensor<256x8x!tt.ptr<i8>, #blocked>, %arg15: tensor<256x8x!tt.ptr<i8>, #blocked>, %arg16: tensor<256x128x!tt.ptr<i8>, #blocked1>, %arg17: tensor<128x256x!tt.ptr<i8>, #blocked2>, %arg18: !ttg.async.token, %arg19: !ttg.async.token, %arg20: !ttg.async.token, %arg21: !ttg.async.token, %arg22: !ttg.memdesc<256x8xi8, #shared, #smem, mutable>, %arg23: !ttg.memdesc<256x8xi8, #shared, #smem, mutable>, %arg24: !ttg.memdesc<256x128xi8, #shared1, #smem, mutable>, %arg25: !ttg.memdesc<128x256xi8, #shared2, #smem, mutable>, %arg26: tensor<256x8xi32, #blocked>, %arg27: tensor<256x8xi32, #blocked>, %arg28: tensor<256x256x!tt.ptr<bf16>, #mma>, %arg29: tensor<256x256xi1, #mma>) attributes {noinline = false} {
+    %c63_i32 = arith.constant 63 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %cst = arith.constant dense<128> : tensor<256x128xi32, #blocked1>
+    %cst_0 = arith.constant dense<128> : tensor<128x256xi32, #blocked2>
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<2x256x128xi8, #shared1, #smem, mutable>
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<2x128x256xi8, #shared2, #smem, mutable>
+    %2 = ttg.local_alloc : () -> !ttg.memdesc<2x256x8xi8, #shared, #smem, mutable>
+    %3 = ttg.local_alloc : () -> !ttg.memdesc<2x256x8xi8, #shared, #smem, mutable>
+    %4:14 = scf.for %arg30 = %c0_i32 to %c63_i32 step %c1_i32 iter_args(%arg31 = %cst_1, %arg32 = %arg14, %arg33 = %arg15, %arg34 = %arg16, %arg35 = %arg17, %arg36 = %c0_i32, %arg37 = %arg18, %arg38 = %arg19, %arg39 = %arg20, %arg40 = %arg21, %arg41 = %arg22, %arg42 = %arg23, %arg43 = %arg24, %arg44 = %arg25) -> (tensor<256x256xf32, #mma>, tensor<256x8x!tt.ptr<i8>, #blocked>, tensor<256x8x!tt.ptr<i8>, #blocked>, tensor<256x128x!tt.ptr<i8>, #blocked1>, tensor<128x256x!tt.ptr<i8>, #blocked2>, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.memdesc<256x8xi8, #shared, #smem, mutable>, !ttg.memdesc<256x8xi8, #shared, #smem, mutable>, !ttg.memdesc<256x128xi8, #shared1, #smem, mutable>, !ttg.memdesc<128x256xi8, #shared2, #smem, mutable>)  : i32 {
+      %7 = ttg.async_wait %arg37, %arg38, %arg39, %arg40 {num = 0 : i32}
+      %8 = tt.addptr %arg34, %cst : tensor<256x128x!tt.ptr<i8>, #blocked1>, tensor<256x128xi32, #blocked1>
+      %9 = tt.addptr %arg35, %cst_0 : tensor<128x256x!tt.ptr<i8>, #blocked2>, tensor<128x256xi32, #blocked2>
+      %10 = tt.addptr %arg32, %arg26 : tensor<256x8x!tt.ptr<i8>, #blocked>, tensor<256x8xi32, #blocked>
+      %11 = tt.addptr %arg33, %arg27 : tensor<256x8x!tt.ptr<i8>, #blocked>, tensor<256x8xi32, #blocked>
+      %12 = arith.addi %arg36, %c1_i32 : i32
+      %13 = arith.cmpi slt, %12, %c2_i32 : i32
+      %14 = arith.select %13, %12, %c0_i32 : i32
+      %15 = ttg.memdesc_subview %2[%14, %c0_i32, %c0_i32] : !ttg.memdesc<2x256x8xi8, #shared, #smem, mutable> -> !ttg.memdesc<256x8xi8, #shared, #smem, mutable>
+      %16 = ttg.async_copy_global_to_local %10, %15 : tensor<256x8x!tt.ptr<i8>, #blocked> -> <256x8xi8, #shared, #smem, mutable>
+      %17 = ttg.async_commit_group %16
+      %18 = ttg.local_load %arg41 token %7 : !ttg.memdesc<256x8xi8, #shared, #smem, mutable> -> tensor<256x8xi8, #linear>
+      %19 = ttg.memdesc_subview %3[%14, %c0_i32, %c0_i32] : !ttg.memdesc<2x256x8xi8, #shared, #smem, mutable> -> !ttg.memdesc<256x8xi8, #shared, #smem, mutable>
+      %20 = ttg.async_copy_global_to_local %11, %19 : tensor<256x8x!tt.ptr<i8>, #blocked> -> <256x8xi8, #shared, #smem, mutable>
+      %21 = ttg.async_commit_group %20
+      %22 = ttg.local_load %arg42 token %7 : !ttg.memdesc<256x8xi8, #shared, #smem, mutable> -> tensor<256x8xi8, #linear1>
+      %23 = ttg.memdesc_subview %0[%14, %c0_i32, %c0_i32] : !ttg.memdesc<2x256x128xi8, #shared1, #smem, mutable> -> !ttg.memdesc<256x128xi8, #shared1, #smem, mutable>
+      %24 = ttg.async_copy_global_to_local %8, %23 : tensor<256x128x!tt.ptr<i8>, #blocked1> -> <256x128xi8, #shared1, #smem, mutable>
+      %25 = ttg.async_commit_group %24
+      %26 = ttg.local_load %arg43 token %7 : !ttg.memdesc<256x128xi8, #shared1, #smem, mutable> -> tensor<256x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+      %27 = ttg.memdesc_subview %1[%14, %c0_i32, %c0_i32] : !ttg.memdesc<2x128x256xi8, #shared2, #smem, mutable> -> !ttg.memdesc<128x256xi8, #shared2, #smem, mutable>
+      %28 = ttg.async_copy_global_to_local %9, %27 : tensor<128x256x!tt.ptr<i8>, #blocked2> -> <128x256xi8, #shared2, #smem, mutable>
+      %29 = ttg.async_commit_group %28
+      %30 = ttg.local_load %arg44 token %7 : !ttg.memdesc<128x256xi8, #shared2, #smem, mutable> -> tensor<128x256xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>
+      %31 = tt.dot_scaled %26 scale %18, %30 scale %22, %arg31 lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<256x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, tensor<256x8xi8, #linear> * tensor<128x256xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, tensor<256x8xi8, #linear1> -> tensor<256x256xf32, #mma>
+      scf.yield %31, %10, %11, %8, %9, %14, %17, %21, %25, %29, %15, %19, %23, %27 : tensor<256x256xf32, #mma>, tensor<256x8x!tt.ptr<i8>, #blocked>, tensor<256x8x!tt.ptr<i8>, #blocked>, tensor<256x128x!tt.ptr<i8>, #blocked1>, tensor<128x256x!tt.ptr<i8>, #blocked2>, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.memdesc<256x8xi8, #shared, #smem, mutable>, !ttg.memdesc<256x8xi8, #shared, #smem, mutable>, !ttg.memdesc<256x128xi8, #shared1, #smem, mutable>, !ttg.memdesc<128x256xi8, #shared2, #smem, mutable>
+    }
+    %5 = ttg.async_wait %4#6, %4#7, %4#8, %4#9 {num = 0 : i32}
+    ttg.local_dealloc %0 : !ttg.memdesc<2x256x128xi8, #shared1, #smem, mutable>
+    ttg.local_dealloc %1 : !ttg.memdesc<2x128x256xi8, #shared2, #smem, mutable>
+    ttg.local_dealloc %2 : !ttg.memdesc<2x256x8xi8, #shared, #smem, mutable>
+    ttg.local_dealloc %3 : !ttg.memdesc<2x256x8xi8, #shared, #smem, mutable>
+    %6 = arith.truncf %4#0 : tensor<256x256xf32, #mma> to tensor<256x256xbf16, #mma>
+    tt.store %arg28, %6, %arg29 : tensor<256x256x!tt.ptr<bf16>, #mma>
+    tt.return
+  }
+}
@@ -132,3 +132,22 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
  }
 }
+
+// -----
+
+// CHECK-LABEL:mxfp4_2step
+#linear = #ttg.linear<{register = [[0, 4], [32, 0], [64, 0], [128, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[0, 0], [0, 0], [16, 0]], block = []}>
+#linear1 = #ttg.linear<{register = [[0, 4], [64, 0], [128, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[16, 0], [32, 0], [0, 0]], block = []}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mxfp4_2step(%arg0: tensor<256x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, %arg1: tensor<256x8xi8, #linear>, %arg2: tensor<128x256xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, %arg3: tensor<256x8xi8, #linear1>) {
+    // CHECK-COUNT-32: rocdl.mfma.scale.f32.16x16x128.f8f6f4
+    // CHECK: rocdl.sched.barrier 0
+    // CHECK: rocdl.s.barrier
+    // CHECK: rocdl.sched.barrier 0
+    // CHECK-COUNT-32: rocdl.mfma.scale.f32.16x16x128.f8f6f4
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
+    %dots = tt.dot_scaled %arg0 scale %arg1, %arg2 scale %arg3, %cst lhs = e2m1 rhs = e2m1 {fastMath = false, pingpong_2step} : tensor<256x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, tensor<256x8xi8, #linear> * tensor<128x256xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, tensor<256x8xi8, #linear1> -> tensor<256x256xf32, #mma>
+    tt.return
+ }
+}
@@ -218,8 +218,10 @@ def make_ttgir(mod, metadata, options):
         global_prefetch = knobs.amd.global_prefetch
         local_prefetch = knobs.amd.local_prefetch
         use_async_copy = knobs.amd.use_async_copy
+        use_block_pingpong = is_pingpong_schedule_enabled(options.arch)
 
-        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy)
+        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy,
+                                               use_block_pingpong)
         if use_async_copy:
             amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
         passes.common.add_canonicalizer(pm)
@@ -232,8 +234,7 @@ def make_ttgir(mod, metadata, options):
             amd.passes.ttgpuir.add_in_thread_transpose(pm)
             passes.ttgpuir.add_remove_layout_conversions(pm)
         amd.passes.ttgpuir.add_reorder_instructions(pm)
-        use_block_pingpong = is_pingpong_schedule_enabled(options.arch)
-        if use_block_pingpong and options.num_stages == 2:
+        if use_block_pingpong and options.num_stages > 1:
             amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
 
         if knobs.amd.use_buffer_ops:
 
@@ -26,6 +26,9 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod
     Option<"useAsyncCopy", "use_async_copy",
            "bool", /*default*/"false",
            "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
+    Option<"usePingpong", "use_pingpong",
+           "bool", /*default*/"false",
+           "Use schedules to enable block ping-pong">,
   ];
 }
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,9 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod`
`26`	`26`	`Option<"useAsyncCopy", "use_async_copy",`
`27`	`27`	`"bool", /default/"false",`
`28`	`28`	`"Use AsyncCopyGlobalToLocal to directly load to shared memory">,`
	`29`	`+ Option<"usePingpong", "use_pingpong",`
	`30`	`+ "bool", /default/"false",`
	`31`	`+ "Use schedules to enable block ping-pong">,`
`29`	`32`	`];`
`30`	`33`	`}`
`31`	`34`