Disabling mmav5 pipelining if there are two or more dots in the loop (#6036)

pawelszczerbuk · web-flow · commit c7a9f291baf6 · 2025-02-26T17:58:08.000-08:00
MMAv5 pipelining has problem with keeping track of async loads feeding
into ops at different pipeline stages. All the corner cases are
difficult to maintain and problems with this keep arising. I am
disabling the support for pipelining when there are more than one dot
ops in the loop. This means in most kernels all the loads will be
feeding to the dot op, or some operation preceding, resolving the issue.
It also means attention kernels won't be pipelined for now, until we
finish integrating mmav5 pipelining pass into the new pipeliner
infrastructure.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp
@@ -615,7 +615,10 @@ FailureOr<scf::ForOp> preProcessLoopForTC05MMAPipelining(scf::ForOp forOp,
     }
   });
 
-  if (mmaOps.empty()) {
+  // Temporarily disable mma pipelining if there are more than one mmaOp in the
+  // loop. This is a workaround for difficult to solve scheduling issues with
+  // loads feeding into non-0 stage ops.
+  if (mmaOps.empty() || mmaOps.size() > 1) {
     return failure();
   }
 
diff --git a/test/TritonGPU/mma-pipeline-blackwell.mlir b/test/TritonGPU/mma-pipeline-blackwell.mlir
@@ -259,20 +259,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 }
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
-  // CHECK-LOWER-LABEL: @do_not_pipeline_second_dot
-  // CHECK-LOWER: scf.for {{.*}}
-  // CHECK-LOWER:   ttng.tmem_store {{.*}} {triton.pipeline_stage = 0 : i32}
-  // CHECK-LOWER:   ttng.tc_gen5_mma {{.*}} {triton.pipeline_stage = 0 : i32}
-  // CHECK-LOWER:   ttng.tmem_load {{.*}} {triton.pipeline_stage = 1 : i32}
-  // CHECK-LOWER:   ttng.tmem_alloc
-  // CHECK-LOWER-NOT: triton.pipeline_stage
-  // CHECK-LOWER:   ttng.tc_gen5_mma
-  // CHECK-LOWER-NOT: triton.pipeline_stage
-  // CHECK-LOWER:   ttng.tmem_load
+  // CHECK-LOWER-LABEL: @do_not_pipeline_two_dots
   // CHECK-LOWER-NOT: triton.pipeline_stage
 
-  // CHECK-LABEL: @do_not_pipeline_second_dot
-  tt.func public @do_not_pipeline_second_dot(%A_ptr: tensor<128x128x!tt.ptr<f16>, #blocked1>, %B_ptr: tensor<128x128x!tt.ptr<f16>, #blocked1>, %acc_ptr: tensor<128x128x!tt.ptr<f32>, #blocked>, %res_ptr: tensor<128x128x!tt.ptr<f32>, #blocked>, %arg3: i32) attributes {noinline = false} {
+  // CHECK-LABEL: @do_not_pipeline_two_dots
+  tt.func public @do_not_pipeline_two_dots(%A_ptr: tensor<128x128x!tt.ptr<f16>, #blocked1>, %B_ptr: tensor<128x128x!tt.ptr<f16>, #blocked1>, %acc_ptr: tensor<128x128x!tt.ptr<f32>, #blocked>, %res_ptr: tensor<128x128x!tt.ptr<f32>, #blocked>, %arg3: i32) attributes {noinline = false} {
     %true = arith.constant true
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
     %c0_i32 = arith.constant 0 : i32

Original file line number	Diff line number	Diff line change
`@@ -615,7 +615,10 @@ FailureOr<scf::ForOp> preProcessLoopForTC05MMAPipelining(scf::ForOp forOp,`
`615`	`615`	`}`
`616`	`616`	`});`
`617`	`617`
`618`		`- if (mmaOps.empty()) {`
	`618`	`+ // Temporarily disable mma pipelining if there are more than one mmaOp in the`
	`619`	`+ // loop. This is a workaround for difficult to solve scheduling issues with`
	`620`	`+ // loads feeding into non-0 stage ops.`
	`621`	`+ if (mmaOps.empty() \|\| mmaOps.size() > 1) {`
`619`	`622`	`return failure();`
`620`	`623`	`}`
`621`	`624`