[Backend] Try to fix infinite loop in membar (#5973)

Mogball · web-flow · commit d71421d3d5f2 · 2025-02-20T09:15:50.000-08:00
diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp
@@ -60,8 +60,9 @@ void MembarAnalysis::resolve(FunctionOpInterface funcOp,
       // the outputBlockInfo, we skip the successors
       continue;
     }
-    // Update the current block
-    outputBlockInfoMap[block].join(inputBlockInfo);
+    // Update the current block. The block transfer function is not monotonic,
+    // so overwrite the output state entirely.
+    outputBlockInfoMap[block] = inputBlockInfo;
     // Update the successors
     for (auto *successor : successors) {
       inputBlockInfoMap[successor].join(outputBlockInfoMap[block]);
diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
@@ -828,3 +828,44 @@ tt.func @tma_special_cases_cf(%arg1: !tt.ptr<i8, 0>, %i1 : i1, %arg2: tensor<256
   tt.return %t : tensor<256x64xf16, #blocked>
 }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-warps" = 4 : i32} {
+
+// CHECK-LABEL: @direct_backedge_within_loop
+tt.func @direct_backedge_within_loop(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr<f16>, %arg4: !tt.ptr<f16>, %arg5: i1) {
+  // CHECK-NEXT: constant
+  %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #blocked>
+  // CHECK-NEXT: local_alloc
+  %0 = ttg.local_alloc %cst : (tensor<128x32xf16, #blocked>) -> !ttg.memdesc<128x32xf16, #shared, #smem>
+  // CHECK-NEXT: barrier
+  // CHECK-NEXT: local_load
+  %1 = ttg.local_load %0 : !ttg.memdesc<128x32xf16, #shared, #smem> -> tensor<128x32xf16, #blocked>
+  // CHECK-NEXT: br
+  cf.br ^bb1(%arg0, %0 : index, !ttg.memdesc<128x32xf16, #shared, #smem>)
+^bb1(%2: index, %3: !ttg.memdesc<128x32xf16, #shared, #smem>):
+  cf.cond_br %arg5, ^bb2, ^bb3
+// CHECK: ^bb2:
+^bb2:
+  // CHECK-NEXT: barrier
+  // CHECK-NEXT: local_alloc
+  %4 = ttg.local_alloc %cst : (tensor<128x32xf16, #blocked>) -> !ttg.memdesc<128x32xf16, #shared, #smem>
+  // CHECK-NEXT: br
+  cf.br ^bb1(%arg1, %4 : index, !ttg.memdesc<128x32xf16, #shared, #smem>)
+// CHECK: ^bb3
+^bb3:
+  // CHECK-NEXT: barrier
+  // CHECK-NEXT: local_load
+  %5 = ttg.local_load %3 : !ttg.memdesc<128x32xf16, #shared, #smem> -> tensor<128x32xf16, #blocked>
+  // CHECK-NEXT: cond_br
+  cf.cond_br %arg5, ^bb3, ^bb4
+^bb4:
+  tt.return
+}
+
+}