Revert "Revert "Add back barrier after asserts (#5043)"" (#2657)

anmyachev · web-flow · commit 26781e49cc72 · 2024-11-11T15:45:01.000+01:00
Closes #2644 The error (more details: #2644 (comment)) seems to be that the operation is incorrectly inserted into the block. My best guess is that we need to explicitly insert a barrier at the beginning of the `thenBlock`. However I don't know the exact reason why this code works for nvidia (maybe because of the different number of instructions that initially replace `"gpu.barrier"() : () -> ()` however I'm not sure). ```bash python: /home/runner/work/triton/triton/llvm-project/llvm/include/llvm/ADT/ilist_iterator.h:168: llvm::ilist_iterator::reference llvm::ilist_iterator<llvm::ilist_detail::node_options<mlir::Operation, true, false, void, false, void>, false, false>::operator*() const [OptionsT = llvm::ilist_detail::node_options<mlir::Operation, true, false, void, false, void>, IsReverse = false, IsConst = false]: Assertion `!NodePtr->isKnownSentinel()' failed. Aborted (core dumped) ``` --------- Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
diff --git a/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp
@@ -35,15 +35,21 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
       }
     }
     llAssert(op, condition, adaptor.getMessage(), rewriter);
+    if (isa<RankedTensorType>(op.getCondition().getType())) {
+      // Add a barrier to avoid a race condition in case an assert is followed
+      // by an op that may trap if the assert condition is true. Since the
+      // tensor in those two operations may have different layout we need to
+      // make sure all the threads are done executing the assert before going to
+      // the next op.
+      barrier();
+    }
     rewriter.eraseOp(op);
     return success();
   }
   // op: the op at which the assert is inserted. Unlike printf, we need to
   // know about the op to split the block.
   void llAssert(Operation *op, Value condition, StringRef message,
                 ConversionPatternRewriter &rewriter) const {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-
     auto ctx = rewriter.getContext();
     auto loc = op->getLoc();
 
@@ -79,6 +85,7 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
     rewriter.create<cf::BranchOp>(loc, thenBlock);
     rewriter.setInsertionPointToEnd(prevBlock);
     rewriter.create<cf::CondBranchOp>(loc, condition, ifBlock, thenBlock);
+    rewriter.setInsertionPointToStart(thenBlock);
   }
 
 protected:
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1729,10 +1729,6 @@ def device_print(prefix: str, args: List[tl.tensor], hex: bool, builder: ir.buil
 def device_assert(cond: tl.tensor, msg: str, builder: ir.builder) -> tl.tensor:
     if not builder.options.debug:
         return
-    cond_ty = cond.type
-    if not cond_ty.is_block():
-        cond_ty = tl.block_type(cond_ty.scalar, (1, ))
-        cond = tl.tensor(builder.create_splat(cond.handle, (1, )), cond_ty)
     return tl.tensor(builder.create_assert(cond.handle, msg), tl.void)
 
 
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1842,6 +1842,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK-DAG: llvm.mlir.global internal constant @assertFunc_0("unknown\00") {addr_space = 0 : i32}
 // CHECK-DAG: llvm.mlir.global internal constant @assertFile_0("inner_call\00") {addr_space = 0 : i32}
 // CHECK-DAG: llvm.mlir.global internal constant @assertMessage_0("assert text\00") {addr_space = 0 : i32}
+// CHECK: llvm.call @__assertfail
+// CHECK: nvvm.barrier0
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func public @add_kernel(%arg0: tensor<1xi1, #blocked>) {
     tt.assert %arg0, "assert text" : tensor<1xi1, #blocked> loc(#loc5)