[BACKEND] Improve perf of tensormap_fenceproxy_acquire (triton-lang#4720)

peterbell10 · web-flow · commit df26ec64aedd · 2024-09-12T14:26:26.000-07:00
The fence turns out to be fairly expensive, and it's cheaper to perform
the fence on a single warp and use a barrier to synchronize the
remaining threads.
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp
@@ -4,12 +4,11 @@
 #include "PatternTritonGPUOpToLLVM.h"
 #include "TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
 
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
-#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -195,12 +194,20 @@ struct ExperimentalTensormapFenceproxyAcquireOpConversion
     auto *sizeOpr = ptxBuilder.newConstantOperand(TMA_SIZE_BYTES);
 
     // Define the instruction opcode
+    constexpr int kWarpSize = 32;
+    Value threadId = getThreadId(rewriter, loc);
+    Value pred = icmp_slt(threadId, i32_val(kWarpSize));
     auto &fence =
         *ptxBuilder.create<>("fence.proxy.tensormap::generic.acquire.gpu");
-    fence(descAddrOpr, sizeOpr);
+    fence(descAddrOpr, sizeOpr).predicate(pred);
 
     ptxBuilder.launch(rewriter, loc, getVoidType());
 
+    // We run the fence on a single warp, then use a barrier to synchronize the
+    // rest. This ends up being faster than running the fence on each warp.
+    // TODO: Ideally we only emit one barrier after all fences are issued
+    rewriter.create<NVVM::Barrier0Op>(loc);
+
     rewriter.eraseOp(op);
     return success();
   }