Skip to content

Commit df26ec6

Browse files
authored
[BACKEND] Improve perf of tensormap_fenceproxy_acquire (triton-lang#4720)
The fence turns out to be fairly expensive, and it's cheaper to perform the fence on a single warp and use a barrier to synchronize the remaining threads.
1 parent c238af8 commit df26ec6

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
#include "PatternTritonGPUOpToLLVM.h"
55
#include "TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
66

7+
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
78
#include "mlir/IR/Value.h"
89
#include "mlir/Transforms/DialectConversion.h"
910
#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
1011
#include "triton/Dialect/Triton/IR/Dialect.h"
11-
#include "triton/Dialect/Triton/IR/Types.h"
12-
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
1312

1413
using namespace mlir;
1514
using namespace mlir::triton;
@@ -195,12 +194,20 @@ struct ExperimentalTensormapFenceproxyAcquireOpConversion
195194
auto *sizeOpr = ptxBuilder.newConstantOperand(TMA_SIZE_BYTES);
196195

197196
// Define the instruction opcode
197+
constexpr int kWarpSize = 32;
198+
Value threadId = getThreadId(rewriter, loc);
199+
Value pred = icmp_slt(threadId, i32_val(kWarpSize));
198200
auto &fence =
199201
*ptxBuilder.create<>("fence.proxy.tensormap::generic.acquire.gpu");
200-
fence(descAddrOpr, sizeOpr);
202+
fence(descAddrOpr, sizeOpr).predicate(pred);
201203

202204
ptxBuilder.launch(rewriter, loc, getVoidType());
203205

206+
// We run the fence on a single warp, then use a barrier to synchronize the
207+
// rest. This ends up being faster than running the fence on each warp.
208+
// TODO: Ideally we only emit one barrier after all fences are issued
209+
rewriter.create<NVVM::Barrier0Op>(loc);
210+
204211
rewriter.eraseOp(op);
205212
return success();
206213
}

0 commit comments

Comments
 (0)