|
4 | 4 | #include "PatternTritonGPUOpToLLVM.h" |
5 | 5 | #include "TritonNVIDIAGPUToLLVM/PTXAsmFormat.h" |
6 | 6 |
|
| 7 | +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" |
7 | 8 | #include "mlir/IR/Value.h" |
8 | 9 | #include "mlir/Transforms/DialectConversion.h" |
9 | 10 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h" |
10 | 11 | #include "triton/Dialect/Triton/IR/Dialect.h" |
11 | | -#include "triton/Dialect/Triton/IR/Types.h" |
12 | | -#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" |
13 | 12 |
|
14 | 13 | using namespace mlir; |
15 | 14 | using namespace mlir::triton; |
@@ -195,12 +194,20 @@ struct ExperimentalTensormapFenceproxyAcquireOpConversion |
195 | 194 | auto *sizeOpr = ptxBuilder.newConstantOperand(TMA_SIZE_BYTES); |
196 | 195 |
|
197 | 196 | // Define the instruction opcode |
| 197 | + constexpr int kWarpSize = 32; |
| 198 | + Value threadId = getThreadId(rewriter, loc); |
| 199 | + Value pred = icmp_slt(threadId, i32_val(kWarpSize)); |
198 | 200 | auto &fence = |
199 | 201 | *ptxBuilder.create<>("fence.proxy.tensormap::generic.acquire.gpu"); |
200 | | - fence(descAddrOpr, sizeOpr); |
| 202 | + fence(descAddrOpr, sizeOpr).predicate(pred); |
201 | 203 |
|
202 | 204 | ptxBuilder.launch(rewriter, loc, getVoidType()); |
203 | 205 |
|
| 206 | + // We run the fence on a single warp, then use a barrier to synchronize the |
| 207 | + // rest. This ends up being faster than running the fence on each warp. |
| 208 | + // TODO: Ideally we only emit one barrier after all fences are issued |
| 209 | + rewriter.create<NVVM::Barrier0Op>(loc); |
| 210 | + |
204 | 211 | rewriter.eraseOp(op); |
205 | 212 | return success(); |
206 | 213 | } |
|
0 commit comments