Skip to content

Commit 82b1f85

Browse files
committed
Merge commit 'c6abb226fddb11d3b72062239610e425577d025c'
2 parents 65d0eb6 + c6abb22 commit 82b1f85

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1442
-810
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ if(NOT MLIR_DIR)
116116
set(MLIR_DIR ${LLVM_LIBRARY_DIR}/cmake/mlir)
117117
endif()
118118

119+
if(NOT LLD_DIR)
120+
set(LLD_DIR ${LLVM_LIBRARY_DIR}/cmake/lld)
121+
endif()
122+
119123
# MLIR
120124
find_package(MLIR REQUIRED CONFIG PATHS ${MLIR_DIR})
121125

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ PYTHON ?= python
66
BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
77
TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
88
PYTEST := $(PYTHON) -m pytest
9-
LLVM_BUILD_PATH ?= ".llvm-project/build"
9+
LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
1010
NUM_PROCS ?= 8
1111

1212
# Incremental builds

include/triton/Analysis/Utility.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
254254

255255
bool atomicNeedsSharedMemory(Value result);
256256

257-
// Return true if the src and dst layout match.
258-
bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
259-
RankedTensorType dstTy);
260-
261257
// Check if MFMA layout can be converted to the dot operand
262258
// layout using warp shuffle.
263259
bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp
6060
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
6161
"mlir::scf::SCFDialect",
6262
"mlir::arith::ArithDialect"];
63+
let options = [
64+
Option<"hoistOutOfIf", "hoist-out-of-if",
65+
"bool", /*default*/"false",
66+
"Hoist TMEM allocations out of if statements">
67+
];
6368
}
6469

6570
def TritonGPUTestPipelineLowerLoop : Pass<"tritongpu-test-pipeline-lower-loop", "mlir::ModuleOp"> {
@@ -130,8 +135,7 @@ def TritonGPURewritePartitionDependencies : Pass<"tritongpu-rewrite-partition-de
130135
"mlir::triton::gpu::TritonGPUDialect",
131136
"mlir::scf::SCFDialect",
132137
"mlir::arith::ArithDialect",
133-
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
134-
"mlir::triton::nvws::NVWSDialect"
138+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
135139
];
136140
}
137141

include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
4949
InterfaceMethod<"Get the produced write dependency of the accumulator.",
5050
"::mlir::Value",
5151
"getToken">,
52+
InterfaceMethod<"Indicate that this MMA op executes asynchronously.",
53+
"void",
54+
"setIsAsync",
55+
(ins "bool":$isAsync)>,
5256
];
5357
}
5458
#endif // TRITON_NVIDIAGPU_OP_INTERFACES

include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -421,8 +421,8 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [
421421

422422
let description = [{
423423
$d += matrix_multiply($a, $b).
424-
If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
425-
If there is a barrier the result will be safe to read after a barrier wait.
424+
if is_async is false, the op executes synchronously. The barrier operands must not be present in that case.
425+
Otherwise, if a barrier is given, the op will trigger a commit/arrive on it. The result will be safe to read after a barrier wait.
426426
If $two_ctas is set the op will execute a matmul across two contiguous CTAs, it will read the data distributed across the two CTAs.
427427
and syncronize both CTAs if the op is synchronous.
428428

@@ -440,7 +440,8 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [
440440
I1:$pred,
441441
Variadic<TTG_MemDescType>:$barriers,
442442
Variadic<I1>:$barrier_preds,
443-
OptionalAttr<UnitAttr>:$two_ctas
443+
UnitAttr:$is_async,
444+
UnitAttr:$two_ctas
444445
);
445446
let results = (outs Optional<TTG_AsyncToken>:$token);
446447

@@ -449,7 +450,8 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [
449450
"Value":$a, "Value":$b, "Value":$d, "Value":$acc_dep, "Value":$useD,
450451
"Value":$pred, CArg<"bool", "false">:$two_ctas,
451452
CArg<"ValueRange", "{}">:$barriers,
452-
CArg<"ValueRange", "{}">:$barrier_preds)>
453+
CArg<"ValueRange", "{}">:$barrier_preds,
454+
CArg<"bool", "false">:$is_async)>
453455
];
454456

455457
let assemblyFormat = [{
@@ -458,6 +460,8 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [
458460
attr-dict `:` qualified(type($a)) `,` qualified(type($b)) `,`
459461
qualified(type($d)) (`,` qualified(type($barriers))^)?
460462
}];
463+
464+
let hasVerifier = 1;
461465
}
462466

463467
def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
@@ -470,8 +474,9 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
470474

471475
let description = [{
472476
$d += matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale))
473-
If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
474-
If there is a barrier the result will be safe to read after a barrier wait.
477+
if is_async is false, the op executes synchronously. The barrier operands must not be present in that case.
478+
Otherwise, if a barrier is given, the op will trigger a commit/arrive on it.
479+
The result will be safe to read after a barrier wait.
475480

476481
This operation takes and produces an optional token to indicate TMEM read
477482
and write on its accumulator operand. When the tokens are present, they can
@@ -490,7 +495,8 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
490495
I1:$useD,
491496
I1:$pred,
492497
Variadic<TTG_MemDescType>:$barriers,
493-
Variadic<I1>:$barrier_preds
498+
Variadic<I1>:$barrier_preds,
499+
UnitAttr:$is_async
494500
);
495501
let results = (outs Optional<TTG_AsyncToken>:$token);
496502

@@ -510,7 +516,8 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
510516
"::mlir::triton::ScaleDotElemType":$b_type,
511517
"::mlir::Value":$useD, "::mlir::Value":$pred,
512518
CArg<"::mlir::ValueRange", "{}">:$barriers,
513-
CArg<"::mlir::ValueRange", "{}">:$barrier_preds)>
519+
CArg<"::mlir::ValueRange", "{}">:$barrier_preds,
520+
CArg<"bool", "false">:$is_async)>
514521
];
515522

516523
let assemblyFormat = [{
@@ -521,6 +528,8 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
521528
qualified(type($d)) `,` qualified(type($a_scale)) `,`
522529
qualified(type($b_scale)) (`,` qualified(type($barriers))^)?
523530
}];
531+
532+
let hasVerifier = 1;
524533
}
525534

526535
def TTNG_TCGen5CommitOp : TTNG_Op<"tc_gen5_commit"> {

lib/Analysis/Utility.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -719,24 +719,6 @@ bool supportMMA(Value value, int version) {
719719
(elemTy.isInteger(8) && version >= 2);
720720
}
721721

722-
// For MMAV3 dotOperand layout matches mma operand for f16 and bf16 cases.
723-
bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
724-
RankedTensorType dstTy) {
725-
auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(srcTy.getEncoding());
726-
auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
727-
if (!mmaLayout || !dotOperandLayout) {
728-
return false;
729-
}
730-
int elementTypeSize = srcTy.getElementType().getIntOrFloatBitWidth();
731-
auto parentTy = srcTy.cloneWithEncoding(dotOperandLayout.getParent());
732-
auto ans = mmaLayout.getVersionMajor() == 3 &&
733-
dotOperandLayout.getOpIdx() == 0 &&
734-
mmaLayout.getWarpsPerCTA()[1] == 1 &&
735-
!cvtNeedsSharedMemory(parentTy, srcTy) && elementTypeSize == 8 &&
736-
dotOperandLayout.getKWidth() == 32 / elementTypeSize;
737-
return ans;
738-
}
739-
740722
bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
741723
RankedTensorType dstTy) {
742724
auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcTy.getEncoding());
@@ -817,7 +799,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
817799
return !cvtReordersRegisters(srcTy, dstTy) &&
818800
!cvtNeedsWarpShuffle(srcTy, dstTy) &&
819801
!triton::gpu::intel::isDpasToDotShortcut(srcTy, dstTy) &&
820-
!matchMmaV3AndDotOperandLayout(srcTy, dstTy) &&
821802
// to be removed when generalized warp shuffle conversions
822803
// are ready:
823804
!matchMFMAAndDotOperandShuffleCase(srcTy, dstTy);

0 commit comments

Comments
 (0)