Skip to content

Commit f57b7bf

Browse files
committed
Merge branch 'main' into gregory/windows-support
2 parents ec7dff7 + 1bc283c commit f57b7bf

File tree

57 files changed

+1218
-675
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+1218
-675
lines changed

bin/RegisterTritonDialects.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
8888
mlir::registerTritonAMDGPUReorderInstructions();
8989
mlir::registerTritonAMDGPUStreamPipelineV2();
9090
mlir::registerTritonAMDGPUCanonicalizePointers();
91+
mlir::registerTritonAMDGPUConvertToBufferOps();
9192
#endif
9293

9394
// TODO: register Triton & TritonGPU passes

include/triton/Analysis/Utility.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "mlir/Support/LLVM.h"
77
#include "triton/Dialect/Triton/IR/Dialect.h"
88
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
9+
#include "triton/Tools/LinearLayout.h"
910

1011
namespace mlir {
1112

@@ -189,6 +190,14 @@ bool supportMMA(triton::DotOp op, int version);
189190

190191
bool supportMMA(Value value, int version);
191192

193+
// Conversion from `srcTy` to `dstTy` involving the minimum amount of data
194+
// transfer provided that both types can be converted to LL (if it can't it'll
195+
// return nullopt). The output will be such that layout.getInDimNames() ==
196+
// layout.getOutDimNames() and the conversion will not include kBlock (resp.
197+
// kWarp or kLane) if it can be avoided
198+
std::optional<mlir::triton::LinearLayout>
199+
minimalCvtLayout(RankedTensorType srcTy, RankedTensorType dstTy);
200+
192201
// Conversion from `srcTy` to `dstTy` only involves reordering of registers.
193202
// There is no need for data exchange across threads, warps, or blocks.
194203
bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy);

include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ class TargetInfoBase {
8282

8383
virtual int getSharedAddressSpace() const = 0;
8484

85+
virtual bool supportVectorizedAtomics() const = 0;
86+
8587
virtual ~TargetInfoBase() {}
8688
};
8789
} // namespace mlir::triton

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,6 @@ std::optional<LinearLayout>
4444
toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
4545
std::optional<int32_t> elemBitWidth = std::nullopt);
4646

47-
// Given a linear layout with input dims and output dims containing a "block"
48-
// dimension, determines if the layout moves data across block boundaries.
49-
bool isCrossCTAConversion(const LinearLayout &layout);
50-
5147
// Given a linear layout where the input dimensions contain a "block" dimension,
5248
// this method sets the "block" dimension to 0 and removes the corresponding
5349
// output dimensions.

include/triton/Tools/LinearLayout.h

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -575,29 +575,20 @@ class LinearLayout {
575575
return *this;
576576
}
577577

578-
// divideLeft and divideRight are the inverses of operator*.
579-
//
580-
// Consider `a = c.divideRight(b)`, where `a` is a linear layout with
581-
// `in-dims(a) == in-dims(b)` and `out-dims(a) == out-dims(c)`. We may remove
582-
// some empty dimensions from `a` to form `a'` and still have `a' * b == c`.
583-
// Therefore, there are multiple possible values that we could return for
584-
// `(a * b).divideRight(b)` which would satisfy
585-
// `((a * b).divideRight(b)) * b == a * b`.
586-
//
587-
// In the following example, we have `a * b == a' * b` when "in1" is an empty
588-
// dimension that maps everything to 0:
589-
//
590-
// a = L("in1", "in2") -> ("out1", "out2")
591-
// a' = L("in1") -> ("out1")
592-
// b = L("in2") -> ("out2")
593-
//
594-
// divideLeft and divideRight resolve this ambiguity by always returning the
595-
// "canonical" quotient, namely the one with the fewest possible size-zero
596-
// input and output dimensions.
597-
//
598-
// TODO(jlebar): Implement divideLeft.
599-
// std::optional<LinearLayout> divideLeft(const LinearLayout &divisor);
600-
std::optional<LinearLayout> divideRight(const LinearLayout &divisor) const;
578+
// Returns true if this layout acts trivially (as the identity) on the given
579+
// dimensions. This means that it's the identity on those dimensions, and it
580+
// does not map other dimensions onto those or these onto other dimensions.
581+
bool isTrivialOver(ArrayRef<StringAttr> dimNames) const;
582+
583+
// For an endomorphism on dimNames (linear map that maps dimNames to dimNames)
584+
// checks whether it is the identity map on these dimensions (i.e
585+
// LinearLayouts::isTrivialOver) and if so, returns the sublayout of the
586+
// remaining dimensions.
587+
// nb. The isTrivialOver condition is more restrictive than the usual
588+
// "leaves the subspace invariant" condition in maths.
589+
// We can always relax it if we know how to take advantage of a conversion
590+
// layout being block-diagonal in the future.
591+
std::optional<LinearLayout> quotient(ArrayRef<StringAttr> dimNames) const;
601592

602593
// Gets a layout with only these in/out dimensions.
603594
//
@@ -614,10 +605,10 @@ class LinearLayout {
614605
bool sublayoutIsZero(ArrayRef<StringAttr> inDimNames,
615606
ArrayRef<StringAttr> outDimNames) const;
616607

617-
// Is the sublayout restricted to inDimNames + outDimNames and then flattened
618-
// to 1D the identity layout (ignoring out-dim sizes)?
619-
bool sublayoutIsIdentity(ArrayRef<StringAttr> inDimNames,
620-
ArrayRef<StringAttr> outDimNames) const;
608+
// Is the sublayout defined from dimNames to dimNames the identity?
609+
// In particular, is the input and output size in these dimensions
610+
// the same, and are the bases the identity?
611+
bool squareSublayoutIsIdentity(ArrayRef<StringAttr> dimNames) const;
621612

622613
// Computes and returns L(x, y, z).
623614
//

include/triton/Tools/Sys/GetEnv.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ namespace mlir::triton {
1313
inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
1414
// clang-format off
1515
"AMDGCN_ENABLE_DUMP",
16+
"AMDGCN_USE_BUFFER_OPS",
1617
"DISABLE_FAST_REDUCTION",
1718
"DISABLE_LLVM_OPT",
1819
"DISABLE_MMA_V3",

lib/Analysis/Utility.cpp

Lines changed: 38 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -647,57 +647,56 @@ bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
647647
return ans;
648648
}
649649

650-
bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy) {
650+
// We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
651+
// under kBlock, kWarp or kLane (in that order). The idea here is that if we
652+
// have a transformation that's the identity on kBlock, we don't need to use
653+
// distributed shared memory. If it's also the identity on kWarp, we can
654+
// transfer via warp-shuffles, and if it's the identity on kLane just have to
655+
// reorder the registers
656+
std::optional<LinearLayout> minimalCvtLayout(RankedTensorType srcTy,
657+
RankedTensorType dstTy) {
651658
MLIRContext *ctx = srcTy.getContext();
652659
std::optional<LinearLayout> srcLayout =
653660
toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
654661
std::optional<LinearLayout> dstLayout =
655662
toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
656-
if (srcLayout.has_value() && dstLayout.has_value()) {
657-
// comp describes the layout function for converting from src to dst.
658-
LinearLayout comp = srcLayout->invertAndCompose(*dstLayout);
659-
StringAttr kLane = StringAttr::get(ctx, "lane");
660-
StringAttr kWarp = StringAttr::get(ctx, "warp");
661-
StringAttr kBlock = StringAttr::get(ctx, "block");
662-
// TODO(jlebar): These checks are overly-restrictive. For example, we can
663-
// transfer by shuffling registers (case 1) if and only if all of the bases
664-
// for `register` have 0s for lane, warp, and block. But the check below is
665-
// stronger than this, checking also that the choice of lane/warp/block does
666-
// not affect the permutation of registers. If we allow different
667-
// lane/warp/blocks to have different permutations, we can generalize this.
668-
if (comp.divideRight(LinearLayout::identity1D(comp.getInDimSize(kLane),
669-
kLane, kLane) *
670-
LinearLayout::identity1D(comp.getInDimSize(kWarp),
671-
kWarp, kWarp) *
672-
LinearLayout::identity1D(comp.getInDimSize(kBlock),
673-
kBlock, kBlock))
674-
.has_value()) {
675-
return true;
663+
if (!(srcLayout.has_value() && dstLayout.has_value()))
664+
return std::nullopt;
665+
// comp describes the layout function to create dst from src.
666+
LinearLayout comp = dstLayout->invertAndCompose(*srcLayout);
667+
// We try to quotient by the largest subspace first
668+
auto dims = SmallVector<StringRef>{"block", "warp", "lane", "register"};
669+
for (auto dim : dims) {
670+
auto quotient = comp.quotient(StringAttr::get(ctx, dim));
671+
if (!quotient.has_value()) {
672+
break;
676673
}
674+
comp = *quotient;
677675
}
678-
return false;
676+
return comp;
677+
}
678+
679+
bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy) {
680+
auto layout = minimalCvtLayout(srcTy, dstTy);
681+
MLIRContext *ctx = srcTy.getContext();
682+
if (!layout.has_value()) {
683+
return false;
684+
}
685+
auto kRegister = StringAttr::get(ctx, "register");
686+
auto outDims = llvm::to_vector(layout->getOutDimNames());
687+
return outDims.empty() || ArrayRef(outDims) == ArrayRef({kRegister});
679688
}
680689

681690
bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
691+
auto layout = minimalCvtLayout(srcTy, dstTy);
682692
MLIRContext *ctx = srcTy.getContext();
683-
std::optional<LinearLayout> srcLayout =
684-
toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
685-
std::optional<LinearLayout> dstLayout =
686-
toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
687-
if (srcLayout.has_value() && dstLayout.has_value()) {
688-
// comp describes the layout function for converting from src to dst.
689-
LinearLayout comp = srcLayout->invertAndCompose(*dstLayout);
690-
StringAttr kWarp = StringAttr::get(ctx, "warp");
691-
StringAttr kBlock = StringAttr::get(ctx, "block");
692-
if (comp.divideRight(LinearLayout::identity1D(comp.getInDimSize(kWarp),
693-
kWarp, kWarp) *
694-
LinearLayout::identity1D(comp.getInDimSize(kBlock),
695-
kBlock, kBlock))
696-
.has_value()) {
697-
return true;
698-
}
693+
if (!layout.has_value()) {
694+
return false;
699695
}
700-
return false;
696+
auto kRegister = StringAttr::get(ctx, "register");
697+
auto kLane = StringAttr::get(ctx, "lane");
698+
return llvm::to_vector(layout->getOutDimNames()) ==
699+
llvm::SmallVector<StringAttr, 2>{kRegister, kLane};
701700
}
702701

703702
bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {

0 commit comments

Comments
 (0)