Skip to content

Commit c53a87f

Browse files
committed
Merge remote-tracking branch 'upstream/main' into rebase
2 parents 5feeb96 + 3889f3f commit c53a87f

File tree

61 files changed

+1373
-606
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1373
-606
lines changed

.pre-commit-config.yaml

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
default_stages: [pre-commit, pre-push, manual]
12
repos:
23
- repo: https://github.com/pre-commit/pre-commit-hooks
3-
rev: v4.4.0
4+
rev: v5.0.0
45
hooks:
56
- id: check-symlinks
67
- id: destroyed-symlinks
@@ -17,12 +18,11 @@ repos:
1718
- id: debug-statements
1819

1920
- repo: https://github.com/astral-sh/ruff-pre-commit
20-
rev: v0.1.3
21+
rev: v0.7.1
2122
hooks:
2223
- id: ruff
2324
files: '^python/.*'
24-
args: ["--fix", "--line-length", "120"]
25-
stages: [pre-commit, pre-push, manual]
25+
args: ["--fix", "--exit-non-zero-on-fix"]
2626
exclude: |
2727
(?x)(
2828
^python/triton/runtime/.*|
@@ -31,18 +31,16 @@ repos:
3131
)
3232
3333
- repo: https://github.com/google/yapf
34-
rev: be72557
34+
rev: "7e21823"
3535
hooks:
3636
- id: yapf
3737
args: ["-p", "-i"]
38-
stages: [pre-commit, pre-push, manual]
3938
exclude: "python/test/unit/language/test_line_info.py"
4039

4140
- repo: https://github.com/pre-commit/mirrors-clang-format
42-
rev: v16.0.6
41+
rev: v19.1.2
4342
hooks:
4443
- id: clang-format
45-
stages: [pre-commit, pre-push, manual]
4644

4745
# Expand YAML anchors in files used by github workflows, because github can't
4846
# do this itself. This lets us use anchors, which avoids code duplication.

bin/RegisterTritonDialects.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
8787
mlir::registerTritonAMDGPUReorderInstructions();
8888
mlir::registerTritonAMDGPUStreamPipelineV2();
8989
mlir::registerTritonAMDGPUCanonicalizePointers();
90+
mlir::registerTritonAMDGPUConvertToBufferOps();
9091

9192
// TODO: register Triton & TritonGPU passes
9293
registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,

include/triton/Analysis/Utility.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "mlir/Support/LLVM.h"
77
#include "triton/Dialect/Triton/IR/Dialect.h"
88
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
9+
#include "triton/Tools/LinearLayout.h"
910

1011
namespace mlir {
1112

@@ -189,6 +190,14 @@ bool supportMMA(triton::DotOp op, int version);
189190

190191
bool supportMMA(Value value, int version);
191192

193+
// Conversion from `srcTy` to `dstTy` involving the minimum amount of data
194+
// transfer provided that both types can be converted to LL (if it can't it'll
195+
// return nullopt). The output will be such that layout.getInDimNames() ==
196+
// layout.getOutDimNames() and the conversion will not include kBlock (resp.
197+
// kWarp or kLane) if it can be avoided
198+
std::optional<mlir::triton::LinearLayout>
199+
minimalCvtLayout(RankedTensorType srcTy, RankedTensorType dstTy);
200+
192201
// Conversion from `srcTy` to `dstTy` only involves reordering of registers.
193202
// There is no need for data exchange across threads, warps, or blocks.
194203
bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy);
@@ -203,7 +212,7 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
203212

204213
bool atomicNeedsSharedMemory(Value result);
205214

206-
bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstT);
215+
bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
207216

208217
bool isMfmaToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
209218

include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ class TargetInfoBase {
8282

8383
virtual int getSharedAddressSpace() const = 0;
8484

85+
virtual bool supportVectorizedAtomics() const = 0;
86+
8587
virtual ~TargetInfoBase() {}
8688
};
8789
} // namespace mlir::triton

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,6 @@ std::optional<LinearLayout>
4444
toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
4545
std::optional<int32_t> elemBitWidth = std::nullopt);
4646

47-
// Given a linear layout with input dims and output dims containing a "block"
48-
// dimension, determines if the layout moves data across block boundaries.
49-
bool isCrossCTAConversion(const LinearLayout &layout);
50-
5147
// Given a linear layout where the input dimensions contain a "block" dimension,
5248
// this method sets the "block" dimension to 0 and removes the corresponding
5349
// output dimensions.

include/triton/Tools/LinearLayout.h

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -575,29 +575,20 @@ class LinearLayout {
575575
return *this;
576576
}
577577

578-
// divideLeft and divideRight are the inverses of operator*.
579-
//
580-
// Consider `a = c.divideRight(b)`, where `a` is a linear layout with
581-
// `in-dims(a) == in-dims(b)` and `out-dims(a) == out-dims(c)`. We may remove
582-
// some empty dimensions from `a` to form `a'` and still have `a' * b == c`.
583-
// Therefore, there are multiple possible values that we could return for
584-
// `(a * b).divideRight(b)` which would satisfy
585-
// `((a * b).divideRight(b)) * b == a * b`.
586-
//
587-
// In the following example, we have `a * b == a' * b` when "in1" is an empty
588-
// dimension that maps everything to 0:
589-
//
590-
// a = L("in1", "in2") -> ("out1", "out2")
591-
// a' = L("in1") -> ("out1")
592-
// b = L("in2") -> ("out2")
593-
//
594-
// divideLeft and divideRight resolve this ambiguity by always returning the
595-
// "canonical" quotient, namely the one with the fewest possible size-zero
596-
// input and output dimensions.
597-
//
598-
// TODO(jlebar): Implement divideLeft.
599-
// std::optional<LinearLayout> divideLeft(const LinearLayout &divisor);
600-
std::optional<LinearLayout> divideRight(const LinearLayout &divisor) const;
578+
// Returns true if this layout acts trivially (as the identity) on the given
579+
// dimensions. This means that it's the identity on those dimensions, and it
580+
// does not map other dimensions onto those or these onto other dimensions.
581+
bool isTrivialOver(ArrayRef<StringAttr> dimNames) const;
582+
583+
// For an endomorphism on dimNames (linear map that maps dimNames to dimNames)
584+
// checks whether it is the identity map on these dimensions (i.e
585+
// LinearLayouts::isTrivialOver) and if so, returns the sublayout of the
586+
// remaining dimensions.
587+
// nb. The isTrivialOver condition is more restrictive than the usual
588+
// "leaves the subspace invariant" condition in maths.
589+
// We can always relax it if we know how to take advantage of a conversion
590+
// layout being block-diagonal in the future.
591+
std::optional<LinearLayout> quotient(ArrayRef<StringAttr> dimNames) const;
601592

602593
// Gets a layout with only these in/out dimensions.
603594
//
@@ -614,10 +605,10 @@ class LinearLayout {
614605
bool sublayoutIsZero(ArrayRef<StringAttr> inDimNames,
615606
ArrayRef<StringAttr> outDimNames) const;
616607

617-
// Is the sublayout restricted to inDimNames + outDimNames and then flattened
618-
// to 1D the identity layout (ignoring out-dim sizes)?
619-
bool sublayoutIsIdentity(ArrayRef<StringAttr> inDimNames,
620-
ArrayRef<StringAttr> outDimNames) const;
608+
// Is the sublayout defined from dimNames to dimNames the identity?
609+
// In particular, is the input and output size in these dimensions
610+
// the same, and are the bases the identity?
611+
bool squareSublayoutIsIdentity(ArrayRef<StringAttr> dimNames) const;
621612

622613
// Computes and returns L(x, y, z).
623614
//
@@ -688,6 +679,13 @@ class LinearLayout {
688679
// (i.e. every input bit affects the output).
689680
llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks() const;
690681

682+
// Increase an input dimension without affecting the output dimension. The
683+
// added free variables are mapped to 0, ensuring that the new input
684+
// dimensions correspond directly to the existing output space. The function
685+
// errors out if `newInDimSize` is less than the current size or the new size
686+
// is not a power of 2.
687+
LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const;
688+
691689
std::string toString() const;
692690

693691
friend bool operator==(LinearLayout lhs, LinearLayout rhs);

include/triton/Tools/Sys/GetEnv.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ namespace mlir::triton {
1313
inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
1414
// clang-format off
1515
"AMDGCN_ENABLE_DUMP",
16+
"AMDGCN_USE_BUFFER_OPS",
1617
"DISABLE_FAST_REDUCTION",
1718
"DISABLE_LLVM_OPT",
1819
"DISABLE_MMA_V3",

lib/Analysis/Utility.cpp

Lines changed: 77 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ bool supportMMA(Value value, int version) {
543543
(elemTy.isInteger(8) && version >= 2);
544544
}
545545

546-
bool isBlockedToDotShortcut(RankedTensorType &srcTy, RankedTensorType &dstTy) {
546+
bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy) {
547547
auto blockedLayout = dyn_cast<BlockedEncodingAttr>(srcTy.getEncoding());
548548
auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
549549
if (blockedLayout == nullptr || dotOperandLayout == nullptr)
@@ -647,57 +647,94 @@ bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
647647
return ans;
648648
}
649649

650-
bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy) {
650+
// We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
651+
// under kBlock, kWarp or kLane (in that order). The idea here is that if we
652+
// have a transformation that's the identity on kBlock, we don't need to use
653+
// distributed shared memory. If it's also the identity on kWarp, we can
654+
// transfer via warp-shuffles, and if it's the identity on kLane just have to
655+
// reorder the registers
656+
std::optional<LinearLayout> minimalCvtLayout(RankedTensorType srcTy,
657+
RankedTensorType dstTy) {
651658
MLIRContext *ctx = srcTy.getContext();
652659
std::optional<LinearLayout> srcLayout =
653660
toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
654661
std::optional<LinearLayout> dstLayout =
655662
toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
656-
if (srcLayout.has_value() && dstLayout.has_value()) {
657-
// comp describes the layout function for converting from src to dst.
658-
LinearLayout comp = srcLayout->invertAndCompose(*dstLayout);
659-
StringAttr kLane = StringAttr::get(ctx, "lane");
660-
StringAttr kWarp = StringAttr::get(ctx, "warp");
661-
StringAttr kBlock = StringAttr::get(ctx, "block");
662-
// TODO(jlebar): These checks are overly-restrictive. For example, we can
663-
// transfer by shuffling registers (case 1) if and only if all of the bases
664-
// for `register` have 0s for lane, warp, and block. But the check below is
665-
// stronger than this, checking also that the choice of lane/warp/block does
666-
// not affect the permutation of registers. If we allow different
667-
// lane/warp/blocks to have different permutations, we can generalize this.
668-
if (comp.divideRight(LinearLayout::identity1D(comp.getInDimSize(kLane),
669-
kLane, kLane) *
670-
LinearLayout::identity1D(comp.getInDimSize(kWarp),
671-
kWarp, kWarp) *
672-
LinearLayout::identity1D(comp.getInDimSize(kBlock),
673-
kBlock, kBlock))
674-
.has_value()) {
675-
return true;
663+
if (!(srcLayout.has_value() && dstLayout.has_value()))
664+
return std::nullopt;
665+
StringAttr kRegister = StringAttr::get(ctx, "register");
666+
StringAttr kLane = StringAttr::get(ctx, "lane");
667+
StringAttr kWarp = StringAttr::get(ctx, "warp");
668+
StringAttr kBlock = StringAttr::get(ctx, "block");
669+
auto numSrcRegs = srcLayout->getInDimSize(kRegister);
670+
auto numDstRegs = dstLayout->getInDimSize(kRegister);
671+
// The `invertAndCompose` function will generate a layout that is injective
672+
// by assigning new output dimensions to free variables. For instance,
673+
// consider a scenario where `srcLayout` has a free variable in the lane
674+
// dimension, while `dstLayout` has two free variables in the lane
675+
// dimension and also a larger number of registers.
676+
// The injective form of `srcLayout` will add only a single additional row
677+
// to the transformation matrix, whereas the injective form of `dstLayout`
678+
// will add two additional rows. This discrepancy causes misleading results
679+
// because the matrices end up with a different number of rows.
680+
//
681+
// Take `dstLayout ⋅ srcLayout^-1` as an example:
682+
//
683+
// - `injective(dstLayout)`: [n, m] → [n + 2, m]
684+
// - `injective(srcLayout)`: [n, m] → [n + 1, m]
685+
// - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1]
686+
// - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n +
687+
// 1] → [n + 2, n + 1]
688+
//
689+
// Here, the `(n + 1)`-th row added by `dstLayout` represents the free
690+
// variable in registers, and the `(n + 2)`-th row represents the free
691+
// variable in lanes. However, the `(n + 1)`-th row added by `srcLayout`
692+
// represents the free variable in lanes. As a result, the `(n + 1)`-th row
693+
// in two layouts do not correspond to the same free variable.
694+
//
695+
// To address this issue, we pad the free variables in `srcLayout` and
696+
// `dstLayout` to ensure they have the same number of registers. This
697+
// guarantees that the resulting matrices have the same number of rows,
698+
// ensuring consistency in the composition process.
699+
auto numRegs = std::max(numSrcRegs, numDstRegs);
700+
auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs);
701+
auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs);
702+
// comp describes the layout function to create dst from src.
703+
LinearLayout comp =
704+
dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs);
705+
// We try to quotient by the largest subspace first
706+
auto dims = SmallVector<StringRef>{"block", "warp", "lane", "register"};
707+
for (auto dim : dims) {
708+
auto quotient = comp.quotient(StringAttr::get(ctx, dim));
709+
if (!quotient.has_value()) {
710+
break;
676711
}
712+
comp = *quotient;
677713
}
678-
return false;
714+
return comp;
715+
}
716+
717+
bool cvtReordersRegisters(RankedTensorType srcTy, RankedTensorType dstTy) {
718+
auto layout = minimalCvtLayout(srcTy, dstTy);
719+
MLIRContext *ctx = srcTy.getContext();
720+
if (!layout.has_value()) {
721+
return false;
722+
}
723+
auto kRegister = StringAttr::get(ctx, "register");
724+
auto outDims = llvm::to_vector(layout->getOutDimNames());
725+
return outDims.empty() || ArrayRef(outDims) == ArrayRef({kRegister});
679726
}
680727

681728
bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
729+
auto layout = minimalCvtLayout(srcTy, dstTy);
682730
MLIRContext *ctx = srcTy.getContext();
683-
std::optional<LinearLayout> srcLayout =
684-
toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
685-
std::optional<LinearLayout> dstLayout =
686-
toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
687-
if (srcLayout.has_value() && dstLayout.has_value()) {
688-
// comp describes the layout function for converting from src to dst.
689-
LinearLayout comp = srcLayout->invertAndCompose(*dstLayout);
690-
StringAttr kWarp = StringAttr::get(ctx, "warp");
691-
StringAttr kBlock = StringAttr::get(ctx, "block");
692-
if (comp.divideRight(LinearLayout::identity1D(comp.getInDimSize(kWarp),
693-
kWarp, kWarp) *
694-
LinearLayout::identity1D(comp.getInDimSize(kBlock),
695-
kBlock, kBlock))
696-
.has_value()) {
697-
return true;
698-
}
731+
if (!layout.has_value()) {
732+
return false;
699733
}
700-
return false;
734+
auto kRegister = StringAttr::get(ctx, "register");
735+
auto kLane = StringAttr::get(ctx, "lane");
736+
return llvm::to_vector(layout->getOutDimNames()) ==
737+
llvm::SmallVector<StringAttr, 2>{kRegister, kLane};
701738
}
702739

703740
bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {

0 commit comments

Comments
 (0)