Skip to content

Commit d702cad

Browse files
authored
Merge OpenAI Triton commit 72ec661 (#5052)
This PR change the Triton base from f804bbc to 72ec661 (Sep 3). Pass rate: 98.73%->98.74%
2 parents 548d1eb + eb617ae commit d702cad

File tree

41 files changed

+1420
-517
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1420
-517
lines changed

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
569569
std::function<Value(Value)> calcPaddedOffset,
570570
Value affineOffset, uint64_t maskSpanAffineOffset,
571571
RewriterBase &rewriter, const TargetInfoBase &targetInfo,
572+
std::optional<int> maybeMaxVecElems = {},
572573
Operation *localLoadOp = nullptr);
573574

574575
// Lower an ld/st-like operation given a layout and a callback that creates the

include/triton/Dialect/TritonGPU/IR/LayoutUtility.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,4 @@ namespace mlir::triton::gpu {
55

66
CTALayoutAttr permuteCTALayout(MLIRContext *ctx, CTALayoutAttr layout,
77
ArrayRef<int> order);
8-
9-
LinearLayout getPaddedRegToSharedLayout(const LinearLayout &regLayout,
10-
PaddedSharedEncodingAttr paddedEnc);
11-
128
} // namespace mlir::triton::gpu

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@ enum class ScaleDotElemType : uint32_t;
1515
namespace mlir::triton::gpu {
1616
class SwizzledSharedEncodingAttr;
1717
class NVMMASharedEncodingAttr;
18-
class AMDRotatingSharedEncodingAttr;
19-
class AMDMfmaEncodingAttr;
2018
class TensorOrMemDesc;
2119
class MemDescType;
20+
class CTALayoutAttr;
2221

2322
// - BlockedEncodingAttrs have the following input dimensions.
2423
//
@@ -73,6 +72,16 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
7372
// `inDimNames`. The latter does not modify the output sizes.
7473
LinearLayout getLayoutWithinBlock(const LinearLayout &layout);
7574

75+
// Combines the layout of a CTA (input dims [register, lane, warp]) with the
76+
// layout of a CGA (i.e. a block), and ensures that the resulting layout has the
77+
// given shape.
78+
//
79+
// See the nomenclature note at the top of LinearLayoutConversions.cpp for why
80+
// the variable with type CTALayoutAttr is called cgaLayoutAttr.
81+
LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
82+
CTALayoutAttr cgaLayoutAttr,
83+
ArrayRef<int64_t> shape);
84+
7685
// In this function, we construct a linear layout representing the
7786
// <shared memory offset, iteration, block> -> <tensor element index> mapping
7887
// for entire `src` and `dst` tensors. We determine the shape of the

include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td

Lines changed: 69 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ include "triton/Dialect/Triton/IR/TritonInterfaces.td"
66
include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
77

88
//===----------------------------------------------------------------------===//
9-
// Traits and Interfaces
9+
// Traits, Interfaces and shared Parameters
1010
//===----------------------------------------------------------------------===//
1111

1212
def MemDescViewTrait : NativeOpTrait<"MemDescViewTrait">;
@@ -55,6 +55,11 @@ def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> {
5555
def DeclareSharedEncodingMethods : DeclareAttrInterfaceMethods<
5656
SharedEncodingTrait, ["getAlignment"]>;
5757

58+
def LinearLayoutParam : AttrOrTypeParameter<"LinearLayout",
59+
"linear layout"> {
60+
let cppAccessorType = "const LinearLayout &";
61+
}
62+
5863
//===----------------------------------------------------------------------===//
5964
// Base Attribute
6065
//===----------------------------------------------------------------------===//
@@ -369,14 +374,15 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at
369374

370375
def PaddedSharedEncodingAttr
371376
: TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",
372-
[SharedEncodingTrait, LayoutEncodingTrait]> {
377+
[SharedEncodingTrait, DeclareLayoutEncodingMethods]> {
373378
let mnemonic = "padded_shared";
374379

375380
let description = [{
376381
An encoding for tensors whose elements may be simultaneously accessed by
377382
different GPU threads in the programs, via shared memory. In other words,
378383
for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
379-
Compared to SwizzledSharedEncodingAttr, this encoding uses padding to avoid
384+
Compared to SwizzledSharedEncodingAttr, this encoding combines padding with
385+
element reordering via linear transformation (e.g. row permutation) to avoid
380386
shared memory bank conflicts.
381387

382388
Formally, given a layout:
@@ -388,48 +394,93 @@ at index i, the corresponding shared memory location index is
388394
i + \sum_{k} (i / interval_k) * pad_k = 1
389395
`<interval_i>` and `<pad_i>` all need to be power of two.
390396

391-
Some concrete examples, using `eM` to mean tensor elements and `pN` to mean
392-
padding:
397+
Some concrete examples ignoring the linear component, using `eM` to mean tensor
398+
elements and `pN` to mean padding:
393399

394400
1. Single interval-padding pair:
395401

396-
#ttg.padded_shared<[2:+2]>
402+
#ttg.padded_shared<[2:+2], {...}>
397403
[e0, e1, p0, p1,
398404
e2, e3, p2, p3,
399405
...]
400406

401407
2. Double interval-padding pairs:
402408

403-
#ttg.padded_shared<[2:+1, 4:+2]>
409+
#ttg.padded_shared<[2:+1, 4:+2], {...}>
404410
[e0, e1, p0,
405411
e2, e3, p1, p2, p3,
406412
e4, e5, p4,
407413
e6, e7, p5, p6, p7,
408414
...]
409415

410-
In addition to interval-padding pairs, this encoding requires an `order` to
411-
specify the logical tensor dimenions from the fastest-to slowest-varying.
412-
It may optionally support CGA level organization like other encoding
413-
attributes too, for example,
414-
#ttg.padded_shared<[2:+1, 4:+2] {
415-
order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1],
416-
CTAOrder = [0, 1]}>
416+
Furthermore this encoding allows for a linear remapping from the 1-D shared
417+
memory offset to logical n-D tensor elements. The remapping is given in the form
418+
of linear bases mapping from offset to [dim0, dim1...dimN-1].
419+
See LinearLayout.h for more details how linear layouts are applied to remap
420+
elements.
421+
Some concrete examples using `xN` and `yN` to mean the logical n-D tensor elements
422+
and `pN` to mean padding:
423+
424+
1. 1D Single interval-padding with strided elements
425+
426+
#ttg.padded_shared<[2:+2] {offset = [[2], [1]], block = []}>
427+
[x0, x2, p0 p1,
428+
x1, x3, p2, p3
429+
...]
430+
431+
2. 2D single interval-padding with rearanged rows.
432+
433+
#ttg.padded_shared<[16:+1] {offset = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]], block = []}>
434+
[
435+
x0y0, x0y1, x0y2, x0y3,
436+
x2y0, x2y1, x2y2, x2y3,
437+
x4y0, x4y1, x4y2, x4y3,
438+
x6y0, x6y1, x6y2, x6y3,
439+
p0,
440+
x1y0, x1y1, x1y2, x1y3,
441+
x3y0, x3y1, x3y2, x3y3,
442+
x5y0, x5y1, x5y2, x5y3,
443+
x7y0, x7y1, x7y2, x7y3,
444+
p1,
445+
]
446+
447+
For identity mappings a short form based on order and shape is used to increase readability. The following two encodings are the same:
448+
449+
#ttg.padded_shared<[2:+2] {order = [1, 0], shape = [16, 32]}>
450+
#ttg.padded_shared<[2:+2] {offset = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16], [1, 0], [2, 0], [4, 0], [8, 0]], block = []}>
451+
452+
417453
}];
418454

419455
let parameters = (ins
420456
ArrayRefParameter<"unsigned">:$intervals,
421457
ArrayRefParameter<"unsigned">:$paddings,
422-
// Order of logical tensor dimensions; fastest-varying first.
423-
ArrayRefParameter<"unsigned">:$order,
424-
"CTALayoutAttr":$CTALayout
458+
LinearLayoutParam:$linearComponent
425459
);
426460

427461
let builders = [
428462
AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
429-
"ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout)>,
463+
"LinearLayout":$linearComponent)>,
464+
465+
// Builder to create an identity mapping as the linear component
466+
AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
467+
"ArrayRef<unsigned>":$order, "ArrayRef<int64_t>":$shape,
468+
"CTALayoutAttr":$ctaLayout)>,
430469
];
431470

432471
let extraClassDeclaration = extraBaseClassDeclaration # [{
472+
// Returns the order of the dimensions `dimName` of the layout.
473+
// If more than dimension is of size one, it uses defaultOrder to determine
474+
// the order of the dimensions of size one.
475+
SmallVector<unsigned> orderPerDim(StringAttr dimName,
476+
ArrayRef<unsigned> defaultOrder) const;
477+
SmallVector<unsigned> getOrder() const;
478+
479+
// Returns the bases of the dimensions `dimName` of the linear_component.
480+
// If skipBroadcast is false, we count a base zero
481+
SmallVector<unsigned> basesPerDim(StringAttr dimName,
482+
bool skipBroadcast = true) const;
483+
433484
unsigned getMinInterval() const {
434485
return *llvm::min_element(getIntervals());
435486
}
@@ -708,11 +759,6 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
708759
// Linear Layout Encoding
709760
//===----------------------------------------------------------------------===//
710761

711-
def LinearLayoutParam : AttrOrTypeParameter<"LinearLayout",
712-
"linear layout"> {
713-
let cppAccessorType = "const LinearLayout &";
714-
}
715-
716762
def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding", [DeclareLayoutEncodingMethods]> {
717763
let mnemonic = "linear";
718764

include/triton/Dialect/TritonGPU/Transforms/Utility.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,9 @@ std::optional<StringRef> getAMDArch(Operation *module);
213213
std::optional<mlir::triton::gpu::SwizzledSharedEncodingAttr>
214214
getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
215215

216-
// Convert \param op operands and results to layout \param encoding.
217-
void convertOpEncoding(Attribute encoding, Operation *op);
216+
// Convert \param op to use \param encoding attribute.
217+
// Skips operands if they're in shared encoding.
218+
Operation *convertDistributedOpEncoding(Attribute encoding, Operation *op);
218219

219220
// Returns the original memory allocation for a memdesc value
220221
triton::gpu::LocalAllocOp findShmemAlloc(Value operand);

include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
5353
"void",
5454
"setIsAsync",
5555
(ins "bool":$isAsync)>,
56+
InterfaceMethod<"Return true if this MMA op executes asynchronously.",
57+
"bool",
58+
"isAsync">
5659
];
5760

5861
let verify = [{

lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,16 @@ LogicalResult lowerLocalStore(Location loc, MLIRContext *ctx, Value regVal,
3131
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(memDescTy.getEncoding());
3232
LinearLayout cvt = LinearLayout::empty();
3333
if (paddedEnc) {
34-
cvt = getPaddedRegToSharedLayout(regLayout, paddedEnc);
34+
const auto &sharedLL = paddedEnc.getLinearComponent();
35+
cvt = regLayout.invertAndCompose(sharedLL);
3536
} else {
3637
auto sharedLayout = toLinearLayout(memDescTy);
3738
cvt = regLayout.invertAndCompose(sharedLayout);
38-
auto kBlock = str_attr("block");
39-
// NYI. We would need to emit a map.shared::cluster instruction.
40-
if (!cvt.isTrivialOver({kBlock})) {
41-
return failure();
42-
}
39+
}
40+
auto kBlock = str_attr("block");
41+
// NYI. We would need to emit a map.shared::cluster instruction.
42+
if (!cvt.isTrivialOver({kBlock})) {
43+
return failure();
4344
}
4445
cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});
4546
lowerLocalLdSt(loc, ctx, cvt, inVals, llvmElemTy, memDescTy, smemObj,
@@ -167,15 +168,16 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
167168
auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc);
168169
LinearLayout cvt = LinearLayout::empty();
169170
if (paddedEnc) {
170-
cvt = getPaddedRegToSharedLayout(regLayout, paddedEnc);
171+
const auto &sharedLL = paddedEnc.getLinearComponent();
172+
cvt = regLayout.invertAndCompose(sharedLL);
171173
} else {
172174
auto sharedLayout = toLinearLayout(memDescTy);
173175
cvt = regLayout.invertAndCompose(sharedLayout);
174-
auto kBlock = str_attr("block");
175-
// NYI. We would need to emit a map.shared::cluster instruction.
176-
if (!cvt.isTrivialOver({kBlock})) {
177-
return failure();
178-
}
176+
}
177+
auto kBlock = str_attr("block");
178+
// NYI. We would need to emit a map.shared::cluster instruction.
179+
if (!cvt.isTrivialOver({kBlock})) {
180+
return failure();
179181
}
180182
cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});
181183

lib/Conversion/TritonGPUToLLVM/Utility.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
567567
std::function<Value(Value)> calcPaddedOffset,
568568
Value affineOffset, uint64_t maskSpanAffineOffset,
569569
RewriterBase &rewriter, const TargetInfoBase &targetInfo,
570-
Operation *localLoadOp) {
570+
std::optional<int> maybeMaxVecElems, Operation *localLoadOp) {
571571

572572
bool isStore = !valsArray.empty();
573573
auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -593,7 +593,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
593593
auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
594594
return lowerLdSt(loc, ctx, cvt, valsArray, llvmElemTy, smemBase,
595595
calcPaddedOffset, affineOffset, maskSpanAffineOffset, laneId,
596-
warpId, rewriter, targetInfo, {}, emitLdSt);
596+
warpId, rewriter, targetInfo, maybeMaxVecElems, emitLdSt);
597597
}
598598

599599
SmallVector<Value> lowerLdSt(
@@ -728,9 +728,17 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
728728
}
729729
auto affineOffset = smemObj.getShmemOffset(loc, rewriter, srcTy);
730730
auto maskSpanAffineOffset = smemObj.getMaskSpanOffsets(srcTy);
731-
return lowerLdStShared(
732-
loc, ctx, cvt, valsArray, llvmElemTy, smemObj.getBase(), calcPaddedOffset,
733-
affineOffset, maskSpanAffineOffset, rewriter, targetInfo, localLoadOp);
731+
732+
std::optional<int> maybeMaxVecElems;
733+
if (auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
734+
srcTy.getEncoding())) {
735+
maybeMaxVecElems = paddedEnc.getMinInterval();
736+
}
737+
738+
return lowerLdStShared(loc, ctx, cvt, valsArray, llvmElemTy,
739+
smemObj.getBase(), calcPaddedOffset, affineOffset,
740+
maskSpanAffineOffset, rewriter, targetInfo,
741+
maybeMaxVecElems, localLoadOp);
734742
}
735743

736744
bool emitTransferBetweenRegistersAndShared(
@@ -753,8 +761,8 @@ bool emitTransferBetweenRegistersAndShared(
753761
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
754762
LinearLayout regToSharedLayout = LinearLayout::empty();
755763
if (paddedEnc) {
756-
regToSharedLayout =
757-
triton::gpu::getPaddedRegToSharedLayout(regLayout, paddedEnc);
764+
const auto &sharedLL = paddedEnc.getLinearComponent();
765+
regToSharedLayout = regLayout.invertAndCompose(sharedLL);
758766
} else {
759767
auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
760768
regToSharedLayout = regLayout.invertAndCompose(sharedLL);

0 commit comments

Comments
 (0)