Skip to content

Commit 03c746c

Browse files
Merge OpenAI Triton commit e5e0081 (#5336)
This PR change the Triton base from 6fce184 to e5e0081 (Oct 16). Pass rate: 94.11%->94.18%
2 parents f049925 + fa15dc1 commit 03c746c

File tree

65 files changed

+3484
-1605
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+3484
-1605
lines changed

CMakeLists.txt

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,17 @@ if(TRITON_BUILD_PYTHON_MODULE)
196196
find_package(Python3 REQUIRED COMPONENTS Development.Module Interpreter)
197197
find_package(pybind11 CONFIG REQUIRED HINTS "${Python3_SITELIB}")
198198

199+
foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
200+
add_subdirectory(third_party/${CODEGEN_BACKEND})
201+
endforeach()
202+
203+
if (TRITON_BUILD_PROTON)
204+
add_subdirectory(third_party/proton)
205+
endif()
206+
# We always build proton dialect
207+
list(APPEND TRITON_PLUGIN_NAMES "proton")
208+
add_subdirectory(third_party/proton/Dialect)
209+
199210
if (DEFINED TRITON_PLUGIN_DIRS)
200211
foreach(PLUGIN_DIR ${TRITON_PLUGIN_DIRS})
201212
# Read the plugin name under dir/backend/name.conf
@@ -213,17 +224,6 @@ if(TRITON_BUILD_PYTHON_MODULE)
213224
endforeach()
214225
endif()
215226

216-
foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
217-
add_subdirectory(third_party/${CODEGEN_BACKEND})
218-
endforeach()
219-
220-
if (TRITON_BUILD_PROTON)
221-
add_subdirectory(third_party/proton)
222-
endif()
223-
# We always build proton dialect
224-
list(APPEND TRITON_PLUGIN_NAMES "proton")
225-
add_subdirectory(third_party/proton/Dialect)
226-
227227
get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
228228
get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS)
229229
set(TRITON_LIBRARIES

Makefile

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,50 +31,50 @@ test-cpp:
3131

3232
.PHONY: test-unit
3333
test-unit: all
34-
cd python/test/unit && $(PYTEST) -s -n $(NUM_PROCS) --ignore=language/test_line_info.py \
34+
cd python/test/unit && $(PYTEST) --tb=short -s -n $(NUM_PROCS) --ignore=language/test_line_info.py \
3535
--ignore=language/test_subprocess.py --ignore=test_debug.py
36-
$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
37-
$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
38-
$(PYTEST) -s -n 6 python/triton_kernels/tests/
39-
TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
36+
$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
37+
$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
38+
$(PYTEST) --tb=short -s -n 6 python/triton_kernels/tests/
39+
TRITON_DISABLE_LINE_INFO=0 $(PYTEST) --tb=short -s python/test/unit/language/test_line_info.py
4040
# Run attention separately to avoid out of gpu memory
41-
$(PYTEST) -vs python/tutorials/06-fused-attention.py
42-
$(PYTEST) -vs python/tutorials/gluon/01-intro.py python/tutorials/gluon/02-layouts.py python/tutorials/gluon/03-async-copy.py python/tutorials/gluon/04-tma.py python/tutorials/gluon/05-wgmma.py python/tutorials/gluon/06-tcgen05.py python/tutorials/gluon/07-persistence.py python/tutorials/gluon/08-warp-specialization.py
43-
$(PYTEST) -vs python/examples/gluon/01-attention-forward.py
41+
$(PYTEST) --tb=short -vs python/tutorials/06-fused-attention.py
42+
$(PYTEST) --tb=short -vs python/tutorials/gluon/01-intro.py python/tutorials/gluon/02-layouts.py python/tutorials/gluon/03-async-copy.py python/tutorials/gluon/04-tma.py python/tutorials/gluon/05-wgmma.py python/tutorials/gluon/06-tcgen05.py python/tutorials/gluon/07-persistence.py python/tutorials/gluon/08-warp-specialization.py
43+
$(PYTEST) --tb=short -vs python/examples/gluon/01-attention-forward.py
4444
TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
4545
$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
46-
$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
46+
$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/gluon
4747

4848
.PHONY: test-distributed
4949
test-distributed: all
5050
$(PYTHON) -m pip install --upgrade pip
5151
$(PYTHON) -m pip install python/triton_kernels -v
52-
$(PYTEST) -s python/triton_kernels/bench/distributed.py
52+
$(PYTEST) --tb=short -s python/triton_kernels/bench/distributed.py
5353

5454
.PHONY: test-gluon
5555
test-gluon: all
56-
$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
57-
$(PYTEST) -vs python/examples/gluon/01-attention-forward.py
56+
$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/gluon
57+
$(PYTEST) --tb=short -vs python/examples/gluon/01-attention-forward.py
5858

5959
.PHONY: test-regression
6060
test-regression: all
61-
$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
61+
$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/regression
6262

6363
.PHONY: test-microbenchmark
6464
test-microbenchmark: all
6565
$(PYTHON) python/test/microbenchmark/launch_overhead.py
6666

6767
.PHONY: test-interpret
6868
test-interpret: all
69-
cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
69+
cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) --tb=short -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
7070
language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
7171
language/test_tuple.py runtime/test_autotuner.py::test_kwargs[False] \
7272
../../tutorials/06-fused-attention.py::test_op --device=cpu
7373

7474
.PHONY: test-proton
7575
test-proton: all
76-
$(PYTEST) -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
77-
$(PYTEST) -s third_party/proton/test/test_override.py
76+
$(PYTEST) --tb=short -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
77+
$(PYTEST) --tb=short -s third_party/proton/test/test_override.py
7878

7979
.PHONY: test-python
8080
test-python: test-unit test-regression test-interpret test-proton

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -112,21 +112,10 @@ LinearLayout chooseShemLayoutForRegToRegConversion(
112112

113113
// The primary goal of this function is to efficiently load 2D tiles of a
114114
// tensor from shared memory using the `ds_read_tr` instruction for AMD GPUs.
115-
LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
116-
int32_t elemBitWidth);
117-
118-
LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
119-
int numWarps);
120-
121115
std::optional<LinearLayout>
122-
getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
123-
int numWarps);
124-
125-
// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
126-
// distribute the data long M for the warp groups. This doesn't affect the TMem
127-
// layout it just returns a distributed layout compatible for tmem_load.
128-
LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
129-
int numWarps);
116+
chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
117+
int32_t elemBitWidth, unsigned instBitWidth,
118+
unsigned numLanesInShuffleGroup);
130119

131120
// Create LinearLayout for scale in scaled mfma.
132121
LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
@@ -162,5 +151,15 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
162151
LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
163152
bool disableSwizzle);
164153

154+
// Make a LinearLayout that maps a block-id to an N-dimensional index.
155+
//
156+
// The tensor is split up into CTAsPerCGA pieces, which are distributed among
157+
// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
158+
//
159+
// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
160+
// for an explanation of why this is called makeCgaLayout when it accepts a
161+
// CTALayoutAttr.
162+
LinearLayout makeCgaLayout(CTALayoutAttr layout);
163+
165164
} // namespace mlir::triton::gpu
166165
#endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H

include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "mlir/IR/BuiltinOps.h"
3030
#include "mlir/IR/BuiltinTypes.h"
3131
#include "mlir/IR/Dialect.h"
32+
#include "llvm/Support/ErrorHandling.h"
3233

3334
// TritonNvidiaGPU depends on Triton
3435
#include "triton/Dialect/Triton/IR/Dialect.h"
@@ -61,24 +62,68 @@ struct TMemAllocation {
6162
int numCols;
6263
};
6364

65+
// Used to describe the layout of the TMEM load/store instructions
66+
enum class TMemAccessAtom { I32x32b, I16x64b, I16x128b, I16x256b, I16x32bx2 };
67+
68+
inline int getElementsPerThread(TMemAccessAtom atom) {
69+
switch (atom) {
70+
case TMemAccessAtom::I32x32b:
71+
case TMemAccessAtom::I16x64b:
72+
case TMemAccessAtom::I16x32bx2:
73+
return 1;
74+
case TMemAccessAtom::I16x128b:
75+
return 2;
76+
case TMemAccessAtom::I16x256b:
77+
return 4;
78+
}
79+
llvm_unreachable("Unknown TMemAccessAtom");
80+
}
81+
82+
inline const char *getOpShape(TMemAccessAtom atom) {
83+
switch (atom) {
84+
case TMemAccessAtom::I32x32b:
85+
return "32x32b";
86+
case TMemAccessAtom::I16x64b:
87+
return "16x64b";
88+
case TMemAccessAtom::I16x128b:
89+
return "16x128b";
90+
case TMemAccessAtom::I16x256b:
91+
return "16x256b";
92+
case TMemAccessAtom::I16x32bx2:
93+
return "16x32bx2";
94+
}
95+
llvm_unreachable("Unknown TMemAccessAtom");
96+
}
97+
98+
LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
99+
bool unpacked);
100+
64101
TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
65102

66-
gpu::DistributedEncodingTrait getTmemCompatibleLayout(unsigned M, unsigned N,
67-
RankedTensorType oltType,
68-
unsigned numWarps);
69-
gpu::DistributedEncodingTrait
103+
SmallVector<gpu::DistributedEncodingTrait>
104+
getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps,
105+
ArrayRef<int64_t> ctaSplit = {1, 1});
106+
107+
std::optional<gpu::DistributedEncodingTrait>
70108
getTmemLoadLayoutSplitLongM(RankedTensorType tensorType,
71109
gpu::MemDescType memType, int numWarps);
110+
72111
SmallVector<gpu::DistributedEncodingTrait>
73112
getTmemCompatibleLayouts(Operation *op, RankedTensorType tensorType,
74113
gpu::MemDescType memType);
75114

76115
bool isDistributedLayoutTMemCompatible(Operation *op,
77116
RankedTensorType tensorType,
78117
gpu::MemDescType memType);
79-
bool isDistributedLayoutSplitMTmemLoadStore(RankedTensorType tensorType,
80-
gpu::MemDescType memType,
81-
int numWarps);
118+
119+
gpu::DistributedEncodingTrait
120+
getDefaultLayoutForTmemLdSt(gpu::MemDescType memType, unsigned numWarps,
121+
gpu::CTALayoutAttr ctaLayout);
122+
123+
std::optional<LinearLayout>
124+
getDistributedLayoutForTmemLdSt(gpu::MemDescType memType, TMemAccessAtom atom,
125+
unsigned numWarps,
126+
gpu::CTALayoutAttr ctaLayout);
82127

83128
} // namespace mlir::triton::nvidia_gpu
84129

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#ifndef TRITON_DIALECT_TRITONNVIDIAGPU_IR_TENSORMEMORYUTILS_H_
2+
#define TRITON_DIALECT_TRITONNVIDIAGPU_IR_TENSORMEMORYUTILS_H_
3+
4+
#include "mlir/IR/BuiltinTypes.h"
5+
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
6+
#include "triton/Tools/LinearLayout.h"
7+
8+
#include <cstdint>
9+
#include <functional>
10+
#include <optional>
11+
12+
namespace mlir::triton::nvidia_gpu {
13+
14+
// Get the maximum number of registers per thread based on the context. This is
15+
// by default 256, but it can be overridden by `ttg.maxnreg` set on the module
16+
// or a contextual register limit set by the compiler on partitions.
17+
int getContextualMaxNReg(Operation *op);
18+
struct TMemLdStEncodingInfo {
19+
TMemAccessAtom atom;
20+
LinearLayout reps;
21+
ColumnAction perm;
22+
int numRegsPerMessage;
23+
std::optional<uint32_t> secondHalfOffset;
24+
std::optional<ColumnAction> broadcast = std::nullopt;
25+
bool unpacked = false;
26+
unsigned vec = 1;
27+
bool padding = false;
28+
};
29+
30+
FailureOr<TMemLdStEncodingInfo>
31+
computeTMemLdStEncodingInfo(RankedTensorType regTy, gpu::MemDescType memTy,
32+
int maxnreg,
33+
std::function<InFlightDiagnostic()> emitError = {});
34+
35+
} // namespace mlir::triton::nvidia_gpu
36+
37+
#endif // TRITON_DIALECT_TRITONNVIDIAGPU_IR_TENSORMEMORYUTILS_H_

include/triton/Tools/LinearLayout.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,18 @@ class LinearLayout {
558558
return reshapeOuts({{*getOutDimNames().begin(), getTotalOutDimSize()}});
559559
}
560560

561+
[[nodiscard]] LinearLayout renameInDim(StringAttr oldDim,
562+
StringAttr newDim) const {
563+
auto bases = getBases();
564+
auto it = bases.find(oldDim);
565+
assert(it != bases.end());
566+
auto value = std::move(it->second);
567+
bases.erase(it);
568+
bases.insert({newDim, std::move(value)});
569+
return LinearLayout(bases, getOutDims(),
570+
/*requireSurjective=*/isSurjective());
571+
}
572+
561573
// Concatenates two layouts by their in (resp. out) dimensions. The layouts
562574
// must have the same output (resp. input) dimensions and sizes and different
563575
// input (resp. output) dimensions. The input dimensions of this layout are

lib/Conversion/TritonGPUToLLVM/Utility.cpp

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -35,46 +35,6 @@ static int __builtin_ctz(unsigned x) {
3535

3636
#endif
3737

38-
// This reverts #5645, because it introduced increased register pressure in AMD
39-
// backend.
40-
// TODO: remove when new implementation performance reaches target level
41-
namespace {
42-
43-
LinearLayout getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
44-
LinearLayout regLayout,
45-
triton::gpu::SharedEncodingTrait dstEnc,
46-
int elemBitWidth,
47-
ArrayRef<int64_t> allocShape) {
48-
StringAttr kBlock = StringAttr::get(ctx, ("block"));
49-
int rank = shape.size();
50-
51-
LinearLayout sharedLayout =
52-
triton::gpu::toLinearLayout(allocShape.take_back(rank), dstEnc);
53-
auto sharedOrder = triton::gpu::getOrder(dstEnc, shape);
54-
55-
// sharedLayout's in-dims are currently (offset, block). Reshape to
56-
// (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional
57-
// shmem strides. (The offsetX's appear in minor-to-major order.)
58-
auto sharedLegacy = cast<triton::gpu::SwizzledSharedEncodingAttr>(dstEnc);
59-
SmallVector<std::pair<StringAttr, int32_t>> multiDimSharedSize;
60-
for (int i = 0; i < rank; i++) {
61-
int dim = sharedOrder[i];
62-
int64_t size = std::max(
63-
int64_t{1},
64-
shape[dim] / sharedLegacy.getCTALayout().getCTASplitNum()[dim]);
65-
multiDimSharedSize.push_back(
66-
{StringAttr::get(ctx, ("offset" + std::to_string(dim))), size});
67-
}
68-
multiDimSharedSize.push_back({kBlock, sharedLayout.getInDimSize(kBlock)});
69-
sharedLayout = sharedLayout.reshapeIns(multiDimSharedSize);
70-
71-
// regToSharedLayout maps from (register, lane, warp, block) to (offsetX1,
72-
// ..., offsetXN, block), where the offsetX's are in minor-to-major order.
73-
return regLayout.invertAndCompose(sharedLayout);
74-
}
75-
76-
} // namespace
77-
7838
namespace mlir {
7939

8040
namespace triton::gpu {

lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,10 @@ namespace ttng = triton::nvidia_gpu;
2121
RankedTensorType getTMEMTensorLayout(const TypeConverter *tc,
2222
RankedTensorType type, MemDescType memdesc,
2323
unsigned numWarps) {
24-
Attribute encoding;
2524
type = cast<RankedTensorType>(tc->convertType(type));
26-
if (isa<ttng::TensorMemoryScalesEncodingAttr>(memdesc.getEncoding())) {
27-
encoding = LinearEncodingAttr::get(
28-
type.getContext(), getScaleTMEMStoreLinearLayout(type, numWarps));
29-
} else {
30-
auto tmemEnc = cast<ttng::TensorMemoryEncodingAttr>(memdesc.getEncoding());
31-
encoding = ttng::getTmemCompatibleLayout(
32-
tmemEnc.getBlockM(), tmemEnc.getBlockN(), type, numWarps);
33-
}
25+
auto ctaLayout = getCTALayout(type.getEncoding());
26+
auto encoding =
27+
ttng::getDefaultLayoutForTmemLdSt(memdesc, numWarps, ctaLayout);
3428
return type.cloneWithEncoding(encoding);
3529
}
3630

lib/Dialect/TritonGPU/IR/Dialect.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -577,17 +577,6 @@ static LogicalResult parseBool(AsmParser &parser, const NamedAttribute &attr,
577577
return parseBoolAttrValue(parser, attr.getValue(), value, desc);
578578
};
579579

580-
static LogicalResult parseType(AsmParser &parser, const NamedAttribute &attr,
581-
Type &value, StringRef desc) {
582-
auto typeAttr = mlir::dyn_cast<TypeAttr>(attr.getValue());
583-
if (!typeAttr) {
584-
parser.emitError(parser.getNameLoc(), "expected a Type in ") << desc;
585-
return failure();
586-
}
587-
value = typeAttr.getValue();
588-
return success();
589-
}
590-
591580
std::optional<LinearLayout>
592581
parseLinearLayout(const DictionaryAttr &dict, AsmParser &parser,
593582
ArrayRef<std::string> inDimNames) {

0 commit comments

Comments
 (0)