Skip to content

Commit 1c11bc2

Browse files
Merge OpenAI Triton commit 6b70e71 (#4836)
This PR change the Triton base from 8dd7ccb to 6b70e71 (Jul 25). Pass rate: 98.62%
2 parents f8ce49d + d9edb64 commit 1c11bc2

File tree

151 files changed

+3999
-2027
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

151 files changed

+3999
-2027
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ if(NOT MLIR_DIR)
116116
set(MLIR_DIR ${LLVM_LIBRARY_DIR}/cmake/mlir)
117117
endif()
118118

119+
if(NOT LLD_DIR)
120+
set(LLD_DIR ${LLVM_LIBRARY_DIR}/cmake/lld)
121+
endif()
122+
119123
# MLIR
120124
find_package(MLIR REQUIRED CONFIG PATHS ${MLIR_DIR})
121125

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ PYTHON ?= python
66
BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
77
TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
88
PYTEST := $(PYTHON) -m pytest
9-
LLVM_BUILD_PATH ?= ".llvm-project/build"
9+
LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
1010
NUM_PROCS ?= 8
1111

1212
# Incremental builds

docs/programming-guide/chapter-3/debugging.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,6 @@ Using Third-party Tools
7777
For debugging on NVIDIA GPUs, `compute-sanitizer <https://docs.nvidia.com/cuda/compute-sanitizer/index.html>`_ is an effective tool for checking data races and memory access issues.
7878
To use it, prepend :code:`compute-sanitizer` to your command to run the Triton program.
7979

80-
For debugging on AMD GPUs, you may want to try the LLVM `AddressSanitizer <https://rocm.docs.amd.com/en/latest/conceptual/using-gpu-sanitizer.html>`_ for ROCm.
80+
For debugging on AMD GPUs, you may want to try the LLVM `AddressSanitizer <https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html>`_ for ROCm.
8181

8282
For detailed visualization of memory access in Triton programs, consider using the `triton-viz <https://github.com/Deep-Learning-Profiling-Tools/triton-viz>`_ tool, which is agnostic to the underlying GPUs.

include/triton/Analysis/Allocation.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy);
6363
ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
6464
RankedTensorType dstTy);
6565

66+
unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
67+
RankedTensorType dstTy);
68+
69+
unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
70+
RankedTensorType dstTy);
6671
} // namespace triton
6772

6873
/// Modified from llvm-15.0: llvm/ADT/AddressRanges.h

include/triton/Analysis/Utility.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
254254

255255
bool atomicNeedsSharedMemory(Value result);
256256

257-
// Return true if the src and dst layout match.
258-
bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
259-
RankedTensorType dstTy);
260-
261257
// Check if MFMA layout can be converted to the dot operand
262258
// layout using warp shuffle.
263259
bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,12 +356,12 @@ class SharedMemoryObject {
356356
RewriterBase &rewriter) const;
357357

358358
// Returns a mask representing all the bits of the memdesc offsets that
359-
// may be modified by an affine offset coming from a memdesc_subview.
359+
// may be modified by an affine offset coming from a memdesc_subslice.
360360
// The offsets are considered to be in the type of the memdesc.
361361
// For padded layouts, we return the offsets without padding.
362362
static uint64_t getMaskSpanOffsets(triton::gpu::MemDescType srcTy);
363363

364-
// Returns whether the shared memory access had a memdesc_subview
364+
// Returns whether the shared memory access had a memdesc_subslice
365365
// that is rank-preserving (soon to be called memdesc_slice)
366366
static bool isAffineSharedMemoryAccess(triton::gpu::MemDescType srcTy) {
367367
return getMaskSpanOffsets(srcTy) != 0;
@@ -644,6 +644,19 @@ Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
644644
const TargetInfoBase &targetInfo,
645645
const LLVMTypeConverter *typeConverter,
646646
RewriterBase &rewriter);
647+
648+
SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
649+
ArrayRef<Value> args,
650+
mlir::TypeID terminatorTypeId,
651+
Location loc);
652+
653+
template <typename TerminatorOp>
654+
SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
655+
ArrayRef<Value> args, Location loc) {
656+
return inlineRegionImpl(rewriter, region, args,
657+
mlir::TypeID::get<TerminatorOp>(), loc);
658+
}
659+
647660
} // namespace mlir
648661

649662
#endif

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,26 @@ def TT_ScanReturnOp: TT_Op<"scan.return",
797797
let assemblyFormat = "$result attr-dict `:` type($result)";
798798
}
799799

800+
//
801+
// Map Elementwise op
802+
//
803+
def TT_MapElementwiseOp: TT_Op<"map_elementwise", [SameOperandsAndResultEncoding,
804+
SameOperandsAndResultShape,
805+
RecursiveMemoryEffects]> {
806+
let summary = "Map a scalar subregion over a tensor";
807+
let arguments = (ins Variadic<TT_Tensor>:$srcs, I32Attr:$pack);
808+
let results = (outs Variadic<TT_Tensor>:$result);
809+
let regions = (region AnyRegion:$scalarOp);
810+
let hasVerifier = 1;
811+
let hasRegionVerifier = 1;
812+
}
813+
814+
def TT_MapElementwiseReturnOp: TT_Op<"map_elementwise.return",
815+
[HasParent<"MapElementwiseOp">, Pure, Terminator, ReturnLike]> {
816+
let summary = "terminator for map elementwise operator";
817+
let arguments = (ins Variadic<AnyType>:$result);
818+
let assemblyFormat = "attr-dict ($result^ `:` type($result))?";
819+
}
800820

801821
//
802822
// External Elementwise op

include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -200,38 +200,57 @@ def TTG_LocalDeallocOp : TTG_Op<"local_dealloc"> {
200200
// Use qualified() otherwise "!ttg.memdesc<X>" is printed as "<X>".
201201
let assemblyFormat = [{$src attr-dict `:` qualified(type($src))}];
202202
}
203-
204-
def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure, MemDescViewTrait]> {
203+
def TTG_MemDescIndexOp : TTG_Op<"memdesc_index", [Pure, MemDescViewTrait]> {
205204
let summary = "take a subview of the descriptor.";
206205

207206
let description = [{
208-
This operation returns a new descriptor representing a subview of the buffer.
207+
This operation returns a new descriptor pointing to the `i`-th element of the
208+
input descriptor along the 0-th dimension.
209+
209210
It doesn't affect the underlying memory.
210211

211212
For example, suppose that
212213
- the input shape is 2x4x16xf16,
213214
- the output shape is 4x16xf16, and
214-
- offsets = [1, 0, 0].
215+
- index = 1.
216+
Then the output descriptor is equivalent to input[1], where input is the logical tensor.
215217

216-
Then in Python syntax, the subview covers input[1].
218+
When the input is of rank 1 (i.e, shape=[k]), the output will have shape=[1].
219+
}];
217220

218-
Just one dimension may be split (at most one non-zero offset).
221+
let arguments = (ins TTG_MemDescType:$src, I32:$index);
219222

220-
When the input shape and the output shape have different rank:
221-
Or the output shape is a tensor of 1D tensor of 1 element:
222-
- The rank of the output must be 1D smaller than the input.
223-
- We assume the input is split along the 0th dimension.
224-
- The offset along the 0th dimension may be a runtime value.
225-
When the input and the output have the same rank:
226-
- The offset must be a compile-time constant
227-
- Larger or equal to the tile of the tensor (or zero)
228-
- That does not split the input along the swizzling pattern (if any)
229-
}];
230-
let arguments = (
231-
ins TTG_MemDescType:$src, Variadic<I32>:$offsets);
223+
let results = (outs TTG_MemDescType:$result);
224+
225+
let assemblyFormat = [{$src `,` $index attr-dict `:` qualified(type($src)) `->` qualified(type($result))}];
226+
227+
let hasVerifier = 1;
228+
}
232229

230+
def TTG_MemDescSubsliceOp : TTG_Op<"memdesc_subslice", [Pure, MemDescViewTrait]> {
231+
let summary = "take a subview of the descriptor.";
232+
233+
let description = [{
234+
This operation returns a new descriptor representing a subview of the logical tensor.
235+
It doesn't affect the underlying memory.
236+
237+
For example, suppose that
238+
- the input shape is 32x16xf16,
239+
- the output shape is 8x16xf16, and
240+
- offsets = [2, 1].
241+
Then in Python syntax, the subview covers input[2:8+2, 1:16+1] where input is
242+
the logical tensor.
243+
244+
The offsets must be larger or equal to the tile of the tensor (or zero).
245+
}];
246+
let arguments = (ins TTG_MemDescType:$src, DenseI32ArrayAttr:$offsets);
233247
// Use qualified() otherwise "!ttg.memdesc<X>" is printed as "<X>".
234-
let assemblyFormat = [{$src `[` $offsets `]` attr-dict `:` qualified(type($src)) `->` qualified(type($result))}];
248+
// Render offsets inline as %src[0, 0] via a custom directive, but keep
249+
// the overall parse/print generated from this assemblyFormat.
250+
let assemblyFormat = [{
251+
$src `[` custom<Offsets>($offsets) `]` attr-dict `:` qualified(type($src))
252+
`->` qualified(type($result))
253+
}];
235254

236255
let results = (outs TTG_MemDescType:$result);
237256

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp
6060
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
6161
"mlir::scf::SCFDialect",
6262
"mlir::arith::ArithDialect"];
63+
let options = [
64+
Option<"hoistOutOfIf", "hoist-out-of-if",
65+
"bool", /*default*/"false",
66+
"Hoist TMEM allocations out of if statements">
67+
];
6368
}
6469

6570
def TritonGPUTestPipelineLowerLoop : Pass<"tritongpu-test-pipeline-lower-loop", "mlir::ModuleOp"> {

include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,15 @@ bool isOuterLoop(scf::ForOp forOp);
6868
/// Function to mask operations during scheduling.
6969
Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
7070

71+
/// Wrap the operation into a MaskOp using the provided predicate, enabling high
72+
/// level predication abstraction during pipelining.
73+
Operation *wrapInMaskOp(RewriterBase &rewriter, Operation *op, Value pred);
74+
75+
// Utilize high level predication abstraction to perform optimizations before
76+
// lowering to predicated operations
77+
void resolveMaskOp(ModuleOp moduleOp,
78+
DenseSet<triton::gpu::MaskOp> &peeledMaskOps);
79+
7180
// Return true if the given ForOp has the attribute
7281
// `tt.disallow_acc_multi_buffer` set to true.
7382
bool getDisallowAccMultiBuffer(scf::ForOp forOp);
@@ -133,11 +142,11 @@ gpu::SharedEncodingTrait getSharedEncoding(Operation *loadOp);
133142
// specified.
134143
int getNumStagesOrDefault(scf::ForOp forOp, int defaultNumStages);
135144

136-
// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
145+
// Given a result of MemDescIndex, or Alloca, create a MemDescIndex with a
137146
// single buffer slice (leading dimension equal to 1), at the given index.
138147
TypedValue<triton::gpu::MemDescType>
139148
createSingleBufferView(OpBuilder &builder, Value alloc, Value idx);
140-
// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
149+
// Given a result of MemDescIndex, or Alloca, create a MemDescIndex with a
141150
// single buffer slice (leading dimension equal to 1), at the given index.
142151
TypedValue<triton::gpu::MemDescType>
143152
createSingleBufferView(OpBuilder &builder, Value alloc, int idx);

0 commit comments

Comments
 (0)