Skip to content

Commit 023102d

Browse files
Merge commit '8c97e9fbde705a0a9938852375cc45fda7dbd768'
2 parents 31623f7 + 8c97e9f commit 023102d

File tree

32 files changed

+65
-446
lines changed

32 files changed

+65
-446
lines changed

include/triton/Analysis/Membar.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class OpBuilder;
1212

1313
/// Callback to allow backend to provide more information on whether a barrier
1414
/// is needed between two operations. Even though two operations access the same
15-
/// shared memory thay may not require a barrier in between them.
15+
/// shared memory they may not require a barrier in between them.
1616
using MembarFilterFn = std::function<bool(Operation *, Operation *)>;
1717

1818
struct BlockInfo {

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 0 additions & 368 deletions
Large diffs are not rendered by default.

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ def TT_ElementwiseInlineAsmOp : TT_Op<"elementwise_inline_asm", [
854854
// Histogram Op
855855
//
856856
def TT_HistogramOp : TT_Op<"histogram", [Pure]> {
857-
let summary = "return a histgram of the inputs.";
857+
let summary = "return a histogram of the inputs.";
858858
let description = [{
859859
Return the histogram of the input tensor. The number of bins is equal to
860860
the dimension of the output tensor. Each bins has a width of 1 and bins

include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ available on AMD Radeon GPUs of RDNA architectures.
950950
- 1: gfx11
951951
- 2: gfx12
952952
- A `warpsPerCTA` parameter characterizes data distribution between warps.
953-
An important limitation of WMMA for layout is a shape for tiles proccessed
953+
An important limitation of WMMA for layout is a shape for tiles processed
954954
by a single warp. It is [16, 16].
955955
This encoding assumes specific access to matrix elements by threads.
956956

include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
9090
let description = [{
9191
This operation copies data from global memory to local memory asynchronously.
9292
This is analogue to tt.load except the data are copied to local memory pointed
93-
by by the memory descriptor instread of a distributed tensor. The rest of the
93+
by by the memory descriptor instead of a distributed tensor. The rest of the
9494
operands are the same as tt.load.
9595
}];
9696

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def TritonGPUReduceDataDuplication: Pass<"tritongpu-reduce-data-duplication", "m
202202
def TritonGPUCombineTensorSelectAndIf: Pass<"tritongpu-combine-tensor-select-and-if", "mlir::ModuleOp"> {
203203
let summary = "Combine tensor select and if";
204204

205-
let description = "For select instruction that uses the same condidtion as the if instruction in the same block "
205+
let description = "For select instruction that uses the same condition as the if instruction in the same block "
206206
"this pass combines the select into the if instruction, making the select operands returned by the "
207207
"then/else yields.";
208208

@@ -211,7 +211,7 @@ def TritonGPUCombineTensorSelectAndIf: Pass<"tritongpu-combine-tensor-select-and
211211
}
212212

213213
def TritonGPUOptimizeAccumulatorInit: Pass<"tritongpu-optimize-accumulator-init", "mlir::ModuleOp"> {
214-
let summary = "Replace accumulater zero-initialization with the flag indicating first use of the accumulator";
214+
let summary = "Replace accumulator zero-initialization with the flag indicating first use of the accumulator";
215215

216216
let description = "For the dot operations that support accumulator-use flag this pass replaces the zero-initialization "
217217
"of the accumulator with the flag indicating the first use of the accumulator.";

include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def TTNG_InvalBarrierOp : TTNG_Op<"inval_barrier", [DeclareOpInterfaceMethods<Me
136136

137137
let description = [{
138138
Invalidate a barrier allocation so that it can be re-used. According to PTX
139-
spec this has to be done before any re-use of the memory used by mbarrier.
139+
spec this has to be done before any reuse of the memory used by mbarrier.
140140

141141
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
142142
}];
@@ -213,7 +213,7 @@ def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local",
213213
let description = [{
214214
This operation copies data from global memory to local memory
215215
asynchronously. This is analogue to tt.load except the data are copied to
216-
local memory pointed by the memory descriptor instread of a distributed
216+
local memory pointed by the memory descriptor instead of a distributed
217217
tensor. The data copied depends on the global memory descriptor pointed to
218218
by `desc_ptr`.
219219
}];
@@ -243,7 +243,7 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global",
243243
let description = [{
244244
This operation copies data from local memory to global memory
245245
asynchronously. This is analogue to tt.store except the data are copied from
246-
local memory pointed by the memory descriptor instread of a distributed
246+
local memory pointed by the memory descriptor instead of a distributed
247247
tensor. The data copied depends on the global memory descriptor pointed to
248248
by `desc_ptr`.
249249
}];

lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ SmallVector<Value> swizzleIndices(ConversionPatternRewriter &rewriter,
4949

5050
auto fastIdx = rawIndices[order[0]];
5151
auto secondIdx = rawIndices[order[1]];
52-
// Original algorithm taken from getSwizzledSharedPtrs function
53-
// (TritonGPUToLLVMBase.h)
54-
//
5552
// phase = (secondIdx // perPhase) % maxPhase
5653
// swizzledGroup = ((fastIdx // vec) ^ phase) * vec
5754
// groupRemainder = fastIdx % vec
@@ -158,7 +155,7 @@ Value computeSwizzledOffset(ConversionPatternRewriter &rewriter, Location loc,
158155
ArrayRef<int64_t> opTensorShape,
159156
ArrayRef<Value> strides) {
160157
Value offset = i32_val(0);
161-
// Compute unswizzled multi dim coordinates in shared memmory object
158+
// Compute unswizzled multi dim coordinates in shared memory object
162159
SmallVector<Value> elemMultiDimIndices(3);
163160
elemMultiDimIndices[dim.batch] =
164161
add(bTileOffset, i32_val(i.bTile * shapePerCTABTile + i.b));
@@ -309,7 +306,7 @@ Value loadFMAOp(Value srcVal, Value llVal, BlockedEncodingAttr dLayout,
309306
sizeNonKPerThread);
310307

311308
// In swizzled memory case basePtr stores pointer to the beginning of shared
312-
// memmory object.
309+
// memory object.
313310
//
314311
// If memory is not swizzled, algorithm breaks element offset pointer into
315312
// constant and non-constant part. Non-constant (depends on thread id) part is

lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ using namespace mlir::triton;
1717
// NOTE: [Additional Function Arguments]
1818
// To support use of shared memory and global scratch memory inside of a
1919
// function, the caller allocates a single large block of the relevant memory
20-
// and calls the funciton with these extra arguments at the end.
20+
// and calls the function with these extra arguments at the end.
2121
// Specifically, the last argument is the global scratch memory allocation and
2222
// the second to last is the shared memory allocation.
2323
//

lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ struct ReduceOpConversion
127127
emitOffsetForLayout(helper.getSrcLayout(), operandType);
128128

129129
// Thread X might hold the same input value in two registers. Get the
130-
// indices in `offsets` that hold unique values, and only accumualte over
130+
// indices in `offsets` that hold unique values, and only accumulate over
131131
// those.
132132
llvm::MapVector<ArrayRef<unsigned>, int> uniqueOffsets;
133133
for (int i = 0; i < offsets.size(); ++i) {
@@ -221,7 +221,7 @@ struct ReduceOpConversion
221221
// For slice layout some ids are duplicated on multiple lanes, so we need to
222222
// handle the delinearization of laneId in a special way. We need to
223223
// generalize this part of the logic to work on any kind of linear layout
224-
// uniformely.
224+
// uniformly.
225225
SmallVector<Value>
226226
getMultiDimLaneId(ReduceOpHelper &helper, Value &laneId, Location &loc,
227227
ConversionPatternRewriter &rewriter) const {

0 commit comments

Comments
 (0)