Skip to content

Commit f5d04d5

Browse files
authored
Revert "[LAYOUTS] Generate distributed layouts for tcgen05.ld/st generically (#8421)" (#8469)
This reverts commit ea01a7e.
1 parent 34676a2 commit f5d04d5

File tree

32 files changed

+1091
-1289
lines changed

32 files changed

+1091
-1289
lines changed

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,19 @@ chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
117117
int32_t elemBitWidth, unsigned instBitWidth,
118118
unsigned numLanesInShuffleGroup);
119119

120+
LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
121+
int numWarps);
122+
123+
std::optional<LinearLayout>
124+
getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
125+
int numWarps);
126+
127+
// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
128+
// distribute the data long M for the warp groups. This doesn't affect the TMem
129+
// layout it just returns a distributed layout compatible for tmem_load.
130+
LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
131+
int numWarps);
132+
120133
// Create LinearLayout for scale in scaled mfma.
121134
LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
122135
ArrayRef<int64_t> dotOperandShape,
@@ -151,15 +164,5 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
151164
LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
152165
bool disableSwizzle);
153166

154-
// Make a LinearLayout that maps a block-id to an N-dimensional index.
155-
//
156-
// The tensor is split up into CTAsPerCGA pieces, which are distributed among
157-
// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
158-
//
159-
// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
160-
// for an explanation of why this is called makeCgaLayout when it accepts a
161-
// CTALayoutAttr.
162-
LinearLayout makeCgaLayout(CTALayoutAttr layout);
163-
164167
} // namespace mlir::triton::gpu
165168
#endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H

include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h

Lines changed: 7 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
#include "mlir/IR/BuiltinOps.h"
3030
#include "mlir/IR/BuiltinTypes.h"
3131
#include "mlir/IR/Dialect.h"
32-
#include "llvm/Support/ErrorHandling.h"
3332

3433
// TritonNvidiaGPU depends on Triton
3534
#include "triton/Dialect/Triton/IR/Dialect.h"
@@ -62,68 +61,24 @@ struct TMemAllocation {
6261
int numCols;
6362
};
6463

65-
// Used to describe the layout of the TMEM load/store instructions
66-
enum class TMemAccessAtom { I32x32b, I16x64b, I16x128b, I16x256b, I16x32bx2 };
67-
68-
inline int getElementsPerThread(TMemAccessAtom atom) {
69-
switch (atom) {
70-
case TMemAccessAtom::I32x32b:
71-
case TMemAccessAtom::I16x64b:
72-
case TMemAccessAtom::I16x32bx2:
73-
return 1;
74-
case TMemAccessAtom::I16x128b:
75-
return 2;
76-
case TMemAccessAtom::I16x256b:
77-
return 4;
78-
}
79-
llvm_unreachable("Unknown TMemAccessAtom");
80-
}
81-
82-
inline const char *getOpShape(TMemAccessAtom atom) {
83-
switch (atom) {
84-
case TMemAccessAtom::I32x32b:
85-
return "32x32b";
86-
case TMemAccessAtom::I16x64b:
87-
return "16x64b";
88-
case TMemAccessAtom::I16x128b:
89-
return "16x128b";
90-
case TMemAccessAtom::I16x256b:
91-
return "16x256b";
92-
case TMemAccessAtom::I16x32bx2:
93-
return "16x32bx2";
94-
}
95-
llvm_unreachable("Unknown TMemAccessAtom");
96-
}
97-
98-
LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
99-
bool unpacked);
100-
10164
TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
10265

103-
SmallVector<gpu::DistributedEncodingTrait>
104-
getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps,
105-
ArrayRef<int64_t> ctaSplit = {1, 1});
106-
107-
std::optional<gpu::DistributedEncodingTrait>
66+
gpu::DistributedEncodingTrait getTmemCompatibleLayout(unsigned M, unsigned N,
67+
RankedTensorType oltType,
68+
unsigned numWarps);
69+
gpu::DistributedEncodingTrait
10870
getTmemLoadLayoutSplitLongM(RankedTensorType tensorType,
10971
gpu::MemDescType memType, int numWarps);
110-
11172
SmallVector<gpu::DistributedEncodingTrait>
11273
getTmemCompatibleLayouts(Operation *op, RankedTensorType tensorType,
11374
gpu::MemDescType memType);
11475

11576
bool isDistributedLayoutTMemCompatible(Operation *op,
11677
RankedTensorType tensorType,
11778
gpu::MemDescType memType);
118-
119-
gpu::DistributedEncodingTrait
120-
getDefaultLayoutForTmemLdSt(gpu::MemDescType memType, unsigned numWarps,
121-
gpu::CTALayoutAttr ctaLayout);
122-
123-
std::optional<LinearLayout>
124-
getDistributedLayoutForTmemLdSt(gpu::MemDescType memType, TMemAccessAtom atom,
125-
unsigned numWarps,
126-
gpu::CTALayoutAttr ctaLayout);
79+
bool isDistributedLayoutSplitMTmemLoadStore(RankedTensorType tensorType,
80+
gpu::MemDescType memType,
81+
int numWarps);
12782

12883
} // namespace mlir::triton::nvidia_gpu
12984

include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h

Lines changed: 0 additions & 37 deletions
This file was deleted.

include/triton/Tools/LinearLayout.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -558,18 +558,6 @@ class LinearLayout {
558558
return reshapeOuts({{*getOutDimNames().begin(), getTotalOutDimSize()}});
559559
}
560560

561-
[[nodiscard]] LinearLayout renameInDim(StringAttr oldDim,
562-
StringAttr newDim) const {
563-
auto bases = getBases();
564-
auto it = bases.find(oldDim);
565-
assert(it != bases.end());
566-
auto value = std::move(it->second);
567-
bases.erase(it);
568-
bases.insert({newDim, std::move(value)});
569-
return LinearLayout(bases, getOutDims(),
570-
/*requireSurjective=*/isSurjective());
571-
}
572-
573561
// Concatenates two layouts by their in (resp. out) dimensions. The layouts
574562
// must have the same output (resp. input) dimensions and sizes and different
575563
// input (resp. output) dimensions. The input dimensions of this layout are

lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,16 @@ namespace ttng = triton::nvidia_gpu;
2121
RankedTensorType getTMEMTensorLayout(const TypeConverter *tc,
2222
RankedTensorType type, MemDescType memdesc,
2323
unsigned numWarps) {
24+
Attribute encoding;
2425
type = cast<RankedTensorType>(tc->convertType(type));
25-
auto ctaLayout = getCTALayout(type.getEncoding());
26-
auto encoding =
27-
ttng::getDefaultLayoutForTmemLdSt(memdesc, numWarps, ctaLayout);
26+
if (isa<ttng::TensorMemoryScalesEncodingAttr>(memdesc.getEncoding())) {
27+
encoding = LinearEncodingAttr::get(
28+
type.getContext(), getScaleTMEMStoreLinearLayout(type, numWarps));
29+
} else {
30+
auto tmemEnc = cast<ttng::TensorMemoryEncodingAttr>(memdesc.getEncoding());
31+
encoding = ttng::getTmemCompatibleLayout(
32+
tmemEnc.getBlockM(), tmemEnc.getBlockN(), type, numWarps);
33+
}
2834
return type.cloneWithEncoding(encoding);
2935
}
3036

0 commit comments

Comments
 (0)