|
29 | 29 | #include "mlir/IR/BuiltinOps.h" |
30 | 30 | #include "mlir/IR/BuiltinTypes.h" |
31 | 31 | #include "mlir/IR/Dialect.h" |
| 32 | +#include "llvm/Support/ErrorHandling.h" |
32 | 33 |
|
33 | 34 | // TritonNvidiaGPU depends on Triton |
34 | 35 | #include "triton/Dialect/Triton/IR/Dialect.h" |
@@ -61,24 +62,68 @@ struct TMemAllocation { |
61 | 62 | int numCols; |
62 | 63 | }; |
63 | 64 |
|
| 65 | +// Used to describe the layout of the TMEM load/store instructions |
| 66 | +enum class TMemAccessAtom { I32x32b, I16x64b, I16x128b, I16x256b, I16x32bx2 }; |
| 67 | + |
| 68 | +inline int getElementsPerThread(TMemAccessAtom atom) { |
| 69 | + switch (atom) { |
| 70 | + case TMemAccessAtom::I32x32b: |
| 71 | + case TMemAccessAtom::I16x64b: |
| 72 | + case TMemAccessAtom::I16x32bx2: |
| 73 | + return 1; |
| 74 | + case TMemAccessAtom::I16x128b: |
| 75 | + return 2; |
| 76 | + case TMemAccessAtom::I16x256b: |
| 77 | + return 4; |
| 78 | + } |
| 79 | + llvm_unreachable("Unknown TMemAccessAtom"); |
| 80 | +} |
| 81 | + |
| 82 | +inline const char *getOpShape(TMemAccessAtom atom) { |
| 83 | + switch (atom) { |
| 84 | + case TMemAccessAtom::I32x32b: |
| 85 | + return "32x32b"; |
| 86 | + case TMemAccessAtom::I16x64b: |
| 87 | + return "16x64b"; |
| 88 | + case TMemAccessAtom::I16x128b: |
| 89 | + return "16x128b"; |
| 90 | + case TMemAccessAtom::I16x256b: |
| 91 | + return "16x256b"; |
| 92 | + case TMemAccessAtom::I16x32bx2: |
| 93 | + return "16x32bx2"; |
| 94 | + } |
| 95 | + llvm_unreachable("Unknown TMemAccessAtom"); |
| 96 | +} |
| 97 | + |
| 98 | +LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom, |
| 99 | + bool unpacked); |
| 100 | + |
64 | 101 | TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType); |
65 | 102 |
|
66 | | -gpu::DistributedEncodingTrait getTmemCompatibleLayout(unsigned M, unsigned N, |
67 | | - RankedTensorType oltType, |
68 | | - unsigned numWarps); |
69 | | -gpu::DistributedEncodingTrait |
| 103 | +SmallVector<gpu::DistributedEncodingTrait> |
| 104 | +getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps, |
| 105 | + ArrayRef<int64_t> ctaSplit = {1, 1}); |
| 106 | + |
| 107 | +std::optional<gpu::DistributedEncodingTrait> |
70 | 108 | getTmemLoadLayoutSplitLongM(RankedTensorType tensorType, |
71 | 109 | gpu::MemDescType memType, int numWarps); |
| 110 | + |
72 | 111 | SmallVector<gpu::DistributedEncodingTrait> |
73 | 112 | getTmemCompatibleLayouts(Operation *op, RankedTensorType tensorType, |
74 | 113 | gpu::MemDescType memType); |
75 | 114 |
|
76 | 115 | bool isDistributedLayoutTMemCompatible(Operation *op, |
77 | 116 | RankedTensorType tensorType, |
78 | 117 | gpu::MemDescType memType); |
79 | | -bool isDistributedLayoutSplitMTmemLoadStore(RankedTensorType tensorType, |
80 | | - gpu::MemDescType memType, |
81 | | - int numWarps); |
| 118 | + |
| 119 | +gpu::DistributedEncodingTrait |
| 120 | +getDefaultLayoutForTmemLdSt(gpu::MemDescType memType, unsigned numWarps, |
| 121 | + gpu::CTALayoutAttr ctaLayout); |
| 122 | + |
| 123 | +std::optional<LinearLayout> |
| 124 | +getDistributedLayoutForTmemLdSt(gpu::MemDescType memType, TMemAccessAtom atom, |
| 125 | + unsigned numWarps, |
| 126 | + gpu::CTALayoutAttr ctaLayout); |
82 | 127 |
|
83 | 128 | } // namespace mlir::triton::nvidia_gpu |
84 | 129 |
|
|
0 commit comments