intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 13 additions & 10 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 7 additions & 52 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 7 additions & 52 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h‎
Lines changed: 0 additions & 37 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 0 additions & 12 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp‎
Lines changed: 9 additions & 3 deletions b/‎lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp‎
Lines changed: 9 additions & 3 deletions
@@ -117,6 +117,19 @@ chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
                      int32_t elemBitWidth, unsigned instBitWidth,
                      unsigned numLanesInShuffleGroup);
 
+LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
+                                           int numWarps);
+
+std::optional<LinearLayout>
+getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
+                             int numWarps);
+
+// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
+// distribute the data long M for the warp groups. This doesn't affect the TMem
+// layout it just returns a distributed layout compatible for tmem_load.
+LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
+                                         int numWarps);
+
 // Create LinearLayout for scale in scaled mfma.
 LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<int64_t> dotOperandShape,
@@ -151,15 +164,5 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle);
 
-// Make a LinearLayout that maps a block-id to an N-dimensional index.
-//
-// The tensor is split up into CTAsPerCGA pieces, which are distributed among
-// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
-//
-// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
-// for an explanation of why this is called makeCgaLayout when it accepts a
-// CTALayoutAttr.
-LinearLayout makeCgaLayout(CTALayoutAttr layout);
-
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -29,7 +29,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
-#include "llvm/Support/ErrorHandling.h"
 
 // TritonNvidiaGPU depends on Triton
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -62,68 +61,24 @@ struct TMemAllocation {
   int numCols;
 };
 
-// Used to describe the layout of the TMEM load/store instructions
-enum class TMemAccessAtom { I32x32b, I16x64b, I16x128b, I16x256b, I16x32bx2 };
-
-inline int getElementsPerThread(TMemAccessAtom atom) {
-  switch (atom) {
-  case TMemAccessAtom::I32x32b:
-  case TMemAccessAtom::I16x64b:
-  case TMemAccessAtom::I16x32bx2:
-    return 1;
-  case TMemAccessAtom::I16x128b:
-    return 2;
-  case TMemAccessAtom::I16x256b:
-    return 4;
-  }
-  llvm_unreachable("Unknown TMemAccessAtom");
-}
-
-inline const char *getOpShape(TMemAccessAtom atom) {
-  switch (atom) {
-  case TMemAccessAtom::I32x32b:
-    return "32x32b";
-  case TMemAccessAtom::I16x64b:
-    return "16x64b";
-  case TMemAccessAtom::I16x128b:
-    return "16x128b";
-  case TMemAccessAtom::I16x256b:
-    return "16x256b";
-  case TMemAccessAtom::I16x32bx2:
-    return "16x32bx2";
-  }
-  llvm_unreachable("Unknown TMemAccessAtom");
-}
-
-LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
-                           bool unpacked);
-
 TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
 
-SmallVector<gpu::DistributedEncodingTrait>
-getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps,
-                         ArrayRef<int64_t> ctaSplit = {1, 1});
-
-std::optional<gpu::DistributedEncodingTrait>
+gpu::DistributedEncodingTrait getTmemCompatibleLayout(unsigned M, unsigned N,
+                                                      RankedTensorType oltType,
+                                                      unsigned numWarps);
+gpu::DistributedEncodingTrait
 getTmemLoadLayoutSplitLongM(RankedTensorType tensorType,
                             gpu::MemDescType memType, int numWarps);
-
 SmallVector<gpu::DistributedEncodingTrait>
 getTmemCompatibleLayouts(Operation *op, RankedTensorType tensorType,
                          gpu::MemDescType memType);
 
 bool isDistributedLayoutTMemCompatible(Operation *op,
                                        RankedTensorType tensorType,
                                        gpu::MemDescType memType);
-
-gpu::DistributedEncodingTrait
-getDefaultLayoutForTmemLdSt(gpu::MemDescType memType, unsigned numWarps,
-                            gpu::CTALayoutAttr ctaLayout);
-
-std::optional<LinearLayout>
-getDistributedLayoutForTmemLdSt(gpu::MemDescType memType, TMemAccessAtom atom,
-                                unsigned numWarps,
-                                gpu::CTALayoutAttr ctaLayout);
+bool isDistributedLayoutSplitMTmemLoadStore(RankedTensorType tensorType,
+                                            gpu::MemDescType memType,
+                                            int numWarps);
 
 } // namespace mlir::triton::nvidia_gpu
 
 
@@ -558,18 +558,6 @@ class LinearLayout {
     return reshapeOuts({{*getOutDimNames().begin(), getTotalOutDimSize()}});
   }
 
-  [[nodiscard]] LinearLayout renameInDim(StringAttr oldDim,
-                                         StringAttr newDim) const {
-    auto bases = getBases();
-    auto it = bases.find(oldDim);
-    assert(it != bases.end());
-    auto value = std::move(it->second);
-    bases.erase(it);
-    bases.insert({newDim, std::move(value)});
-    return LinearLayout(bases, getOutDims(),
-                        /*requireSurjective=*/isSurjective());
-  }
-
   // Concatenates two layouts by their in (resp. out) dimensions. The layouts
   // must have the same output (resp. input) dimensions and sizes and different
   // input (resp. output) dimensions. The input dimensions of this layout are
 
@@ -21,10 +21,16 @@ namespace ttng = triton::nvidia_gpu;
 RankedTensorType getTMEMTensorLayout(const TypeConverter *tc,
                                      RankedTensorType type, MemDescType memdesc,
                                      unsigned numWarps) {
+  Attribute encoding;
   type = cast<RankedTensorType>(tc->convertType(type));
-  auto ctaLayout = getCTALayout(type.getEncoding());
-  auto encoding =
-      ttng::getDefaultLayoutForTmemLdSt(memdesc, numWarps, ctaLayout);
+  if (isa<ttng::TensorMemoryScalesEncodingAttr>(memdesc.getEncoding())) {
+    encoding = LinearEncodingAttr::get(
+        type.getContext(), getScaleTMEMStoreLinearLayout(type, numWarps));
+  } else {
+    auto tmemEnc = cast<ttng::TensorMemoryEncodingAttr>(memdesc.getEncoding());
+    encoding = ttng::getTmemCompatibleLayout(
+        tmemEnc.getBlockM(), tmemEnc.getBlockN(), type, numWarps);
+  }
   return type.cloneWithEncoding(encoding);
 }