intel
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 11 deletions b/‎CMakeLists.txt‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎Makefile‎
Lines changed: 16 additions & 16 deletions b/‎Makefile‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 13 additions & 14 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 52 additions & 7 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 52 additions & 7 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h‎
Lines changed: 37 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 12 additions & 0 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 40 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 40 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp‎
Lines changed: 3 additions & 9 deletions b/‎lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 11 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 11 deletions
@@ -196,6 +196,17 @@ if(TRITON_BUILD_PYTHON_MODULE)
   find_package(Python3 REQUIRED COMPONENTS Development.Module Interpreter)
   find_package(pybind11 CONFIG REQUIRED HINTS "${Python3_SITELIB}")
 
+  foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
+    add_subdirectory(third_party/${CODEGEN_BACKEND})
+  endforeach()
+
+  if (TRITON_BUILD_PROTON)
+    add_subdirectory(third_party/proton)
+  endif()
+  # We always build proton dialect
+  list(APPEND TRITON_PLUGIN_NAMES "proton")
+  add_subdirectory(third_party/proton/Dialect)
+
   if (DEFINED TRITON_PLUGIN_DIRS)
     foreach(PLUGIN_DIR ${TRITON_PLUGIN_DIRS})
       # Read the plugin name under dir/backend/name.conf
@@ -213,17 +224,6 @@ if(TRITON_BUILD_PYTHON_MODULE)
     endforeach()
   endif()
 
-  foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
-    add_subdirectory(third_party/${CODEGEN_BACKEND})
-  endforeach()
-
-  if (TRITON_BUILD_PROTON)
-    add_subdirectory(third_party/proton)
-  endif()
-  # We always build proton dialect
-  list(APPEND TRITON_PLUGIN_NAMES "proton")
-  add_subdirectory(third_party/proton/Dialect)
-
   get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
   get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS)
   set(TRITON_LIBRARIES
 
@@ -31,50 +31,50 @@ test-cpp:
 
 .PHONY: test-unit
 test-unit: all
-	cd python/test/unit && $(PYTEST) -s -n $(NUM_PROCS) --ignore=language/test_line_info.py \
+	cd python/test/unit && $(PYTEST) --tb=short -s -n $(NUM_PROCS) --ignore=language/test_line_info.py \
 		--ignore=language/test_subprocess.py --ignore=test_debug.py
-	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
-	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
-	$(PYTEST) -s -n 6 python/triton_kernels/tests/
-	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
+	$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
+	$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
+	$(PYTEST) --tb=short -s -n 6 python/triton_kernels/tests/
+	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) --tb=short -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
-	$(PYTEST) -vs python/tutorials/06-fused-attention.py
-	$(PYTEST) -vs python/tutorials/gluon/01-intro.py python/tutorials/gluon/02-layouts.py python/tutorials/gluon/03-async-copy.py python/tutorials/gluon/04-tma.py python/tutorials/gluon/05-wgmma.py python/tutorials/gluon/06-tcgen05.py python/tutorials/gluon/07-persistence.py python/tutorials/gluon/08-warp-specialization.py
-	$(PYTEST) -vs python/examples/gluon/01-attention-forward.py
+	$(PYTEST) --tb=short -vs python/tutorials/06-fused-attention.py
+	$(PYTEST) --tb=short -vs python/tutorials/gluon/01-intro.py python/tutorials/gluon/02-layouts.py python/tutorials/gluon/03-async-copy.py python/tutorials/gluon/04-tma.py python/tutorials/gluon/05-wgmma.py python/tutorials/gluon/06-tcgen05.py python/tutorials/gluon/07-persistence.py python/tutorials/gluon/08-warp-specialization.py
+	$(PYTEST) --tb=short -vs python/examples/gluon/01-attention-forward.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
-	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
+	$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/gluon
 
 .PHONY: test-distributed
 test-distributed: all
 	$(PYTHON) -m pip install --upgrade pip
 	$(PYTHON) -m pip install python/triton_kernels -v
-	$(PYTEST) -s python/triton_kernels/bench/distributed.py
+	$(PYTEST) --tb=short -s python/triton_kernels/bench/distributed.py
 
 .PHONY: test-gluon
 test-gluon: all
-	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
-	$(PYTEST) -vs python/examples/gluon/01-attention-forward.py
+	$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/gluon
+	$(PYTEST) --tb=short -vs python/examples/gluon/01-attention-forward.py
 
 .PHONY: test-regression
 test-regression: all
-	$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
+	$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/regression
 
 .PHONY: test-microbenchmark
 test-microbenchmark: all
 	$(PYTHON) python/test/microbenchmark/launch_overhead.py
 
 .PHONY: test-interpret
 test-interpret: all
-	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
+	cd python/test/unit && TRITON_INTERPRET=1 $(PYTEST) --tb=short -s -n 16 -m interpreter cuda language/test_core.py language/test_standard.py \
 		language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
 		language/test_tuple.py runtime/test_autotuner.py::test_kwargs[False] \
 		../../tutorials/06-fused-attention.py::test_op --device=cpu
 
 .PHONY: test-proton
 test-proton: all
-	$(PYTEST) -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
-	$(PYTEST) -s third_party/proton/test/test_override.py
+	$(PYTEST) --tb=short -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
+	$(PYTEST) --tb=short -s third_party/proton/test/test_override.py
 
 .PHONY: test-python
 test-python: test-unit test-regression test-interpret test-proton
 
@@ -112,21 +112,10 @@ LinearLayout chooseShemLayoutForRegToRegConversion(
 
 // The primary goal of this function is to efficiently load 2D tiles of a
 // tensor from shared memory using the `ds_read_tr` instruction for AMD GPUs.
-LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
-                                     int32_t elemBitWidth);
-
-LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
-                                           int numWarps);
-
 std::optional<LinearLayout>
-getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
-                             int numWarps);
-
-// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
-// distribute the data long M for the warp groups. This doesn't affect the TMem
-// layout it just returns a distributed layout compatible for tmem_load.
-LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
-                                         int numWarps);
+chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
+                     int32_t elemBitWidth, unsigned instBitWidth,
+                     unsigned numLanesInShuffleGroup);
 
 // Create LinearLayout for scale in scaled mfma.
 LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
@@ -162,5 +151,15 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle);
 
+// Make a LinearLayout that maps a block-id to an N-dimensional index.
+//
+// The tensor is split up into CTAsPerCGA pieces, which are distributed among
+// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
+//
+// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
+// for an explanation of why this is called makeCgaLayout when it accepts a
+// CTALayoutAttr.
+LinearLayout makeCgaLayout(CTALayoutAttr layout);
+
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -29,6 +29,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
+#include "llvm/Support/ErrorHandling.h"
 
 // TritonNvidiaGPU depends on Triton
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -61,24 +62,68 @@ struct TMemAllocation {
   int numCols;
 };
 
+// Used to describe the layout of the TMEM load/store instructions
+enum class TMemAccessAtom { I32x32b, I16x64b, I16x128b, I16x256b, I16x32bx2 };
+
+inline int getElementsPerThread(TMemAccessAtom atom) {
+  switch (atom) {
+  case TMemAccessAtom::I32x32b:
+  case TMemAccessAtom::I16x64b:
+  case TMemAccessAtom::I16x32bx2:
+    return 1;
+  case TMemAccessAtom::I16x128b:
+    return 2;
+  case TMemAccessAtom::I16x256b:
+    return 4;
+  }
+  llvm_unreachable("Unknown TMemAccessAtom");
+}
+
+inline const char *getOpShape(TMemAccessAtom atom) {
+  switch (atom) {
+  case TMemAccessAtom::I32x32b:
+    return "32x32b";
+  case TMemAccessAtom::I16x64b:
+    return "16x64b";
+  case TMemAccessAtom::I16x128b:
+    return "16x128b";
+  case TMemAccessAtom::I16x256b:
+    return "16x256b";
+  case TMemAccessAtom::I16x32bx2:
+    return "16x32bx2";
+  }
+  llvm_unreachable("Unknown TMemAccessAtom");
+}
+
+LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
+                           bool unpacked);
+
 TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
 
-gpu::DistributedEncodingTrait getTmemCompatibleLayout(unsigned M, unsigned N,
-                                                      RankedTensorType oltType,
-                                                      unsigned numWarps);
-gpu::DistributedEncodingTrait
+SmallVector<gpu::DistributedEncodingTrait>
+getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps,
+                         ArrayRef<int64_t> ctaSplit = {1, 1});
+
+std::optional<gpu::DistributedEncodingTrait>
 getTmemLoadLayoutSplitLongM(RankedTensorType tensorType,
                             gpu::MemDescType memType, int numWarps);
+
 SmallVector<gpu::DistributedEncodingTrait>
 getTmemCompatibleLayouts(Operation *op, RankedTensorType tensorType,
                          gpu::MemDescType memType);
 
 bool isDistributedLayoutTMemCompatible(Operation *op,
                                        RankedTensorType tensorType,
                                        gpu::MemDescType memType);
-bool isDistributedLayoutSplitMTmemLoadStore(RankedTensorType tensorType,
-                                            gpu::MemDescType memType,
-                                            int numWarps);
+
+gpu::DistributedEncodingTrait
+getDefaultLayoutForTmemLdSt(gpu::MemDescType memType, unsigned numWarps,
+                            gpu::CTALayoutAttr ctaLayout);
+
+std::optional<LinearLayout>
+getDistributedLayoutForTmemLdSt(gpu::MemDescType memType, TMemAccessAtom atom,
+                                unsigned numWarps,
+                                gpu::CTALayoutAttr ctaLayout);
 
 } // namespace mlir::triton::nvidia_gpu
 
 
@@ -0,0 +1,37 @@
+#ifndef TRITON_DIALECT_TRITONNVIDIAGPU_IR_TENSORMEMORYUTILS_H_
+#define TRITON_DIALECT_TRITONNVIDIAGPU_IR_TENSORMEMORYUTILS_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Tools/LinearLayout.h"
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+
+namespace mlir::triton::nvidia_gpu {
+
+// Get the maximum number of registers per thread based on the context. This is
+// by default 256, but it can be overridden by `ttg.maxnreg` set on the module
+// or a contextual register limit set by the compiler on partitions.
+int getContextualMaxNReg(Operation *op);
+struct TMemLdStEncodingInfo {
+  TMemAccessAtom atom;
+  LinearLayout reps;
+  ColumnAction perm;
+  int numRegsPerMessage;
+  std::optional<uint32_t> secondHalfOffset;
+  std::optional<ColumnAction> broadcast = std::nullopt;
+  bool unpacked = false;
+  unsigned vec = 1;
+  bool padding = false;
+};
+
+FailureOr<TMemLdStEncodingInfo>
+computeTMemLdStEncodingInfo(RankedTensorType regTy, gpu::MemDescType memTy,
+                            int maxnreg,
+                            std::function<InFlightDiagnostic()> emitError = {});
+
+} // namespace mlir::triton::nvidia_gpu
+
+#endif // TRITON_DIALECT_TRITONNVIDIAGPU_IR_TENSORMEMORYUTILS_H_
@@ -558,6 +558,18 @@ class LinearLayout {
     return reshapeOuts({{*getOutDimNames().begin(), getTotalOutDimSize()}});
   }
 
+  [[nodiscard]] LinearLayout renameInDim(StringAttr oldDim,
+                                         StringAttr newDim) const {
+    auto bases = getBases();
+    auto it = bases.find(oldDim);
+    assert(it != bases.end());
+    auto value = std::move(it->second);
+    bases.erase(it);
+    bases.insert({newDim, std::move(value)});
+    return LinearLayout(bases, getOutDims(),
+                        /*requireSurjective=*/isSurjective());
+  }
+
   // Concatenates two layouts by their in (resp. out) dimensions. The layouts
   // must have the same output (resp. input) dimensions and sizes and different
   // input (resp. output) dimensions. The input dimensions of this layout are
 
@@ -35,46 +35,6 @@ static int __builtin_ctz(unsigned x) {
 
 #endif
 
-// This reverts #5645, because it introduced increased register pressure in AMD
-// backend.
-// TODO: remove when new implementation performance reaches target level
-namespace {
-
-LinearLayout getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
-                                  LinearLayout regLayout,
-                                  triton::gpu::SharedEncodingTrait dstEnc,
-                                  int elemBitWidth,
-                                  ArrayRef<int64_t> allocShape) {
-  StringAttr kBlock = StringAttr::get(ctx, ("block"));
-  int rank = shape.size();
-
-  LinearLayout sharedLayout =
-      triton::gpu::toLinearLayout(allocShape.take_back(rank), dstEnc);
-  auto sharedOrder = triton::gpu::getOrder(dstEnc, shape);
-
-  // sharedLayout's in-dims are currently (offset, block).  Reshape to
-  // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional
-  // shmem strides.  (The offsetX's appear in minor-to-major order.)
-  auto sharedLegacy = cast<triton::gpu::SwizzledSharedEncodingAttr>(dstEnc);
-  SmallVector<std::pair<StringAttr, int32_t>> multiDimSharedSize;
-  for (int i = 0; i < rank; i++) {
-    int dim = sharedOrder[i];
-    int64_t size = std::max(
-        int64_t{1},
-        shape[dim] / sharedLegacy.getCTALayout().getCTASplitNum()[dim]);
-    multiDimSharedSize.push_back(
-        {StringAttr::get(ctx, ("offset" + std::to_string(dim))), size});
-  }
-  multiDimSharedSize.push_back({kBlock, sharedLayout.getInDimSize(kBlock)});
-  sharedLayout = sharedLayout.reshapeIns(multiDimSharedSize);
-
-  // regToSharedLayout maps from (register, lane, warp, block) to (offsetX1,
-  // ..., offsetXN, block), where the offsetX's are in minor-to-major order.
-  return regLayout.invertAndCompose(sharedLayout);
-}
-
-} // namespace
-
 namespace mlir {
 
 namespace triton::gpu {
 
@@ -21,16 +21,10 @@ namespace ttng = triton::nvidia_gpu;
 RankedTensorType getTMEMTensorLayout(const TypeConverter *tc,
                                      RankedTensorType type, MemDescType memdesc,
                                      unsigned numWarps) {
-  Attribute encoding;
   type = cast<RankedTensorType>(tc->convertType(type));
-  if (isa<ttng::TensorMemoryScalesEncodingAttr>(memdesc.getEncoding())) {
-    encoding = LinearEncodingAttr::get(
-        type.getContext(), getScaleTMEMStoreLinearLayout(type, numWarps));
-  } else {
-    auto tmemEnc = cast<ttng::TensorMemoryEncodingAttr>(memdesc.getEncoding());
-    encoding = ttng::getTmemCompatibleLayout(
-        tmemEnc.getBlockM(), tmemEnc.getBlockN(), type, numWarps);
-  }
+  auto ctaLayout = getCTALayout(type.getEncoding());
+  auto encoding =
+      ttng::getDefaultLayoutForTmemLdSt(memdesc, numWarps, ctaLayout);
   return type.cloneWithEncoding(encoding);
 }
 
 
@@ -577,17 +577,6 @@ static LogicalResult parseBool(AsmParser &parser, const NamedAttribute &attr,
   return parseBoolAttrValue(parser, attr.getValue(), value, desc);
 };
 
-static LogicalResult parseType(AsmParser &parser, const NamedAttribute &attr,
-                               Type &value, StringRef desc) {
-  auto typeAttr = mlir::dyn_cast<TypeAttr>(attr.getValue());
-  if (!typeAttr) {
-    parser.emitError(parser.getNameLoc(), "expected a Type in ") << desc;
-    return failure();
-  }
-  value = typeAttr.getValue();
-  return success();
-}
-
 std::optional<LinearLayout>
 parseLinearLayout(const DictionaryAttr &dict, AsmParser &parser,
                   ArrayRef<std::string> inDimNames) {