intel
diff --git a/‎bin/CMakeLists.txt‎
Lines changed: 0 additions & 17 deletions b/‎bin/CMakeLists.txt‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 6 additions & 6 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 5 additions & 12 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 141 additions & 29 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 141 additions & 29 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 12 additions & 0 deletions b/‎lib/Tools/LinearLayout.cpp‎
Lines changed: 12 additions & 0 deletions
@@ -7,13 +7,7 @@ add_llvm_executable(triton-opt triton-opt.cpp PARTIAL_SOURCES_INTENDED)
 # TODO: what's this?
 llvm_update_compile_flags(triton-opt)
 target_link_libraries(triton-opt PRIVATE
-  TritonLLVMIR
-  TritonAnalysis
-  TritonTransforms
-  TritonGPUTransforms
-  TritonNvidiaGPUTransforms
   TritonIntelLLVMIR
-  MLIRGPUToROCDLTransforms
   ${dialect_libs}
   ${conversion_libs}
   ${triton_libs}
@@ -32,11 +26,6 @@ mlir_check_all_link_libraries(triton-reduce)
 
 llvm_update_compile_flags(triton-reduce)
 target_link_libraries(triton-reduce PRIVATE
-  TritonLLVMIR
-  TritonAnalysis
-  TritonTransforms
-  TritonGPUTransforms
-  TritonNvidiaGPUTransforms
   ${dialect_libs}
   ${conversion_libs}
   ${triton_libs}
@@ -54,10 +43,6 @@ add_llvm_executable(triton-lsp triton-lsp.cpp PARTIAL_SOURCES_INTENDED)
 
 llvm_update_compile_flags(triton-lsp)
 target_link_libraries(triton-lsp PRIVATE
-  TritonAnalysis
-  TritonTransforms
-  TritonGPUTransforms
-  TritonNvidiaGPUTransforms
   ${dialect_libs}
   ${conversion_libs}
   ${triton_libs}
@@ -96,8 +81,6 @@ export_executable_symbols_for_plugins(triton-llvm-opt)
 
 add_llvm_executable(triton-tensor-layout triton-tensor-layout.cpp PARTIAL_SOURCES_INTENDED)
 target_link_libraries(triton-tensor-layout PRIVATE
-  TritonGPUIR
-  TritonNvidiaGPUIR
   ${triton_libs}
   ${conversion_libs}
   ${dialect_libs}
 
@@ -1154,15 +1154,15 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
 // Returns true on success.
 [[nodiscard]] bool emitTransferBetweenRegistersAndShared(
     RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems, Value shmemBase,
-    ArrayRef<Value> shmemStrides, Location loc, RewriterBase &rewriter,
+    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
+    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
 inline DenseMap<unsigned, Value> getSwizzledSharedPtrs(
     Location loc, const TargetInfoBase &target, unsigned inVec,
     RankedTensorType srcTy, triton::gpu::SharedEncodingAttr resSharedLayout,
-    Type resElemTy, SharedMemoryObject smemObj, RewriterBase &rewriter,
+    Type resElemTy, const SharedMemoryObject &smemObj, RewriterBase &rewriter,
     ArrayRef<Value> offsetVals, ArrayRef<Value> srcStrides) {
   // This utility computes the pointers for accessing the provided swizzled
   // shared memory layout `resSharedLayout`. More specifically, it computes,
@@ -1324,14 +1324,14 @@ inline DenseMap<unsigned, Value> getSwizzledSharedPtrs(
 SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
                                            triton::gpu::MemDescType srcTy,
                                            Type elemLlvmTy,
-                                           SharedMemoryObject smemObj,
+                                           const SharedMemoryObject &smemObj,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target);
 
 void storeDistributedToShared(
     triton::gpu::MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
-    ArrayRef<Value> srcVals, Value smemBase, ArrayRef<Value> dstStrides,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
+    ArrayRef<Value> srcVals, const SharedMemoryObject &smemObj, Location loc,
+    RewriterBase &rewriter, const TargetInfoBase &target,
     std::pair<size_t, Type> *const llvmOpCount = nullptr);
 
 inline Value getStructFromSharedMemoryObject(Location loc,
 
@@ -414,6 +414,10 @@ class LinearLayout {
 
   bool isSurjective() const { return surjective; }
 
+  bool isInvertible() const {
+    return surjective && getTotalInDimSize() == getTotalOutDimSize();
+  }
+
   const BasesT &getBases() const { return bases; }
 
   // Get the pos'th basis vector for the inDim -> outDim mapping.
@@ -673,6 +677,9 @@ class LinearLayout {
   // don't place any guarantees on the choices made by this function.
   [[nodiscard]] LinearLayout invertAndCompose(const LinearLayout &outer) const;
 
+  // Get the layout that is the inverse of this layout.
+  [[nodiscard]] LinearLayout invert() const;
+
   // For each in-dim, returns a bitmask of the "free variables" in the layout
   // function.
   //
 
@@ -25,11 +25,9 @@ void lowerDistributedToShared(
   auto outOrd = mlir::cast<SharedEncodingAttr>(dstTy.getEncoding()).getOrder();
   auto elemTy = typeConverter->convertType(srcTy.getElementType());
 
-  auto smemBase = smemObj.getBase();
-  auto dstStrides = smemObj.getStrides();
   auto inVals = unpackLLElements(loc, adaptorSrc, rewriter);
-  storeDistributedToShared(dstTy, srcTy, elemTy, inVals, smemBase, dstStrides,
-                           loc, rewriter, targetInfo, llvmOpCount);
+  storeDistributedToShared(dstTy, srcTy, elemTy, inVals, smemObj, loc, rewriter,
+                           targetInfo, llvmOpCount);
 }
 
 struct GlobalScratchAllocOpConversion
@@ -157,14 +155,9 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
         // If we remove this one, ldmatrix will IMA. It can probably be relaxed
         // though
         canUseLdmatrix &=
-            srcTy.getShape()[0] >= 8 && srcTy.getShape()[1] >= 4 * kWidth;
-        // To be removed in https://github.com/triton-lang/triton/pull/5154
-        bool legacyLoweringIsBuggy =
-            (kWidth >= 8 || (kWidth == 4 && bitwidth == 32) ||
-             dstTy.getRank() == 3) &&
-            mma.isAmpere();
-        return (mma.isHopper() && !canUseLdmatrix) ||
-               (mma.isAmpere() && legacyLoweringIsBuggy);
+            srcTy.getShape()[0] >= 8 &&
+            srcTy.getShape()[1] >= 4 * kWidth & dstTy.getRank() <= 2;
+        return !canUseLdmatrix;
       }
       if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
         return true;
 
@@ -169,10 +169,139 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
   return ret;
 }
 
+namespace {
+
+Value getSmemVecAddr(RankedTensorType registerTy,
+                     triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
+                     Location loc, RewriterBase &rewriter,
+                     const LinearLayout &regToSharedLayout, Value regId,
+                     Value laneId, Value warpId,
+                     const SharedMemoryObject &smemObj) {
+  MLIRContext *ctx = rewriter.getContext();
+  StringAttr kBlock = str_attr("block");
+  StringAttr kRegister = str_attr("register");
+  StringAttr kLane = str_attr("lane");
+  StringAttr kWarp = str_attr("warp");
+  auto shape = sharedTy.getShape();
+  auto rank = shape.size();
+  auto allocShape = sharedTy.getAllocShape();
+  auto sharedEnc =
+      dyn_cast<triton::gpu::SharedEncodingAttr>(sharedTy.getEncoding());
+
+  auto smemBase = smemObj.getBase();
+  auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding());
+  auto smemOffsets = smemObj.getOffsets();
+  auto smemStrides = smemObj.getStrides();
+  Value smemOffset;
+  // When loading or storing to shared memory, we consider two cases for
+  // performance reasons:
+  //
+  //   1. Non-swizzled shared memory.
+  //   2. Swizzled shared memory.
+  //
+  // Consider lowering `ttg.local_load %a`. In the first case, we can
+  // directly construct a linear layout using `%a`'s shape and shared memory
+  // encoding, irrespective of `%a`'s rank or whether it represents a slice of a
+  // larger tensor.
+  //
+  // The method does not apply for swizzled shared memory in some scenarios.
+  // Key properties of swizzling in Triton are:
+  //
+  //   - Swizzling applies only to tensors with rank ≥ 2.
+  //   - It is restricted to the last two dimensions of the tensor.
+  //   - These last two dimensions are always treated as the most "minor."
+  //
+  // An important edge case arises when `%a` results from `%a = ttg.subview %b`,
+  // where `%b` is swizzled (and so is `%a`). In this case, constructing a
+  // layout and determining shared memory offsets using `%a`'s shape is
+  // incorrect. This is because swizzling depends on the original shape of `%b`,
+  // which differs from `%a`'s shape. As a result, some locations may fall
+  // outside `%a`'s contiguous view of memory. Specifically, an element `[i
+  // (row_idx), j (col_idx)]` in `%a` might map to `[i, j']` after swizzling,
+  // where `j'` lies outside `%a`'s shape but still within `%b`'s shape.
+  //
+  // We propose case 2 (see comments below), which provides a more general
+  // solution for all swizzled shared memory scenarios, including the edge case
+  // mentioned above.
+  if (/*no swizzling*/ sharedEnc.getMaxPhase() == 1 ||
+      /*swizzling but same shape*/ shape == allocShape ||
+      /*swizzling and rank-reduced and rank >= 2*/
+      (shape == allocShape.take_back(rank) && rank >= 2)) { // Case 1
+    // Get the address to load/store.  The multi-dim address is (offsetX1, ...,
+    // offsetXN, block), where the offsets appear in minor-to-major order, and
+    // we drop_end to drop block, which we know from above will be 0.
+    smemOffsets = llvm::to_vector(llvm::drop_end(llvm::make_second_range(
+        applyLinearLayout(loc, rewriter, regToSharedLayout,
+                          {{kRegister, regId},
+                           {kLane, laneId},
+                           {kWarp, warpId},
+                           {kBlock, i32_val(0)}}))));
+    // Reorder strides according to `order`.  This way they match the
+    // multi-dimensional offsets in regToSharedLayout.
+    smemOffset = dot(rewriter, loc, smemOffsets,
+                     applyPermutation(smemStrides, sharedOrder));
+  } else { // Case 2 -> rank-reduced swizzling
+    assert(rank >= 2 && "Swizzling only applies to tensors with rank >= 2");
+    // We define both tensor offsets and shared memory offsets:
+    //
+    //   - Tensor offsets: Relative offsets within a given tensor.
+    //   - Shared memory offsets: Absolute offsets within the shared memory.
+    //
+    // In Triton, the shared memory layout provides an invertible, one-to-one
+    // mapping between tensor offsets and shared memory offsets. The `base`
+    // field of any shared memory object represents both the shared memory
+    // offset and the tensor offset relative to the original tensor at
+    // allocation, prior to any subview operations.
+    //
+    // To determine the shared memory offsets for a specific register when
+    // dealing with swizzled and sliced tensors, the process involves:
+    //
+    //   1. Retrieving the original tensor's `invertAllocSharedLayout`, which
+    //   maps the allocated tensor's offsets back to shared memory offsets.
+    //   2. Reconstructing the register's offsets in the allocated tensor by
+    //   summing:
+    //      - The shared memory offsets of the current view's base, and
+    //      - The relative tensor offsets of the register.
+    //
+    // This approach ensures that "absolute" tensor offsets can be
+    // mapped to the correct shared memory addresses using
+    // `invertAllocSharedLayout`.
+    std::optional<LinearLayout> regLayout =
+        triton::gpu::toLinearLayout(shape, registerTy.getEncoding());
+    auto allocSharedLayout = triton::gpu::toLinearLayout(
+        allocShape.take_back(rank), sharedTy.getEncoding(),
+        elemLlvmTy.getIntOrFloatBitWidth());
+    assert(allocSharedLayout.has_value() &&
+           "Failed to convert layout to linear layout");
+    auto invertAllocSharedLayout = allocSharedLayout->invert();
+    auto multiDimTensorOffsets =
+        llvm::to_vector(applyLinearLayout(loc, rewriter, *regLayout,
+                                          {{kRegister, regId},
+                                           {kLane, laneId},
+                                           {kWarp, warpId},
+                                           {kBlock, i32_val(0)}}));
+    for (auto i = 0; i < rank; i++) {
+      multiDimTensorOffsets[i].second =
+          add(multiDimTensorOffsets[i].second, smemOffsets[i]);
+    }
+    smemOffset = applyLinearLayout(loc, rewriter, invertAllocSharedLayout,
+                                   multiDimTensorOffsets)[0]
+                     .second;
+    Value baseToAllocBaseDist = dot(rewriter, loc, smemOffsets, smemStrides);
+    smemOffset = sub(smemOffset, baseToAllocBaseDist);
+  }
+  auto ptrTy = smemBase.getType();
+  auto vecAddr = gep(ptrTy, elemLlvmTy, smemBase, smemOffset);
+  vecAddr.setInbounds(true);
+  return vecAddr;
+}
+
+} // namespace
+
 bool emitTransferBetweenRegistersAndShared(
     RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems, Value shmemBase,
-    ArrayRef<Value> shmemStrides, Location loc, RewriterBase &rewriter,
+    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
+    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
   MLIRContext *ctx = rewriter.getContext();
@@ -230,28 +359,12 @@ bool emitTransferBetweenRegistersAndShared(
 
   int numElems = regToSharedLayout->getInDimSize(kRegister);
   auto vecTy = vec_ty(elemLlvmTy, vecElems);
-  auto ptrTy = shmemBase.getType();
   Value zero = i32_val(0);
   SmallVector<Value> ret;
   for (int i = 0; i < numElems / vecElems; i++) {
-    // Get the address to load/store.  The multi-dim address is (offsetX1, ...,
-    // offsetXN, block), where the offsets appear in minor-to-major order, and
-    // we drop_end to drop block, which we know from above will be 0.
-    auto multiDimShmemOffset =
-        llvm::to_vector(llvm::drop_end(llvm::make_second_range(
-            applyLinearLayout(loc, rewriter, *regToSharedLayout,
-                              {{kRegister, i32_val(i * vecElems)},
-                               {kLane, laneId},
-                               {kWarp, warpId},
-                               {kBlock, zero}}))));
-
-    // Reorder strides according to `order`.  This way they match the
-    // multi-dimensional offsets in regToSharedLayout.
-    auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding());
-    Value shmemOffset = dot(rewriter, loc, multiDimShmemOffset,
-                            applyPermutation(shmemStrides, sharedOrder));
-    auto vecAddr = gep(ptrTy, elemLlvmTy, shmemBase, shmemOffset);
-    vecAddr.setInbounds(true);
+    auto vecAddr = getSmemVecAddr(
+        registerTy, sharedTy, elemLlvmTy, loc, rewriter, *regToSharedLayout,
+        i32_val(i * vecElems), laneId, warpId, smemObj);
 
     perVectorCallback(vecTy, vecAddr);
   }
@@ -261,14 +374,13 @@ bool emitTransferBetweenRegistersAndShared(
 SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
                                            triton::gpu::MemDescType srcTy,
                                            Type elemLlvmTy,
-                                           SharedMemoryObject smemObj,
+                                           const SharedMemoryObject &smemObj,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target) {
   SmallVector<Value> ret;
   bool success = emitTransferBetweenRegistersAndShared(
-      dstTy, srcTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemObj.getBase(),
-      smemObj.getStrides(), loc, rewriter, target,
-      [&](VectorType vecTy, Value vecAddr) {
+      dstTy, srcTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemObj, loc,
+      rewriter, target, [&](VectorType vecTy, Value vecAddr) {
         auto vecVal = load(vecTy, vecAddr);
         vecVal.setAlignment(vecTy.getNumElements() *
                             elemLlvmTy.getIntOrFloatBitWidth() / 8);
@@ -285,14 +397,14 @@ SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
 
 void storeDistributedToShared(triton::gpu::MemDescType dstTy,
                               RankedTensorType srcTy, Type elemLlvmTy,
-                              ArrayRef<Value> srcVals, Value smemBase,
-                              ArrayRef<Value> dstStrides, Location loc,
+                              ArrayRef<Value> srcVals,
+                              const SharedMemoryObject &smemObj, Location loc,
                               RewriterBase &rewriter,
                               const TargetInfoBase &target,
                               std::pair<size_t, Type> *const llvmOpCount) {
   bool success = emitTransferBetweenRegistersAndShared(
-      srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase,
-      dstStrides, loc, rewriter, target, [&](VectorType vecTy, Value vecAddr) {
+      srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemObj, loc,
+      rewriter, target, [&](VectorType vecTy, Value vecAddr) {
         ArrayRef<Value> vals = srcVals.take_front(vecTy.getNumElements());
         srcVals = srcVals.drop_front(vecTy.getNumElements());
 
 
@@ -394,7 +394,7 @@ struct MemDescSubviewOpConversion
     int rankReduced = srcTy.getRank() - destRank;
     for (int i = rankReduced; i < opOffsetVals.size(); i++) {
       strides.push_back(smemObj.strides[i]);
-      offsetVals.push_back(opOffsetVals[i]);
+      offsetVals.push_back(add(opOffsetVals[i], smemObj.offsets[i]));
     }
     // Compute the offset based on the original strides of the shared memory
     // object
 
@@ -272,7 +272,7 @@ createTMAAsyncCopy(scf::ForOp &forOp, tt::ExperimentalDescriptorLoadOp loadOp,
 
     builder.setInsertionPointAfter(viewLoad);
     auto sharedLoad = builder.createWithStage<ttg::LocalLoadOp>(
-        loc, stage, clusterId, loadOp.getType(),
+        loc, stageForFirstUse, clusterForFirstUse, loadOp.getType(),
         viewLoad /*,wait->getResult(0)*/);
     auto result = sharedLoad->getResults();
     loadOp->replaceAllUsesWith(result);
 
@@ -337,6 +337,7 @@ LinearLayout::checkInvariants(bool requireSurjective) {
            "can be reached by some `in` coordinate, but was not:" +
            toString();
   }
+
   return std::nullopt;
 }
 
@@ -918,6 +919,17 @@ LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
   return flatComposed.reshapeIns(retInDims).reshapeOuts(retOutDims);
 }
 
+LinearLayout LinearLayout::invert() const {
+  // A^-1(x) = A^-1(I(x)), thus A.invert() = I.invertAndCompose(A)
+  assert(isInvertible() &&
+         "A linear layout must be surjective and square to be invertible");
+  LinearLayout identity = LinearLayout::empty();
+  for (auto outDim : getOutDimNames()) {
+    identity *= LinearLayout::identity1D(getOutDimSize(outDim), outDim, outDim);
+  }
+  return identity.invertAndCompose(*this);
+}
+
 llvm::MapVector<StringAttr, int32_t>
 LinearLayout::getFreeVariableMasks() const {
   std::unique_ptr<uint64_t[]> mat = getMatrix(*this);
Original file line number	Diff line number	Diff line change
`@@ -394,7 +394,7 @@ struct MemDescSubviewOpConversion`
`394`	`394`	`int rankReduced = srcTy.getRank() - destRank;`
`395`	`395`	`for (int i = rankReduced; i < opOffsetVals.size(); i++) {`
`396`	`396`	`strides.push_back(smemObj.strides[i]);`
`397`		`- offsetVals.push_back(opOffsetVals[i]);`
	`397`	`+ offsetVals.push_back(add(opOffsetVals[i], smemObj.offsets[i]));`
`398`	`398`	`}`
`399`	`399`	`// Compute the offset based on the original strides of the shared memory`
`400`	`400`	`// object`