[BACKEND] Define the semantics of memdesc_subview (#6886)

lezcano · web-flow · commit 6b0624220d6a · 2025-05-20T20:13:04.000+01:00
We strictly define the semantics of `memdesc_subview` as allowing
arbitrary skips along the 0-th dimension when the subview is
rank-reducing, and otherwise via constant offsets that don't touch the
swizzling pattern.

We implement a generic lowering that lowers arbitrary layouts under
these conditions.

These conditions can be very much relaxed and generalised if needed in
the future.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -208,14 +208,26 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure, MemDescViewTrait]> {
 
   let description = [{
     This operation returns a new descriptor representing a subview of the buffer.
-    It doesn't affect the underlying memory. The subview can be rank-reduced.
+    It doesn't affect the underlying memory.
 
     For example, suppose that
      - the input shape is 2x4x16xf16,
-     - the output shape is 4x4xf16, and
-     - offsets = [1, 0, 4].
-
-    Then in Python syntax, the subview covers input[1][0:4][4:8].
+     - the output shape is 4x16xf16, and
+     - offsets = [1, 0, 0].
+
+    Then in Python syntax, the subview covers input[1].
+
+    Just one dimension may be split (at most one non-zero offset).
+
+    When the input shape and the output shape have different rank:
+    Or the output shape is a tensor of 1D tensor of 1 element:
+      - The rank of the output must be 1D smaller than the input.
+      - We assume the input is split along the 0th dimension.
+      - The offset along the 0th dimension may be a runtime value.
+    When the input and the output have the same rank:
+      - The offset must be a compile-time constant
+      - Larger or equal to the tile of the tensor (or zero)
+      - That does not split the input along the swizzling pattern (if any)
   }];
   let arguments = (
     ins TTG_MemDescType:$src, Variadic<I32>:$offsets);
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -3,6 +3,7 @@
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
+#include "triton/Tools/LayoutUtils.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -421,6 +422,7 @@ struct MemDescSubviewOpConversion
   matchAndRewrite(triton::gpu::MemDescSubviewOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
+    auto *ctx = op->getContext();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
     auto destTy = op.getResult().getType();
@@ -433,15 +435,42 @@ struct MemDescSubviewOpConversion
                                                    llvmElemTy, rewriter);
     auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
     SmallVector<Value> opOffsetVals = op.getOffsets();
+    // We assume we always create a subview of the last dimensions
     SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
                                      smemStrides.end());
+    // Compute total offset
     SmallVector<Value> offsetVals;
     auto destRank = op.getResult().getType().getRank();
     auto rankReduced = srcTy.getRank() - destRank;
     for (int i = rankReduced; i < opOffsetVals.size(); i++) {
       offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
     }
-    Value offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
+
+    Value offset;
+    if (rankReduced || (destTy.getRank() == 1 && destTy.getDimSize(0) == 1)) {
+      // We are splitting the pipelining dimension which may not be a power of 2
+      // so we can't use LinearLayouts
+      offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
+    } else {
+      auto dimNames = standardOutDimNames(ctx, opOffsetVals.size());
+      SmallVector<std::pair<StringAttr, Value>> logicalOffsets;
+      // This assumes the subviews are additive, in the sense that we can
+      // compute the offset of one and an add it to the offset of the previous
+      // one we computed. We check for this in the verifier.
+      for (int i = 0; i < rankReduced; i++) {
+        logicalOffsets.push_back({dimNames[i], b.i32_val(0)});
+      }
+      for (int i = rankReduced; i < opOffsetVals.size(); i++) {
+        logicalOffsets.push_back({dimNames[i], offsetVals[i - rankReduced]});
+      }
+      // The order gives us the honest-to-goodness layout rank
+      auto srcAllocShape =
+          srcTy.getAllocShape().take_back(getOrder(srcTy).size());
+      auto llInv = toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+      offset =
+          applyLinearLayout(loc, rewriter, llInv, logicalOffsets)[0].second;
+    }
+
     auto base = smemObj.getBase();
     auto elemPtrTy = base.getType();
     smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -9,6 +9,7 @@
 #include "triton/Dialect/TritonGPU/IR/Types.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Tools/LayoutUtils.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/LogicalResult.h"
 
@@ -600,15 +601,107 @@ LogicalResult MemDescSubviewOp::verify() {
             "offsets other than the first one must be constant zeros");
       }
     }
+    return success();
   }
 
-  // TODO(jlebar): Currently we generate illegal encodings, so we can't add a
-  // verifier for them.  In particular, we use the same encoding for the src and
-  // dst of a subview op, when the subview removes a dimension.  That generates
-  // an illegal shared encoding (because the size of `order` doesn't match the
-  // rank of the tensor), but it's not checked anywhere, and we believe the
-  // resulting code ultimately works.
+  assert(isa<SharedEncodingTrait>(srcEnc));
 
+  // corner case: 1D -> 1D into a 1 element tensor (we don't have 0D tensors)
+  if (srcTy.getRank() == 1 && dstTy.getRank() == 1 &&
+      dstTy.getDimSize(0) == 1) {
+    return success();
+  }
+
+  // There are two cases:
+  // 1. The subview is rank-reducing
+  //  - We split along the first dimension. It can be with non-constant offsets
+  if (srcTy.getRank() != dstTy.getRank()) {
+    if (srcTy.getRank() - dstTy.getRank() != 1) {
+      return emitError(
+          "only nD -> (n-1)D rank-reducing subviews are supported");
+    }
+    for (auto offset : getOffsets().take_back(dstTy.getRank())) {
+      if (auto constOp = offset.getDefiningOp<arith::ConstantOp>()) {
+        if (auto offsetInt = dyn_cast<IntegerAttr>(constOp.getValue())) {
+          if (offsetInt.getInt() != 0) {
+            return emitError("only first offset can be non-zero for a "
+                             "rank-reducing subview");
+          }
+        } else {
+          return emitError(
+              "only integer constant values are allowed for the split");
+        }
+      } else {
+        return emitError("only constant values are allowed outside the front "
+                         "dimension in a rank-reducing subview");
+      }
+    }
+    return success();
+  }
+  assert(srcTy.getRank() == dstTy.getRank());
+  // 2. The src is non-rank-reducing
+  //  - We split along at most one dim, but just with constant values
+  //  - The values where the split happens must not be within the swizzling
+  //  pattern
+  // Check which dimension we are splitting along
+  int dim = -1;
+  for (int i = 0; i < srcTy.getRank(); i++) {
+    if (srcTy.getDimSize(i) != dstTy.getDimSize(i)) {
+      if (dim != -1) {
+        return emitError(
+            "We don't allow subviews that split along multiple dimensions");
+      }
+      dim = i;
+    }
+  }
+  SmallVector<int64_t> offsets;
+  for (auto offset : getOffsets()) {
+    if (auto constOp = offset.getDefiningOp<arith::ConstantOp>()) {
+      if (auto offsetInt = dyn_cast<IntegerAttr>(constOp.getValue())) {
+        offsets.push_back(offsetInt.getInt());
+      } else {
+        return emitError(
+            "only integer constant values are allowed for the split");
+      }
+    } else {
+      return emitError("only constant values are allowed for the split");
+    }
+  }
+  // Identity subview
+  if (dim == -1) {
+    return success();
+  }
+
+  for (auto [i, offset] : llvm::enumerate(offsets)) {
+    if (i != dim) {
+      if (offset != 0) {
+        return emitError("A non zero offset found in a dimension that is "
+                         "not being split");
+      }
+    } else {
+      if (offset & (dstTy.getDimSize(dim) - 1)) {
+        return emitError("The split offset may not touch the tile");
+      }
+    }
+  }
+  auto ctx = getContext();
+  // The order gives us the honest-to-goodness layout rank
+  auto srcAllocShape = srcTy.getAllocShape().take_back(getOrder(srcTy).size());
+  auto llInv =
+      triton::gpu::toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+  auto kDim = mlir::StringAttr::get(ctx, "dim" + llvm::Twine(dim));
+  llvm::SmallVector<std::pair<mlir::StringAttr, int32_t>> namedOffsets;
+  for (auto d : standardOutDimNames(ctx, srcTy.getRank())) {
+    namedOffsets.push_back({d, 0});
+  }
+  for (int dimSize = dstTy.getDimSize(dim); dimSize < srcTy.getDimSize(dim);
+       dimSize *= 2) {
+    namedOffsets[dim] = {kDim, dimSize};
+    if (!llvm::isPowerOf2_32(llInv.apply(namedOffsets)[0].second)) {
+      return emitError(
+          "We don't support splitting along the swizzling pattern");
+    }
+  }
   return success();
 }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
@@ -15,8 +15,6 @@
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
-#include "triton/Tools/LayoutUtils.h"
-#include "triton/Tools/LinearLayout.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -287,50 +285,24 @@ SmallVector<Value> splitLhs(OpBuilder &builder,
 SmallVector<Value> splitRhs(OpBuilder &builder,
                             TypedValue<ttg::MemDescType> rhs, int64_t newK) {
   auto loc = rhs.getLoc();
-  auto *ctx = builder.getContext();
   auto type = rhs.getType();
   auto rank = type.getRank();
   auto kDim = rank - 2;
   auto nSplits = type.getShape()[kDim] / newK;
-  // offset -> matrix
-  auto ll = ttg::toLinearLayout(type.getShape(), type.getEncoding());
-  auto llInv = ll.invert();
-
-  // Split into
-  auto kOffset = StringAttr::get(ctx, "offset");
-  assert(llInv.getOutDimSize(kOffset) == product(type.getShape()));
-  auto dimNames = tt::standardOutDimNames(ctx, rank);
-  SmallVector<std::pair<StringAttr, int32_t>> newOutDims;
-  for (auto d : getOrder(type)) {
-    newOutDims.push_back({dimNames[d], type.getShape()[d]});
-  }
-  // Split into shmem shape and invert
-  llInv = llInv.reshapeOuts(newOutDims);
-  llInv = llInv.transposeOuts(dimNames);
-  auto toOffsets = [&](const SmallVector<std::pair<StringAttr, int32_t>>
-                           &shape) {
-    return llvm::to_vector(
-        llvm::map_range(llvm::make_second_range(shape), [&](int32_t v) {
-          return builder.create<arith::ConstantIntOp>(loc, v, 32).getResult();
-        }));
-  };
-  // New Shape
   auto shape = llvm::to_vector(type.getShape());
   shape[kDim] = newK;
+  SmallVector<Value> offsetsVal;
+  for (int i = 0; i < rank; i++) {
+    offsetsVal.push_back(builder.create<arith::ConstantIntOp>(loc, 0, 32));
+  }
   auto newType = ttg::MemDescType::get(
       shape, type.getElementType(), type.getEncoding(), type.getMemorySpace(),
       /*isMutable=*/false, type.getAllocShape());
   SmallVector<Value> ret;
-  SmallVector<std::pair<StringAttr, int32_t>> logicalOffsets;
-  for (int i = 0; i < rank; i++) {
-    logicalOffsets.push_back({StringAttr::get(ctx, "dim" + Twine(i)), 0});
-  }
   for (int i = 0; i < nSplits; i++) {
-    logicalOffsets[kDim].second = i * newK;
-    auto shmemOffsets = toOffsets(llInv.apply(logicalOffsets));
-
+    offsetsVal[kDim] = builder.create<arith::ConstantIntOp>(loc, i * newK, 32);
     Value newSmem = builder.create<triton::gpu::MemDescSubviewOp>(
-        loc, newType, rhs, shmemOffsets);
+        loc, newType, rhs, offsetsVal);
     ret.push_back(newSmem);
   }
   return ret;
diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir
@@ -116,9 +116,10 @@ tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B :
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) ->
   (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
     scf.if %i1 {
+      %zero = arith.constant 0 : i32
       %index = arith.constant 8 : i32
       // expected-remark @below {{%4 -> %0,%1}}
-      %cst0 = ttg.memdesc_subview %a_shared[%index, %index] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
+      %cst0 = ttg.memdesc_subview %a_shared[%index, %zero] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
       scf.yield
     }
     scf.yield %b_shared, %a_shared, %a_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -440,8 +440,9 @@ tt.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>
   %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
     scf.if %i1 {
+      %zero = arith.constant 0 : i32
       %index = arith.constant 8 : i32
-      %cst0 = ttg.memdesc_subview %a_shared[%index, %index] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
+      %cst0 = ttg.memdesc_subview %a_shared[%index, %zero] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
       scf.yield
     }
     scf.yield %b_shared, %a_shared, %a_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -373,11 +373,11 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     %c16_i32 = arith.constant 16 : i32
     // CHECK-COUNT-16: llvm.store {{.*}} : vector<1xf16>, !llvm.ptr<3>
     %0 = ttg.local_alloc %arg0 : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
-    %1 = ttg.memdesc_subview %0[%c16_i32, %c0_i32] : !ttg.memdesc<64x64xf16, #shared, #smem, mutable> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 64x64>
+    %1 = ttg.memdesc_subview %0[%c0_i32, %c16_i32] : !ttg.memdesc<64x64xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x16xf16, #shared, #smem, mutable, 64x64>
     // CHECK-COUNT-4: llvm.load {{.*}} : !llvm.ptr<3> -> vector<1xf16>
-    %2 = ttg.local_load %1 : !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 64x64> -> tensor<16x64xf16, #blocked>
+    %2 = ttg.local_load %1 : !ttg.memdesc<64x16xf16, #shared, #smem, mutable, 64x64> -> tensor<64x16xf16, #blocked>
     // CHECK-COUNT-4: llvm.store {{.*}} : vector<1xf16>, !llvm.ptr<3>
-    ttg.local_store %2, %1 : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 64x64>
+    ttg.local_store %2, %1 : tensor<64x16xf16, #blocked> -> !ttg.memdesc<64x16xf16, #shared, #smem, mutable, 64x64>
     tt.return
   }
 }
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -549,8 +549,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK: llvm.mlir.global external @global_smem
-  // CHECK-LABEL: basic_subview
-  tt.func @basic_subview() {
+  // CHECK-LABEL: rank_reducing_subview
+  tt.func @rank_reducing_subview() {
     // CHECK: llvm.mlir.addressof @global_smem
     // CHECK: llvm.extractvalue
     // CHECK-NEXT: llvm.extractvalue
@@ -579,33 +579,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 // -----
 
-#shared0 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
-#smem = #ttg.shared_memory
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
-  // CHECK: llvm.mlir.global external @global_smem
-  // CHECK-LABEL: nvmma_subview
-  tt.func @nvmma_subview() {
-    // CHECK: llvm.mlir.addressof @global_smem
-    // CHECK: llvm.mlir.constant(1 : i32) : i32
-    // CHECK-NEXT: llvm.mlir.constant(128 : i32) : i32
-    // CHECK-NEXT: llvm.add
-    // CHECK-NEXT: llvm.add
-    // CHECK-NEXT: llvm.mlir.constant(0 : i32) : i32
-    // CHECK-NEXT: llvm.mul
-    // CHECK-NEXT: llvm.add
-    // CHECK-NEXT: llvm.mul
-    // CHECK-NEXT: llvm.add
-    // CHECK-NEXT: llvm.getelementptr
-    %index = arith.constant 1 : i32
-    %zero = arith.constant 0 : i32
-    %0 = ttg.local_alloc : () -> !ttg.memdesc<16x128xf32, #shared0, #smem, mutable>
-    %1 = ttg.memdesc_subview %0[%zero, %zero] : !ttg.memdesc<16x128xf32, #shared0, #smem, mutable> -> !ttg.memdesc<16x32xf32, #shared0, #smem, mutable>
-    tt.return
-  }
-}
-
-// -----
-
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: basic_async_wait
   tt.func @basic_async_wait() {
diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir