[BACKEND] Reshape the allocShape within MemDescReshapeOp (#7495)

lezcano · web-flow · commit 322cd5b4a72a · 2025-07-13T20:38:42.000Z
This follows the same pattern as `MemdescTransOp`. To do so, we align
more the op with `MemDescTransOp` by inferring the output type
automatically.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -273,6 +273,27 @@ def TTG_MemDescReshapeOp : TTG_Op<"memdesc_reshape", [Pure,
   }];
 
   let arguments = (ins TTG_MemDescType:$src);
+
+  let builders = [
+    OpBuilder<(ins "Value":$src, "ArrayRef<int64_t>":$shape),
+              [{
+                MemDescType dstTy;
+                auto srcTy = cast<MemDescType>(src.getType());
+                auto result = inferReturnTypes($_builder.getContext(),
+                                           $_builder.getUnknownLoc(),
+                                           srcTy, shape, dstTy);
+                assert(succeeded(result) && "failed to infer return types");
+                build($_builder, $_state, dstTy, src);
+              }]>
+  ];
+  let extraClassDeclaration = [{
+      static LogicalResult inferReturnTypes(MLIRContext *context,
+                                        std::optional<Location> loc,
+                                        MemDescType srcTy,
+                                        ArrayRef<int64_t> dstShape,
+                                        MemDescType &inferredReturnType);
+  }];
+
   let results = (outs TTG_MemDescType:$result);
 
   let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))";
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -2466,6 +2466,38 @@ struct TritonGPUInferLayoutInterface
                                              Attribute srcEnc,
                                              ArrayRef<int64_t> dstShape,
                                              Attribute &dstEnc) const {
+    if (auto mmaEncoding = dyn_cast<NVMMASharedEncodingAttr>(srcEnc)) {
+      // TODO: supporting reshape of CTA layouts is non-trivial.
+      if (getNumCTAs(mmaEncoding) > 1)
+        return failure();
+      int innerDimDst =
+          mmaEncoding.getTransposed() ? dstShape.front() : dstShape.back();
+      int innerDimSrc =
+          mmaEncoding.getTransposed() ? srcShape.front() : srcShape.back();
+      // For now disallow reshape of the inner dimension.
+      if (innerDimDst != innerDimSrc)
+        return failure();
+      auto *ctx = srcEnc.getContext();
+
+      // CTALayout can be all 1's because we bailed on multi-CTA layouts above.
+      auto CTALayout = CTALayoutAttr::get(
+          ctx,
+          /*CTAsPerCGA=*/SmallVector<unsigned>(dstShape.size(), 1),
+          /*CTASplitNum=*/SmallVector<unsigned>(dstShape.size(), 1),
+          /*CTAOrder=*/llvm::to_vector(llvm::seq<unsigned>(dstShape.size())));
+      dstEnc = NVMMASharedEncodingAttr::get(
+          ctx, mmaEncoding.getSwizzlingByteWidth(), mmaEncoding.getTransposed(),
+          mmaEncoding.getElementBitWidth(), mmaEncoding.getFp4Padded(),
+          CTALayout);
+      // Big guns, check linear layouts are equivalent
+      // We disallow reshaping memdesc_subviews in the verifier
+      auto srcLL = toLinearLayout(srcShape, srcEnc);
+      auto dstLL = toLinearLayout(dstShape, dstEnc);
+      if (reshapeLayout(ctx, srcLL, dstShape) != dstLL) {
+        return failure();
+      }
+      return success();
+    }
     auto src = mlir::dyn_cast<BlockedEncodingAttr>(srcEnc);
     if (!src) {
       return failure();
@@ -2713,6 +2745,10 @@ struct TritonGPUInferLayoutInterface
     if (succeeded(result)) {
       return result;
     }
+    if (!isa<DistributedEncodingTrait>(srcEnc)) {
+      return emitOptionalError(loc,
+                               "Failed MemDescReshapeOp encoding inference");
+    }
     // If the legacy encoding failed use LinearLayouts.
     // Once LinearLayouts are more widely used, we can remove
     // inferReshapeOpLegacyEncoding and simply use LLs.
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -474,19 +474,48 @@ LogicalResult MemDescReshapeOp::verify() {
     return emitError("result element type must match src element type");
   }
 
-  // Infer the dst layout from the source and verify that it is equivalent.
-  auto srcEncoding = srcType.getEncoding();
-  Attribute inferedDstEncoding;
-
-  LinearLayout ll = inferReshapeLinearLayout(cast<TensorOrMemDesc>(srcType),
-                                             dstType.getShape());
-  LinearLayout llDst = triton::gpu::toLinearLayout(dstType);
-  if (ll != llDst) {
+  MemDescType expectedTy;
+  if (failed(inferReturnTypes(getContext(), getLoc(), srcType,
+                              dstType.getShape(), expectedTy)))
+    return failure();
+  // Check that the alloc shape separately to give a cleaner error, given that
+  // it's the most likely source of the error.
+  if (expectedTy.getAllocShape() != dstType.getAllocShape()) {
+    return emitError(
+        "The result alloc shape does not match the expected alloc shape.");
+  }
+  if (expectedTy != dstType) {
     return emitError("source and destination layout are incompatible.");
   }
   return success();
 }
 
+LogicalResult MemDescReshapeOp::inferReturnTypes(
+    MLIRContext *context, std::optional<Location> loc, MemDescType srcTy,
+    ArrayRef<int64_t> dstShape, MemDescType &inferredReturnType) {
+  if (product<int64_t>(dstShape) != product<int64_t>(srcTy.getShape()))
+    return emitOptionalError(
+        loc, "dst shape has different number of elements than src");
+
+  Attribute dstEncoding;
+  if (Attribute srcEnc = srcTy.getEncoding()) {
+    auto *inferLayout = cast<DialectInferLayoutInterface>(&srcEnc.getDialect());
+    if (failed(inferLayout->inferReshapeOpEncoding(srcTy.getShape(), srcEnc,
+                                                   dstShape, dstEncoding, loc)))
+      return failure();
+  }
+
+  SmallVector<int64_t> dstAllocShape =
+      to_vector(srcTy.getAllocShape().take_front(srcTy.getAllocShape().size() -
+                                                 srcTy.getShape().size()));
+  dstAllocShape.append(dstShape.begin(), dstShape.end());
+
+  inferredReturnType = MemDescType::get(
+      dstShape, srcTy.getElementType(), dstEncoding, srcTy.getMemorySpace(),
+      srcTy.getMutableMemory(), dstAllocShape);
+  return success();
+}
+
 // MemDescReinterpretOp
 LogicalResult MemDescReinterpretOp::verify() {
   if (getSrc().getType().getMemorySpace() != getType().getMemorySpace())
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -143,44 +143,6 @@ class FuseTransMMAV3Plus : public OpRewritePattern<LocalAllocOp> {
   }
 };
 
-static Attribute inferSrcEncodingMemDescReshape(ArrayRef<int64_t> srcShape,
-                                                MemDescType dstType) {
-  auto dstEncoding = dstType.getEncoding();
-  auto dstShape = dstType.getShape();
-  auto mmaEncoding = dyn_cast<NVMMASharedEncodingAttr>(dstEncoding);
-  if (!mmaEncoding)
-    return {};
-  // TODO: supporting reshape of CTA layouts is non-trivial.
-  if (getNumCTAs(mmaEncoding) > 1)
-    return {};
-  int innerDimDst =
-      mmaEncoding.getTransposed() ? dstShape.front() : dstShape.back();
-  int innerDimSrc =
-      mmaEncoding.getTransposed() ? srcShape.front() : srcShape.back();
-  // For now disallow reshape of the inner dimension.
-  if (innerDimDst != innerDimSrc)
-    return {};
-
-  // CTALayout can be all 1's because we bailed on multi-CTA layouts above.
-  auto CTALayout = CTALayoutAttr::get(
-      dstEncoding.getContext(),
-      /*CTAsPerCGA=*/SmallVector<unsigned>(srcShape.size(), 1),
-      /*CTASplitNum=*/SmallVector<unsigned>(srcShape.size(), 1),
-      /*CTAOrder=*/llvm::to_vector(llvm::seq<unsigned>(srcShape.size())));
-  auto srcEncoding = NVMMASharedEncodingAttr::get(
-      dstEncoding.getContext(), mmaEncoding.getSwizzlingByteWidth(),
-      mmaEncoding.getTransposed(), mmaEncoding.getElementBitWidth(),
-      mmaEncoding.getFp4Padded(), CTALayout);
-  // Big guns, check linear layouts are equivalent
-  auto srcLL = toLinearLayout(srcShape, srcEncoding);
-  auto dstLL = toLinearLayout(dstShape, dstEncoding);
-  auto ctx = dstEncoding.getContext();
-  if (reshapeLayout(ctx, srcLL, dstShape) != dstLL) {
-    return {};
-  }
-  return srcEncoding;
-}
-
 // Rewrite
 //
 //   alloc(reshape(), #shared1) ->
@@ -204,18 +166,21 @@ class ReshapeMemDesc : public OpRewritePattern<LocalAllocOp> {
     auto allocEncoding = allocType.getEncoding();
 
     RankedTensorType srcTy = reshapeOp.getSrc().getType();
-    auto newAllocEncoding =
-        inferSrcEncodingMemDescReshape(srcTy.getShape(), allocType);
-    if (!newAllocEncoding)
+    auto srcShape = srcTy.getShape();
+    auto dstShape = allocType.getShape();
+
+    // We use the fact that forward and backward inference are the same for
+    // MemDescReshapeOp to infer the source MemDescType that would produce
+    // `allocType` after a reshape.
+    MemDescType innerTy;
+    if (failed(MemDescReshapeOp::inferReturnTypes(
+            getContext(), allocOp.getLoc(), allocType, srcShape, innerTy)))
       return failure();
 
-    MemDescType innerTy =
-        MemDescType::get(srcTy.getShape(), srcTy.getElementType(),
-                         newAllocEncoding, allocType.getMemorySpace());
     auto newAlloc = rewriter.create<LocalAllocOp>(allocOp.getLoc(), innerTy,
                                                   reshapeOp.getSrc());
-    rewriter.replaceOpWithNewOp<MemDescReshapeOp>(allocOp, allocOp.getType(),
-                                                  newAlloc);
+    rewriter.replaceOpWithNewOp<MemDescReshapeOp>(allocOp, newAlloc,
+                                                  allocOp.getType().getShape());
     return success();
   }
 };
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1500,8 +1500,9 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
       newVal = builder.create<ttg::MemDescTransOp>(trans.getLoc(), val,
                                                    trans.getOrder());
     } else if (auto reshape = dyn_cast<ttg::MemDescReshapeOp>(user)) {
-      newVal = builder.create<ttg::MemDescReshapeOp>(reshape.getLoc(),
-                                                     reshape.getType(), val);
+      auto shape = reshape.getType().getShape();
+      newVal =
+          builder.create<ttg::MemDescReshapeOp>(reshape.getLoc(), val, shape);
     }
     assert(newVal && "unhandled memdesc view");
     newVal.getDefiningOp()->setAttrs(user->getAttrs());
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -396,8 +396,9 @@ void init_gluon_ir(py::module &&m) {
              return self.create<ttg::MemDescTransOp>(src, order);
            })
       .def("create_memdesc_reshape",
-           [](GluonOpBuilder &self, Type resultType, Value src) -> Value {
-             return self.create<ttg::MemDescReshapeOp>(resultType, src);
+           [](GluonOpBuilder &self, Value src,
+              std::vector<int64_t> &shape) -> Value {
+             return self.create<ttg::MemDescReshapeOp>(src, shape);
            })
       .def("create_memdesc_reinterpret",
            [](GluonOpBuilder &self, Type resultType, Value src) -> Value {
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -309,10 +309,8 @@ def shared_memory_cast_kernel():
 
     layout_b: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=False, element_bitwidth=16,
                                                       rank=4, cta_order=[3, 2, 1, 0])
-    layout_reshape: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=False,
-                                                            element_bitwidth=16, rank=2)
     smem = ttgl.allocate_shared_memory(ttgl.float16, [32, 1, 4, 64], layout_b)
-    smem.reshape((128, 64), layout_reshape)
+    smem.reshape((128, 64))
 
     smem._reinterpret(ttgl.int8, [1024], ttgl.SwizzledSharedLayout(1, 1, 1, [0, 1]))
 
@@ -336,7 +334,7 @@ def test_shared_memory_cast(fresh_knobs):
     %2 = ttg.memdesc_trans %1 {order = array<i32: 1, 0>} : !ttg.memdesc<256x128xi8, #shared, #smem, mutable, 2x256x128> -> !ttg.memdesc<128x256xi8, #shared1, #smem, mutable, 2x128x256>
     tt.call @"test_frontend.anchor_noinline__MDi8S128_256SLNVMMA_64_8_True_False_NVMMALAS[2, 128, 256]ASMD__"(%2) : (!ttg.memdesc<128x256xi8, #shared1, #smem, mutable, 2x128x256>) -> ()
     %3 = ttg.local_alloc : () -> !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable>
-    %4 = ttg.memdesc_reshape %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<128x64xf16, #shared3, #smem, mutable, 32x1x4x64>
+    %4 = ttg.memdesc_reshape %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<128x64xf16, #shared3, #smem, mutable>
     %5 = ttg.memdesc_reinterpret %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<1024xi8, #shared4, #smem, mutable>
     tt.return
   }
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -293,21 +293,19 @@ def permute(self, order, _semantic: GluonSemantic) -> shared_memory_descriptor:
         return _semantic.memdesc_trans(self, order)
 
     @builtin
-    def reshape(self, shape, layout, _semantic: GluonSemantic) -> shared_memory_descriptor:
+    def reshape(self, shape, _semantic: GluonSemantic) -> shared_memory_descriptor:
         """
         Reshape the shared memory descriptor to a new shape and layout.
 
         Args:
             shape (List[int]): The target shape.
-            layout (SharedLayout): The new layout for the descriptor.
 
         Returns:
             shared_memory_descriptor: Descriptor with the new shape and layout.
         """
         shape = [_unwrap_if_constexpr(s) for s in shape]
-        layout = _unwrap_if_constexpr(layout)
 
-        return _semantic.memdesc_reshape(self, shape, layout)
+        return _semantic.memdesc_reshape(self, shape)
 
     @builtin
     def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -1,4 +1,5 @@
 from typing import Sequence, List, TypeVar, Tuple, Callable
+import math
 from triton.language.semantic import TritonSemantic
 from . import _core as ttgl
 from ._layouts import SliceLayout, AutoLayout
@@ -213,10 +214,26 @@ def memdesc_trans(self, mem_desc, order):
         return ttgl.shared_memory_descriptor(handle, element_ty=mem_desc.dtype, shape=shape,
                                              alloc_shape=new_alloc_shape, layout=layout)
 
-    def memdesc_reshape(self, mem_desc, shape, layout):
-        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
-        handle = self.builder.create_memdesc_reshape(ty.to_ir(self.builder), mem_desc.handle)
-        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+    def memdesc_reshape(self, mem_desc, shape):
+        _check(
+            math.prod(shape) == math.prod(mem_desc.shape),
+            lambda: (f"memdesc_reshape total elements mismatch: "
+                     f"{mem_desc.shape} -> {shape}"),
+        )
+
+        handle = self.builder.create_memdesc_reshape(mem_desc.handle, shape)
+        layout = self.builder.get_gluon_layout_from_memdesc(handle)
+        alloc_shape = mem_desc.type.alloc_shape
+        prefix_len = len(alloc_shape) - mem_desc.rank
+        new_alloc_shape = alloc_shape[:prefix_len] + list(shape)
+
+        return ttgl.shared_memory_descriptor(
+            handle,
+            element_ty=mem_desc.dtype,
+            shape=shape,
+            alloc_shape=new_alloc_shape,
+            layout=layout,
+        )
 
     def memdesc_reinterpret(self, mem_desc, dtype, shape, layout):
         ty = ttgl.shared_memory_descriptor_type(dtype, shape, layout, shape)
diff --git a/test/TritonGPU/ops.mlir b/test/TritonGPU/ops.mlir
@@ -66,6 +66,21 @@ module attributes {"ttg.target" = "cuda:0", "ttg.num-ctas" = 1 : i32, "ttg.num-w
   }
 }
 
+// -----
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16,  CTAsPerCGA = [1,1,1,1], CTASplitNum = [1,1,1,1], CTAOrder = [3, 2, 1, 0]}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "cuda:0", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: memdesc_reshape
+  // CHECK: !ttg.memdesc<128x64xf16, #{{.+}}, mutable>
+  tt.func @memdesc_reshape(%d : !ttg.memdesc<32x1x4x64xf16, #shared, #smem, mutable>){
+    %1 = ttg.memdesc_reshape %d : !ttg.memdesc<32x1x4x64xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x64xf16, #shared1, #smem, mutable>
+    tt.return
+  }
+}
+
+
 // -----
 
 // CHECK-LABEL: @warp_specialize_nothing