[BACKEND] Error when using CGA>1 on memdesc_subview (#7288)

lezcano · web-flow · commit 056ad7f9e341 · 2025-06-24T14:08:37.000+01:00
Before we were simply discarding the block during the lowering.
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -504,9 +504,13 @@ struct MemDescSubviewOpConversion
       // The order gives us the honest-to-goodness layout rank
       auto srcAllocShape =
           srcTy.getAllocShape().take_back(getOrder(srcTy).size());
-      auto llInv = toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
-      offset =
-          applyLinearLayout(loc, rewriter, llInv, logicalOffsets)[0].second;
+      auto ll = toLinearLayout(srcAllocShape, srcTy.getEncoding());
+      // Checked in the verifier.
+      assert(ll.getInDimSize(str_attr("block")) == 1);
+      auto kOffset = str_attr("offset");
+      ll = ll.reshapeIns({{kOffset, ll.getTotalInDimSize()}});
+      offset = applyLinearLayout(loc, rewriter, ll.invert(), logicalOffsets)[0]
+                   .second;
     }
 
     auto base = smemObj.getBase();
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -721,8 +721,14 @@ LogicalResult MemDescSubviewOp::verify() {
   auto ctx = getContext();
   // The order gives us the honest-to-goodness layout rank
   auto srcAllocShape = srcTy.getAllocShape().take_back(getOrder(srcTy).size());
-  auto llInv =
-      triton::gpu::toLinearLayout(srcAllocShape, srcTy.getEncoding()).invert();
+  auto ll = triton::gpu::toLinearLayout(srcAllocShape, srcTy.getEncoding());
+  // NYI: We don't support non-trivial block dimension for now.
+  auto kBlock = mlir::StringAttr::get(getContext(), "block");
+  if (ll.getInDimSize(kBlock) != 1) {
+    return emitError("non-trivial block dimension not supported");
+  }
+
+  auto llInv = ll.invert();
   auto kDim = mlir::StringAttr::get(ctx, "dim" + llvm::Twine(dim));
   llvm::SmallVector<std::pair<mlir::StringAttr, int32_t>> namedOffsets;
   for (auto d : standardOutDimNames(ctx, srcTy.getRank())) {
diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir
@@ -1,5 +1,16 @@
 // RUN: triton-opt --split-input-file %s --verify-diagnostics
 
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [2, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#smem = #ttg.shared_memory
+tt.func public @non_trivial_block(%arg0: !ttg.memdesc<8x16xf32, #shared, #smem>) {
+    %zero = arith.constant 0 : i32
+    // expected-error @+1 {{non-trivial block}}
+    %a = ttg.memdesc_subview %arg0[%zero, %zero] : !ttg.memdesc<8x16xf32, #shared, #smem> -> !ttg.memdesc<8x8xf32, #shared, #smem>
+    tt.return
+}
+
+// -----
+
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1]}>
 #smem = #ttg.shared_memory
 tt.func public @miss_encoding(%arg0: !ttg.memdesc<8x16xf32, #shared, #smem>) {