[BACKEND] Also support rank reducing case in updateEncodingForShape (triton-lang#6375)

ThomasRaoux · web-flow · commit d59e426db57a · 2025-04-02T23:04:53.000Z
Fix the helper to support both cases when we are rank reducing or
expanding
diff --git a/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h b/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h
@@ -82,11 +82,15 @@ updateEncodingForShape(Operation *op, gpu::SharedEncodingTrait encoding,
 
     auto rank = tensorType.getRank();
     auto oldOrder = swizEnc.getOrder();
-    assert(oldOrder.size() <= rank);
     SmallVector<unsigned> order;
     for (int i = 0; i + oldOrder.size() < rank; ++i)
       order.push_back(rank - i - 1);
-    order.append(oldOrder.begin(), oldOrder.end());
+    for (int i = 0; i < oldOrder.size(); ++i) {
+      // If it is a rank-reducing load, we need to drop the last dimensions.
+      if (oldOrder[i] >= rank)
+        continue;
+      order.push_back(oldOrder[i]);
+    }
     auto newCtaEnc = updateCTALayoutForShape(ctaLayout, tensorType.getShape());
     return gpu::SwizzledSharedEncodingAttr::get(
         ctx, swizEnc.getVec(), swizEnc.getPerPhase(), swizEnc.getMaxPhase(),
diff --git a/test/TritonNvidiaGPU/tma_lowering.mlir b/test/TritonNvidiaGPU/tma_lowering.mlir
@@ -90,3 +90,19 @@ tt.func @tma_scatter(%arg0: !tt.tensordesc<tensor<1x128xbf16, #nvmma_128>>, %arg
 }
 
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 2, 0]}>
+// CHECK: #[[$SHARED:.+]] = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABLE: @rank_reducing_load
+  tt.func public @rank_reducing_load(%arg0: !tt.tensordesc<tensor<1x256x32xf32, #shared>>) -> tensor<256x32xf32, #blocked> {
+      %c32_i32 = arith.constant 32 : i32
+      // CHECK: %[[A:.+]] = ttg.local_alloc : () -> !ttg.memdesc<256x32xf32, #[[$SHARED]], #smem, mutable>
+      // CHECK: tng.async_tma_copy_global_to_local %{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}] %[[A]],
+      %l = tt.descriptor_load %arg0[%c32_i32, %c32_i32, %c32_i32] : !tt.tensordesc<tensor<1x256x32xf32, #shared>> -> tensor<256x32xf32, #blocked>
+      tt.return %l : tensor<256x32xf32, #blocked>
+  }
+}