[AMD] Enable subview for amd rotating shared attribute (#6160)

binarman · web-flow · commit 1239887283c5 · 2025-03-10T16:29:40.000Z
This PR fixes assert in converter and adds relevant lit test.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -353,7 +353,8 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
     }
   } else { // Case 2 -> rank-reduced swizzling
     assert(rank >= 2 && "Swizzling only applies to tensors with rank >= 2");
-    assert(isa<triton::gpu::SwizzledSharedEncodingAttr>(sharedEnc) &&
+    assert((isa<triton::gpu::SwizzledSharedEncodingAttr,
+                triton::gpu::AMDRotatingSharedEncodingAttr>(sharedEnc)) &&
            "NVMMA layout not supported for sliced tensors");
     // We define both tensor offsets and shared memory offsets:
     //
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -359,3 +359,24 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     tt.return
   }
 }
+
+// -----
+
+// CHECK-LABEL: amd_rotating_subview_shared_layout
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#shared = #ttg.amd_rotating_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @amd_rotating_subview_shared_layout(%arg0: tensor<64x64xf16, #blocked>) {
+    %c0_i32 = arith.constant 0 : i32
+    %c16_i32 = arith.constant 16 : i32
+    // CHECK-COUNT-16: llvm.store {{.*}} : vector<1xf16>, !llvm.ptr<3>
+    %0 = ttg.local_alloc %arg0 : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
+    %1 = ttg.memdesc_subview %0[%c16_i32, %c0_i32] : !ttg.memdesc<64x64xf16, #shared, #smem, mutable> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 64x64>
+    // CHECK-COUNT-4: llvm.load {{.*}} : !llvm.ptr<3> -> vector<1xf16>
+    %2 = ttg.local_load %1 : !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 64x64> -> tensor<16x64xf16, #blocked>
+    // CHECK-COUNT-4: llvm.store {{.*}} : vector<1xf16>, !llvm.ptr<3>
+    ttg.local_store %2, %1 : tensor<16x64xf16, #blocked> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 64x64>
+    tt.return
+  }
+}