[mlir][amdgpu] Properly handle mismatching memref ranks in amdgpu.gather_to_lds (#149407)

Hardcode84 · web-flow · commit 6b29ee9d9a8d · 2025-07-18T00:42:25.000+03:00
This op doesn't have any rank or indices restrictions on src/dst
memrefs, but was using `SameVariadicOperandSize` which was causing
issues. Also fix some other issues while we at it.
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
   let summary = "Extend a vector of packed floating point values";
 
   let description = [{
-    Extend and scale two packed floats in `source[index]` to two floats and 
+    Extend and scale two packed floats in `source[index]` to two floats and
     return them.
 
     This rather unusual signature arises from the fact that AMD GPUs cannot
@@ -861,7 +861,7 @@ def AMDGPU_WMMAOp :
 }
 
 def AMDGPU_GatherToLDSOp :
-    AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
+    AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
     Arguments<(ins
                    Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
                    Variadic<Index>:$srcIndices,
@@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp :
     order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
 
     This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
-    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and 
-    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile 
-    size. 
-    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` 
+    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
+    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
+    size.
+    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
     are omitted from this wrapper.
-    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for 
-    double-precision operations on gfx94x and so are not included here. 
+    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
+    double-precision operations on gfx94x and so are not included here.
   }];
   let assemblyFormat = [{
     `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 3;
   if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
@@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 7;
   if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
   // CHECK: %[[ALLOC:.*]] = memref.alloc()
   // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
   // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
   // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
-  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
+  // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]]
   // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
   %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
   %c0 = arith.constant 0 : index
@@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
   // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
 
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
+  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
   // CHECK: %[[C32:.*]] = arith.constant 32 : index
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
   func.return %0 : vector<8xi6>
 }
+
+// -----
+
+func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
+  // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
+  amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
   func.return %0 : vector<4xf16>
 }
+
+// CHECK-LABEL: func @gather_to_lds
+func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}],          %{{.*}}[%{{.*}}, %{{.*}}]
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1]        : vector<2xf16>, memref<32x32xf16>, memref<32xf16,    #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem1[%idx1],        %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>,    memref<32x32xf16, #gpu.address_space<workgroup>>
+  func.return
+}