Final touch.

lialan · lialan · commit d68db39da215 · 2025-04-07T13:59:06.000-04:00
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -784,7 +784,8 @@ def AMDGPU_GatherToLDSOp :
     * `$srcIndices`: indices into `$src` to read from for this thread.
     * `$dst`: LDS memory memref to write to.
     * `$dstIndices`: base indices into `$dst` to write to for the subgroup of this thread.
-      number of subgroup size of elements will be written contiguously to `$dst[$dstIndices]`.
+      The elements gathered by the subgroup will be written in order of lane ID will be written
+      into contiguously starting at `$dst[$dstIndices]`.
     * `$transferType`: type of the data to be transferred by each thread. This is used to determine
       the size of the data to be transferred and the number of threads in the subgroup.
       The transfer type must be a scalar type or a vector type with a single element type.
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -925,11 +925,12 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
     // `global_load_lds` instructions.
     size_t loadWidth;
     Type transferType = op.getTransferType();
-    if (auto transferVectorType = dyn_cast<VectorType>(transferType))
+    if (auto transferVectorType = dyn_cast<VectorType>(transferType)) {
       loadWidth = transferVectorType.getNumElements() *
-                  transferVectorType.getElementTypeBitWidth() / 8;
-    else
+                  (transferVectorType.getElementTypeBitWidth() / 8);
+    } else {
       loadWidth = transferType.getIntOrFloatBitWidth() / 8;
+    }
 
     // Currently only 1, 2, and 4 byte loads are supported.
     if (loadWidth != 1 && loadWidth != 2 && loadWidth != 4)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -117,17 +117,17 @@ LogicalResult FatRawBufferCastOp::verify() {
 static bool hasGlobalMemorySpace(Attribute memorySpace) {
   if (!memorySpace)
     return true;
-  if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
+  if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
-  if (auto gpuMemorySpace = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+  if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
     return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
   return false;
 }
 
 static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
-  if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
+  if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 3;
-  if (auto gpuMemorySpace = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+  if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
     return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup;
   return false;
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -148,4 +148,4 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   amdgpu.gather_to_lds %global[%src_idx], %alloc[%dst_idx, %c0]
     : i32, memref<512xi32, #gpu_global_addrspace>, memref<4x64xi32, #gpu_lds_addrspace>
   func.return
-}
+}