-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[mlir][amdgpu] Properly handle mismatching memref ranks in amdgpu.gather_to_lds
#149407
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ther_to_lds` This op doesn't have any rank or indices restrictions on src/dst memrefs, but was using `SameVariadicOperandSize` which was causing issues. Also fix some other issues while we at it.
|
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Ivan Butygin (Hardcode84) ChangesThis op doesn't have any rank or indices restrictions on src/dst memrefs, but was using Full diff: https://github.com/llvm/llvm-project/pull/149407.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index eadb5d9326798..80959ffbaf426 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
let summary = "Extend a vector of packed floating point values";
let description = [{
- Extend and scale two packed floats in `source[index]` to two floats and
+ Extend and scale two packed floats in `source[index]` to two floats and
return them.
This rather unusual signature arises from the fact that AMD GPUs cannot
@@ -861,7 +861,7 @@ def AMDGPU_WMMAOp :
}
def AMDGPU_GatherToLDSOp :
- AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
+ AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
Variadic<Index>:$srcIndices,
@@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp :
order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
- - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
- fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
- size.
- - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
+ - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
+ fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
+ size.
+ - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
are omitted from this wrapper.
- - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
- double-precision operations on gfx94x and so are not included here.
+ - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
+ double-precision operations on gfx94x and so are not included here.
}];
let assemblyFormat = [{
`(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index acaf6a2f8792a..88c2eb3326d96 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) {
}
static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
return intMemorySpace.getInt() == 3;
if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
@@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
}
static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
return intMemorySpace.getInt() == 7;
if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
index 77103fa5c25f1..e48c94195ea56 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
// CHECK: %[[ALLOC:.*]] = memref.alloc()
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
// CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
// CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
- // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
+ // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64
+ // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]]
// CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
%alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
%c0 = arith.constant 0 : index
@@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
// CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
// CHECK: %[[C0:.*]] = arith.constant 0 : index
- // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
+ // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
// CHECK: %[[C12:.*]] = arith.constant 12 : index
// CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
// CHECK: %[[C32:.*]] = arith.constant 32 : index
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 6d55583f8bc7c..0d2fd245af9e2 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
func.return %0 : vector<8xi6>
}
+
+// -----
+
+func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
+ // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
+ amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 51f3bbd9ae45c..5559ac8f1a5c3 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
func.return %0 : vector<4xf16>
}
+
+// CHECK-LABEL: func @gather_to_lds
+func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
+ // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
+ // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+ amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1] : vector<2xf16>, memref<32x32xf16>, memref<32xf16, #gpu.address_space<workgroup>>
+ amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+ func.return
+}
|
krzysz00
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Approved, thanks for the catch!
amdgpu.gather_to_ldamdgpu.gather_to_lds
Add some basic functional support for `global_load_lds` instructions generation. See doc for the details. Currently doesn't support buffer ops (needs llvm/llvm-project#149407), masking (need buffer ops and for Ivan to think real hard), or actual gathers (need for Ivan to think even harder). Scheduling doesn't work either still, but I had fixed some immediate issues. --------- Signed-off-by: nithinsubbiah <[email protected]> Signed-off-by: Ivan Butygin <[email protected]> Co-authored-by: nithinsubbiah <[email protected]>
Contains fix for the `gather_to_lds` mismatched ranks issue (llvm/llvm-project#149407). Signed-off-by: Ivan Butygin <[email protected]>
This op doesn't have any rank or indices restrictions on src/dst memrefs, but was using
SameVariadicOperandSizewhich was causing issues. Also fix some other issues while we at it.