Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
let summary = "Extend a vector of packed floating point values";

let description = [{
Extend and scale two packed floats in `source[index]` to two floats and
Extend and scale two packed floats in `source[index]` to two floats and
return them.

This rather unusual signature arises from the fact that AMD GPUs cannot
Expand Down Expand Up @@ -861,7 +861,7 @@ def AMDGPU_WMMAOp :
}

def AMDGPU_GatherToLDSOp :
AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
Variadic<Index>:$srcIndices,
Expand Down Expand Up @@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp :
order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).

This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
- `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
size.
- `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
- `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
size.
- `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
are omitted from this wrapper.
- The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
double-precision operations on gfx94x and so are not included here.
- The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
double-precision operations on gfx94x and so are not included here.
}];
let assemblyFormat = [{
`(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
Expand Down
4 changes: 4 additions & 0 deletions mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) {
}

static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
if (!memorySpace)
return false;
if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
return intMemorySpace.getInt() == 3;
if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
Expand All @@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
}

static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
if (!memorySpace)
return false;
if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
return intMemorySpace.getInt() == 7;
if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
Expand Down
7 changes: 5 additions & 2 deletions mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
// CHECK: %[[ALLOC:.*]] = memref.alloc()
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
// CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
// CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
// CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]]
// CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
%alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
%c0 = arith.constant 0 : index
Expand All @@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
// CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]

// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
// CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
// CHECK: %[[C12:.*]] = arith.constant 12 : index
// CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
// CHECK: %[[C32:.*]] = arith.constant 32 : index
Expand Down
8 changes: 8 additions & 0 deletions mlir/test/Dialect/AMDGPU/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
func.return %0 : vector<8xi6>
}

// -----

func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
// expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
func.return
}
11 changes: 11 additions & 0 deletions mlir/test/Dialect/AMDGPU/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
func.return %0 : vector<4xf16>
}

// CHECK-LABEL: func @gather_to_lds
func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
// CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
// CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
// CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1] : vector<2xf16>, memref<32x32xf16>, memref<32xf16, #gpu.address_space<workgroup>>
amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
func.return
}
Loading