Skip to content

Commit b483701

Browse files
committed
Update tests.
1 parent 73629f4 commit b483701

File tree

3 files changed

+139
-27
lines changed

3 files changed

+139
-27
lines changed

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -935,24 +935,10 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
935935
if (loadWidth != 1 && loadWidth != 2 && loadWidth != 4)
936936
return op.emitOpError("chipset unsupported element size");
937937

938-
auto convertIndices = [&](ValueRange indices) -> SmallVector<Value, 4> {
939-
SmallVector<Value, 4> convertedIndices;
940-
941-
for (Value index : indices) {
942-
Type convertedType = getTypeConverter()->convertType(index.getType());
943-
auto convertedIndex = rewriter.create<LLVM::ConstantOp>(
944-
loc, convertedType, rewriter.getIntegerAttr(convertedType, 0));
945-
convertedIndices.push_back(convertedIndex);
946-
}
947-
return convertedIndices;
948-
};
949-
950-
Value srcPtr =
951-
getStridedElementPtr(loc, srcMemRefType, adaptor.getSrc(),
952-
convertIndices(op.getSrcIndices()), rewriter);
953-
Value dstPtr =
954-
getStridedElementPtr(loc, dstMemRefType, adaptor.getDst(),
955-
convertIndices(op.getDstIndices()), rewriter);
938+
Value srcPtr = getStridedElementPtr(loc, srcMemRefType, adaptor.getSrc(),
939+
(adaptor.getSrcIndices()), rewriter);
940+
Value dstPtr = getStridedElementPtr(loc, dstMemRefType, adaptor.getDst(),
941+
(adaptor.getDstIndices()), rewriter);
956942

957943
rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadLDSOp>(
958944
op, srcPtr, dstPtr, createI32Constant(rewriter, loc, loadWidth),

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,17 +117,18 @@ LogicalResult FatRawBufferCastOp::verify() {
117117
static bool hasGlobalMemorySpace(Attribute memorySpace) {
118118
if (!memorySpace)
119119
return true;
120-
else if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
120+
if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
121121
return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
122-
else if (auto gpuMemorySpace =
123-
llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
122+
if (auto gpuMemorySpace = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
124123
return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
125124
return false;
126125
}
127126

128127
static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
129128
if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
130129
return intMemorySpace.getInt() == 3;
130+
if (auto gpuMemorySpace = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
131+
return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup;
131132
return false;
132133
}
133134

mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir

Lines changed: 131 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,142 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
1010
%c12 = arith.constant 12 : index
1111
%c32 = arith.constant 32 : index
1212
%alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace>
13-
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<128x72xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
14-
// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x64xf32, 3>
15-
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] : memref<64x64xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
13+
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
14+
15+
// CHECK: %[[C0:.*]] = arith.constant 0 : index
16+
// CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
17+
// CHECK: %[[C12:.*]] = arith.constant 12 : index
18+
// CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
19+
// CHECK: %[[C32:.*]] = arith.constant 32 : index
20+
// CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
21+
22+
// CHECK: %[[ALLOC:.*]] = memref.alloc()
23+
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast
24+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
25+
26+
// CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
27+
// CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
28+
// CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
29+
30+
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
31+
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
32+
33+
// CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
34+
// CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
35+
// CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
36+
37+
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
38+
// CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
39+
// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
40+
// CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
41+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
42+
amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] {transferType = f32}
43+
: memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace>
44+
func.return
45+
}
46+
47+
// CHECK-LABEL: func @global_load_to_rocdl_i8
48+
// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi8, 1>)
49+
func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrspace>) {
50+
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
51+
52+
// CHECK: %[[C0:.*]] = arith.constant 0 : index
53+
// CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
54+
// CHECK: %[[C12:.*]] = arith.constant 12 : index
55+
// CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
56+
// CHECK: %[[C32:.*]] = arith.constant 32 : index
57+
// CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
58+
59+
// CHECK: %[[ALLOC:.*]] = memref.alloc()
60+
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
61+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
62+
63+
// CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
64+
// CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
65+
// CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
66+
67+
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
68+
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
69+
70+
// CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
71+
// CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
72+
// CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
73+
74+
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
75+
// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
76+
// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
77+
// CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
78+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C1]], %[[C0]], %[[C0_2]]
79+
%c0 = arith.constant 0 : index
80+
%c12 = arith.constant 12 : index
81+
%c32 = arith.constant 32 : index
82+
%alloc = memref.alloc() : memref<64x64xi8, #gpu_lds_addrspace>
83+
amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] {transferType = i8}
84+
: memref<128x72xi8, #gpu_global_addrspace>, memref<64x64xi8, #gpu_lds_addrspace>
85+
func.return
86+
}
87+
88+
// CHECK-LABEL: func @global_load_to_rocdl_vec
89+
// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>)
90+
func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_addrspace>) {
91+
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
92+
93+
// CHECK: %[[C0:.*]] = arith.constant 0 : index
94+
// CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
95+
// CHECK: %[[C12:.*]] = arith.constant 12 : index
96+
// CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
97+
// CHECK: %[[C32:.*]] = arith.constant 32 : index
98+
// CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
99+
100+
// CHECK: %[[ALLOC:.*]] = memref.alloc()
101+
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
16102
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
17-
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]]
103+
104+
// CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
105+
// CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
106+
// CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
107+
108+
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
18109
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
19-
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]]
110+
111+
// CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
112+
// CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
113+
// CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
114+
115+
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
20116
// CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
21117
// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
22118
// CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
23119
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
24-
amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] {transferType = f32} : memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace>
120+
%c0 = arith.constant 0 : index
121+
%c12 = arith.constant 12 : index
122+
%c32 = arith.constant 32 : index
123+
%alloc = memref.alloc() : memref<64x128xi16, #gpu_lds_addrspace>
124+
amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] {transferType = vector<2 x i16>}
125+
: memref<128x72xi16, #gpu_global_addrspace>, memref<64x128xi16, #gpu_lds_addrspace>
25126
func.return
26127
}
128+
129+
130+
// CHECK-LABEL: func @global_load_to_rocdl_dynamic_indices
131+
// CHECK-SAME: (%[[ARG0:.*]]: memref<512xi32, 1>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index)
132+
func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_global_addrspace>, %src_idx : index, %dst_idx : index) {
133+
// CHECK: %[[DSTIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[DST_IDX]]
134+
// CHECK: %[[SRCIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC_IDX]]
135+
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
136+
// CHECK: %[[ALLOC:.*]] = memref.alloc()
137+
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
138+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
139+
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
140+
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
141+
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]]
142+
// CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
143+
// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
144+
// CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
145+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
146+
%alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
147+
%c0 = arith.constant 0 : index
148+
amdgpu.gather_to_lds %global[%src_idx], %alloc[%dst_idx, %c0] {transferType = i32}
149+
: memref<512xi32, #gpu_global_addrspace>, memref<4x64xi32, #gpu_lds_addrspace>
150+
func.return
151+
}

0 commit comments

Comments
 (0)