@@ -10,17 +10,142 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
1010 %c12 = arith.constant 12 : index
1111 %c32 = arith.constant 32 : index
1212 %alloc = memref.alloc () : memref <64 x64 xf32 , #gpu_lds_addrspace >
13- // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<128x72xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
14- // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x64xf32, 3>
15- // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] : memref<64x64xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
13+ // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
14+
15+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
16+ // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
17+ // CHECK: %[[C12:.*]] = arith.constant 12 : index
18+ // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
19+ // CHECK: %[[C32:.*]] = arith.constant 32 : index
20+ // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
21+
22+ // CHECK: %[[ALLOC:.*]] = memref.alloc()
23+ // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast
24+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
25+
26+ // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
27+ // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
28+ // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
29+
30+ // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
31+ // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
32+
33+ // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
34+ // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
35+ // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
36+
37+ // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
38+ // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
39+ // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
40+ // CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
41+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
42+ amdgpu.gather_to_lds %global [%c12 , %c0 ], %alloc [%c32 , %c0 ] {transferType = f32 }
43+ : memref <128 x72 xf32 , #gpu_global_addrspace >, memref <64 x64 xf32 , #gpu_lds_addrspace >
44+ func.return
45+ }
46+
47+ // CHECK-LABEL: func @global_load_to_rocdl_i8
48+ // CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi8, 1>)
49+ func.func @global_load_to_rocdl_i8 (%global : memref <128 x72 xi8 , #gpu_global_addrspace >) {
50+ // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
51+
52+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
53+ // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
54+ // CHECK: %[[C12:.*]] = arith.constant 12 : index
55+ // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
56+ // CHECK: %[[C32:.*]] = arith.constant 32 : index
57+ // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
58+
59+ // CHECK: %[[ALLOC:.*]] = memref.alloc()
60+ // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
61+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
62+
63+ // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
64+ // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
65+ // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
66+
67+ // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
68+ // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
69+
70+ // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
71+ // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
72+ // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
73+
74+ // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
75+ // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
76+ // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
77+ // CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
78+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C1]], %[[C0]], %[[C0_2]]
79+ %c0 = arith.constant 0 : index
80+ %c12 = arith.constant 12 : index
81+ %c32 = arith.constant 32 : index
82+ %alloc = memref.alloc () : memref <64 x64 xi8 , #gpu_lds_addrspace >
83+ amdgpu.gather_to_lds %global [%c12 , %c0 ], %alloc [%c32 , %c0 ] {transferType = i8 }
84+ : memref <128 x72 xi8 , #gpu_global_addrspace >, memref <64 x64 xi8 , #gpu_lds_addrspace >
85+ func.return
86+ }
87+
88+ // CHECK-LABEL: func @global_load_to_rocdl_vec
89+ // CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>)
90+ func.func @global_load_to_rocdl_vec (%global : memref <128 x72 xi16 , #gpu_global_addrspace >) {
91+ // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
92+
93+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
94+ // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
95+ // CHECK: %[[C12:.*]] = arith.constant 12 : index
96+ // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
97+ // CHECK: %[[C32:.*]] = arith.constant 32 : index
98+ // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
99+
100+ // CHECK: %[[ALLOC:.*]] = memref.alloc()
101+ // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
16102 // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
17- // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]]
103+
104+ // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
105+ // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
106+ // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
107+
108+ // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
18109 // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
19- // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]]
110+
111+ // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
112+ // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
113+ // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
114+
115+ // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
20116 // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
21117 // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
22118 // CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
23119 // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
24- amdgpu.gather_to_lds %global [%c12 , %c0 ], %alloc [%c32 , %c0 ] {transferType = f32 } : memref <128 x72 xf32 , #gpu_global_addrspace >, memref <64 x64 xf32 , #gpu_lds_addrspace >
120+ %c0 = arith.constant 0 : index
121+ %c12 = arith.constant 12 : index
122+ %c32 = arith.constant 32 : index
123+ %alloc = memref.alloc () : memref <64 x128 xi16 , #gpu_lds_addrspace >
124+ amdgpu.gather_to_lds %global [%c12 , %c0 ], %alloc [%c32 , %c0 ] {transferType = vector <2 x i16 >}
125+ : memref <128 x72 xi16 , #gpu_global_addrspace >, memref <64 x128 xi16 , #gpu_lds_addrspace >
25126 func.return
26127}
128+
129+
130+ // CHECK-LABEL: func @global_load_to_rocdl_dynamic_indices
131+ // CHECK-SAME: (%[[ARG0:.*]]: memref<512xi32, 1>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index)
132+ func.func @global_load_to_rocdl_dynamic_indices (%global : memref <512 xi32 , #gpu_global_addrspace >, %src_idx : index , %dst_idx : index ) {
133+ // CHECK: %[[DSTIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[DST_IDX]]
134+ // CHECK: %[[SRCIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC_IDX]]
135+ // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
136+ // CHECK: %[[ALLOC:.*]] = memref.alloc()
137+ // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
138+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
139+ // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
140+ // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
141+ // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]]
142+ // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
143+ // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
144+ // CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
145+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
146+ %alloc = memref.alloc () : memref <4 x64 xi32 , #gpu_lds_addrspace >
147+ %c0 = arith.constant 0 : index
148+ amdgpu.gather_to_lds %global [%src_idx ], %alloc [%dst_idx , %c0 ] {transferType = i32 }
149+ : memref <512 xi32 , #gpu_global_addrspace >, memref <4 x64 xi32 , #gpu_lds_addrspace >
150+ func.return
151+ }
0 commit comments