@@ -21,8 +21,8 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
2121
2222 // CHECK: %[[ALLOC:.*]] = memref.alloc()
2323 // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast
24- // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
25-
24+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
25+
2626 // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
2727 // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
2828 // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
@@ -35,8 +35,7 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
3535 // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
3636
3737 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
38- // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
39- // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]]
38+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
4039 amdgpu.gather_to_lds %global [%c12 , %c0 ], %alloc [%c32 , %c0 ]
4140 : f32 , memref <128 x72 xf32 , #gpu_global_addrspace >, memref <64 x64 xf32 , #gpu_lds_addrspace >
4241 func.return
@@ -56,8 +55,8 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
5655
5756 // CHECK: %[[ALLOC:.*]] = memref.alloc()
5857 // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
59- // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
60-
58+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
59+
6160 // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
6261 // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
6362 // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
@@ -70,8 +69,7 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
7069 // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
7170
7271 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
73- // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
74- // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C1]]
72+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 1
7573 %c0 = arith.constant 0 : index
7674 %c12 = arith.constant 12 : index
7775 %c32 = arith.constant 32 : index
@@ -85,7 +83,7 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
8583// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>)
8684func.func @global_load_to_rocdl_vec (%global : memref <128 x72 xi16 , #gpu_global_addrspace >) {
8785 // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
88-
86+
8987 // CHECK: %[[C0:.*]] = arith.constant 0 : index
9088 // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
9189 // CHECK: %[[C12:.*]] = arith.constant 12 : index
@@ -95,8 +93,8 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add
9593
9694 // CHECK: %[[ALLOC:.*]] = memref.alloc()
9795 // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
98- // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
99-
96+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
97+
10098 // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
10199 // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
102100 // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
@@ -109,8 +107,7 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add
109107 // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
110108
111109 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
112- // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
113- // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]]
110+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
114111 %c0 = arith.constant 0 : index
115112 %c12 = arith.constant 12 : index
116113 %c32 = arith.constant 32 : index
@@ -129,12 +126,11 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
129126 // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
130127 // CHECK: %[[ALLOC:.*]] = memref.alloc()
131128 // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
132- // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
129+ // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
133130 // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
134131 // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
135132 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]]
136- // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
137- // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]]
133+ // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
138134 %alloc = memref.alloc () : memref <4 x64 xi32 , #gpu_lds_addrspace >
139135 %c0 = arith.constant 0 : index
140136 amdgpu.gather_to_lds %global [%src_idx ], %alloc [%dst_idx , %c0 ]
0 commit comments