@@ -13,7 +13,8 @@ gpu.module @load_store_check {
13
13
// CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
14
14
// CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
15
15
// CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
16
- %src_tdesc = xegpu.create_nd_tdesc %srcce [0 , 0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #xegpu.block_tdesc_attr <memory_space = global >, #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
16
+ %src_tdesc = xegpu.create_nd_tdesc %srcce [0 , 0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 >
17
+
17
18
18
19
//CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
19
20
//CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
@@ -25,15 +26,14 @@ gpu.module @load_store_check {
25
26
//CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
26
27
//CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32
27
28
//CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]] {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, transpose = false, vnni_transform = false, l1_cache_control = C, l3_cache_control = UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
28
- %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >}> : !xegpu.tensor_desc <8 x16 xf32 , #xegpu.block_tdesc_attr < memory_space = global >, #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>> -> vector <8 x 1 x f32 >
29
+ %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >}> : !xegpu.tensor_desc <8 x16 xf32 > -> vector <8 x f32 >
29
30
//CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32>
30
- //CHECK: %[[LD_LOADED_F32_DISTRIBUTED:.*]] = vector.shape_cast %[[LD_LOADED_F32]] : vector<8xf32> to vector<8x1xf32>
31
31
32
32
%tid_x = gpu.thread_id x
33
33
%tid_x_i32 = arith.index_cast %tid_x : index to i32
34
34
%tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
35
- //CHECK: %[[LOADED_F32_DISTRIBUTED_MODIFIED :.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32_DISTRIBUTED ]] [0, 0 ] : f32 into vector<8x1xf32 >
36
- %loaded_modified = vector.insert %tid_x_f32 , %loaded [0 , 0 ] : f32 into vector <8 x 1 x f32 >
35
+ //CHECK: %[[LOADED_F32_MODIFIED :.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32 ]] [0] : f32 into vector<8xf32 >
36
+ %loaded_modified = vector.insert %tid_x_f32 , %loaded [0 ] : f32 into vector <8 x f32 >
37
37
38
38
// CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
39
39
// CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
@@ -43,7 +43,7 @@ gpu.module @load_store_check {
43
43
// CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32>
44
44
// CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32>
45
45
// CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32>
46
- %dst_tdesc = xegpu.create_nd_tdesc %dstte [0 , 0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #xegpu.block_tdesc_attr <memory_space = global >, #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]> >
46
+ %dst_tdesc = xegpu.create_nd_tdesc %dstte [0 , 0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #xegpu.block_tdesc_attr <memory_space = global >>
47
47
48
48
//CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64>
49
49
//CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64>
@@ -54,10 +54,9 @@ gpu.module @load_store_check {
54
54
//CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1>
55
55
//CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
56
56
//CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32
57
- //CHECK: %[[FLAT_VALUE:.*]] = vector.shape_cast %[[LOADED_F32_DISTRIBUTED_MODIFIED]] : vector<8x1xf32> to vector<8xf32>
58
- //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[FLAT_VALUE]] : vector<8xf32> to vector<8xi32>
57
+ //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32>
59
58
//CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]], %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]] {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, l1_cache_control = WB, l3_cache_control = UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
60
- xegpu.store_nd %loaded_modified , %dst_tdesc <{l1_hint = #xegpu.cache_hint <write_back >, l2_hint = #xegpu.cache_hint <uncached >}>: vector <8 x 1 x f32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.block_tdesc_attr <memory_space = global >, #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ] >>
59
+ xegpu.store_nd %loaded_modified , %dst_tdesc <{l1_hint = #xegpu.cache_hint <write_back >, l2_hint = #xegpu.cache_hint <uncached >}>: vector <8 x f32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.block_tdesc_attr <memory_space = global >>
61
60
gpu.return
62
61
}
63
62
}
0 commit comments