Skip to content

Commit 88ab9aa

Browse files
committed
Add more high rank base memory test cases.
1 parent e053be1 commit 88ab9aa

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
2+
3+
gpu.module @load_store_check {
4+
// CHECK-LABEL: gpu.func @load_store
5+
// CHECK-SAME: %[[ARG0:.*]]: memref<?x?x?x?xf32>, %[[ARG1:.*]]: memref<?x?x?x?xf32>) kernel {
6+
gpu.func @load_store(%src: memref<?x?x?x?xf32>, %dst: memref<?x?x?x?xf32>) kernel {
7+
// CHECK: %[[C32_I32:.*]] = arith.constant 32 : i32
8+
// CHECK: %[[C64_I32:.*]] = arith.constant 64 : i32
9+
// CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32
10+
// CHECK: %[[C72_I32:.*]] = arith.constant 72 : i32
11+
// CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<?x?x?x?xf32> -> index
12+
// CHECK: %[[VAR0:.*]] = arith.index_castui %[[INTPTR]] : index to i64
13+
// CHECK: %[[INTPTR_0:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<?x?x?x?xf32> -> index
14+
// CHECK: %[[VAR1:.*]] = arith.index_castui %[[INTPTR_0]] : index to i64
15+
%dim0 = arith.constant 3 : index
16+
%dim1 = arith.constant 3 : index
17+
%dim2 = arith.constant 8 : index
18+
%dim3 = arith.constant 16 : index
19+
%stride3 = arith.constant 1 : index
20+
%stride2 = arith.constant 16 : index
21+
%stride1 = arith.constant 128 : index
22+
%stride0 = arith.constant 384 : index
23+
24+
%src_tdesc = xegpu.create_nd_tdesc %src, shape:[%dim0, %dim1, %dim2, %dim3],
25+
strides:[%stride0, %stride1, %stride2, %stride3] : memref<?x?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
26+
27+
// CHECK: %[[VAR2:.*]] = llvm.inttoptr %[[VAR1]] : i64 to !llvm.ptr<1>
28+
// CHECK: %[[LOADED:.*]] = xevm.blockload2d %[[VAR2]], %[[C64_I32]], %[[C72_I32]], %[[C64_I32]],
29+
// CHECK-SAME: %[[C0_I32]], %[[C64_I32]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>,
30+
// CHECK-SAME: elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32,
31+
// CHECK-SAME: tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
32+
// CHECK: %[[LOADED_F32:.*]] = vector.bitcast %[[LOADED]] : vector<8xi32> to vector<8xf32>
33+
%loaded = xegpu.load_nd %src_tdesc[2, 2, 0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
34+
: !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
35+
36+
%tid_x = gpu.thread_id x
37+
%tid_x_i32 = arith.index_cast %tid_x : index to i32
38+
%tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
39+
// CHECK: %[[LOADED_MODIFIED:.*]] = vector.insert
40+
%loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
41+
42+
%dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%dim0, %dim1, %dim2, %dim3],
43+
strides:[%stride0, %stride1, %stride2, %stride3] : memref<?x?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
44+
45+
// CHECK: %[[VAR8:.*]] = llvm.inttoptr %[[VAR0]] : i64 to !llvm.ptr<1>
46+
// CHECK: %[[LOADED_MODIFIED_BC:.*]] = vector.bitcast %[[LOADED_MODIFIED]] : vector<8xf32> to vector<8xi32>
47+
// CHECK: xevm.blockstore2d %[[VAR8]], %[[C64_I32]], %[[C72_I32]], %[[C64_I32]],
48+
// CHECK-SAME: %[[C0_I32]], %[[C32_I32]], %[[LOADED_MODIFIED_BC]] <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>,
49+
// CHECK-SAME: elem_size_in_bits = 32 : i32, tile_height = 8 : i32, tile_width = 16 : i32}>
50+
xegpu.store_nd %loaded_modified, %dst_tdesc[1, 1, 0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
51+
: vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
52+
gpu.return
53+
}
54+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize -cse %s | FileCheck %s
2+
3+
gpu.module @load_store_check {
4+
// CHECK-LABEL: gpu.func @load_store
5+
// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64,
6+
// CHECK-SAME: %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index, %[[ARG7:.*]]: index, %[[ARG8:.*]]: index, %[[ARG9:.*]]: index
7+
gpu.func @load_store(%src: i64, %dst: i64, %dim0: index, %dim1: index, %dim2: index, %dim3: index,
8+
%stride0: index, %stride1: index, %stride2: index, %stride3: index) kernel {
9+
// CHECK: %[[C2:.*]] = arith.constant 2 : index
10+
// CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32
11+
// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
12+
// CHECK: %[[VAR0:.*]] = arith.index_cast %[[ARG5]] : index to i32
13+
// CHECK: %[[VAR1:.*]] = arith.index_cast %[[ARG2]] : index to i64
14+
// CHECK: %[[VAR2:.*]] = arith.index_cast %[[ARG3]] : index to i64
15+
// CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[VAR2]] : i64
16+
// CHECK: %[[VAR4:.*]] = arith.index_cast %[[ARG4]] : index to i64
17+
// CHECK: %[[VAR5:.*]] = arith.muli %[[VAR3]], %[[VAR4]] : i64
18+
// CHECK: %[[VAR6:.*]] = arith.trunci %[[VAR5]] : i64 to i32
19+
// CHECK: %[[VAR7:.*]] = arith.muli %[[ARG4]], %[[C2]] : index
20+
// CHECK: %[[VAR8:.*]] = arith.muli %[[ARG4]], %[[ARG3]] : index
21+
// CHECK: %[[VAR9:.*]] = arith.muli %[[VAR8]], %[[C2]] : index
22+
// CHECK: %[[VAR10:.*]] = arith.addi %[[VAR7]], %[[VAR9]] : index
23+
// CHECK: %[[VAR11:.*]] = arith.index_cast %[[VAR10]] : index to i32
24+
%src_tdesc = xegpu.create_nd_tdesc %src, shape:[%dim0, %dim1, %dim2, %dim3],
25+
strides:[%stride0, %stride1, %stride2, %stride3] : i64 -> !xegpu.tensor_desc<8x16xf32>
26+
27+
// CHECK: %[[SRC_PTR:.*]] = llvm.inttoptr %[[ARG0]] : i64 to !llvm.ptr<1>
28+
// CHECK: %[[VAR13:.*]] = arith.muli %[[VAR0]], %[[C4_I32]] : i32
29+
// CHECK: %[[LOADED:.*]] = xevm.blockload2d %[[SRC_PTR]], %[[VAR13]], %[[VAR6]], %[[VAR13]], %[[C0_I32]], %[[VAR11]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
30+
// CHECK: %[[VAR15:.*]] = vector.bitcast %[[LOADED]] : vector<8xi32> to vector<8xf32>
31+
%loaded = xegpu.load_nd %src_tdesc[2, 2, 0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
32+
: !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
33+
34+
%tid_x = gpu.thread_id x
35+
%tid_x_i32 = arith.index_cast %tid_x : index to i32
36+
%tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
37+
// CHECK: %[[LOADED_MODIFIED:.*]] = vector.insert
38+
%loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
39+
40+
// CHECK: %[[VAR19:.*]] = arith.addi %[[ARG4]], %[[VAR8]] : index
41+
// CHECK: %[[VAR20:.*]] = arith.index_cast %[[VAR19]] : index to i32
42+
%dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%dim0, %dim1, %dim2, %dim3],
43+
strides:[%stride0, %stride1, %stride2, %stride3] : i64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
44+
45+
// CHECK: %[[DST_PTR:.*]] = llvm.inttoptr %[[ARG1]] : i64 to !llvm.ptr<1>
46+
// CHECK: %[[LOADED_MODIFIED_BITCAST:.*]] = vector.bitcast %[[LOADED_MODIFIED]] : vector<8xf32> to vector<8xi32>
47+
// CHECK: xevm.blockstore2d %[[DST_PTR]], %[[VAR13]], %[[VAR6]], %[[VAR13]], %[[C0_I32]], %[[VAR20]], %[[LOADED_MODIFIED_BITCAST]] <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32, tile_height = 8 : i32, tile_width = 16 : i32}>
48+
xegpu.store_nd %loaded_modified, %dst_tdesc[1, 1, 0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
49+
: vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
50+
gpu.return
51+
}
52+
}

0 commit comments

Comments
 (0)