|
1 | 1 | // RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s |
2 | 2 |
|
3 | 3 | gpu.module @prefetch_nd_check { |
| 4 | + // CHECK-LABEL: gpu.func @prefetch_nd( |
| 5 | + // CHECK-SAME: %[[ARG0:.*]]: memref<8x16xf32, 1>, %[[ARG1:.*]]: memref<8x16xf32, 1>) kernel { |
4 | 6 | gpu.func @prefetch_nd(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { |
| 7 | + // CHECK: %[[MEMSPACECAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<8x16xf32, 1> to memref<8x16xf32> |
| 8 | + // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST]] : memref<8x16xf32> -> index |
| 9 | + // CHECK: %[[VAR0:.*]] = arith.index_castui %[[INTPTR]] : index to i64 |
5 | 10 | %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> |
| 11 | + // CHECK: %[[MEMSPACECAST_0:.*]] = memref.memory_space_cast %[[ARG1]] : memref<8x16xf32, 1> to memref<8x16xf32> |
6 | 12 | %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> |
7 | 13 |
|
8 | | - // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 |
9 | | - // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> |
10 | | - // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64> |
11 | | - // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32> |
12 | | - // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32> |
13 | | - // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32> |
14 | | - // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32> |
15 | | - // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32> |
16 | 14 | %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, |
17 | | - #xegpu.block_tdesc_attr<memory_space = global>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
18 | | - |
19 | | - //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64> |
20 | | - //CHECK: %[[PREF_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64> |
21 | | - //CHECK: %[[PREF_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32> |
22 | | - //CHECK: %[[PREF_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32> |
23 | | - //CHECK: %[[PREF_TILE_W64:.*]] = arith.constant 0 : i64 |
24 | | - //CHECK: %[[PREF_TILE_W:.*]] = arith.trunci %[[PREF_TILE_W64]] : i64 to i32 |
25 | | - //CHECK: %[[PREF_TILE_H64:.*]] = arith.constant 0 : i64 |
26 | | - //CHECK: %[[PREF_TILE_H:.*]] = arith.trunci %[[PREF_TILE_H64]] : i64 to i32 |
27 | | - //CHECK: %[[PREF_LLVMPTR:.*]] = llvm.inttoptr %[[PREF_INTPTR]] : i64 to !llvm.ptr<1> |
28 | | - //CHECK: %[[PREF_SIZEOF_F32:.*]] = arith.constant 4 : i32 |
29 | | - //CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[PREF_BASE_W]], %[[PREF_SIZEOF_F32]] : i32 |
30 | | - //CHECK: xevm.blockprefetch2d %[[PREF_LLVMPTR]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_BASE_H]], |
31 | | - //CHECK-SAME: %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_TILE_W]], %[[PREF_TILE_H]] |
32 | | - //CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32, |
33 | | - //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}> |
34 | | - //CHECK-SAME: : (!llvm.ptr<1>, i32, i32, i32, i32, i32) |
| 15 | + #xegpu.block_tdesc_attr<memory_space = global>> |
| 16 | + // CHECK: %[[C16_I64:.*]] = arith.constant 16 : i64 |
| 17 | + // CHECK: %[[VAR1:.*]] = arith.trunci %[[C16_I64]] : i64 to i32 |
| 18 | + // CHECK: %[[C8_I64:.*]] = arith.constant 8 : i64 |
| 19 | + // CHECK: %[[VAR2:.*]] = arith.trunci %[[C8_I64]] : i64 to i32 |
| 20 | + // CHECK: %[[C0_I64:.*]] = arith.constant 0 : i64 |
| 21 | + // CHECK: %[[VAR3:.*]] = arith.trunci %[[C0_I64]] : i64 to i32 |
| 22 | + // CHECK: %[[C0_I64_1:.*]] = arith.constant 0 : i64 |
| 23 | + // CHECK: %[[VAR4:.*]] = arith.trunci %[[C0_I64_1]] : i64 to i32 |
| 24 | + // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR0]] : i64 to !llvm.ptr<1> |
| 25 | + // CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32 |
| 26 | + // CHECK: %[[VAR6:.*]] = arith.muli %[[VAR1]], %[[C4_I32]] : i32 |
| 27 | + // CHECK: xevm.blockprefetch2d %[[VAR5]], %[[VAR6]], %[[VAR2]], %[[VAR6]], %[[VAR3]], %[[VAR4]] |
| 28 | + // CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32, |
| 29 | + // CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}> |
35 | 30 | xegpu.prefetch_nd %src_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> |
36 | | - : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>, |
37 | | - #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| 31 | + : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>> |
38 | 32 |
|
39 | 33 | gpu.return |
40 | 34 | } |
|
0 commit comments