|
| 1 | +// RUN: mlir-opt --xegpu-instructionlize -split-input-file %s | FileCheck %s |
| 2 | + |
| 3 | + |
| 4 | +#a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]> |
| 5 | +#b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]> |
| 6 | +#c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]> |
| 7 | + |
| 8 | +#l1 = #xegpu.layout<inst_data = [8, 16]> |
| 9 | +#l2 = #xegpu.layout<inst_data = [16, 16]> |
| 10 | + |
| 11 | +gpu.module @test_kernel { |
| 12 | + gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { |
| 13 | + %c0 = arith.constant 0 : index |
| 14 | + %c16 = arith.constant 16 : index |
| 15 | + %c32 = arith.constant 32 : index |
| 16 | + %c1024 = arith.constant 1024 : index |
| 17 | + %block_id_x = gpu.block_id x |
| 18 | + %block_id_y = gpu.block_id y |
| 19 | + %m = arith.muli %block_id_x, %c16 : index |
| 20 | + %n = arith.muli %block_id_y, %c32 : index |
| 21 | + |
| 22 | + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> |
| 23 | + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> |
| 24 | + |
| 25 | + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> |
| 26 | + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> |
| 27 | + %out:3 = scf.for %k = %c0 to %c1024 step %c32 |
| 28 | + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) |
| 29 | + -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { |
| 30 | + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> |
| 31 | + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> |
| 32 | + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> |
| 33 | + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> |
| 34 | + //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> |
| 35 | + %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> |
| 36 | + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> |
| 37 | + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> |
| 38 | + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> |
| 39 | + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> |
| 40 | + scf.yield %a_next_tdesc, %b_next_tdesc, %c |
| 41 | + : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> |
| 42 | + } |
| 43 | + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> |
| 44 | + xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> |
| 45 | + gpu.return |
| 46 | + } |
| 47 | + |
| 48 | + //----- |
| 49 | + gpu.func @test_gemm_simple(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { |
| 50 | + %c0 = arith.constant 0 : index |
| 51 | + %c16 = arith.constant 16 : index |
| 52 | + %c32 = arith.constant 32 : index |
| 53 | + %c1024 = arith.constant 1024 : index |
| 54 | + %block_id_x = gpu.block_id x |
| 55 | + %block_id_y = gpu.block_id y |
| 56 | + %m = arith.muli %block_id_x, %c16 : index |
| 57 | + %n = arith.muli %block_id_y, %c32 : index |
| 58 | + |
| 59 | + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1> |
| 60 | + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> |
| 61 | + |
| 62 | + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1> |
| 63 | + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2> |
| 64 | + %out:3 = scf.for %k = %c0 to %c1024 step %c32 |
| 65 | + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) |
| 66 | + -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) { |
| 67 | + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> |
| 68 | + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> |
| 69 | + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> |
| 70 | + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> |
| 71 | + //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> |
| 72 | + %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> |
| 73 | + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> |
| 74 | + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1> |
| 75 | + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16> |
| 76 | + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2> |
| 77 | + scf.yield %a_next_tdesc, %b_next_tdesc, %c |
| 78 | + : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32> |
| 79 | + } |
| 80 | + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> |
| 81 | + xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> |
| 82 | + gpu.return |
| 83 | + } |
| 84 | + |
| 85 | + //----- |
| 86 | + |
| 87 | + gpu.func @test_gemm_a_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { |
| 88 | + %c0 = arith.constant 0 : index |
| 89 | + %c16 = arith.constant 16 : index |
| 90 | + %c32 = arith.constant 32 : index |
| 91 | + %c1024 = arith.constant 1024 : index |
| 92 | + %block_id_x = gpu.block_id x |
| 93 | + %block_id_y = gpu.block_id y |
| 94 | + %m = arith.muli %block_id_x, %c16 : index |
| 95 | + %n = arith.muli %block_id_y, %c32 : index |
| 96 | + |
| 97 | + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> |
| 98 | + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> |
| 99 | + |
| 100 | + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> |
| 101 | + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> |
| 102 | + %out:3 = scf.for %k = %c0 to %c1024 step %c32 |
| 103 | + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) |
| 104 | + -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { |
| 105 | + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> |
| 106 | + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> |
| 107 | + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> |
| 108 | + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> |
| 109 | + //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16> |
| 110 | + %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16> |
| 111 | + //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> |
| 112 | + %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> |
| 113 | + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> |
| 114 | + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> |
| 115 | + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> |
| 116 | + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> |
| 117 | + scf.yield %a_next_tdesc, %b_next_tdesc, %c |
| 118 | + : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> |
| 119 | + } |
| 120 | + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> |
| 121 | + xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> |
| 122 | + gpu.return |
| 123 | + }} |
0 commit comments