|
2 | 2 | // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s |
3 | 3 |
|
4 | 4 | // CHECK-LABEL: gpu.func @load_dpas_postop_store |
5 | | -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { |
6 | | -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> |
7 | | -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> |
8 | | -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> |
9 | | -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> |
10 | | -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> |
11 | | -// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> |
12 | | -// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> |
13 | | -// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> |
14 | | -// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> |
15 | | -// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> |
| 5 | +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, |
| 6 | +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { |
| 7 | +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> |
| 8 | +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> |
| 9 | +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> |
| 10 | +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> |
| 11 | +// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> |
| 12 | +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> |
| 13 | +// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> |
| 14 | +// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> |
| 15 | +// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> |
| 16 | +// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> |
16 | 17 | gpu.module @xevm_module{ |
17 | 18 | gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { |
18 | 19 | %c0 = arith.constant 0 : index |
@@ -47,26 +48,29 @@ gpu.module @xevm_module{ |
47 | 48 |
|
48 | 49 | // ----- |
49 | 50 | // CHECK-LABEL: gpu.func @gemm |
50 | | -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { |
51 | | -// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x |
52 | | -// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y |
53 | | -// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index |
54 | | -// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index |
55 | | -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> |
56 | | -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> |
57 | | -// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> |
58 | | -// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { |
59 | | -// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> |
60 | | -// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> |
61 | | -// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> |
62 | | -// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> |
63 | | -// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> |
64 | | -// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> |
65 | | -// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> |
66 | | -// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> |
67 | | -// CHECK-NEXT: } |
68 | | -// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> |
69 | | -// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> |
| 51 | +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, |
| 52 | +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { |
| 53 | +// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x |
| 54 | +// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y |
| 55 | +// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index |
| 56 | +// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index |
| 57 | +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> |
| 58 | +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> |
| 59 | +// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> |
| 60 | +// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) |
| 61 | +// CHECK-SAME: -> (vector<8x1xf32>) { |
| 62 | +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> |
| 63 | +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> |
| 64 | +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> |
| 65 | +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> |
| 66 | +// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> |
| 67 | +// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] |
| 68 | +// CHECK-SAME: : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> |
| 69 | +// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> |
| 70 | +// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> |
| 71 | +// CHECK-NEXT: } |
| 72 | +// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> |
| 73 | +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> |
70 | 74 | gpu.module @xevm_module{ |
71 | 75 | gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ |
72 | 76 | %c0 = arith.constant 0 : index |
@@ -109,3 +113,104 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar |
109 | 113 | gpu.return |
110 | 114 | } |
111 | 115 | } |
| 116 | + |
| 117 | +// ----- |
| 118 | +// CHECK-LABEL: gpu.func @scatter_ops_scf_yield |
| 119 | +// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) { |
| 120 | +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16> |
| 121 | +// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> |
| 122 | +// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> |
| 123 | +// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) { |
| 124 | +// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> |
| 125 | +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> |
| 126 | +// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16> |
| 127 | +// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16> |
| 128 | +// CHECK-NEXT: } else { |
| 129 | +// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16> |
| 130 | +// CHECK-NEXT: } |
| 131 | +// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16> |
| 132 | +// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> |
| 133 | +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> |
| 134 | +gpu.module @xevm_module{ |
| 135 | + gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { |
| 136 | + %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1> |
| 137 | + %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> |
| 138 | + %loaded = scf.if %pred -> (vector<16x8xf16>) { |
| 139 | + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { |
| 140 | + layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> |
| 141 | + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> |
| 142 | + scf.yield %3 : vector<16x8xf16> |
| 143 | + } else { |
| 144 | + %3 = arith.constant { |
| 145 | + layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> |
| 146 | + } dense<12.> : vector<16x8xf16> |
| 147 | + scf.yield %3 : vector<16x8xf16> |
| 148 | + } { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> } |
| 149 | + xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| 150 | + gpu.return |
| 151 | + } |
| 152 | +} |
| 153 | + |
| 154 | +// ----- |
| 155 | +// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { |
| 156 | +// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> |
| 157 | +// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> |
| 158 | +// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 |
| 159 | +// CHECK: scf.if %[[PREDICATE]] { |
| 160 | +// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> |
| 161 | +// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> |
| 162 | +// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> |
| 163 | +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> |
| 164 | +// CHECK-NEXT: } |
| 165 | +gpu.module @xevm_module{ |
| 166 | + gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { |
| 167 | + %pred = llvm.mlir.poison : i1 |
| 168 | + %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1> |
| 169 | + %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> |
| 170 | + scf.if %pred { |
| 171 | + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { |
| 172 | + layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> |
| 173 | + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> |
| 174 | + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> |
| 175 | + } |
| 176 | + gpu.return |
| 177 | + } |
| 178 | +} |
| 179 | + |
| 180 | +// ----- |
| 181 | +// CHECK-LABEL: gpu.func @mma_transpose_b( |
| 182 | +// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { |
| 183 | +// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> |
| 184 | +// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> |
| 185 | +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> |
| 186 | +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}> |
| 187 | +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> |
| 188 | +// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> |
| 189 | +// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> |
| 190 | +// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> |
| 191 | +// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> |
| 192 | +gpu.module @xevm_module{ |
| 193 | + gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { |
| 194 | + %c0 = arith.constant 0 : index |
| 195 | + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> |
| 196 | + -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| 197 | + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| 198 | + : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> |
| 199 | + %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> |
| 200 | + -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> |
| 201 | + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} |
| 202 | + : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> |
| 203 | + %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} |
| 204 | + : vector<16x8xi32> to vector<16x16xf16> |
| 205 | + %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} |
| 206 | + : vector<16x16xf16> to vector<16x16xf16> |
| 207 | + %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} |
| 208 | + : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> |
| 209 | + %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> |
| 210 | + -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| 211 | + xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, |
| 212 | + !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> |
| 213 | + gpu.return |
| 214 | + |
| 215 | + } |
| 216 | +} |
0 commit comments