@@ -263,4 +263,45 @@ gpu.module @test_distribution {
263263 } {sg_id_range = #xegpu.range <[3 , 19 ]>}
264264 gpu.return
265265 }
266+
267+ // CHECK-LABEL: @load_gather
268+ // CHECK-SAME: %[[ARG0:.*]]: memref<?xf16>
269+ gpu.func @load_gather (%src : memref <?xf16 >) {
270+ // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x4xindex>
271+ // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<32x4xi1>
272+ // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
273+ %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [32 , 4 ]>} dense <0 > : vector <256 x16 xindex >
274+ %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [32 , 4 ]>} dense <1 > : vector <256 x16 xi1 >
275+ %load = xegpu.load %src [%offset ], %mask {chunk_size = 1 , layout_result_0 = #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [32 , 4 ]>, l1_hint = #xegpu.cache_hint <cached >}
276+ : memref <?xf16 >, vector <256 x16 xindex >, vector <256 x16 xi1 > -> vector <256 x16 xf16 >
277+ gpu.return
278+ }
279+
280+ // CHECK-LABEL: @store_scatter
281+ // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16>
282+ gpu.func @store_scatter (%dest : memref <256 xf16 >) {
283+ // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16>
284+ // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
285+ // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
286+ // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
287+ %val = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <25.5 > : vector <256 xf16 >
288+ %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <0 > : vector <256 xindex >
289+ %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <1 > : vector <256 xi1 >
290+ xegpu.store %val , %dest [%offset ], %mask {chunk_size = 1 , layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>, l1_hint = #xegpu.cache_hint <cached >}
291+ : vector <256 xf16 >, memref <256 xf16 >, vector <256 xindex >, vector <256 xi1 >
292+ gpu.return
293+ }
294+
295+ // CHECK-LABEL: @load_with_chunk_size
296+ // CHECK-SAME: %[[ARG0:.*]]: memref<?xf16>
297+ gpu.func @load_with_chunk_size (%src : memref <?xf16 >) {
298+ // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
299+ // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
300+ // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
301+ %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <0 > : vector <256 xindex >
302+ %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <1 > : vector <256 xi1 >
303+ %load = xegpu.load %src [%offset ], %mask {chunk_size = 4 , layout_result_0 = #xegpu.layout <sg_layout = [32 , 1 ], sg_data = [8 , 4 ]>, l1_hint = #xegpu.cache_hint <cached >}
304+ : memref <?xf16 >, vector <256 xindex >, vector <256 xi1 > -> vector <256 x4 xf16 >
305+ gpu.return
306+ }
266307}
0 commit comments