@@ -269,7 +269,8 @@ gpu.module @test_distribution {
269269 gpu.func @load_gather (%src : memref <?xf16 >) {
270270 // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x4xindex>
271271 // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<32x4xi1>
272- // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
272+ // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
273+ // CHECK-SAME: : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
273274 %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [32 , 4 ]>} dense <0 > : vector <256 x16 xindex >
274275 %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [32 , 4 ]>} dense <1 > : vector <256 x16 xi1 >
275276 %load = xegpu.load %src [%offset ], %mask {chunk_size = 1 , layout_result_0 = #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [32 , 4 ]>, l1_hint = #xegpu.cache_hint <cached >}
@@ -283,21 +284,23 @@ gpu.module @test_distribution {
283284 // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16>
284285 // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
285286 // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
286- // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
287+ // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
288+ // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
287289 %val = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <25.5 > : vector <256 xf16 >
288290 %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <0 > : vector <256 xindex >
289291 %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <1 > : vector <256 xi1 >
290- xegpu.store %val , %dest [%offset ], %mask {chunk_size = 1 , layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>, l1_hint = #xegpu.cache_hint <cached >}
292+ xegpu.store %val , %dest [%offset ], %mask {chunk_size = 1 , layout = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>, l1_hint = #xegpu.cache_hint <cached >}
291293 : vector <256 xf16 >, memref <256 xf16 >, vector <256 xindex >, vector <256 xi1 >
292294 gpu.return
293295 }
294296
295- // CHECK-LABEL: @load_with_chunk_size
297+ // CHECK-LABEL: @load_with_non_unit_chunk_size
296298 // CHECK-SAME: %[[ARG0:.*]]: memref<?xf16>
297- gpu.func @load_with_chunk_size (%src : memref <?xf16 >) {
299+ gpu.func @load_with_non_unit_chunk_size (%src : memref <?xf16 >) {
298300 // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
299301 // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
300- // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}> : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
302+ // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}>
303+ // CHECK-SAME: : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
301304 %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <0 > : vector <256 xindex >
302305 %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [32 ], sg_data = [8 ]>} dense <1 > : vector <256 xi1 >
303306 %load = xegpu.load %src [%offset ], %mask {chunk_size = 4 , layout_result_0 = #xegpu.layout <sg_layout = [32 , 1 ], sg_data = [8 , 4 ]>, l1_hint = #xegpu.cache_hint <cached >}
0 commit comments