@@ -249,3 +249,144 @@ gpu.func @non_unit_inner_stride_3D(
249249// CHECK: %[[RES:.+]] = arith.select %[[MASK]], %[[V]], %[[PASS]] : vector<8xi1>, vector<8xf32>
250250// CHECK: gpu.return %[[RES]] : vector<8xf32>
251251}
252+
253+ // -----
254+
255+ gpu.module @xevm_module {
256+ gpu.func @load_dynamic_layout_operands (%source: memref <?x?xf32 >,
257+ %off0: index , %off1: index ,
258+ %indices: vector <8 x16 xindex >, %mask: vector <8 x16 xi1 >,
259+ %pass_thru: vector <8 x16 xf32 >) -> vector <8 x16 xf32 > {
260+ %res = vector.gather %source [%off0 , %off1 ][%indices ], %mask ,
261+ %pass_thru {
262+ layout_result_0 = #xegpu.layout <sg_layout = [0 ]>,
263+ layout_operand_3 = #xegpu.layout <sg_layout = [1 ]>,
264+ layout_operand_4 = #xegpu.layout <sg_layout = [2 ]>,
265+ layout_operand_5 = #xegpu.layout <sg_layout = [3 ]>
266+ } : memref <?x?xf32 >, vector <8 x16 xindex >, vector <8 x16 xi1 >, vector <8 x16 xf32 > into vector <8 x16 xf32 >
267+ gpu.return %res : vector <8 x16 xf32 >
268+ }
269+ // CHECK-LABEL: @load_dynamic_layout_operands(
270+ // CHECK-SAME: %[[SRC:.+]]: memref<?x?xf32>,
271+ // CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
272+ // CHECK-SAME: %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>, %[[PASS:.+]]: vector<8x16xf32>) -> vector<8x16xf32> {
273+ // CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
274+ // CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
275+ // CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
276+ // CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [1]>, layout_operand_2 = #xegpu.layout<sg_layout = [2]>,
277+ // CHECK-SAME: layout_result_0 = #xegpu.layout<sg_layout = [0]>}
278+ // CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
279+ // CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [2]>,
280+ // CHECK-SAME: {{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [3]>,
281+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [0]>} : vector<8x16xi1>, vector<8x16xf32>
282+ }
283+
284+ // -----
285+
286+ gpu.module @xevm_module {
287+ gpu.func @load_dynamic_layout_mixed (%source: memref <?x?x?xf32 >,
288+ %off0: index , %off1: index , %off2: index ,
289+ %mask: vector <8 x16 xi1 >) -> vector <8 x16 xf32 > {
290+ %pass_thru = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [0 ]>} dense <0.000000e+00 > : vector <8 x16 xf32 >
291+ %cst_1 = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [1 ]>} dense <[[0 ], [32 ], [64 ], [96 ], [128 ], [160 ], [192 ], [224 ]]> : vector <8 x1 xindex >
292+ %cst_2 = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [2 ]>} dense <[[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ]]> : vector <1 x16 xindex >
293+ %0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout <sg_layout = [3 ]>} : vector <8 x1 xindex > to vector <8 x16 xindex >
294+ %1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout <sg_layout = [4 ]>} : vector <1 x16 xindex > to vector <8 x16 xindex >
295+ %2 = arith.addi %0 , %1 {layout_result_0 = #xegpu.layout <sg_layout = [5 ]>} : vector <8 x16 xindex >
296+
297+ %res = vector.gather %source [%off0 , %off1 , %off2 ][%2 ], %mask ,
298+ %pass_thru {
299+ layout_result_0 = #xegpu.layout <sg_layout = [6 ]>,
300+ layout_operand_5 = #xegpu.layout <sg_layout = [7 ]>
301+ } : memref <?x?x?xf32 >, vector <8 x16 xindex >, vector <8 x16 xi1 >, vector <8 x16 xf32 > into vector <8 x16 xf32 >
302+ %res2 = arith.addf %res , %pass_thru : vector <8 x16 xf32 >
303+ gpu.return %res2 : vector <8 x16 xf32 >
304+ }
305+ // CHECK-LABEL: @load_dynamic_layout_mixed(
306+ // CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>,
307+ // CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
308+ // CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
309+ // CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
310+ // CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
311+ // CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
312+ // CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
313+ // CHECK-SAME: {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]>
314+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
315+ // CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
316+ // CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
317+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
318+ }
319+
320+
321+ // -----
322+
323+ gpu.module @xevm_module {
324+ gpu.func @load_static_layout_mixed (%source: memref <8 x16 x32 xf32 >,
325+ %off0: index , %off1: index , %off2: index ,
326+ %mask: vector <8 x16 xi1 >) -> vector <8 x16 xf32 > {
327+ %pass_thru = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [0 ]>} dense <0.000000e+00 > : vector <8 x16 xf32 >
328+ %cst_1 = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [1 ]>} dense <[[0 ], [32 ], [64 ], [96 ], [128 ], [160 ], [192 ], [224 ]]> : vector <8 x1 xindex >
329+ %cst_2 = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [2 ]>} dense <[[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ]]> : vector <1 x16 xindex >
330+ %0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout <sg_layout = [3 ]>} : vector <8 x1 xindex > to vector <8 x16 xindex >
331+ %1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout <sg_layout = [4 ]>} : vector <1 x16 xindex > to vector <8 x16 xindex >
332+ %2 = arith.addi %0 , %1 {layout_result_0 = #xegpu.layout <sg_layout = [5 ]>} : vector <8 x16 xindex >
333+
334+ %res = vector.gather %source [%off0 , %off1 , %off2 ][%2 ], %mask ,
335+ %pass_thru {
336+ layout_result_0 = #xegpu.layout <sg_layout = [6 ]>,
337+ layout_operand_5 = #xegpu.layout <sg_layout = [7 ]>
338+ } : memref <8 x16 x32 xf32 >, vector <8 x16 xindex >, vector <8 x16 xi1 >, vector <8 x16 xf32 > into vector <8 x16 xf32 >
339+ %res2 = arith.addf %res , %pass_thru : vector <8 x16 xf32 >
340+ gpu.return %res2 : vector <8 x16 xf32 >
341+ }
342+ // CHECK-LABEL: @load_static_layout_mixed(
343+ // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>,
344+ // CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
345+ // CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
346+ // CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
347+ // CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
348+ // CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
349+ // CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
350+ // CHECK-SAME: {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]>
351+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
352+ // CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
353+ // CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
354+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
355+ }
356+
357+ // -----
358+
359+ gpu.module @xevm_module {
360+ gpu.func @load_dynamic_layout_mixed_override (%source: memref <?x?x?xf32 >,
361+ %off0: index , %off1: index , %off2: index ,
362+ %mask: vector <8 x16 xi1 >) -> vector <8 x16 xf32 > {
363+ %pass_thru = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [0 ]>} dense <0.000000e+00 > : vector <8 x16 xf32 >
364+ %cst_1 = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [1 ]>} dense <[[0 ], [32 ], [64 ], [96 ], [128 ], [160 ], [192 ], [224 ]]> : vector <8 x1 xindex >
365+ %cst_2 = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [2 ]>} dense <[[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ]]> : vector <1 x16 xindex >
366+ %0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout <sg_layout = [3 ]>} : vector <8 x1 xindex > to vector <8 x16 xindex >
367+ %1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout <sg_layout = [4 ]>} : vector <1 x16 xindex > to vector <8 x16 xindex >
368+ %2 = arith.addi %0 , %1 {layout_result_0 = #xegpu.layout <sg_layout = [5 ]>} : vector <8 x16 xindex >
369+
370+ %res = vector.gather %source [%off0 , %off1 , %off2 ][%2 ], %mask ,
371+ %pass_thru {
372+ layout_result_0 = #xegpu.layout <sg_layout = [6 ]>,
373+ layout_operand_4 = #xegpu.layout <sg_layout = [99 ]>, // overriding %2's layout
374+ layout_operand_5 = #xegpu.layout <sg_layout = [7 ]>
375+ } : memref <?x?x?xf32 >, vector <8 x16 xindex >, vector <8 x16 xi1 >, vector <8 x16 xf32 > into vector <8 x16 xf32 >
376+ %res2 = arith.addf %res , %pass_thru : vector <8 x16 xf32 >
377+ gpu.return %res2 : vector <8 x16 xf32 >
378+ }
379+ // CHECK-LABEL: @load_dynamic_layout_mixed_override(
380+ // CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>,
381+ // CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
382+ // CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
383+ // CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
384+ // CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
385+ // CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
386+ // CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
387+ // CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [99]>, layout_operand_2 = #xegpu.layout<sg_layout = [7]>
388+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
389+ // CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
390+ // CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
391+ // CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
392+ }
0 commit comments