@@ -253,6 +253,7 @@ gpu.func @non_unit_inner_stride_3D(
253253// -----
254254
255255gpu.module @xevm_module {
256+ // Layouts are only specified for the gather op itself.
256257gpu.func @load_dynamic_layout_operands (%source: memref <?x?xf32 >,
257258 %off0: index , %off1: index ,
258259 %indices: vector <8 x16 xindex >, %mask: vector <8 x16 xi1 >,
@@ -270,6 +271,7 @@ gpu.func @load_dynamic_layout_operands(%source: memref<?x?xf32>,
270271// CHECK-SAME: %[[SRC:.+]]: memref<?x?xf32>,
271272// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
272273// CHECK-SAME: %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>, %[[PASS:.+]]: vector<8x16xf32>) -> vector<8x16xf32> {
274+ // %indices producer doesn't have a layout, so as 'broadcast/add' ops computing linear index.
273275// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
274276// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
275277// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -307,6 +309,7 @@ gpu.func @load_dynamic_layout_mixed(%source: memref<?x?x?xf32>,
307309// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
308310// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
309311// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
312+ // Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
310313// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
311314// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
312315// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -344,6 +347,7 @@ gpu.func @load_static_layout_mixed(%source: memref<8x16x32xf32>,
344347// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
345348// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
346349// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
350+ // Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
347351// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
348352// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
349353// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -381,6 +385,8 @@ gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
381385// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
382386// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
383387// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
388+ // Verify that linear-indices computation uses layout from the 'indices' producer op (%2)
389+ // and not it's overriden version from the scatter_op (sg_layout = [99])
384390// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
385391// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
386392// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -390,3 +396,41 @@ gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
390396// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
391397// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
392398}
399+
400+ // -----
401+
402+ gpu.module @xevm_module {
403+ gpu.func @load_with_cache_hints (%source: memref <8 x16 x32 xf32 >,
404+ %off1: index , %off2: index , %off3: index ,
405+ %indices: vector <8 xindex >, %mask: vector <8 xi1 >,
406+ %pass_thru: vector <8 xf32 >) -> vector <8 xf32 > {
407+ %0 = vector.gather %source [%off1 , %off2 , %off3 ][%indices ], %mask ,
408+ %pass_thru {
409+ l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >,
410+ l3_hint = #xegpu.cache_hint <streaming >
411+ } : memref <8 x16 x32 xf32 >, vector <8 xindex >, vector <8 xi1 >, vector <8 xf32 > into vector <8 xf32 >
412+ gpu.return %0 : vector <8 xf32 >
413+ }
414+ // CHECK-LABEL: @load_with_cache_hints(
415+ // CHECK: xegpu.load {{[^<]*}}
416+ // CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<streaming>}>
417+ }
418+
419+ // -----
420+
421+ gpu.module @xevm_module {
422+ gpu.func @load_with_partial_cache_hints (%source: memref <8 x16 x32 xf32 >,
423+ %off1: index , %off2: index , %off3: index ,
424+ %indices: vector <8 xindex >, %mask: vector <8 xi1 >,
425+ %pass_thru: vector <8 xf32 >) -> vector <8 xf32 > {
426+ %0 = vector.gather %source [%off1 , %off2 , %off3 ][%indices ], %mask ,
427+ %pass_thru {
428+ l1_hint = #xegpu.cache_hint <cached >,
429+ l3_hint = #xegpu.cache_hint <streaming >
430+ } : memref <8 x16 x32 xf32 >, vector <8 xindex >, vector <8 xi1 >, vector <8 xf32 > into vector <8 xf32 >
431+ gpu.return %0 : vector <8 xf32 >
432+ }
433+ // CHECK-LABEL: @load_with_partial_cache_hints(
434+ // CHECK: xegpu.load {{[^<]*}}
435+ // CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<streaming>}>
436+ }
0 commit comments