add cache hint propagation

dchigarev · dchigarev · commit 76c8129beee0 · 2025-10-12T14:00:21.000Z
Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -97,6 +97,20 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter,
   return success();
 }
 
+// Extract cache hints from the op attributes if available.
+static void getOpCacheHints(Operation *op,
+                            SmallVector<xegpu::CachePolicyAttr, 3> &hints) {
+  assert(hints.size() == 3 &&
+         "Expecting a vector of size 3 for l1, l2, l3 hints.");
+  // get l1, l2, l3 hints from attributes if available.
+  if (auto l1Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l1_hint"))
+    hints[0] = l1Attr;
+  if (auto l2Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l2_hint"))
+    hints[1] = l2Attr;
+  if (auto l3Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l3_hint"))
+    hints[2] = l3Attr;
+}
+
 static xegpu::CreateNdDescOp
 createNdDescriptor(PatternRewriter &rewriter, Location loc,
                    xegpu::TensorDescType descType, TypedValue<MemRefType> src,
@@ -631,12 +645,17 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
         gatherOp->getOpOperand(numOffsets + 2));
     auto layoutPassThru = mlir::xegpu::getDistributeLayoutAttr(
         gatherOp->getOpOperand(numOffsets + 3));
+
+    SmallVector<xegpu::CachePolicyAttr, 3> cacheHints{xegpu::CachePolicyAttr{},
+                                                      xegpu::CachePolicyAttr{},
+                                                      xegpu::CachePolicyAttr{}};
+    getOpCacheHints(gatherOp, cacheHints);
     auto xeGatherOp = xegpu::LoadGatherOp::create(
         rewriter, loc, vectorType, flatMemref, localOffsets, gatherOp.getMask(),
         /*chunk_size=*/IntegerAttr{},
-        /*l1_hint=*/xegpu::CachePolicyAttr{},
-        /*l2_hint=*/xegpu::CachePolicyAttr{},
-        /*l3_hint=*/xegpu::CachePolicyAttr{});
+        /*l1_hint=*/cacheHints[0],
+        /*l2_hint=*/cacheHints[1],
+        /*l3_hint=*/cacheHints[2]);
     mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpResult(0), layoutRes);
     mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(1),
                                          layoutIndices);
@@ -682,13 +701,17 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
         scatterOp->getOpOperand(numOffsets + 2));
     auto layoutVal = mlir::xegpu::getDistributeLayoutAttr(
         scatterOp->getOpOperand(numOffsets + 3));
+    SmallVector<xegpu::CachePolicyAttr, 3> cacheHints{xegpu::CachePolicyAttr{},
+                                                      xegpu::CachePolicyAttr{},
+                                                      xegpu::CachePolicyAttr{}};
+    getOpCacheHints(scatterOp, cacheHints);
     auto storeOp = xegpu::StoreScatterOp::create(
         rewriter, loc, scatterOp.getValueToStore(), flatMemref, localOffsets,
         scatterOp.getMask(),
         /*chunk_size=*/IntegerAttr{},
-        /*l1_hint=*/xegpu::CachePolicyAttr{},
-        /*l2_hint=*/xegpu::CachePolicyAttr{},
-        /*l3_hint=*/xegpu::CachePolicyAttr{});
+        /*l1_hint=*/cacheHints[0],
+        /*l2_hint=*/cacheHints[1],
+        /*l3_hint=*/cacheHints[2]);
     mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(0), layoutVal);
     mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(2),
                                          layoutIndices);
diff --git a/mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir
@@ -253,6 +253,7 @@ gpu.func @non_unit_inner_stride_3D(
 // -----
 
 gpu.module @xevm_module {
+// Layouts are only specified for the gather op itself.
 gpu.func @load_dynamic_layout_operands(%source: memref<?x?xf32>,
     %off0: index, %off1: index,
     %indices: vector<8x16xindex>, %mask: vector<8x16xi1>,
@@ -270,6 +271,7 @@ gpu.func @load_dynamic_layout_operands(%source: memref<?x?xf32>,
 // CHECK-SAME:   %[[SRC:.+]]: memref<?x?xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
 // CHECK-SAME:   %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>, %[[PASS:.+]]: vector<8x16xf32>) -> vector<8x16xf32> {
+// %indices producer doesn't have a layout, so as 'broadcast/add' ops computing linear index.
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
 // CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -307,6 +309,7 @@ gpu.func @load_dynamic_layout_mixed(%source: memref<?x?x?xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
 // CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
 // CHECK:        %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+// Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
 // CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -344,6 +347,7 @@ gpu.func @load_static_layout_mixed(%source: memref<8x16x32xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
 // CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
 // CHECK:        %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+// Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
 // CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -381,6 +385,8 @@ gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
 // CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
 // CHECK:        %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+// Verify that linear-indices computation uses layout from the 'indices' producer op (%2)
+// and not it's overriden version from the scatter_op (sg_layout = [99])
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
 // CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -390,3 +396,41 @@ gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
 // CHECK-SAME:   {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
 // CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
 }
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @load_with_cache_hints(%source: memref<8x16x32xf32>,
+     %off1: index, %off2: index, %off3: index,
+     %indices: vector<8xindex>, %mask: vector<8xi1>,
+     %pass_thru: vector<8xf32>) -> vector<8xf32> {
+  %0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask,
+       %pass_thru {
+        l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>,
+        l3_hint = #xegpu.cache_hint<streaming>
+  } : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32>
+  gpu.return %0 : vector<8xf32>
+}
+// CHECK-LABEL:  @load_with_cache_hints(
+// CHECK:        xegpu.load {{[^<]*}}
+// CHECK-SAME:   <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<streaming>}>
+}
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @load_with_partial_cache_hints(%source: memref<8x16x32xf32>,
+     %off1: index, %off2: index, %off3: index,
+     %indices: vector<8xindex>, %mask: vector<8xi1>,
+     %pass_thru: vector<8xf32>) -> vector<8xf32> {
+  %0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask,
+       %pass_thru {
+        l1_hint = #xegpu.cache_hint<cached>,
+        l3_hint = #xegpu.cache_hint<streaming>
+  } : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32>
+  gpu.return %0 : vector<8xf32>
+}
+// CHECK-LABEL:  @load_with_partial_cache_hints(
+// CHECK:        xegpu.load {{[^<]*}}
+// CHECK-SAME:   <{l1_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<streaming>}>
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/scatter-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/scatter-to-xegpu.mlir
@@ -222,6 +222,7 @@ gpu.func @store_dynamic_layout_operands(%vec: vector<8x16xf32>, %source: memref<
 // CHECK-SAME:   %[[VEC:.+]]: vector<8x16xf32>, %[[SRC:.+]]: memref<?x?xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
 // CHECK-SAME:   %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>) {
+// %indices producer doesn't have a layout, so as 'broadcast/add' ops computing linear index.
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
 // CHECK:        xegpu.store %[[VEC]], %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -251,6 +252,7 @@ gpu.func @store_dynamic_layout_mixed(%source: memref<?x?x?xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
 // CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) {
 // CHECK:        %[[VEC:.+]] = arith.constant {layout_operand_0 = #xegpu.layout<sg_layout = [0]>} dense<1.000000e+00> : vector<8x16xf32>
+// Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
 // CHECK:        xegpu.store %[[VEC]], %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -280,6 +282,7 @@ gpu.func @store_static_layout_mixed(%source: memref<8x16x32xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
 // CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) {
 // CHECK:        %[[VEC:.+]] = arith.constant {layout_operand_0 = #xegpu.layout<sg_layout = [0]>} dense<1.000000e+00> : vector<8x16xf32>
+// Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
 // CHECK:        xegpu.store %[[VEC]], %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
@@ -310,9 +313,43 @@ gpu.func @store_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
 // CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
 // CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) {
 // CHECK:        %[[VEC:.+]] = arith.constant {layout_operand_0 = #xegpu.layout<sg_layout = [0]>} dense<1.000000e+00> : vector<8x16xf32>
+// Verify that linear-indices computation uses layout from the 'indices' producer op (%2)
+// and not it's overriden version from the scatter_op (sg_layout = [99])
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
 // CHECK:        xegpu.store %[[VEC]], %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
 // CHECK-SAME:   {{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [99]>,
 // CHECK-SAME:   {{[^}]*}}layout_operand_3 = #xegpu.layout<sg_layout = [6]>}
 }
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @store_with_cache_hints(%vec: vector<8xf32>, %source: memref<8x16x32xf32>,
+     %off1: index, %off2: index, %off3: index,
+     %indices: vector<8xindex>, %mask: vector<8xi1>) {
+  vector.scatter %source[%off1, %off2, %off3][%indices], %mask, %vec {
+    l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<write_back>
+  } : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32>
+  gpu.return
+}
+// CHECK-LABEL:  @store_with_cache_hints(
+// CHECK:        xegpu.store {{[^<]*}}
+// CHECK-SAME:   <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<write_back>}>
+}
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @store_with_partial_cache_hints(%vec: vector<8xf32>, %source: memref<8x16x32xf32>,
+     %off1: index, %off2: index, %off3: index,
+     %indices: vector<8xindex>, %mask: vector<8xi1>) {
+  vector.scatter %source[%off1, %off2, %off3][%indices], %mask, %vec {
+    l1_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<write_back>
+  } : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32>
+  gpu.return
+}
+// CHECK-LABEL:  @store_with_partial_cache_hints(
+// CHECK:        xegpu.store {{[^<]*}}
+// CHECK-SAME:   <{l1_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<write_back>}>
+}