[mlir][XeGPU][VectorToXeGPU] Propagate vector layouts to xegpu ops

dchigarev · dchigarev · commit a22b251e9ab9 · 2025-10-12T13:00:28.000Z
Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -374,22 +374,29 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
         arith::AddIOp::create(rewriter, loc, baseOffset, offsetContrib);
   }
   Value indices = gatScatOp.getIndices();
+  auto indicesLayout = mlir::xegpu::getDistributeLayoutAttr(indices);
   VectorType vecType = cast<VectorType>(indices.getType());
 
-  Value strideVector =
-      vector::BroadcastOp::create(rewriter, loc, vecType, strides.back())
-          .getResult();
-  Value stridedIndices =
-      arith::MulIOp::create(rewriter, loc, strideVector, indices).getResult();
-
-  Value baseVector =
-      vector::BroadcastOp::create(
-          rewriter, loc,
-          VectorType::get(vecType.getShape(), rewriter.getIndexType()),
-          baseOffset)
-          .getResult();
-  return arith::AddIOp::create(rewriter, loc, baseVector, stridedIndices)
-      .getResult();
+  auto strideVector =
+      vector::BroadcastOp::create(rewriter, loc, vecType, strides.back());
+  mlir::xegpu::setDistributeLayoutAttr(strideVector->getOpResult(0),
+                                       indicesLayout);
+
+  auto stridedIndices =
+      arith::MulIOp::create(rewriter, loc, strideVector.getResult(), indices);
+  mlir::xegpu::setDistributeLayoutAttr(stridedIndices->getOpResult(0),
+                                       indicesLayout);
+
+  auto baseVector = vector::BroadcastOp::create(
+      rewriter, loc,
+      VectorType::get(vecType.getShape(), rewriter.getIndexType()), baseOffset);
+  mlir::xegpu::setDistributeLayoutAttr(baseVector->getOpResult(0),
+                                       indicesLayout);
+
+  auto result = arith::AddIOp::create(rewriter, loc, baseVector.getResult(),
+                                      stridedIndices.getResult());
+  mlir::xegpu::setDistributeLayoutAttr(result->getOpResult(0), indicesLayout);
+  return result.getResult();
 }
 
 template <
@@ -616,16 +623,34 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
         computeOffsets(rewriter, gatherOp, meta.first, meta.second);
     Value flatMemref = memrefToIndexPtr(gatherOp, rewriter);
 
+    auto numOffsets = gatherOp.getOffsets().size();
+    auto layoutRes = mlir::xegpu::getDistributeLayoutAttr(gatherOp.getResult());
+    auto layoutIndices = mlir::xegpu::getDistributeLayoutAttr(
+        gatherOp->getOpOperand(numOffsets + 1));
+    auto layoutMask = mlir::xegpu::getDistributeLayoutAttr(
+        gatherOp->getOpOperand(numOffsets + 2));
+    auto layoutPassThru = mlir::xegpu::getDistributeLayoutAttr(
+        gatherOp->getOpOperand(numOffsets + 3));
     auto xeGatherOp = xegpu::LoadGatherOp::create(
         rewriter, loc, vectorType, flatMemref, localOffsets, gatherOp.getMask(),
         /*chunk_size=*/IntegerAttr{},
         /*l1_hint=*/xegpu::CachePolicyAttr{},
         /*l2_hint=*/xegpu::CachePolicyAttr{},
         /*l3_hint=*/xegpu::CachePolicyAttr{});
+    mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpResult(0), layoutRes);
+    mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(1),
+                                         layoutIndices);
+    mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(2),
+                                         layoutMask);
 
     auto selectOp =
         arith::SelectOp::create(rewriter, loc, gatherOp.getMask(),
                                 xeGatherOp.getResult(), gatherOp.getPassThru());
+    mlir::xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(0), layoutMask);
+    mlir::xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(2),
+                                         layoutPassThru);
+    mlir::xegpu::setDistributeLayoutAttr(selectOp->getOpResult(0), layoutRes);
+
     rewriter.replaceOp(gatherOp, selectOp.getResult());
     return success();
   }
@@ -650,12 +675,24 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
         computeOffsets(rewriter, scatterOp, meta.first, meta.second);
     Value flatMemref = memrefToIndexPtr(scatterOp, rewriter);
 
-    xegpu::StoreScatterOp::create(rewriter, loc, scatterOp.getValueToStore(),
-                                  flatMemref, localOffsets, scatterOp.getMask(),
-                                  /*chunk_size=*/IntegerAttr{},
-                                  /*l1_hint=*/xegpu::CachePolicyAttr{},
-                                  /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                  /*l3_hint=*/xegpu::CachePolicyAttr{});
+    auto numOffsets = scatterOp.getOffsets().size();
+    auto layoutIndices = mlir::xegpu::getDistributeLayoutAttr(
+        scatterOp->getOpOperand(numOffsets + 1));
+    auto layoutMask = mlir::xegpu::getDistributeLayoutAttr(
+        scatterOp->getOpOperand(numOffsets + 2));
+    auto layoutVal = mlir::xegpu::getDistributeLayoutAttr(
+        scatterOp->getOpOperand(numOffsets + 3));
+    auto storeOp = xegpu::StoreScatterOp::create(
+        rewriter, loc, scatterOp.getValueToStore(), flatMemref, localOffsets,
+        scatterOp.getMask(),
+        /*chunk_size=*/IntegerAttr{},
+        /*l1_hint=*/xegpu::CachePolicyAttr{},
+        /*l2_hint=*/xegpu::CachePolicyAttr{},
+        /*l3_hint=*/xegpu::CachePolicyAttr{});
+    mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(0), layoutVal);
+    mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(2),
+                                         layoutIndices);
+    mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(3), layoutMask);
     rewriter.eraseOp(scatterOp);
     return success();
   }
diff --git a/mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir
@@ -249,3 +249,144 @@ gpu.func @non_unit_inner_stride_3D(
 // CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[V]], %[[PASS]] : vector<8xi1>, vector<8xf32>
 // CHECK:        gpu.return %[[RES]] : vector<8xf32>
 }
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @load_dynamic_layout_operands(%source: memref<?x?xf32>,
+    %off0: index, %off1: index,
+    %indices: vector<8x16xindex>, %mask: vector<8x16xi1>,
+    %pass_thru: vector<8x16xf32>) -> vector<8x16xf32> {
+  %res = vector.gather %source[%off0, %off1][%indices], %mask,
+       %pass_thru {
+        layout_result_0 = #xegpu.layout<sg_layout = [0]>,
+        layout_operand_3 = #xegpu.layout<sg_layout = [1]>,
+        layout_operand_4 = #xegpu.layout<sg_layout = [2]>,
+        layout_operand_5 = #xegpu.layout<sg_layout = [3]>
+  } : memref<?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
+  gpu.return %res : vector<8x16xf32>
+}
+// CHECK-LABEL:  @load_dynamic_layout_operands(
+// CHECK-SAME:   %[[SRC:.+]]: memref<?x?xf32>,
+// CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
+// CHECK-SAME:   %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>, %[[PASS:.+]]: vector<8x16xf32>) -> vector<8x16xf32> {
+// CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} :  index to vector<8x16xindex>
+// CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
+// CHECK-SAME:   {layout_operand_1 = #xegpu.layout<sg_layout = [1]>, layout_operand_2 = #xegpu.layout<sg_layout = [2]>,
+// CHECK-SAME:   layout_result_0 = #xegpu.layout<sg_layout = [0]>}
+// CHECK:        %[[RES:.+]] = arith.select {{[^{]*}}
+// CHECK-SAME:   {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [2]>,
+// CHECK-SAME:   {{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [3]>,
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [0]>} : vector<8x16xi1>, vector<8x16xf32>
+}
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @load_dynamic_layout_mixed(%source: memref<?x?x?xf32>,
+    %off0: index, %off1: index, %off2: index,
+    %mask: vector<8x16xi1>) -> vector<8x16xf32> {
+  %pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+  %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex>
+  %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+  %0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex>
+  %1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex>
+  %2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
+
+  %res = vector.gather %source[%off0, %off1, %off2][%2], %mask,
+       %pass_thru {
+        layout_result_0 = #xegpu.layout<sg_layout = [6]>,
+        layout_operand_5 = #xegpu.layout<sg_layout = [7]>
+  } : memref<?x?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
+  %res2 = arith.addf %res, %pass_thru : vector<8x16xf32>
+  gpu.return %res2 : vector<8x16xf32>
+}
+// CHECK-LABEL:  @load_dynamic_layout_mixed(
+// CHECK-SAME:   %[[SRC:.+]]: memref<?x?x?xf32>,
+// CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
+// CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
+// CHECK:        %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
+// CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
+// CHECK-SAME:   {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]>
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
+// CHECK:        %[[RES:.+]] = arith.select {{[^{]*}}
+// CHECK-SAME:   {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
+}
+
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @load_static_layout_mixed(%source: memref<8x16x32xf32>,
+    %off0: index, %off1: index, %off2: index,
+    %mask: vector<8x16xi1>) -> vector<8x16xf32> {
+  %pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+  %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex>
+  %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+  %0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex>
+  %1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex>
+  %2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
+
+  %res = vector.gather %source[%off0, %off1, %off2][%2], %mask,
+       %pass_thru {
+        layout_result_0 = #xegpu.layout<sg_layout = [6]>,
+        layout_operand_5 = #xegpu.layout<sg_layout = [7]>
+  } : memref<8x16x32xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
+  %res2 = arith.addf %res, %pass_thru : vector<8x16xf32>
+  gpu.return %res2 : vector<8x16xf32>
+}
+// CHECK-LABEL:  @load_static_layout_mixed(
+// CHECK-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
+// CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
+// CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
+// CHECK:        %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
+// CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
+// CHECK-SAME:   {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]>
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
+// CHECK:        %[[RES:.+]] = arith.select {{[^{]*}}
+// CHECK-SAME:   {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
+}
+
+// -----
+
+gpu.module @xevm_module {
+gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
+    %off0: index, %off1: index, %off2: index,
+    %mask: vector<8x16xi1>) -> vector<8x16xf32> {
+  %pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+  %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex>
+  %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+  %0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex>
+  %1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex>
+  %2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
+
+  %res = vector.gather %source[%off0, %off1, %off2][%2], %mask,
+       %pass_thru {
+        layout_result_0 = #xegpu.layout<sg_layout = [6]>,
+        layout_operand_4 = #xegpu.layout<sg_layout = [99]>, // overriding %2's layout
+        layout_operand_5 = #xegpu.layout<sg_layout = [7]>
+  } : memref<?x?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
+  %res2 = arith.addf %res, %pass_thru : vector<8x16xf32>
+  gpu.return %res2 : vector<8x16xf32>
+}
+// CHECK-LABEL:  @load_dynamic_layout_mixed_override(
+// CHECK-SAME:   %[[SRC:.+]]: memref<?x?x?xf32>,
+// CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
+// CHECK-SAME:   %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
+// CHECK:        %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} :  index to vector<8x16xindex>
+// CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
+// CHECK-SAME:   {layout_operand_1 = #xegpu.layout<sg_layout = [99]>, layout_operand_2 = #xegpu.layout<sg_layout = [7]>
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
+// CHECK:        %[[RES:.+]] = arith.select {{[^{]*}}
+// CHECK-SAME:   {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
+// CHECK-SAME:   {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/scatter-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/scatter-to-xegpu.mlir