Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 75 additions & 23 deletions mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter,
return success();
}

// Extract cache hints from the op attributes if available.
static SmallVector<xegpu::CachePolicyAttr, 3> getOpCacheHints(Operation *op) {
SmallVector<xegpu::CachePolicyAttr, 3> cacheHints{xegpu::CachePolicyAttr{},
xegpu::CachePolicyAttr{},
xegpu::CachePolicyAttr{}};
// get l1, l2, l3 hints from attributes if available.
if (auto l1Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l1_hint"))
cacheHints[0] = l1Attr;
if (auto l2Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l2_hint"))
cacheHints[1] = l2Attr;
if (auto l3Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l3_hint"))
cacheHints[2] = l3Attr;
return cacheHints;
}

static xegpu::CreateNdDescOp
createNdDescriptor(PatternRewriter &rewriter, Location loc,
xegpu::TensorDescType descType, TypedValue<MemRefType> src,
Expand Down Expand Up @@ -374,22 +389,27 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
arith::AddIOp::create(rewriter, loc, baseOffset, offsetContrib);
}
Value indices = gatScatOp.getIndices();
// Extract indices layout and propagate it to all 'vector' ops created here
auto indicesLayout = xegpu::getDistributeLayoutAttr(indices);
VectorType vecType = cast<VectorType>(indices.getType());

Value strideVector =
vector::BroadcastOp::create(rewriter, loc, vecType, strides.back())
.getResult();
Value stridedIndices =
arith::MulIOp::create(rewriter, loc, strideVector, indices).getResult();

Value baseVector =
vector::BroadcastOp::create(
rewriter, loc,
VectorType::get(vecType.getShape(), rewriter.getIndexType()),
baseOffset)
.getResult();
return arith::AddIOp::create(rewriter, loc, baseVector, stridedIndices)
.getResult();
auto strideVector =
vector::BroadcastOp::create(rewriter, loc, vecType, strides.back());
xegpu::setDistributeLayoutAttr(strideVector->getOpResult(0), indicesLayout);

auto stridedIndices =
arith::MulIOp::create(rewriter, loc, strideVector.getResult(), indices);
xegpu::setDistributeLayoutAttr(stridedIndices->getOpResult(0), indicesLayout);

auto baseVector = vector::BroadcastOp::create(
rewriter, loc,
VectorType::get(vecType.getShape(), rewriter.getIndexType()), baseOffset);
xegpu::setDistributeLayoutAttr(baseVector->getOpResult(0), indicesLayout);

auto result = arith::AddIOp::create(rewriter, loc, baseVector.getResult(),
stridedIndices.getResult());
xegpu::setDistributeLayoutAttr(result->getOpResult(0), indicesLayout);
return result.getResult();
}

template <
Expand Down Expand Up @@ -616,16 +636,35 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
computeOffsets(rewriter, gatherOp, meta.first, meta.second);
Value flatMemref = memrefToIndexPtr(gatherOp, rewriter);

auto numOffsets = gatherOp.getOffsets().size();
auto layoutRes = xegpu::getDistributeLayoutAttr(gatherOp.getResult());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All three layouts should be exactly the same.
We can take the layout from Result vector, and set to the store op's layout attribute (need to expand the op definition to make the attribute persistent, not lost after optimization passes).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found at least one scenario in our test cases where not all operands of xegpu.store have the same layout.

It also looks quite confusing in case of xegpu.load to have mixed layout_operand_* and layout attribute:

%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
  {
    layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
    layout = #xegpu.layout<
            lane_layout = [16, 1], lane_data = [1, 2]> // layout for what??
  }
  : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>

// -----

%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
  {
    layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
    layout_result_0 = #xegpu.layout<
            lane_layout = [16, 1], lane_data = [1, 2]> // the purpose of the layout is very clear
  }
  : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>

Given that, I don't think we need to have a separate layout attribute for xegpu load/store ops (or we have to rethink it once again, and make it not like load_matrix/store_matrix layout attribute).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the entry XeGPU IR, we need to set the layout to anchor op, and rely on the propagation pass to propagate the layout to all values. The anchor op include: load/store, load_matrix/store_matrix, load_nd/store_nd, and dpas, convert_layout. The layout describes the layout for the tensor tile: for memory operation, it describes the memory operand (like memref, mem_desc, nd_tdesc), for dpas, it describes the vector operands.

The layout propagated are temporarily attributes and can be lost during IR transformation or lowering. But the anchor op defined layout attributes are permanents and we rely on these attributes being set up properly so able to recover in case user compose xegpu passes with their own passes.

The example you show above actually looks fine to me, as long as the propogation rules decides to drop one dimension for the mask and index and keep the tensor tile layout as 2d. It does look a bit complex, but I don't expect this kind of layout is exposed to user since chunkload should be lower-level representation for memory coalesce or load_matrix lowering.

So I still suggest changing this PR and just set the layout attribute according to the tensor tile's layout for anchor ops only and remove the attributes setting for other ops and operands.

auto layoutIndices =
xegpu::getDistributeLayoutAttr(gatherOp->getOpOperand(numOffsets + 1));
auto layoutMask =
xegpu::getDistributeLayoutAttr(gatherOp->getOpOperand(numOffsets + 2));
auto layoutPassThru =
xegpu::getDistributeLayoutAttr(gatherOp->getOpOperand(numOffsets + 3));

SmallVector<xegpu::CachePolicyAttr, 3> cacheHints =
getOpCacheHints(gatherOp);
auto xeGatherOp = xegpu::LoadGatherOp::create(
rewriter, loc, vectorType, flatMemref, localOffsets, gatherOp.getMask(),
/*chunk_size=*/IntegerAttr{},
/*l1_hint=*/xegpu::CachePolicyAttr{},
/*l2_hint=*/xegpu::CachePolicyAttr{},
/*l3_hint=*/xegpu::CachePolicyAttr{});
/*l1_hint=*/cacheHints[0],
/*l2_hint=*/cacheHints[1],
/*l3_hint=*/cacheHints[2]);
xegpu::setDistributeLayoutAttr(xeGatherOp->getOpResult(0), layoutRes);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should expand xegpu.load/store with optional attribute DistributeLayoutAttr (like loadmatrix op definition), and set the operation's attribute only. The propagation process will propagate it to other ops.

xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(1), layoutIndices);
xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(2), layoutMask);

auto selectOp =
arith::SelectOp::create(rewriter, loc, gatherOp.getMask(),
xeGatherOp.getResult(), gatherOp.getPassThru());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double-check, the layout isn't assigned to the second operand (LoadGather) as it's already in the producer's result.
I assume it's left to the propagation to fill the gap? Any drawbacks in assigning layout in both places?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to double-check, the layout isn't assigned to the second operand (LoadGather) as it's already in the producer's result

Right, I thought that's enough. There seems to be no drawbacks though from assigning layout in both places. Applied layout in both places in the last commit just in case

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

setting the operand layout is done at the client of layout propagation result. So I think there is no need to update layout_operand[_] here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it always the case? Say, the value to store comes as a function's argument (our func is not inlined yet) and it's impossible to determine the producer but it's possible to determine the layout from {layout_operand_i} attribute. Should we support such cases? Are there any negative side-effects from setting a layout_operand_i even if we can access the layout from layout_result_i of the value producer?

xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(0), layoutMask);
xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(1), layoutRes);
xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(2), layoutPassThru);
xegpu::setDistributeLayoutAttr(selectOp->getOpResult(0), layoutRes);

rewriter.replaceOp(gatherOp, selectOp.getResult());
return success();
}
Expand All @@ -650,12 +689,25 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
computeOffsets(rewriter, scatterOp, meta.first, meta.second);
Value flatMemref = memrefToIndexPtr(scatterOp, rewriter);

xegpu::StoreScatterOp::create(rewriter, loc, scatterOp.getValueToStore(),
flatMemref, localOffsets, scatterOp.getMask(),
/*chunk_size=*/IntegerAttr{},
/*l1_hint=*/xegpu::CachePolicyAttr{},
/*l2_hint=*/xegpu::CachePolicyAttr{},
/*l3_hint=*/xegpu::CachePolicyAttr{});
auto numOffsets = scatterOp.getOffsets().size();
auto layoutIndices =
xegpu::getDistributeLayoutAttr(scatterOp->getOpOperand(numOffsets + 1));
auto layoutMask =
xegpu::getDistributeLayoutAttr(scatterOp->getOpOperand(numOffsets + 2));
auto layoutVal =
xegpu::getDistributeLayoutAttr(scatterOp->getOpOperand(numOffsets + 3));
SmallVector<xegpu::CachePolicyAttr, 3> cacheHints =
getOpCacheHints(scatterOp);
auto storeOp = xegpu::StoreScatterOp::create(
rewriter, loc, scatterOp.getValueToStore(), flatMemref, localOffsets,
scatterOp.getMask(),
/*chunk_size=*/IntegerAttr{},
/*l1_hint=*/cacheHints[0],
/*l2_hint=*/cacheHints[1],
/*l3_hint=*/cacheHints[2]);
xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(0), layoutVal);
xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(2), layoutIndices);
xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(3), layoutMask);
rewriter.eraseOp(scatterOp);
return success();
}
Expand Down
185 changes: 185 additions & 0 deletions mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,188 @@ gpu.func @non_unit_inner_stride_3D(
// CHECK: %[[RES:.+]] = arith.select %[[MASK]], %[[V]], %[[PASS]] : vector<8xi1>, vector<8xf32>
// CHECK: gpu.return %[[RES]] : vector<8xf32>
}

// -----

gpu.module @xevm_module {
// Layouts are only specified for the gather op itself.
gpu.func @load_dynamic_layout_operands(%source: memref<?x?xf32>,
%off0: index, %off1: index,
%indices: vector<8x16xindex>, %mask: vector<8x16xi1>,
%pass_thru: vector<8x16xf32>) -> vector<8x16xf32> {
%res = vector.gather %source[%off0, %off1][%indices], %mask,
%pass_thru {
layout_result_0 = #xegpu.layout<sg_layout = [0]>,
layout_operand_3 = #xegpu.layout<sg_layout = [1]>,
layout_operand_4 = #xegpu.layout<sg_layout = [2]>,
layout_operand_5 = #xegpu.layout<sg_layout = [3]>
} : memref<?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
gpu.return %res : vector<8x16xf32>
}
// CHECK-LABEL: @load_dynamic_layout_operands(
// CHECK-SAME: %[[SRC:.+]]: memref<?x?xf32>,
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
// CHECK-SAME: %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>, %[[PASS:.+]]: vector<8x16xf32>) -> vector<8x16xf32> {
// %indices producer doesn't have a layout, so as 'broadcast/add' ops computing linear index.
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex>
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
// CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [1]>, layout_operand_2 = #xegpu.layout<sg_layout = [2]>,
// CHECK-SAME: layout_result_0 = #xegpu.layout<sg_layout = [0]>}
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [2]>,
// CHECK-SAME: {{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [3]>,
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [0]>} : vector<8x16xi1>, vector<8x16xf32>
}

// -----

gpu.module @xevm_module {
gpu.func @load_dynamic_layout_mixed(%source: memref<?x?x?xf32>,
%off0: index, %off1: index, %off2: index,
%mask: vector<8x16xi1>) -> vector<8x16xf32> {
%pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex>
%cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
%0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex>
%1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex>
%2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>

%res = vector.gather %source[%off0, %off1, %off2][%2], %mask,
%pass_thru {
layout_result_0 = #xegpu.layout<sg_layout = [6]>,
layout_operand_5 = #xegpu.layout<sg_layout = [7]>
} : memref<?x?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
%res2 = arith.addf %res, %pass_thru : vector<8x16xf32>
gpu.return %res2 : vector<8x16xf32>
}
// CHECK-LABEL: @load_dynamic_layout_mixed(
// CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>,
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
// Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
// CHECK-SAME: {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar to my previous comment operand layouts are not needed.

// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
}


// -----

gpu.module @xevm_module {
gpu.func @load_static_layout_mixed(%source: memref<8x16x32xf32>,
%off0: index, %off1: index, %off2: index,
%mask: vector<8x16xi1>) -> vector<8x16xf32> {
%pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex>
%cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
%0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex>
%1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex>
%2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>

%res = vector.gather %source[%off0, %off1, %off2][%2], %mask,
%pass_thru {
layout_result_0 = #xegpu.layout<sg_layout = [6]>,
layout_operand_5 = #xegpu.layout<sg_layout = [7]>
} : memref<8x16x32xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
%res2 = arith.addf %res, %pass_thru : vector<8x16xf32>
gpu.return %res2 : vector<8x16xf32>
}
// CHECK-LABEL: @load_static_layout_mixed(
// CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>,
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
// Verify that linear-indices computation uses layout from the 'indices' producer op (%2).
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
// CHECK-SAME: {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]>
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
}

// -----

gpu.module @xevm_module {
gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>,
%off0: index, %off1: index, %off2: index,
%mask: vector<8x16xi1>) -> vector<8x16xf32> {
%pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex>
%cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
%0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex>
%1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex>
%2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>

%res = vector.gather %source[%off0, %off1, %off2][%2], %mask,
%pass_thru {
layout_result_0 = #xegpu.layout<sg_layout = [6]>,
layout_operand_4 = #xegpu.layout<sg_layout = [99]>, // overriding %2's layout
layout_operand_5 = #xegpu.layout<sg_layout = [7]>
} : memref<?x?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32>
%res2 = arith.addf %res, %pass_thru : vector<8x16xf32>
gpu.return %res2 : vector<8x16xf32>
}
// CHECK-LABEL: @load_dynamic_layout_mixed_override(
// CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>,
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> {
// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32>
// Verify that linear-indices computation uses layout from the 'indices' producer op (%2)
// and not it's overriden version from the scatter_op (sg_layout = [99])
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex>
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex>
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]]
// CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [99]>, layout_operand_2 = #xegpu.layout<sg_layout = [7]>
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>}
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}}
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>,
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32>
}

// -----

gpu.module @xevm_module {
gpu.func @load_with_cache_hints(%source: memref<8x16x32xf32>,
%off1: index, %off2: index, %off3: index,
%indices: vector<8xindex>, %mask: vector<8xi1>,
%pass_thru: vector<8xf32>) -> vector<8xf32> {
%0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask,
%pass_thru {
l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<streaming>
} : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32>
gpu.return %0 : vector<8xf32>
}
// CHECK-LABEL: @load_with_cache_hints(
// CHECK: xegpu.load {{[^<]*}}
// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<streaming>}>
}

// -----

gpu.module @xevm_module {
gpu.func @load_with_partial_cache_hints(%source: memref<8x16x32xf32>,
%off1: index, %off2: index, %off3: index,
%indices: vector<8xindex>, %mask: vector<8xi1>,
%pass_thru: vector<8xf32>) -> vector<8xf32> {
%0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask,
%pass_thru {
l1_hint = #xegpu.cache_hint<cached>,
l3_hint = #xegpu.cache_hint<streaming>
} : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32>
gpu.return %0 : vector<8xf32>
}
// CHECK-LABEL: @load_with_partial_cache_hints(
// CHECK: xegpu.load {{[^<]*}}
// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<streaming>}>
}
Loading