-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[mlir][XeGPU][VectorToXeGPU] Propagate vector layouts to xegpu ops #163071
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
a22b251
76c8129
fdb0540
79e37d8
3afe5d5
2a43ee6
e0cf57b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,6 +97,20 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter, | |
return success(); | ||
} | ||
|
||
// Extract cache hints from the op attributes if available. | ||
static void getOpCacheHints(Operation *op, | ||
SmallVector<xegpu::CachePolicyAttr, 3> &hints) { | ||
assert(hints.size() == 3 && | ||
"Expecting a vector of size 3 for l1, l2, l3 hints."); | ||
// get l1, l2, l3 hints from attributes if available. | ||
if (auto l1Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l1_hint")) | ||
hints[0] = l1Attr; | ||
if (auto l2Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l2_hint")) | ||
hints[1] = l2Attr; | ||
if (auto l3Attr = op->getAttrOfType<xegpu::CachePolicyAttr>("l3_hint")) | ||
hints[2] = l3Attr; | ||
} | ||
|
||
static xegpu::CreateNdDescOp | ||
createNdDescriptor(PatternRewriter &rewriter, Location loc, | ||
xegpu::TensorDescType descType, TypedValue<MemRefType> src, | ||
|
@@ -374,22 +388,30 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp, | |
arith::AddIOp::create(rewriter, loc, baseOffset, offsetContrib); | ||
} | ||
Value indices = gatScatOp.getIndices(); | ||
// Extract indices layout and propagate it to all 'vector' ops created here | ||
auto indicesLayout = mlir::xegpu::getDistributeLayoutAttr(indices); | ||
dchigarev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
VectorType vecType = cast<VectorType>(indices.getType()); | ||
|
||
|
||
Value strideVector = | ||
vector::BroadcastOp::create(rewriter, loc, vecType, strides.back()) | ||
.getResult(); | ||
Value stridedIndices = | ||
arith::MulIOp::create(rewriter, loc, strideVector, indices).getResult(); | ||
|
||
Value baseVector = | ||
vector::BroadcastOp::create( | ||
rewriter, loc, | ||
VectorType::get(vecType.getShape(), rewriter.getIndexType()), | ||
baseOffset) | ||
.getResult(); | ||
return arith::AddIOp::create(rewriter, loc, baseVector, stridedIndices) | ||
.getResult(); | ||
auto strideVector = | ||
vector::BroadcastOp::create(rewriter, loc, vecType, strides.back()); | ||
mlir::xegpu::setDistributeLayoutAttr(strideVector->getOpResult(0), | ||
indicesLayout); | ||
|
||
auto stridedIndices = | ||
arith::MulIOp::create(rewriter, loc, strideVector.getResult(), indices); | ||
mlir::xegpu::setDistributeLayoutAttr(stridedIndices->getOpResult(0), | ||
indicesLayout); | ||
|
||
auto baseVector = vector::BroadcastOp::create( | ||
rewriter, loc, | ||
VectorType::get(vecType.getShape(), rewriter.getIndexType()), baseOffset); | ||
mlir::xegpu::setDistributeLayoutAttr(baseVector->getOpResult(0), | ||
indicesLayout); | ||
|
||
auto result = arith::AddIOp::create(rewriter, loc, baseVector.getResult(), | ||
stridedIndices.getResult()); | ||
mlir::xegpu::setDistributeLayoutAttr(result->getOpResult(0), indicesLayout); | ||
return result.getResult(); | ||
} | ||
|
||
template < | ||
|
@@ -616,16 +638,39 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> { | |
computeOffsets(rewriter, gatherOp, meta.first, meta.second); | ||
Value flatMemref = memrefToIndexPtr(gatherOp, rewriter); | ||
|
||
auto numOffsets = gatherOp.getOffsets().size(); | ||
auto layoutRes = mlir::xegpu::getDistributeLayoutAttr(gatherOp.getResult()); | ||
auto layoutIndices = mlir::xegpu::getDistributeLayoutAttr( | ||
gatherOp->getOpOperand(numOffsets + 1)); | ||
auto layoutMask = mlir::xegpu::getDistributeLayoutAttr( | ||
gatherOp->getOpOperand(numOffsets + 2)); | ||
auto layoutPassThru = mlir::xegpu::getDistributeLayoutAttr( | ||
gatherOp->getOpOperand(numOffsets + 3)); | ||
|
||
SmallVector<xegpu::CachePolicyAttr, 3> cacheHints{xegpu::CachePolicyAttr{}, | ||
xegpu::CachePolicyAttr{}, | ||
xegpu::CachePolicyAttr{}}; | ||
getOpCacheHints(gatherOp, cacheHints); | ||
auto xeGatherOp = xegpu::LoadGatherOp::create( | ||
rewriter, loc, vectorType, flatMemref, localOffsets, gatherOp.getMask(), | ||
/*chunk_size=*/IntegerAttr{}, | ||
/*l1_hint=*/xegpu::CachePolicyAttr{}, | ||
/*l2_hint=*/xegpu::CachePolicyAttr{}, | ||
/*l3_hint=*/xegpu::CachePolicyAttr{}); | ||
/*l1_hint=*/cacheHints[0], | ||
/*l2_hint=*/cacheHints[1], | ||
/*l3_hint=*/cacheHints[2]); | ||
mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpResult(0), layoutRes); | ||
mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(1), | ||
layoutIndices); | ||
mlir::xegpu::setDistributeLayoutAttr(xeGatherOp->getOpOperand(2), | ||
layoutMask); | ||
|
||
auto selectOp = | ||
arith::SelectOp::create(rewriter, loc, gatherOp.getMask(), | ||
xeGatherOp.getResult(), gatherOp.getPassThru()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to double-check, the layout isn't assigned to the second operand (LoadGather) as it's already in the producer's result. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right, I thought that's enough. There seems to be no drawbacks though from assigning layout in both places. Applied layout in both places in the last commit just in case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. setting the operand layout is done at the client of layout propagation result. So I think there is no need to update There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it always the case? Say, the value to store comes as a function's argument (our func is not inlined yet) and it's impossible to determine the producer but it's possible to determine the layout from |
||
mlir::xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(0), layoutMask); | ||
mlir::xegpu::setDistributeLayoutAttr(selectOp->getOpOperand(2), | ||
layoutPassThru); | ||
mlir::xegpu::setDistributeLayoutAttr(selectOp->getOpResult(0), layoutRes); | ||
|
||
rewriter.replaceOp(gatherOp, selectOp.getResult()); | ||
return success(); | ||
} | ||
|
@@ -650,12 +695,28 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> { | |
computeOffsets(rewriter, scatterOp, meta.first, meta.second); | ||
Value flatMemref = memrefToIndexPtr(scatterOp, rewriter); | ||
|
||
xegpu::StoreScatterOp::create(rewriter, loc, scatterOp.getValueToStore(), | ||
flatMemref, localOffsets, scatterOp.getMask(), | ||
/*chunk_size=*/IntegerAttr{}, | ||
/*l1_hint=*/xegpu::CachePolicyAttr{}, | ||
/*l2_hint=*/xegpu::CachePolicyAttr{}, | ||
/*l3_hint=*/xegpu::CachePolicyAttr{}); | ||
auto numOffsets = scatterOp.getOffsets().size(); | ||
auto layoutIndices = mlir::xegpu::getDistributeLayoutAttr( | ||
scatterOp->getOpOperand(numOffsets + 1)); | ||
auto layoutMask = mlir::xegpu::getDistributeLayoutAttr( | ||
scatterOp->getOpOperand(numOffsets + 2)); | ||
auto layoutVal = mlir::xegpu::getDistributeLayoutAttr( | ||
scatterOp->getOpOperand(numOffsets + 3)); | ||
SmallVector<xegpu::CachePolicyAttr, 3> cacheHints{xegpu::CachePolicyAttr{}, | ||
xegpu::CachePolicyAttr{}, | ||
xegpu::CachePolicyAttr{}}; | ||
getOpCacheHints(scatterOp, cacheHints); | ||
auto storeOp = xegpu::StoreScatterOp::create( | ||
rewriter, loc, scatterOp.getValueToStore(), flatMemref, localOffsets, | ||
scatterOp.getMask(), | ||
/*chunk_size=*/IntegerAttr{}, | ||
/*l1_hint=*/cacheHints[0], | ||
/*l2_hint=*/cacheHints[1], | ||
/*l3_hint=*/cacheHints[2]); | ||
mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(0), layoutVal); | ||
mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(2), | ||
layoutIndices); | ||
mlir::xegpu::setDistributeLayoutAttr(storeOp->getOpOperand(3), layoutMask); | ||
rewriter.eraseOp(scatterOp); | ||
return success(); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -249,3 +249,188 @@ gpu.func @non_unit_inner_stride_3D( | |
// CHECK: %[[RES:.+]] = arith.select %[[MASK]], %[[V]], %[[PASS]] : vector<8xi1>, vector<8xf32> | ||
// CHECK: gpu.return %[[RES]] : vector<8xf32> | ||
} | ||
|
||
// ----- | ||
|
||
gpu.module @xevm_module { | ||
// Layouts are only specified for the gather op itself. | ||
gpu.func @load_dynamic_layout_operands(%source: memref<?x?xf32>, | ||
%off0: index, %off1: index, | ||
%indices: vector<8x16xindex>, %mask: vector<8x16xi1>, | ||
%pass_thru: vector<8x16xf32>) -> vector<8x16xf32> { | ||
%res = vector.gather %source[%off0, %off1][%indices], %mask, | ||
%pass_thru { | ||
layout_result_0 = #xegpu.layout<sg_layout = [0]>, | ||
layout_operand_3 = #xegpu.layout<sg_layout = [1]>, | ||
layout_operand_4 = #xegpu.layout<sg_layout = [2]>, | ||
layout_operand_5 = #xegpu.layout<sg_layout = [3]> | ||
} : memref<?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32> | ||
gpu.return %res : vector<8x16xf32> | ||
} | ||
// CHECK-LABEL: @load_dynamic_layout_operands( | ||
// CHECK-SAME: %[[SRC:.+]]: memref<?x?xf32>, | ||
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, | ||
// CHECK-SAME: %[[INDICES:.+]]: vector<8x16xindex>, %[[MASK:.+]]: vector<8x16xi1>, %[[PASS:.+]]: vector<8x16xf32>) -> vector<8x16xf32> { | ||
// %indices producer doesn't have a layout, so as 'broadcast/add' ops computing linear index. | ||
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} : index to vector<8x16xindex> | ||
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex> | ||
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] | ||
// CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [1]>, layout_operand_2 = #xegpu.layout<sg_layout = [2]>, | ||
// CHECK-SAME: layout_result_0 = #xegpu.layout<sg_layout = [0]>} | ||
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}} | ||
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [2]>, | ||
// CHECK-SAME: {{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [3]>, | ||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [0]>} : vector<8x16xi1>, vector<8x16xf32> | ||
} | ||
|
||
// ----- | ||
|
||
gpu.module @xevm_module { | ||
gpu.func @load_dynamic_layout_mixed(%source: memref<?x?x?xf32>, | ||
%off0: index, %off1: index, %off2: index, | ||
%mask: vector<8x16xi1>) -> vector<8x16xf32> { | ||
%pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32> | ||
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex> | ||
%cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> | ||
%0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex> | ||
%1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex> | ||
%2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex> | ||
|
||
%res = vector.gather %source[%off0, %off1, %off2][%2], %mask, | ||
%pass_thru { | ||
layout_result_0 = #xegpu.layout<sg_layout = [6]>, | ||
layout_operand_5 = #xegpu.layout<sg_layout = [7]> | ||
} : memref<?x?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32> | ||
%res2 = arith.addf %res, %pass_thru : vector<8x16xf32> | ||
gpu.return %res2 : vector<8x16xf32> | ||
} | ||
// CHECK-LABEL: @load_dynamic_layout_mixed( | ||
// CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, | ||
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index, | ||
// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> { | ||
// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32> | ||
// Verify that linear-indices computation uses layout from the 'indices' producer op (%2). | ||
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex> | ||
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex> | ||
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] | ||
// CHECK-SAME: {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similar to my previous comment operand layouts are not needed. |
||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} | ||
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}} | ||
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>, | ||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32> | ||
} | ||
|
||
|
||
// ----- | ||
|
||
gpu.module @xevm_module { | ||
gpu.func @load_static_layout_mixed(%source: memref<8x16x32xf32>, | ||
%off0: index, %off1: index, %off2: index, | ||
%mask: vector<8x16xi1>) -> vector<8x16xf32> { | ||
%pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32> | ||
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex> | ||
%cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> | ||
%0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex> | ||
%1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex> | ||
%2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex> | ||
|
||
%res = vector.gather %source[%off0, %off1, %off2][%2], %mask, | ||
%pass_thru { | ||
layout_result_0 = #xegpu.layout<sg_layout = [6]>, | ||
layout_operand_5 = #xegpu.layout<sg_layout = [7]> | ||
} : memref<8x16x32xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32> | ||
%res2 = arith.addf %res, %pass_thru : vector<8x16xf32> | ||
gpu.return %res2 : vector<8x16xf32> | ||
} | ||
// CHECK-LABEL: @load_static_layout_mixed( | ||
// CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, | ||
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index, | ||
// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> { | ||
// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32> | ||
// Verify that linear-indices computation uses layout from the 'indices' producer op (%2). | ||
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex> | ||
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex> | ||
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] | ||
// CHECK-SAME: {{{[^}]*}}layout_operand_2 = #xegpu.layout<sg_layout = [7]> | ||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} | ||
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}} | ||
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>, | ||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32> | ||
} | ||
|
||
// ----- | ||
|
||
gpu.module @xevm_module { | ||
gpu.func @load_dynamic_layout_mixed_override(%source: memref<?x?x?xf32>, | ||
%off0: index, %off1: index, %off2: index, | ||
%mask: vector<8x16xi1>) -> vector<8x16xf32> { | ||
%pass_thru = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32> | ||
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1]>} dense<[[0], [32], [64], [96], [128], [160], [192], [224]]> : vector<8x1xindex> | ||
%cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> | ||
%0 = vector.broadcast %cst_1 {layout_result_0 = #xegpu.layout<sg_layout = [3]>} : vector<8x1xindex> to vector<8x16xindex> | ||
%1 = vector.broadcast %cst_2 {layout_result_0 = #xegpu.layout<sg_layout = [4]>} : vector<1x16xindex> to vector<8x16xindex> | ||
%2 = arith.addi %0, %1 {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex> | ||
|
||
%res = vector.gather %source[%off0, %off1, %off2][%2], %mask, | ||
%pass_thru { | ||
layout_result_0 = #xegpu.layout<sg_layout = [6]>, | ||
layout_operand_4 = #xegpu.layout<sg_layout = [99]>, // overriding %2's layout | ||
layout_operand_5 = #xegpu.layout<sg_layout = [7]> | ||
} : memref<?x?x?xf32>, vector<8x16xindex>, vector<8x16xi1>, vector<8x16xf32> into vector<8x16xf32> | ||
%res2 = arith.addf %res, %pass_thru : vector<8x16xf32> | ||
gpu.return %res2 : vector<8x16xf32> | ||
} | ||
// CHECK-LABEL: @load_dynamic_layout_mixed_override( | ||
// CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, | ||
// CHECK-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index, | ||
// CHECK-SAME: %[[MASK:.+]]: vector<8x16xi1>) -> vector<8x16xf32> { | ||
// CHECK: %[[PASS_THRU:.+]] = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [0]>} dense<0.000000e+00> : vector<8x16xf32> | ||
// Verify that linear-indices computation uses layout from the 'indices' producer op (%2) | ||
// and not it's overriden version from the scatter_op (sg_layout = [99]) | ||
// CHECK: %[[SPLAT:.+]] = vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : index to vector<8x16xindex> | ||
// CHECK: %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [5]>} : vector<8x16xindex> | ||
// CHECK: %[[VEC:.+]] = xegpu.load %[[BASE_I64:.+]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] | ||
// CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [99]>, layout_operand_2 = #xegpu.layout<sg_layout = [7]> | ||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} | ||
// CHECK: %[[RES:.+]] = arith.select {{[^{]*}} | ||
// CHECK-SAME: {{{[^}]*}}layout_operand_0 = #xegpu.layout<sg_layout = [7]>, | ||
// CHECK-SAME: {{[^}]*}}layout_result_0 = #xegpu.layout<sg_layout = [6]>} : vector<8x16xi1>, vector<8x16xf32> | ||
} | ||
|
||
// ----- | ||
|
||
gpu.module @xevm_module { | ||
gpu.func @load_with_cache_hints(%source: memref<8x16x32xf32>, | ||
%off1: index, %off2: index, %off3: index, | ||
%indices: vector<8xindex>, %mask: vector<8xi1>, | ||
%pass_thru: vector<8xf32>) -> vector<8xf32> { | ||
%0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask, | ||
%pass_thru { | ||
l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, | ||
l3_hint = #xegpu.cache_hint<streaming> | ||
} : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32> | ||
gpu.return %0 : vector<8xf32> | ||
} | ||
// CHECK-LABEL: @load_with_cache_hints( | ||
// CHECK: xegpu.load {{[^<]*}} | ||
// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<streaming>}> | ||
} | ||
|
||
// ----- | ||
|
||
gpu.module @xevm_module { | ||
gpu.func @load_with_partial_cache_hints(%source: memref<8x16x32xf32>, | ||
%off1: index, %off2: index, %off3: index, | ||
%indices: vector<8xindex>, %mask: vector<8xi1>, | ||
%pass_thru: vector<8xf32>) -> vector<8xf32> { | ||
%0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask, | ||
%pass_thru { | ||
l1_hint = #xegpu.cache_hint<cached>, | ||
l3_hint = #xegpu.cache_hint<streaming> | ||
} : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32> | ||
gpu.return %0 : vector<8xf32> | ||
} | ||
// CHECK-LABEL: @load_with_partial_cache_hints( | ||
// CHECK: xegpu.load {{[^<]*}} | ||
// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<streaming>}> | ||
} |
Uh oh!
There was an error while loading. Please reload this page.