Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 50 additions & 4 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
OptionalAttr<DistributeLayoutAttr>:$layout);
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);

let extraClassDeclaration = extraBaseClassDeclaration # [{
Expand All @@ -852,6 +853,16 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
return getSource().getType();
}

xegpu::DistributeLayoutAttr getDistributeLayout() {
xegpu::DistributeLayoutAttr layout = nullptr;
if (auto tdescType = getTensorDescType()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are deprecating the load_gather w/ tdesc format, so no need to check it here.

layout = tdescType.getLayoutAttr();
}
if (!layout)
layout = getLayoutAttr();
return layout;
}

TypedValue<xegpu::TensorDescType> getTensorDesc() {
if (auto tdescType = getTensorDescType()) {
return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
Expand Down Expand Up @@ -895,7 +906,19 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
"IntegerAttr": $chunk_size,
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint)>
"xegpu::CachePolicyAttr": $l3_hint)>,
OpBuilder<(ins "Type": $value, "Value": $source, "Value": $mask,
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint,
"xegpu::DistributeLayoutAttr": $layout)>,
OpBuilder<(ins "Type": $value, "Value": $source,
"ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
"IntegerAttr": $chunk_size,
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint,
"xegpu::DistributeLayoutAttr": $layout)>
];

let hasVerifier = 1;
Expand Down Expand Up @@ -979,7 +1002,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
OptionalAttr<DistributeLayoutAttr>:$layout);

let extraClassDeclaration = extraBaseClassDeclaration#[{
Type getDestType() {
Expand All @@ -993,6 +1017,16 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
return TypedValue<xegpu::TensorDescType>();
}

xegpu::DistributeLayoutAttr getDistributeLayout() {
xegpu::DistributeLayoutAttr layout = nullptr;
if (auto tdescType = getTensorDescType()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to support the tdesc form.

layout = tdescType.getLayoutAttr();
}
if (!layout)
layout = getLayoutAttr();
return layout;
}

xegpu::TensorDescType getTensorDescType() {
return dyn_cast<xegpu::TensorDescType>(getDestType());
}
Expand Down Expand Up @@ -1030,7 +1064,19 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
"IntegerAttr": $chunk_size,
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint)>
"xegpu::CachePolicyAttr": $l3_hint)>,
OpBuilder<(ins "Value": $value, "Value": $dest, "Value": $mask,
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint,
"xegpu::DistributeLayoutAttr": $layout)>,
OpBuilder<(ins "Value": $value, "Value": $dest,
"ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
"IntegerAttr": $chunk_size,
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint,
"xegpu::DistributeLayoutAttr": $layout)>
];

let hasVerifier = 1;
Expand Down
61 changes: 57 additions & 4 deletions mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint) {
build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
l1_hint, l2_hint, l3_hint);
l1_hint, l2_hint, l3_hint, /*layout=*/nullptr);
}

void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
Expand All @@ -832,7 +832,34 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
auto offset = vector::FromElementsOp::create(builder, loc, type, values);

build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
l2_hint, l3_hint);
l2_hint, l3_hint, /*layout=*/nullptr);
}

void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
Type valueType, Value source, Value mask,
xegpu::CachePolicyAttr l1_hint,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint,
DistributeLayoutAttr layout) {
build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why we have this form: load without offsets?

l1_hint, l2_hint, l3_hint, layout);
}

void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
Type valueType, Value source,
ArrayRef<OpFoldResult> offsets, Value mask,
IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint,
DistributeLayoutAttr layout) {
auto loc = source.getLoc();
int64_t size = static_cast<int64_t>(offsets.size());
auto type = VectorType::get(size, builder.getIndexType());
auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
auto offset = vector::FromElementsOp::create(builder, loc, type, values);

build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
l2_hint, l3_hint, layout);
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -883,7 +910,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint) {
build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
l2_hint, l3_hint);
l2_hint, l3_hint, /*layout=*/nullptr);
}

void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
Expand All @@ -901,7 +928,33 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,

// Call the correct builder overload that does not expect result types.
build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
l3_hint);
l3_hint, /*layout=*/nullptr);
}

void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
Value value, Value dest, Value mask,
xegpu::CachePolicyAttr l1_hint,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint,
DistributeLayoutAttr layout) {
build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also no offsets?

l2_hint, l3_hint, layout);
}

void StoreScatterOp::build(
OpBuilder &builder, OperationState &state, Value value, Value dest,
ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint, DistributeLayoutAttr layout) {
auto loc = dest.getLoc();
int64_t size = static_cast<int64_t>(offsets.size());
auto type = VectorType::get(size, builder.getIndexType());
auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
auto offset = vector::FromElementsOp::create(builder, loc, type, values);

// Call the correct builder overload that does not expect result types.
build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
l3_hint, layout);
}

//===----------------------------------------------------------------------===//
Expand Down
4 changes: 2 additions & 2 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ struct UnrollLoadGatherOpWithOffset
auto newOp = xegpu::LoadGatherOp::create(
rewriter, loc, newValueTy, op.getSource(), o, m,
rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(),
op.getL2HintAttr(), op.getL3HintAttr());
op.getL2HintAttr(), op.getL3HintAttr(), op.getLayoutAttr());
newOps.push_back(newOp);
}

Expand Down Expand Up @@ -783,7 +783,7 @@ struct UnrollStoreScatterOpWithOffsets
xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m,
rewriter.getI64IntegerAttr(chunkSize),
op.getL1HintAttr(), op.getL2HintAttr(),
op.getL3HintAttr());
op.getL3HintAttr(), op.getLayoutAttr());
}

rewriter.eraseOp(op);
Expand Down
22 changes: 10 additions & 12 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,8 @@ struct WgToSgLoadGatherOpWithOffset
llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
auto newLoadOp = xegpu::LoadGatherOp::create(
rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
/*layout*/ nullptr);
xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0),
layout.dropSgLayoutAndData());
newLoadOps.push_back(newLoadOp);
Expand Down Expand Up @@ -961,19 +962,16 @@ struct WgToSgStoreScatterOpWithOffset
auto chunkSizeAttr = rewriter.getI64IntegerAttr(chunkSize);
for (auto [val, offs, mask] : llvm::zip(
adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) {
xegpu::DistributeLayoutAttr newLayout = nullptr;
if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
!layout.getEffectiveInstDataAsInt().empty())
// Update the layout attribute to drop sg_layout and sg_data.
newLayout = layout.dropSgLayoutAndData();

auto store = xegpu::StoreScatterOp::create(
rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr,
op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
// Update the layout attribute to drop sg_layout and sg_data.
if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
!layout.getEffectiveInstDataAsInt().empty()) {
for (OpOperand &operand : store->getOpOperands()) {
// Skip for operand one (memref)
if (operand.getOperandNumber() == 1)
continue;
xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData());
}
}
op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
/*layout*/ newLayout);
}
rewriter.eraseOp(op);
return success();
Expand Down
17 changes: 16 additions & 1 deletion mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ std::string xegpu::getLayoutName(const OpOperand &operand) {
}

std::string xegpu::getLayoutName(const OpResult result) {
if (isa<xegpu::LoadGatherOp, xegpu::StoreScatterOp>(result.getOwner()))
return "layout";
const StringRef prefix = "layout_result_";
return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
}
Expand Down Expand Up @@ -141,6 +143,9 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
return storeOp.getLayoutAttr();

if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
return loadGatherOp.getLayoutAttr();

std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
Expand Down Expand Up @@ -168,6 +173,12 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
return storeOp.getLayoutAttr();

// if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op))
// return loadGatherOp.getDistributeLayout();

if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
return storeScatterOp.getDistributeLayout();

std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
Expand Down Expand Up @@ -196,7 +207,8 @@ template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
void xegpu::setDistributeLayoutAttrs(
Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::LoadGatherOp,
xegpu::StoreScatterOp>(nestOp))
return;

for (OpOperand &opr : nestOp->getOpOperands()) {
Expand All @@ -216,6 +228,9 @@ void xegpu::removeLayoutAttr(const T &operandOrResult) {
std::string name = xegpu::getLayoutName(operandOrResult);
if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
owner->removeAttr(name);
if (isa<xegpu::StoreMatrixOp, xegpu::LoadGatherOp>(owner) &&
owner->hasAttrOfType<DistributeLayoutAttr>("layout"))
owner->removeAttr("layout");
}

// Explicit instantiation for OpResult
Expand Down
10 changes: 5 additions & 5 deletions mlir/test/Dialect/XeGPU/propagate-layout.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need to change the layout here.
It is checking whether the propagation set the temporarily layout attribute for the load result correct.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may create separate test that test how the propagation honor the user's setting. Say, user to set a different layout like
layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [1, 2]
for store and expect it propagating from store to load.

Once user set it, the propagation should honor user's setting instead of using its default one.

Note that these xegpu.load variant is to be deprecated. Please just focus on xegpu.load variant that has memref as input.
Also the test may not use chunk_size. We don't really expect user to use the chunk load.

// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
Expand All @@ -122,7 +122,7 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
Expand Down Expand Up @@ -167,8 +167,8 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}>
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64,
// CHECK-SAME: layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
Expand All @@ -186,7 +186,7 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
// CHECK-SAME: <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
Expand Down
12 changes: 6 additions & 6 deletions mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ gpu.module @xevm_module{
%1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
%loaded = scf.if %pred -> (vector<16x8xf16>) {
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8,
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
Copy link
Contributor

@Jianhui-Li Jianhui-Li Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave these two tests as is.

}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
scf.yield %3 : vector<16x8xf16>
} else {
%3 = arith.constant {
Expand Down Expand Up @@ -168,9 +168,9 @@ gpu.module @xevm_module{
%1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
scf.if %pred {
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8,
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
}
gpu.return
Expand Down
Loading
Loading