Skip to content

Commit f291f33

Browse files
authored
[MLIR][XeGPU] Support order attribute and add pattern for vector.transpose in WgToSg Pass (#165307)
This PR does the following: 1. Handle order attribute during the delinearization from linear subgroup Id to multi-dim id. 2. Adds a transformation pattern for vector.transpose in wg to sg pass. 3. Updates CHECKS in the wg to sg tests
1 parent 952d4b4 commit f291f33

File tree

9 files changed

+387
-286
lines changed

9 files changed

+387
-286
lines changed

mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -280,27 +280,82 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
280280
FailureOr<SmallVector<Value>>
281281
LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
282282

283-
// TODO: handle order attribute
284-
auto hasDefaultOrder = [&]() {
285-
DenseI32ArrayAttr order = getOrder();
286-
return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>(
287-
llvm::reverse(order.asArrayRef())));
288-
};
289-
if (!hasDefaultOrder())
290-
return mlir::emitError(loc, "order attribute is currently not supported.");
291-
SmallVector<int64_t> layout;
283+
SmallVector<int64_t> sgLayoutInt;
292284
if (isForWorkgroup()) {
293-
layout = getEffectiveSgLayoutAsInt();
285+
sgLayoutInt = getEffectiveSgLayoutAsInt();
294286
} else if (isForSubgroup()) {
295-
layout = getEffectiveLaneLayoutAsInt();
287+
sgLayoutInt = getEffectiveLaneLayoutAsInt();
296288
} else {
297289
return failure();
298290
}
299-
auto dims = llvm::map_to_vector(layout, [&](int64_t d) -> Value {
300-
return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
301-
});
302291

303-
return affine::delinearizeIndex(builder, loc, linearId, dims);
292+
DenseI32ArrayAttr orderAttr = getOrder();
293+
294+
// Handle order attribute
295+
SmallVector<int64_t> order;
296+
if (orderAttr && !orderAttr.empty()) {
297+
order = llvm::to_vector(
298+
llvm::map_range(orderAttr.asArrayRef(),
299+
[](int32_t idx) { return static_cast<int64_t>(idx); }));
300+
} else {
301+
// Default order: [1, 0] for 2D (row-major), [2, 1, 0] for 3D, etc.
302+
order = llvm::to_vector(
303+
llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size())));
304+
}
305+
306+
if (order.size() != sgLayoutInt.size()) {
307+
return failure();
308+
}
309+
310+
SmallVector<Value> result(sgLayoutInt.size());
311+
Value remaining = linearId;
312+
313+
/// Process dimensions in the order they appear in the order array
314+
/// The first dimension in order is the fastest-changing
315+
///
316+
/// Example walkthrough for linearId=22, sgLayout=[2,4,4], order=[2,1,0]:
317+
///
318+
/// Initial: remaining=22, dimIdx = order[i], dimSize = sgLayout[dimIdx],
319+
/// result=[?,?,?]
320+
///
321+
/// i=0 (process columns, dimIdx=2, dimSize=4):
322+
/// result[2] = 22 % 4 = 2 (column coordinate)
323+
/// remaining = 22 / 4 = 5 (5 complete groups of 4 columns processed)
324+
///
325+
/// i=1 (process rows, dimIdx=1, dimSize=4):
326+
/// result[1] = 5 % 4 = 1 (row coordinate)
327+
/// remaining = 5 / 4 = 1 (1 complete group of 4 rows processed)
328+
///
329+
/// i=2 (process layers, dimIdx=0, dimSize=2):
330+
/// result[0] = 1 % 2 = 1 (layer coordinate)
331+
/// (no remaining update - last iteration)
332+
///
333+
/// Final result: [1,1,2] = Layer 1, Row 1, Column 2
334+
for (size_t i = 0; i < order.size(); ++i) {
335+
int64_t dimIdx = order[i];
336+
int64_t dimSize = sgLayoutInt[dimIdx];
337+
338+
Value dimSizeVal =
339+
builder.createOrFold<arith::ConstantIndexOp>(loc, dimSize);
340+
341+
/// Extract the coordinate for this dimension using modulo operation
342+
/// This gives us "how far within this dimension" we are
343+
/// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within
344+
/// this dimension)
345+
result[dimIdx] =
346+
builder.createOrFold<index::RemUOp>(loc, remaining, dimSizeVal);
347+
348+
/// Update remaining for the next dimension by removing what we've already
349+
/// processed. Division tells us "how many complete groups of this dimension
350+
/// we've gone through" e.g., linearId=22, dimSize=4: 22 / 4 = 5 (we've
351+
/// completed 5 groups of 4) Skip this for the last iteration since there's
352+
/// no next dimension to process
353+
if (i < order.size() - 1) {
354+
remaining =
355+
builder.createOrFold<index::DivUOp>(loc, remaining, dimSizeVal);
356+
}
357+
}
358+
return result;
304359
}
305360

306361
/// Implements DistributeLayoutAttr::computeDistributedCoords to generate

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,70 @@ struct WgToSgMultiDimReductionOp
12191219
}
12201220
};
12211221

1222+
// This pattern transforms vector.transpose ops to work at subgroup level.
1223+
struct WgToSgVectorTransposeOp
1224+
: public OpConversionPattern<vector::TransposeOp> {
1225+
using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
1226+
1227+
LogicalResult
1228+
matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor,
1229+
ConversionPatternRewriter &rewriter) const override {
1230+
VectorType resultType = op.getResultVectorType();
1231+
1232+
ArrayRef<int64_t> wgShape = resultType.getShape();
1233+
xegpu::DistributeLayoutAttr layout =
1234+
xegpu::getDistributeLayoutAttr(op.getResult());
1235+
if (!layout || !layout.isForWorkgroup())
1236+
return failure();
1237+
1238+
xegpu::DistributeLayoutAttr sourceLayout =
1239+
xegpu::getDistributeLayoutAttr(op.getVector());
1240+
if (!sourceLayout || !sourceLayout.isForWorkgroup())
1241+
return failure();
1242+
1243+
SmallVector<int64_t> sourceSgLayout =
1244+
sourceLayout.getEffectiveSgLayoutAsInt();
1245+
SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
1246+
DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
1247+
DenseI32ArrayAttr resultOrder = layout.getOrder();
1248+
1249+
if (!sourceOrder || !resultOrder) {
1250+
return rewriter.notifyMatchFailure(
1251+
op, "Both source and result must have order attributes");
1252+
}
1253+
1254+
ArrayRef<int64_t> permutation = op.getPermutation();
1255+
size_t permutationSize = permutation.size();
1256+
if (sourceSgLayout.size() != permutationSize ||
1257+
resultSgLayout.size() != permutationSize) {
1258+
return rewriter.notifyMatchFailure(
1259+
op, "Layouts and permutation must have the same rank");
1260+
}
1261+
1262+
// Check that sgLayout, sgData & order are properly transposed for source
1263+
// and result
1264+
if (!layout.isTransposeOf(sourceLayout, permutation))
1265+
return rewriter.notifyMatchFailure(
1266+
op, "Result layout is not a valid transpose of source layout "
1267+
"according to permutation");
1268+
1269+
SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
1270+
VectorType newResultType =
1271+
VectorType::get(sgShape, resultType.getElementType());
1272+
SmallVector<Value> newTransposeOps;
1273+
for (auto src : adaptor.getVector()) {
1274+
auto newTranspose = vector::TransposeOp::create(
1275+
rewriter, op.getLoc(), newResultType, src, permutation);
1276+
xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
1277+
layout.dropSgLayoutAndData());
1278+
newTransposeOps.push_back(newTranspose.getResult());
1279+
}
1280+
1281+
rewriter.replaceOpWithMultiple(op, {newTransposeOps});
1282+
return success();
1283+
}
1284+
};
1285+
12221286
} // namespace
12231287

12241288
namespace mlir {
@@ -1233,7 +1297,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
12331297
WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
12341298
WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
12351299
WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp,
1236-
WgToSgMultiDimReductionOp>(patterns.getContext());
1300+
WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp>(
1301+
patterns.getContext());
12371302
}
12381303
} // namespace xegpu
12391304
} // namespace mlir
@@ -1360,7 +1425,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
13601425
return isLegal(layout);
13611426
});
13621427

1363-
target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>(
1428+
target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
1429+
vector::TransposeOp, vector::BroadcastOp,
1430+
vector::MultiDimReductionOp>(
13641431
[=](Operation *op) -> bool {
13651432
// Check for either a SliceAttr or LayoutAttr on the result.
13661433
auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
@@ -1379,16 +1446,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
13791446
return isLegal(layout);
13801447
});
13811448

1382-
target.addDynamicallyLegalOp<vector::BroadcastOp>(
1383-
[=](vector::BroadcastOp op) -> bool {
1384-
return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
1385-
});
1386-
1387-
target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
1388-
[=](vector::MultiDimReductionOp op) -> bool {
1389-
return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
1390-
});
1391-
13921449
target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
13931450
[=](xegpu::ConvertLayoutOp op) -> bool {
13941451
return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());

mlir/test/Dialect/XeGPU/subgroup-distribute.mlir

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -268,15 +268,16 @@ gpu.module @xevm_module{
268268

269269
// -----
270270
// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
271-
// CHECK: %[[LAYOUT_X:.*]] = arith.constant 8 : index
272-
// CHECK: %[[LAYOUT_Y:.*]] = arith.constant 2 : index
271+
// CHECK: %[[C2:.*]] = arith.constant 2 : index
272+
// CHECK: %[[C8:.*]] = arith.constant 8 : index
273273
// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
274-
// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
275-
// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
276-
// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_Y]], %[[LAYOUT_Y]]
277-
// CHECK: %[[LANE_X_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[LAYOUT_X]]
278-
// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
279-
// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
274+
// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]]
275+
// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]]
276+
// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]]
277+
// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]]
278+
// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]]
279+
// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
280+
// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
280281
gpu.module @xevm_module{
281282
gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
282283
%c0 = arith.constant 0 : index
@@ -288,19 +289,20 @@ gpu.module @xevm_module{
288289

289290
// -----
290291
// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
291-
// CHECK: %[[DIST_UNIT_HEIGHT_X:.*]] = arith.constant 4 : index
292-
// CHECK: %[[DIST_UNIT_HEIGHT_Y:.*]] = arith.constant 8 : index
293-
// CHECK: %[[LANE_DATA_Y:.*]] = arith.constant 2 : index
294-
// CHECK: %[[USER_OFFSET_X:.*]] = arith.constant 1 : index
292+
// CHECK: %[[C8:.*]] = arith.constant 8 : index
293+
// CHECK: %[[C2:.*]] = arith.constant 2 : index
294+
// CHECK: %[[C4:.*]] = arith.constant 4 : index
295+
// CHECK: %[[C1:.*]] = arith.constant 1 : index
295296
// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
296-
// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
297-
// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
298-
// CHECK: %[[LANE_Y_OFFSET_1:.*]] = index.mul %[[DELINEARIZED_LANE_Y]], %[[LANE_DATA_Y]]
299-
// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[LANE_Y_OFFSET_1]], %[[DIST_UNIT_HEIGHT_Y]]
300-
// CHECK: %[[LANE_X_OFFSET_1:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[DIST_UNIT_HEIGHT_X]]
301-
// CHECK: %[[LANE_X_OFFSET:.*]] = index.add %[[LANE_X_OFFSET_1]], %[[USER_OFFSET_X]]
302-
// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
303-
// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
297+
// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]]
298+
// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]]
299+
// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]]
300+
// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]]
301+
// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]]
302+
// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]]
303+
// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]]
304+
// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
305+
// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
304306
gpu.module @xevm_module{
305307
gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
306308
%c0 = arith.constant 0 : index
Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,32 @@
11
// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
22

3-
//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
43
gpu.module @test {
54
gpu.func @slice_attr() -> vector<128xindex> {
6-
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
7-
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
8-
//CHECK: [[c32:%.+]] = arith.constant 32 : index
9-
//CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
10-
//CHECK: [[c128:%.+]] = arith.constant 128 : index
11-
//CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
12-
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
13-
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
14-
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
5+
// CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
6+
// CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]]
7+
// CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]]
8+
// CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
9+
// CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
10+
// CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
11+
// CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
12+
// CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
1513
%step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
1614
gpu.return %step : vector<128xindex>
1715
}
1816

1917
gpu.func @nested_slice_attr() -> vector<128xindex> {
20-
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
21-
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
22-
//CHECK: [[c32:%.+]] = arith.constant 32 : index
23-
//CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
24-
//CHECK: [[c128:%.+]] = arith.constant 128 : index
25-
//CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
26-
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
27-
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
28-
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
18+
// CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
19+
// CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]]
20+
// CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]]
21+
// CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]]
22+
// CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
23+
// CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
24+
// CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
25+
// CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
26+
// CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
2927
%0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
3028
gpu.return %0 : vector<128xindex>
3129
}
3230

33-
}
31+
}
32+

mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops {
166166
%load_b = xegpu.load_nd %tdesc_b
167167
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
168168
-> vector<24x32xf32>
169-
// CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
170-
// CHECK-SAME-COUNT-12: : vector<2x2xf32>
169+
// CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
171170
// CHECK-NOT: arith.negf
172171
%negf = arith.negf %load_a
173172
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
174173
: vector<24x32xf32>
175-
// CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
176-
// CHECK-SAME-COUNT-12: : vector<2x2xf32>
174+
// CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
177175
// CHECK-NOT: math.powf
178176
%powf = math.powf %load_a, %load_b
179177
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}

0 commit comments

Comments
 (0)