Skip to content

Commit 8ea5e20

Browse files
authored
[MLIR][XeGPU] Disable block count usage in layout propagation (llvm#168504)
1 parent a088e74 commit 8ea5e20

File tree

3 files changed

+39
-4
lines changed

3 files changed

+39
-4
lines changed

mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,19 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
3737
propagate the layouts required for their operands to the producers. With
3838
this propagated layout information, pass will then update op result type
3939
with the layout information.
40+
41+
`layout-kind` option values:
42+
- `inst`
43+
Propagate the `inst_data` field of the layout attribute. The default is chosen to
44+
maximize instruction-level granularity so that the user shape can be processed
45+
with the fewest instructions. For N-D operations, this granularity depends on
46+
W (width) and H (height) of the instruction shape.
47+
The B (block) dimension (or array length) is not included in the default
48+
configuration and must be enabled via a separate optimization pass.
49+
50+
- `lane`
51+
Propagate the `lane_layout` and `lane_data` fields of the layout attribute.
52+
Default values are selected to align with hardware.
4053
}];
4154
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
4255
"vector::VectorDialect"];

mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -495,8 +495,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
495495
auto [bWidth, bHeight, bCount] = blockWHC.value();
496496
SmallVector<int> instData;
497497
int instWidth = xegpu::getLargestDivisor(
498-
static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
499-
bCount);
498+
static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth);
500499
if (instWidth == -1)
501500
prefetch.emitWarning(
502501
"No suitable instruction multiple found for the given shape.");
@@ -702,8 +701,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
702701
auto [bWidth, bHeight, bCount] = blockWHC.value();
703702
SmallVector<int> instData;
704703
int instWidth = xegpu::getLargestDivisor(
705-
static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
706-
bCount);
704+
static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth);
707705
if (instWidth == -1)
708706
store.emitWarning(
709707
"No suitable instruction multiple found for the given shape.");

mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,29 @@
11
// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
22

3+
4+
// CHECK-LABEL: func.func @load_store_no_array_len(
5+
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<8x32xf32>) {
6+
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
7+
// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
8+
// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
9+
// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
10+
// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
11+
// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
12+
gpu.module @test {
13+
// Although the uArch allows 8x32 inst data using block count (or array_len),
14+
// it is up to optimization passes to decide on the block count usage.
15+
func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf32>) {
16+
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
17+
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
18+
%1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
19+
%2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32>
20+
xegpu.store_nd %2, %1 : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
21+
return
22+
}
23+
}
24+
25+
// -----
26+
327
// CHECK-LABEL: func.func @dpas_f16(
428
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
529
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>

0 commit comments

Comments
 (0)