[MLIR][XeGPU] Disable block count usage in layout propagation (llvm#168504)

akroviakov · web-flow · commit 8ea5e20ce4c4 · 2025-11-23T10:34:57.000+01:00
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -37,6 +37,19 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
     propagate the layouts required for their operands to the producers. With
     this propagated layout information, pass will then update op result type
     with the layout information.
+
+    `layout-kind` option values:
+    - `inst`
+      Propagate the `inst_data` field of the layout attribute. The default is chosen to
+       maximize instruction-level granularity so that the user shape can be processed
+       with the fewest instructions. For N-D operations, this granularity depends on
+       W (width) and H (height) of the instruction shape.
+       The B (block) dimension (or array length) is not included in the default
+       configuration and must be enabled via a separate optimization pass.
+
+    - `lane`
+      Propagate the `lane_layout` and `lane_data` fields of the layout attribute.
+      Default values are selected to align with hardware.
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
                            "vector::VectorDialect"];
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -495,8 +495,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
   int instWidth = xegpu::getLargestDivisor(
-      static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
-      bCount);
+      static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth);
   if (instWidth == -1)
     prefetch.emitWarning(
         "No suitable instruction multiple found for the given shape.");
@@ -702,8 +701,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
   int instWidth = xegpu::getLargestDivisor(
-      static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
-      bCount);
+      static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth);
   if (instWidth == -1)
     store.emitWarning(
         "No suitable instruction multiple found for the given shape.");
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -1,5 +1,29 @@
 // RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
 
+
+// CHECK-LABEL: func.func @load_store_no_array_len(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<8x32xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
+// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]]  : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+gpu.module @test {
+// Although the uArch allows 8x32 inst data using block count (or array_len),
+// it is up to optimization passes to decide on the block count usage.
+func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+  %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32>
+  xegpu.store_nd %2, %1  : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
+  return
+}
+}
+
+// -----
+
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>