refine tensor_desc verifier

chencha3 · chencha3 · commit ee3802d56600 · 2025-04-24T17:18:28.000Z
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -25,12 +25,14 @@ class TensorDescType;
 } // namespace xegpu
 } // namespace mlir
 
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
 #define GET_TYPEDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc>
+
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
+
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h.inc>
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -36,6 +36,12 @@ def XeGPU_Dialect : Dialect {
 
     let useDefaultTypePrinterParser = true;
     let useDefaultAttributePrinterParser = true;
+
+    let extraClassDeclaration = [{
+      /// Checks if the given shape can be evenly distributed based on the layout
+      /// and data factors provided by the LayoutAttr.
+      static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::LayoutAttr attr);
+    }];
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -30,6 +31,61 @@ void XeGPUDialect::initialize() {
       >();
 }
 
+bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
+                                         xegpu::LayoutAttr attr) {
+  assert(attr && "Layout attribute is missing.");
+
+  auto getSubShapeOrNull =
+      [&](llvm::ArrayRef<int64_t> shape, DenseI32ArrayAttr layout,
+          DenseI32ArrayAttr data,
+          bool use_rr = true) -> std::optional<SmallVector<int64_t>> {
+    llvm::SmallVector<int64_t> newShape(shape);
+    if (layout) {
+      auto vec = llvm::to_vector_of<int64_t>(layout.asArrayRef());
+      if (vec.size() != shape.size())
+        return std::nullopt;
+      auto ratio = computeShapeRatio(shape, vec);
+      if (!ratio.has_value())
+        return std::nullopt;
+      newShape = ratio.value();
+    }
+
+    if (data) {
+      auto vec = llvm::to_vector_of<int64_t>(data.asArrayRef());
+      if (vec.size() != shape.size())
+        return std::nullopt;
+      auto ratio = computeShapeRatio(newShape, vec);
+      if (!ratio.has_value() && use_rr)
+        ratio = computeShapeRatio(vec, newShape);
+      if (!ratio.has_value())
+        return std::nullopt;
+
+      // if data is not null, we always return it for next phase.
+      newShape = vec;
+    }
+    return newShape;
+  };
+
+  // check the sgLayout and sgData
+  auto maybeSgShape =
+      getSubShapeOrNull(shape, attr.getSgLayout(), attr.getSgData());
+  if (!maybeSgShape)
+    return false;
+  auto sgShape = maybeSgShape.value();
+
+  // check InstData, it neither have layout nor need round-robin
+  auto maybeInstShape =
+      getSubShapeOrNull(sgShape, nullptr, attr.getInstData(), false);
+  if (!maybeInstShape)
+    return false;
+  auto instShape = maybeInstShape.value();
+
+  // check LaneLayout and LaneData
+  auto maybeLaneShape = getSubShapeOrNull(instShape, attr.getLaneLayout(),
+                                          attr.getLaneData(), false);
+  return maybeLaneShape.has_value();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_BlockTensorDescAttr
 //===----------------------------------------------------------------------===//
@@ -241,7 +297,7 @@ LogicalResult TensorDescType::verify(
     llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
     mlir::Attribute encoding, mlir::Attribute layout) {
   size_t rank = shape.size();
-  // Low-pressure types are packed in 32-bit units.
+  // Low-precision types are packed in 32-bit units.
   int32_t packingFactor = 32 / elementType.getIntOrFloatBitWidth();
   if (rank != 1 && rank != 2)
     return emitError() << "expected 1D or 2D tensor";
@@ -268,23 +324,21 @@ LogicalResult TensorDescType::verify(
     }
   }
 
-  if (auto blockAttr =
-          mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding)) {
+  auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
+  if (blockAttr) {
     MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
     if (rank == 2 && memorySpaceAttr &&
         memorySpaceAttr.getValue() == MemorySpace::SLM)
       return emitError() << "SLM is not supported for 2D block tensor";
   }
 
-  if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
-
+  auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout);
+  if (layoutAttr) {
     if (rank != (size_t)layoutAttr.getRank())
       return emitError() << "expected layout rank to match tensor rank";
 
-    ArrayRef<int32_t> laneLayout = layoutAttr.getLaneLayout().asArrayRef();
-    ArrayRef<int32_t> laneData = layoutAttr.getLaneData().asArrayRef();
-
-    if (scatterAttr) {
+    auto laneData = layoutAttr.getLaneData();
+    if (scatterAttr && laneData) {
       // Validate subgroup mapping rules for scattered tensors.
       // A work-item's slice of the tensor with shape [sg_size] or
       // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
@@ -294,20 +348,19 @@ LogicalResult TensorDescType::verify(
       if (rank > 1 && laneData[0] != 1)
         return emitError()
                << "cannot map over non-contiguous scattered row elements";
-      if (laneData.back() != packingFactor)
+      if (laneData[rank - 1] != packingFactor)
         return emitError() << "work item data mapping must match the number of "
                               "contiguous elements";
     }
 
-    for (size_t i = 0; i < shape.size(); ++i) {
-      uint32_t numElemPerWi = laneLayout[i] * laneData[i];
-      if (shape[i] < numElemPerWi || shape[i] % numElemPerWi != 0)
-        return emitError() << "cannot distribute " << shape[i] << " over "
-                           << laneLayout[i] << " work items with "
-                           << laneData[i] << " elements each";
+    if (!XeGPUDialect::isEvenlyDistributable(shape, layoutAttr)) {
+      std::string shapeStr;
+      llvm::raw_string_ostream stream(shapeStr);
+      llvm::interleaveComma(shape, stream);
+      return emitError() << "cannot distribute [" << shapeStr << "] using "
+                         << layoutAttr;
     }
   }
-
   return success();
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -73,34 +73,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
          kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
 }
 
-// Checks if the given shape is evenly distributed based on the layout
-// and data factors provided by the LayoutAttr. The function ensures that
-// each dimension of the shape can be evenly divided by the corresponding
-// data factor, and the resulting quotient can be evenly divided by the
-// layout factor. Returns `true` if the shape is evenly distributed,
-// otherwise `false`.
-static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
-                              xegpu::LayoutAttr attr) {
-  assert(attr && "Layout attribute is missing.");
-  llvm::SmallVector<int32_t> defaults(shape.size(), 1);
-  llvm::ArrayRef<int32_t> layout, data;
-  if (auto sg_layout = attr.getSgLayout()) {
-    layout = sg_layout.asArrayRef();
-    auto sg_data = attr.getSgData();
-    data = sg_data ? sg_data.asArrayRef() : defaults;
-  } else {
-    layout = attr.getLaneLayout().asArrayRef();
-    auto lane_data = attr.getLaneData();
-    data = lane_data ? lane_data.asArrayRef() : defaults;
-  }
-  for (auto [dimSize, dataFactor, layoutFactor] :
-       llvm::zip_equal(shape, data, layout)) {
-    if (dimSize % dataFactor != 0 || (dimSize / dataFactor) % layoutFactor != 0)
-      return false;
-  }
-  return true;
-}
-
 static LogicalResult
 isValidGatherScatterParams(Type maskTy, VectorType valueTy,
                            TensorDescType tdescTy, UnitAttr transposeAttr,
@@ -685,10 +657,10 @@ LogicalResult ConvertLayoutOp::verify() {
         "expected srcMap and resMap be WgLayout or SgLayout at the same time.");
 
   auto shape = getSource().getType().getShape();
-  if (!isEvenDistributed(shape, srcMap))
+  if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap))
     return emitOpError("invalid srcMap, data cannot be evenly distributed.");
 
-  if (!isEvenDistributed(shape, resMap))
+  if (!XeGPUDialect::isEvenlyDistributable(shape, resMap))
     return emitOpError("invalid resMap, data cannot be evenly distributed.");
 
   return mlir::success();
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -404,31 +404,31 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
 // -----
 func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error@+1 {{cannot distribute 8 over 16 work items with 1 elements each}}
+      // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}}
       !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   return
 }
 
 // -----
 func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error@+1 {{cannot distribute 4 over 8 work items with 1 elements each}}
+      // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}}
       !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
   return
 }
 
 // -----
 func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error@+1 {{cannot distribute 4 over 2 work items with 4 elements each}}
+      // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>}}
       !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
   return
 }
 
 // -----
 func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error@+1 {{cannot distribute 4 over 8 work items with 1 elements each}}
+      // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 2]>}}
       !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 2]>>
   return
 }