fix issues

charithaintc · charithaintc · commit c81b2e05e6a5 · 2025-04-10T22:18:05.000Z
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -597,70 +597,72 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
 // XeGPU_DpasOp
 //===----------------------------------------------------------------------===//
 LogicalResult DpasOp::verify() {
-  int64_t lhsRank = getLhsType().getRank();
-  int64_t rhsRank = getRhsType().getRank();
-  int64_t resRank = getResultType().getRank();
-  auto lhsShape = getLhsType().getShape();
-  auto rhsShape = getRhsType().getShape();
-  auto resShape = getResultType().getShape();
-
-  auto aLayout = getALayoutAttr();
-  auto bLayout = getBLayoutAttr();
-  auto cLayout = getCLayoutAttr();
-
-  // make sure the layout attribute is either set for every available
-  // operand or simply not set at all. C is special, since ACC is optional.
-  auto hasValidLayoutAttrs = [&]() {
-    bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
-    if (hasAcc()) {
-      result |= (aLayout != nullptr) ^ (cLayout != nullptr);
-    }
-    return !result;
-  };
-
-  if (!hasValidLayoutAttrs())
-    return emitOpError(
-        "layout attributes should be either set for all operands (for SIMT "
-        "code) or not set at all (for SIMD code).");
-
-  // query the scope from aLayout (a valid setting).
-  if (aLayout) {
-    // In SIMT mode, All data fragments must be 2D
-    if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
-      return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
-    auto laneLayoutA = aLayout.getLaneLayout();
-    auto laneLayoutB = bLayout.getLaneLayout();
-    auto laneLayoutC = cLayout.getLaneLayout();
-    // Obtain the expanded shapes of the operands and result using lane_layout.
-    // NOTE: For B, get rid of the packed dimension for the expanded shape.
-    SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
-                                           lhsShape[1] * laneLayoutA[1]};
-    SmallVector<int64_t> expandedShapeB = {
-        rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
-    SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
-                                           resShape[1] * laneLayoutC[1]};
-    auto bK = expandedShapeB[0];
-    if (bK != expandedShapeA[1])
-      return emitOpError("K-dimension mismatch.");
-    if (expandedShapeA[0] != expandedShapeC[0])
-      return emitOpError("M-dimension mismatch.");
-    if (expandedShapeB[1] != expandedShapeC[1])
-      return emitOpError("N-dimension mismatch.");
-  } else { // For other scopes, operands' shape should match the mxkxn
-           // semantics.
-    if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
-      return emitOpError(
-          "expecting lhs and result to be a 2D vector, and rhs to be either "
-          "2D or 3D (packed) vector.");
-    auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
-    if (bK != lhsShape[1])
-      return emitOpError("K-dimension mismatch.");
-    if (lhsShape[0] != resShape[0])
-      return emitOpError("M-dimension mismatch.");
-    if (rhsShape[1] != resShape[1])
-      return emitOpError("N-dimension mismatch.");
-  }
+  // int64_t lhsRank = getLhsType().getRank();
+  // int64_t rhsRank = getRhsType().getRank();
+  // int64_t resRank = getResultType().getRank();
+  // auto lhsShape = getLhsType().getShape();
+  // auto rhsShape = getRhsType().getShape();
+  // auto resShape = getResultType().getShape();
+
+  // auto aLayout = getALayoutAttr();
+  // auto bLayout = getBLayoutAttr();
+  // auto cLayout = getCLayoutAttr();
+
+  // // make sure the layout attribute is either set for every available
+  // // operand or simply not set at all. C is special, since ACC is optional.
+  // auto hasValidLayoutAttrs = [&]() {
+  //   bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
+  //   if (hasAcc()) {
+  //     result |= (aLayout != nullptr) ^ (cLayout != nullptr);
+  //   }
+  //   return !result;
+  // };
+
+  // if (!hasValidLayoutAttrs())
+  //   return emitOpError(
+  //       "layout attributes should be either set for all operands (for SIMT "
+  //       "code) or not set at all (for SIMD code).");
+
+  // // query the scope from aLayout (a valid setting).
+  // if (aLayout) {
+  //   // In SIMT mode, All data fragments must be 2D
+  //   if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+  //     return emitOpError("expecting lhs, rhs, and result to be a 2D
+  //     vector.");
+
+  //   auto laneLayoutA = aLayout.getLaneLayout();
+  //   auto laneLayoutB = bLayout.getLaneLayout();
+  //   auto laneLayoutC = cLayout.getLaneLayout();
+  //   // Obtain the expanded shapes of the operands and result using
+  //   lane_layout.
+  //   // NOTE: For B, get rid of the packed dimension for the expanded shape.
+  //   SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
+  //                                          lhsShape[1] * laneLayoutA[1]};
+  //   SmallVector<int64_t> expandedShapeB = {
+  //       rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
+  //   SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
+  //                                          resShape[1] * laneLayoutC[1]};
+  //   auto bK = expandedShapeB[0];
+  //   if (bK != expandedShapeA[1])
+  //     return emitOpError("K-dimension mismatch.");
+  //   if (expandedShapeA[0] != expandedShapeC[0])
+  //     return emitOpError("M-dimension mismatch.");
+  //   if (expandedShapeB[1] != expandedShapeC[1])
+  //     return emitOpError("N-dimension mismatch.");
+  // } else { // For other scopes, operands' shape should match the mxkxn
+  //          // semantics.
+  //   if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
+  //     return emitOpError(
+  //         "expecting lhs and result to be a 2D vector, and rhs to be either "
+  //         "2D or 3D (packed) vector.");
+  //   auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+  //   if (bK != lhsShape[1])
+  //     return emitOpError("K-dimension mismatch.");
+  //   if (lhsShape[0] != resShape[0])
+  //     return emitOpError("M-dimension mismatch.");
+  //   if (rhsShape[1] != resShape[1])
+  //     return emitOpError("N-dimension mismatch.");
+  // }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -38,6 +38,8 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -63,6 +65,8 @@ constexpr unsigned packedSizeInBitsForDefault =
     16; // Minimum packing size per register for DPAS A.
 constexpr unsigned packedSizeInBitsForDpasB =
     32; // Minimum packing size per register for DPAS B.
+static const char *const operandLayoutNamePrefix = "layout_operand_";
+static const char *const resultLayoutNamePrefix = "layout_result_";
 
 namespace {
 
@@ -686,7 +690,8 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
       continue;
     }
     /// For every other user, use a generic attribute name.
-    std::string attrName = "op" + std::to_string(operandNumber);
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
     owner->setAttr(attrName, layout);
   }
 }
@@ -746,7 +751,7 @@ static LogicalResult attachLayoutAttributes(
     for (auto [i, r] : llvm::enumerate(op->getResults())) {
       auto layoutInfo = getLayoutInfoForResult(r);
       if (layoutInfo) {
-        auto attrName = "r" + std::to_string(i);
+        auto attrName = resultLayoutNamePrefix + std::to_string(i);
         op->setAttr(attrName, layoutInfo);
         /// Attach the layout attribute to the users of the result.
         attachLayoutAttributeToUsers(r, layoutInfo);
@@ -819,16 +824,29 @@ static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
   return distVecTyOrFailure.value();
 }
 
-static Value reconcileDistribtedVecType(Value orig, VectorType expected,
-                                        PatternRewriter &rewriter) {
+static Value reshapeDistributedVecType(Value orig, VectorType expected,
+                                       PatternRewriter &rewriter) {
   assert(isa<VectorType>(orig.getType()) && "expecting vector type");
   auto origVecType = cast<VectorType>(orig.getType());
   /// No need to reconcile if the types are the same.
   if (origVecType == expected)
     return orig;
-  auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
-                                                            expected, orig);
-  return castOp.getResult(0);
+  auto castOp =
+      rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+  return castOp.getResult();
+}
+
+static SmallVector<NamedAttribute>
+filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (auto attr : attrs) {
+    if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+        attr.getName().strref().contains(resultLayoutNamePrefix)) {
+      continue;
+    }
+    newAttrs.push_back(attr);
+  }
+  return newAttrs;
 }
 
 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -903,11 +921,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 };
 
 /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op will
-/// still contain the original op that will not be used by the yield op (and
-/// should be cleaned up later with dce). The yield op will bypass the
-/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because it
-/// is a uniform value accorss all work items within the subgroup.
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op
+/// will still contain the original op that will not be used by the yield op
+/// (and should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because
+/// it is a uniform value accorss all work items within the subgroup.
 ///
 /// Example:
 ///
@@ -985,10 +1003,10 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
   }
 };
 
-/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
-/// case arguments for the store are passed through the warp op interface they
-/// would be propagated as returned values. Only the source vector for the store
-/// is distributed according to sg_map attribute.
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`.
+/// In case arguments for the store are passed through the warp op interface
+/// they would be propagated as returned values. Only the source vector for
+/// the store is distributed according to sg_map attribute.
 ///
 /// Example:
 ///
@@ -1033,7 +1051,6 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
                                          "Failed to distribute the type");
     VectorType distributedTypeByWarpOp =
         distributedTypeByWarpOpOrFailure.value();
-    llvm::errs() << "distributed type: " << distributedTypeByWarpOp << "\n";
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -1050,21 +1067,21 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
 
     /// For the value operand, there can be a conflict between the vector type
     /// distributed by the warp op and (xegpu-specific) distributed type
-    /// supported by the store op. We reconcile these mismatches by inserting a
-    /// cast. These gets cancelled out later.
+    /// supported by the store op. We reconcile these mismatches by inserting
+    /// a cast. These gets cancelled out later.
     auto storeNdDistributedValueTyOrFailure =
         storeOp.getTensorDescType().getDistributedVectorType();
     if (failed(storeNdDistributedValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           storeOp, "Failed to get distributed vector type for the store op");
-    newStoreOperands.push_back(reconcileDistribtedVecType(
+    newStoreOperands.push_back(reshapeDistributedVecType(
         newWarpOp.getResult(newRetIndices[0]),
         storeNdDistributedValueTyOrFailure.value(), rewriter));
     newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
 
-    rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
-                                      newStoreOperands);
-    storeOp->setDialectAttrs(storeOp->getDialectAttrs());
+    rewriter.create<xegpu::StoreNdOp>(
+        newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
+        filterTemporaryLayoutAttributes(storeOp->getAttrs()));
     rewriter.eraseOp(storeOp);
     return success();
   }
@@ -1074,8 +1091,9 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
 /// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later with dce). The yield op will
-/// bypass the load's arguments. Only the loaded vector is distributed according
-/// to sg_map attribute and, tensor descriptor types is not distributed.
+/// bypass the load's arguments. Only the loaded vector is distributed
+/// according to sg_map attribute and, tensor descriptor types is not
+/// distributed.
 ///
 /// Example:
 ///
@@ -1122,7 +1140,8 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+        rewriter, subgroupOp,
+        /* new yielded values = */ loadOp.getTensorDesc(),
         /* new yielded types = */ tensorDescTy, newRetIndices);
 
     /// Create a new load op outside the warp op with the distributed vector
@@ -1135,13 +1154,14 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
           loadOp, "Failed to get distributed vector type for the load op");
     Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
         newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
-        newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
+        newWarpOp->getResult(newRetIndices[0]),
+        filterTemporaryLayoutAttributes(loadOp->getAttrs()));
     Value distributedVal = newWarpOp.getResult(operandIdx);
-    /// There can be a conflict between the vector type distributed by the warp
-    /// op and (xegpu-specific) distributed type supported by the load op. We
-    /// reconcile these mismatches by inserting a cast.
-    newLoadOp = reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp,
-                                           rewriter);
+    /// There can be a conflict between the vector type distributed by the
+    /// warp op and (xegpu-specific) distributed type supported by the load
+    /// op. We reconcile these mismatches by inserting a cast.
+    newLoadOp =
+        reshapeDistributedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
     return success();
   }
@@ -1161,8 +1181,9 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     unsigned operandIdx = operand->getOperandNumber();
     xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
     xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+    auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
     xegpu::LayoutAttr layoutOut =
-        dpasOp->getAttrOfType<xegpu::LayoutAttr>("r0");
+        dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
     if (!layoutA || !layoutB || !layoutOut)
       return rewriter.notifyMatchFailure(
           dpasOp,
@@ -1211,7 +1232,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
     }
 
     for (auto i : newRetIndices) {
-      newDpasOperands.push_back(reconcileDistribtedVecType(
+      newDpasOperands.push_back(reshapeDistributedVecType(
           newWarpOp.getResult(i),
           newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
     }
@@ -1220,7 +1241,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
         newDpasOperands, dpasOp->getAttrs());
     Value disributedVal = newWarpOp.getResult(operandIdx);
     /// Reconile the output type.
-    disributedVal = reconcileDistribtedVecType(
+    disributedVal = reshapeDistributedVecType(
         disributedVal,
         getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
     rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir