address comments

chencha3 · chencha3 · commit a0512d9f95f0 · 2025-09-03T15:09:44.000Z
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -144,6 +144,11 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op,
 /// if no GPU module parent or XeVM target attribute exists.
 std::optional<std::string> getChipStr(Operation *op);
 
+/// Generates element-wise addition ops of two arrays with same length.
+SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc,
+                                         ArrayRef<OpFoldResult> lhs,
+                                         ArrayRef<OpFoldResult> rhs);
+
 /// Generates element-wise addition ops of two arrays with automatic alignment.
 /// When the input arrays have different sizes, the shorter array is
 /// right-aligned with the longer array, and the unmatched leading elements from
@@ -157,7 +162,6 @@ std::optional<std::string> getChipStr(Operation *op);
 SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
                                               ArrayRef<OpFoldResult> lhs,
                                               ArrayRef<OpFoldResult> rhs);
-
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -686,12 +686,12 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
   using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
   LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
                                 PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    VectorType valueTy = op.getType();
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
-    if (!targetShape)
+    if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
       return failure();
 
-    Location loc = op.getLoc();
-    VectorType valueTy = op.getType();
     Type elemTy = valueTy.getElementType();
     ArrayRef<int64_t> shape = valueTy.getShape();
     auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
@@ -702,17 +702,17 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
     SmallVector<SmallVector<OpFoldResult>> offsetsList;
     for (SmallVector<int64_t> offsets :
          StaticTileOffsetRange(shape, *targetShape)) {
-      auto adds = xegpu::addWithRightAligned(
+      auto adds = xegpu::addElementwise(
           rewriter, loc, mixedOffsets,
           getAsIndexOpFoldResult(op.getContext(), offsets));
       offsetsList.push_back(adds);
     }
 
     SmallVector<Value> newOps;
+    layout = layout.dropInstData();
     for (SmallVector<OpFoldResult> offsets : offsetsList) {
       auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
-          op.getLoc(), newValueTy, op.getMemDesc(), offsets,
-          layout.dropInstData());
+          op.getLoc(), newValueTy, op.getMemDesc(), offsets, layout);
       newOps.push_back(newOp);
     }
     Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
@@ -743,7 +743,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
     SmallVector<SmallVector<OpFoldResult>> offsetsList;
     for (SmallVector<int64_t> offsets :
          StaticTileOffsetRange(shape, *targetShape)) {
-      auto adds = xegpu::addWithRightAligned(
+      auto adds = xegpu::addElementwise(
           rewriter, loc, mixedOffsets,
           getAsIndexOpFoldResult(op.getContext(), offsets));
       offsetsList.push_back(adds);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -447,6 +447,21 @@ std::optional<std::string> xegpu::getChipStr(Operation *op) {
   return std::nullopt;
 }
 
+/// Generates element-wise addition ops of two arrays with same length.
+SmallVector<OpFoldResult> xegpu::addElementwise(OpBuilder &builder,
+                                                Location loc,
+                                                ArrayRef<OpFoldResult> lhs,
+                                                ArrayRef<OpFoldResult> rhs) {
+  assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
+  SmallVector<OpFoldResult> results;
+  for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
+    auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
+    auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
+    results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
+  }
+  return results;
+}
+
 /// Generates element-wise addition ops of two arrays with automatic alignment.
 /// When the input arrays have different sizes, the shorter array is
 /// right-aligned with the longer array, and the unmatched leading elements from
@@ -466,10 +481,6 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
   ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
   SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
   a = a.slice(a.size() - b.size());
-  for (auto [l, r] : llvm::zip(a, b)) {
-    auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
-    auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
-    results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
-  }
+  results.append(addElementwise(builder, loc, a, b));
   return results;
 }