Add pattern for constant mask

nbpatel · nbpatel · commit be59f467b838 · 2025-11-14T21:20:44.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1283,57 +1283,74 @@ struct WgToSgVectorTransposeOp
   }
 };
 
-/// Pattern for lowering vector.create_mask and vector.constant_mask ops to
-/// subgroup level.
-template <typename MaskOpType>
-struct WgToSgVectorMaskOp : public OpConversionPattern<MaskOpType> {
-  using OpConversionPattern<MaskOpType>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      MaskOpType op,
-      typename OpConversionPattern<MaskOpType>::OneToNOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    VectorType resultType = op.getResult().getType();
-    ArrayRef<int64_t> wgShape = resultType.getShape();
+// This pattern distributes the vector.constant_mask ops to work at subgroup
+// level.
+struct WgToSgVectorConstantMaskOp
+    : public OpConversionPattern<vector::ConstantMaskOp> {
+  using OpConversionPattern<vector::ConstantMaskOp>::OpConversionPattern;
 
+  LogicalResult
+  matchAndRewrite(vector::ConstantMaskOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     xegpu::DistributeLayoutAttr layout =
         xegpu::getDistributeLayoutAttr(op.getResult());
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
-    SmallVector<int64_t> sgShape;
-    int count;
-    std::tie(sgShape, count) = getSgShapeAndCount(wgShape, layout);
-    VectorType newResultType =
-        VectorType::get(sgShape, resultType.getElementType());
+    Location loc = op.getLoc();
+    VectorType type = op.getResult().getType();
+    auto wgShape = type.getShape();
+
+    ArrayRef<int64_t> originalMaskDimSizes = op.getMaskDimSizes();
 
-    SmallVector<Value> newMaskOps;
-    for (int i = 0; i < count; ++i) {
-      Value newMaskOp;
-      if constexpr (std::is_same_v<MaskOpType, vector::CreateMaskOp>) {
-        newMaskOp = vector::CreateMaskOp::create(
-            rewriter, op.getLoc(), newResultType, op.getOperands());
-      } else if constexpr (std::is_same_v<MaskOpType, vector::ConstantMaskOp>) {
-        newMaskOp = vector::ConstantMaskOp::create(
-            rewriter, op.getLoc(), newResultType, op.getMaskDimSizes());
-      } else {
-        return rewriter.notifyMatchFailure(op,
-                                           "Unsupported mask operation type");
+    // Get subgroup ID.
+    Value sgId =
+        gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+    auto sgOffsets =
+        layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
+    if (failed(sgOffsets))
+      return failure();
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+    VectorType resultType = VectorType::get(sgShape, type.getElementType());
+
+    SmallVector<Value> newCreateMaskOps;
+    for (auto offsetSet : *sgOffsets) {
+      SmallVector<Value> maskOperands;
+
+      for (auto [i, originalMaskSize] : llvm::enumerate(originalMaskDimSizes)) {
+        Value originalMaskSizeVal =
+            arith::ConstantIndexOp::create(rewriter, loc, originalMaskSize);
+        Value dimSizeVal =
+            arith::ConstantIndexOp::create(rewriter, loc, sgShape[i]);
+        Value offset = offsetSet[i];
+        // Compute: originalMaskSize - offset.
+        Value adjustedMaskSize =
+            arith::SubIOp::create(rewriter, loc, originalMaskSizeVal, offset);
+        // Clamp to [0, dimSize]: max(0, min(adjustedMaskSize,
+        // dimSize))
+        Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+        Value clampedLow =
+            arith::MaxSIOp::create(rewriter, loc, adjustedMaskSize, zero);
+        Value clampedHigh =
+            arith::MinSIOp::create(rewriter, loc, clampedLow, dimSizeVal);
+        maskOperands.push_back(clampedHigh);
       }
-      xegpu::setDistributeLayoutAttr(cast<OpResult>(newMaskOp),
-                                     layout.dropSgLayoutAndData());
 
-      newMaskOps.push_back(newMaskOp);
+      auto newCreateMaskOp =
+          vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands);
+      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+          !layout.getEffectiveInstDataAsInt().empty())
+        xegpu::setDistributeLayoutAttr(newCreateMaskOp->getResult(0),
+                                       layout.dropSgLayoutAndData());
+      newCreateMaskOps.push_back(newCreateMaskOp.getResult());
     }
 
-    rewriter.replaceOpWithMultiple(op, {newMaskOps});
+    rewriter.replaceOpWithMultiple(op, {newCreateMaskOps});
     return success();
   }
 };
 
-using WgToSgVectorCreateMaskOp = WgToSgVectorMaskOp<vector::CreateMaskOp>;
-using WgToSgVectorConstantMaskOp = WgToSgVectorMaskOp<vector::ConstantMaskOp>;
-
 } // namespace
 
 namespace mlir {
@@ -1349,8 +1366,7 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
            WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
            WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp,
            WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp,
-           WgToSgVectorConstantMaskOp, WgToSgVectorCreateMaskOp>(
-          patterns.getContext());
+           WgToSgVectorConstantMaskOp>(patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -1477,10 +1493,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
-                               vector::TransposeOp, vector::BroadcastOp,
-                               vector::MultiDimReductionOp,
-                               vector::ConstantMaskOp, vector::CreateMaskOp>(
+  target.addDynamicallyLegalOp<
+      vector::ShapeCastOp, vector::StepOp, vector::TransposeOp,
+      vector::BroadcastOp, vector::MultiDimReductionOp, vector::ConstantMaskOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
         auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -134,12 +134,8 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_mask_2D
   gpu.func @vector_mask_2D() {
     %cst16 = arith.constant 16 : index
-    // CHECK: %[[CST16:.*]] = arith.constant 16 : index
-    // CHECK-COUNT-4: vector.create_mask %[[CST16:.*]], %[[CST16]] : vector<16x16xi1>
+    // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
     // CHECK-NOT: vector.create_mask
-    // CHECK-COUNT-4: vector.constant_mask [16, 16] : vector<16x16xi1>
-    // CHECK-NOT: vector.constant_mask
-    %create_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
     %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -550,20 +550,37 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: vector_mask_1D
   gpu.func @vector_mask_1D() {
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C2:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C16:.*]]
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MUL]], %[[C32:.*]]
+    // CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C8:.*]], %[[REMU2]] : index
+    // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
     %cst8 = arith.constant 8 : index
-    // CHECK: vector.create_mask {{.*}} : vector<16xi1>
-    %create_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<16xi1>
-    // CHECK: vector.constant_mask [8] : vector<16xi1>
     %constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
     gpu.return
   }
 
   // CHECK-LABEL: vector_mask_2D
   gpu.func @vector_mask_2D() {
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8:.*]]
+    // CHECK-DAG: %[[ROW:.*]] = index.mul %[[SGIDY]], %[[C32:.*]]
+    // CHECK-DAG: %[[COL:.*]] = index.mul %[[SGIDX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODROW:.*]] = index.remu %[[ROW]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODCOL:.*]] = index.remu %[[COL]], %[[C128:.*]]
+    // CHECK-DAG: %[[SUBROW:.*]] = arith.subi %[[C16:.*]], %[[MODROW]] : index
+    // CHECK-DAG: %[[MAXROW:.*]] = arith.maxsi %[[SUBROW]], %[[C4:.*]] : index
+    // CHECK-DAG: %[[MINROW:.*]] = arith.minsi %[[MAXROW]], %[[C32:.*]] : index
+    // CHECK-DAG: %[[SUBCOL:.*]] = arith.subi %[[C16:.*]], %[[MODCOL]] : index
+    // CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C7:.*]] : index
+    // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
+    // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
     %cst16 = arith.constant 16 : index
-    // CHECK: vector.create_mask {{.*}}, {{.*}} : vector<32x32xi1>
-    %create_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
-    // CHECK: vector.constant_mask [16, 16] : vector<32x32xi1>
     %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
     gpu.return
   }