Address feedback

akroviakov · akroviakov · commit bcc9d85b2c95 · 2025-08-29T16:26:33.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -195,7 +195,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(unsigned rank) {
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
-                                           bool scattered = false) {
+                                           bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
          "Expected 1D or 2D vector.");
@@ -208,7 +208,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
   // Packing factor is determined by the element type bitwidth.
   int packingFactor = 1;
   unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
-  if (scattered) {
+  if (isScattered) {
     packingFactor =
         bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
             ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
@@ -224,7 +224,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
-                                           bool scattered = false) {
+                                           bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
          "Expected 1D or 2D TensorDesc.");
@@ -237,7 +237,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
   // Packing factor is determined by the element type bitwidth.
   unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
 
-  if (scattered) {
+  if (isScattered) {
     int packingFactor =
         bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
             ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
@@ -558,7 +558,10 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     ArrayRef<const LayoutInfoLattice *> results) {
   // The layout is strictly determined by the payload type.
   auto payloadTy = dyn_cast<VectorType>(load.getValueType());
-  assert(payloadTy && "Only vector payload distribution is supported");
+  if (!payloadTy) {
+    load.emitWarning("Not propagating, non-vector payload supplied.");
+    return;
+  }
   LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
 
   // Mask operand should have 1D default layout.
@@ -569,9 +572,8 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     propagateIfChanged(operands[0], operands[0]->meet(layout));
   // Propagate the new layout to the mask and optional offset operand.
   propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
-  if (load.getOffsets()) {
+  if (load.getOffsets())
     propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
-  }
 }
 
 /// Propagate the layout of the descriptor to the vector offset operand in
@@ -597,7 +599,10 @@ void LayoutInfoPropagation::visitStoreScatterOp(
   // the tensor descriptor is equal to the subgroup size. This is ensured by
   // the op verifier.
   auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
-  assert(payloadTy && "Only vector payload distribution is supported");
+  if (!payloadTy) {
+    storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
+    return;
+  }
   auto payloadShape = payloadTy.getShape();
   if (payloadShape.size() > 1)
     assert(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -849,18 +849,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
       return rewriter.notifyMatchFailure(storeScatterOp,
                                          "Expected 1D offsets and mask vector");
     VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
-    assert(storeVecTy.getRank() <= 2 &&
-           "Expected at most 2D result at SG level");
-    VectorType distStoreVecTy;
-    if (storeVecTy.getRank() == 2)
-      distStoreVecTy = VectorType::Builder(storeVecTy).dropDim(0);
-    else // rank 1
-      distStoreVecTy = VectorType::Builder(storeVecTy).setDim(0, 1);
-    // Assume offset and mask producers will be distributed as well.
-    VectorType distOffsetsTy =
-        VectorType::get({1}, getElementTypeOrSelf(offsetsTy));
-    VectorType distMaskTy = VectorType::get(
-        {1}, getElementTypeOrSelf(storeScatterOp.getMask().getType()));
+    if (storeVecTy.getRank() > 2)
+      return rewriter.notifyMatchFailure(
+          storeScatterOp, "Expected at most 2D result at SG level");
+
     std::string layoutPayloadName =
         xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
     std::string layoutOffsetsName =
@@ -884,17 +876,20 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
     if (failed(distStoreVecByWarpOpOrFailure) ||
         failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
-      storeScatterOp.emitWarning(
+      return rewriter.notifyMatchFailure(
+          storeScatterOp,
           "Some vector operands have no layouts, using defaults instead.");
     }
-    distStoreVecTy = distStoreVecByWarpOpOrFailure.value_or(distStoreVecTy);
-    distOffsetsTy = distOffsetsByWarpOpOrFailure.value_or(distOffsetsTy);
-    distMaskTy = distMaskByWarpOpOrFailure.value_or(distMaskTy);
+    VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
+    VectorType expectedPayloadTy = VectorType::get(
+        {distPayloadTy.getNumElements()}, distPayloadTy.getElementType());
 
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = storeScatterOp->getOperands();
     SmallVector<Type> operandTypesToYield = {
-        distStoreVecTy, operands[1].getType(), distOffsetsTy, distMaskTy};
+        expectedPayloadTy, operands[1].getType(),
+        distOffsetsByWarpOpOrFailure.value(),
+        distMaskByWarpOpOrFailure.value()};
 
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
@@ -958,10 +953,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
       return rewriter.notifyMatchFailure(loadGatherOp,
                                          "Expected 1D offsets and mask vector");
     // Assume offset and mask producers will be distributed as well.
-    VectorType distOffsetsTy =
-        VectorType::get({1}, getElementTypeOrSelf(offsetsTy));
-    VectorType distMaskTy = VectorType::get({1}, getElementTypeOrSelf(maskTy));
-
     std::string layoutOffsetsName =
         xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
     std::string layoutMaskName =
@@ -978,16 +969,16 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
-      loadGatherOp.emitWarning(
+      return rewriter.notifyMatchFailure(
+          loadGatherOp,
           "Some vector operands have no layouts, using defaults instead.");
     }
-    distOffsetsTy = distOffsetsByWarpOpOrFailure.value_or(distOffsetsTy);
-    distMaskTy = distMaskByWarpOpOrFailure.value_or(distMaskTy);
 
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = loadGatherOp->getOperands();
-    SmallVector<Type> operandTypesToYield = {operands[0].getType(),
-                                             distOffsetsTy, distMaskTy};
+    SmallVector<Type> operandTypesToYield = {
+        operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
+        distMaskByWarpOpOrFailure.value()};
 
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
@@ -998,7 +989,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
     const unsigned operandIdx = producedByLastLoad->getOperandNumber();
     VectorType loadVecTy =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    assert(loadVecTy.getRank() == 1 && "Expected a distributed vector");
 
     rewriter.setInsertionPointAfter(newWarpOp);
     xegpu::LoadGatherOp newOp = rewriter.create<xegpu::LoadGatherOp>(
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -330,24 +330,15 @@ gpu.module @test {
   gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) {
     %1 = arith.constant dense<1>: vector<16xi1>
     %offset = arith.constant dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-    xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @test {
-  gpu.func @scatter_ops(%src: memref<256xf16>) {
-    %1 = arith.constant dense<1>: vector<16xi1>
-    %offset = arith.constant dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-    xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
+      layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+      layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+    xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> {
+      layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
+      layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+      layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+    } : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
     gpu.return
   }
 }