Address Feedback

nbpatel · nbpatel · commit a092bd34d2d2 · 2025-05-20T02:49:31.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -81,7 +81,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
   calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
                          const SmallVector<OpFoldResult> &originalOffsets,
                          const SmallVector<Value> &localOffset,
-                         const SmallVector<int64_t> &distUnitBaseAddr) const {
+                         const SmallVector<int64_t> &distUnitBaseAddr,
+                         const SmallVector<int64_t> &distUnitShape) const {
     assert(localOffset.size() == distUnitBaseAddr.size() &&
            "localOffset and distUnitBaseAddr must have the same rank");
 
@@ -105,9 +106,13 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
           rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
       Value offset =
           rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
+      Value modValue =
+          rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
+      Value offsetMod =
+          rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
       Value origOffset = getValueFromOpFoldResult(originalOffsets[dimIdx]);
       Value globalOffset =
-          rewriter.createOrFold<index::AddOp>(loc, origOffset, offset);
+          rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
       globalOffsets[dimIdx] = globalOffset;
     }
 
@@ -125,10 +130,27 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       return failure();
     Type elemTy = tdescTy.getElementType();
     ArrayRef<int64_t> wgShape = tdescTy.getShape();
-    SmallVector<int64_t> sgShape =
-        llvm::to_vector_of<int64_t>(layout.getSgData().asArrayRef());
-    SmallVector<int64_t> sgLayout =
-        llvm::to_vector_of<int64_t>(layout.getSgLayout().asArrayRef());
+    SmallVector<int64_t> sgLayout;
+    if (auto sgLayoutAttr = layout.getSgLayout()) {
+      sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
+    } else {
+      // sgLayout must be present for workgroup-level distribution.
+      op.emitError("sgLayout attribute is required in layout");
+      return failure();
+    }
+
+    SmallVector<int64_t> sgShape;
+    if (auto sgDataAttr = layout.getSgData()) {
+      sgShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
+    } else {
+      assert(wgShape.size() == sgLayout.size() &&
+             "sgLayout and wgShape must have the same rank");
+      sgShape.reserve(wgShape.size());
+      for (size_t i = 0; i < wgShape.size(); ++i) {
+        assert(sgLayout[i] != 0 && "sgLayout elements must be non-zero");
+        sgShape.push_back(wgShape[i] / sgLayout[i]);
+      }
+    }
 
     // TODO : Handle order attribute
     // Get the subgroup ID
@@ -168,8 +190,9 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     SmallVector<Value> newCreateNdOps;
     for (SmallVector<int64_t> distUnitBaseAddr :
          StaticTileOffsetRange(wgShape, distUnitShape)) {
-      SmallVector<OpFoldResult> globalOffsets = calculateGlobalOffsets(
-          rewriter, loc, originalOffsets, localOffset, distUnitBaseAddr);
+      SmallVector<OpFoldResult> globalOffsets =
+          calculateGlobalOffsets(rewriter, loc, originalOffsets, localOffset,
+                                 distUnitBaseAddr, distUnitShape);
 
       auto newCreateNdOp = rewriter.create<xegpu::CreateNdDescOp>(
           loc, newTdescTy, op.getSource(), globalOffsets, op.getMixedSizes(),
@@ -258,11 +281,10 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
     if (!originalLayout)
       return failure();
 
-    size_t i = 0;
     SmallVector<Value> newDpasOps;
+    size_t i = 0;
     for (auto aVec : adaptor.getLhs()) {
       for (auto bVec : adaptor.getRhs()) {
-
         llvm::SmallVector<Value> operands({aVec, bVec});
         Value tmpC;
         if (op.getAcc()) {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -14,9 +14,14 @@ gpu.module @test_1_1_assignment {
   // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
   // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
   // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
+  // CHECK: %[[C24:.*]] = arith.constant 24 : index
+  // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[ADD1:.*]] = index.add %[[MUL1]], %[[C0]]
-  // CHECK: %[[ADD2:.*]] = index.add %[[MUL2]], %[[C0]]
+  // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
+  // CHECK: %[[C32:.*]] = arith.constant 32 : index
+  // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
+  // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
+  // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
   // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
   // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
   // CHECK: gpu.return
@@ -108,6 +113,40 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
+
+// CHECK-LABEL: test_dpas_no_sg_data
+// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
+// CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
+gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+    // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
+    // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
+    // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
+    // CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
+    // CHECK-SAME: -> vector<12x8xf32>
+    // CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG_1]][{{%.*}}, {{%.*}}] : memref<32x24xf32>
+    // CHECK-SAME: -> !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
+    // CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]]
+    // CHECK-SAME: : !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
+    // CHECK-SAME: -> vector<8x12xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]]
+    // CHECK-SAME: {layout_result_0 =  #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>, vector<8x12xf32> -> vector<12x12xf32>
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a =  xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32>
+      -> !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
+    %load_b =  xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
+      -> vector<32x24xf32>
+    %dpas = xegpu.dpas %load_a, %load_b
+      {layout =  #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
+    gpu.return
+  }
+
   // CHECK-LABEL: test_prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
   gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {