Address feedback

nbpatel · nbpatel · commit b3ba670e96a0 · 2025-05-16T18:24:47.000Z
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -33,7 +33,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
       "Print the result of the subgroup map propagation analysis and exit.">];
 }
 
-def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute", "::mlir::gpu::GPUModuleOp"> {
+def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
   let summary = "Transform WorkGroup level XeGPU code to SubGroup level";
   let description = [{
     This transform pass distributes the workgroup level computation to
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -7,14 +7,15 @@
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 
+#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include <mlir/Dialect/GPU/IR/GPUDialect.h>
-#include <mlir/Dialect/Index/IR/IndexOps.h>
 
 namespace mlir {
 namespace xegpu {
@@ -70,15 +71,6 @@ namespace {
 struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
 
-  // Convert linear subgroup ID to 2D coordinates
-  // TODO: Delinearize for nD
-  SmallVector<Value> delinearizeSubgroupId(ConversionPatternRewriter &rewriter,
-                                           Location loc, Value sgID,
-                                           Value sgDimX, Value sgDimY) const {
-    return {rewriter.create<index::DivUOp>(loc, sgID, sgDimY),
-            rewriter.create<index::RemUOp>(loc, sgID, sgDimY)};
-  }
-
   // Calculate offset for each subgroup
   SmallVector<OpFoldResult>
   calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
@@ -144,7 +136,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
 
     // TODO : Handle order attribute
     // Get the subgroup ID
-    auto linearSgId = rewriter.create<gpu::SubgroupIdOp>(loc, nullptr);
+    auto linearSgId =
+        rewriter.create<gpu::SubgroupIdOp>(loc, /*upper_bound=*/nullptr);
 
     // Create constants for layout dimensions
     SmallVector<Value> sgLayoutDim(sgLayout.size());
@@ -156,9 +149,11 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       sgDataDim[i] = rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]);
     }
 
-    // Delinearize the 1D subgroup id into 2d
-    SmallVector<Value> sgIds = delinearizeSubgroupId(
-        rewriter, loc, linearSgId, sgLayoutDim[0], sgLayoutDim[1]);
+    auto deLinearizeSgId =
+        affine::delinearizeIndex(rewriter, loc, linearSgId, sgLayoutDim);
+    if (failed(deLinearizeSgId))
+      return failure();
+    SmallVector<Value> sgIds = *deLinearizeSgId;
 
     // Calculate distribution unit shape and local offsets for subgroup
     SmallVector<int64_t> distUnitShape(sgLayout.size());
@@ -267,9 +262,9 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
     if (!originalLayout)
       return failure();
 
+    size_t i = 0;
     SmallVector<Value> newDpasOps;
     for (auto aVec : adaptor.getLhs()) {
-      size_t i = 0;
       for (auto bVec : adaptor.getRhs()) {
 
         llvm::SmallVector<Value> operands({aVec, bVec});
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -6,7 +6,9 @@ gpu.module @test_round_robin_assignment {
   gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
       // CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32>
       // CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
-      %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      // CHECK-NOT: xegpu.create_nd_tdesc
+      %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
+        -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       gpu.return
     }
 
@@ -17,18 +19,26 @@ gpu.module @test_round_robin_assignment {
       // CHECK-COUNT-12: xegpu.load_nd %{{.*}}
       // CHECK-SAME-COUNT-12: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-SAME-COUNT-12: -> vector<2x2xf32>
-      %load =  xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>> -> vector<24x32xf32>
+      // CHECK-NOT: xegpu.load_nd
+      %load =  xegpu.load_nd %tdesc
+        : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+        -> vector<24x32xf32>
       gpu.return
     }
 
   // CHECK-LABEL: test_store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
   gpu.func @test_store_nd(%src: memref<24x32xf32>) {
-      %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
+        -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}}
       // CHECK-SAME-COUNT-12: : vector<2x2xf32>, !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
-      %load = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>> -> vector<24x32xf32>
-      xegpu.store_nd %load, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      // CHECK-NOT : xegpu.store_nd
+      %load = xegpu.load_nd %tdesc
+        : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+        -> vector<24x32xf32>
+      xegpu.store_nd %load, %tdesc
+        : vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       gpu.return
   }
 
@@ -38,7 +48,9 @@ gpu.module @test_round_robin_assignment {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->  !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16]
     // CHECK-SAME-COUNT-12: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
-    %update = xegpu.update_nd_offset %tdesc, [0, 16] :  !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    // CHECK-NOT: xegpu.update_nd_offset
+    %update = xegpu.update_nd_offset %tdesc, [0, 16]
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     gpu.return
   }
 
@@ -47,28 +59,45 @@ gpu.module @test_round_robin_assignment {
   gpu.func @test_dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
     // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
+    // CHECK-NOT: xegpu.create_nd_tdesc
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
     // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
+    // CHECK-NOT: xegpu.create_nd_tdesc
     // CHECK-COUNT-4:  xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}] : memref<8x8xf32>
     // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
+    // CHECK-NOT: xegpu.create_nd_tdesc
     // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
     // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
     // CHECK-SAME-COUNT-16: : vector<2x2xf32>, vector<2x2xf32> -> vector<2x2xf32>
-    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<8x8xf32> -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-    %load_a =  xegpu.load_nd %tdesc_a: !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>> -> vector<8x8xf32>
-    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<8x8xf32> -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-    %load_b =  xegpu.load_nd %tdesc_b:  !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>> -> vector<8x8xf32>
-    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<8x8xf32> -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-    %dpas = xegpu.dpas %load_a, %load_b {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>} : vector<8x8xf32>, vector<8x8xf32> -> vector<8x8xf32>
+    // CHECK-NOT: xegpu.dpas
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<8x8xf32>
+      -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %load_a =  xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<8x8xf32>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<8x8xf32>
+      -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %load_b =  xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<8x8xf32>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<8x8xf32>
+      -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %dpas = xegpu.dpas %load_a, %load_b
+      {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<8x8xf32>, vector<8x8xf32> -> vector<8x8xf32>
     gpu.return
   }
 
   // CHECK-LABEL: test_prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
   gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
-    // CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
-    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-    xegpu.prefetch_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    // CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}}
+    // CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
+    // CHECK-NOT: xegpu.prefetch_nd
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    xegpu.prefetch_nd %tdesc
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     gpu.return
   }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {`
`33`	`33`	`"Print the result of the subgroup map propagation analysis and exit.">];`
`34`	`34`	`}`
`35`	`35`
`36`		`-def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute", "::mlir::gpu::GPUModuleOp"> {`
	`36`	`+def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {`
`37`	`37`	`let summary = "Transform WorkGroup level XeGPU code to SubGroup level";`
`38`	`38`	`let description = [{`
`39`	`39`	`This transform pass distributes the workgroup level computation to`