[MLIR][XeGPU] Add support for subgroup_id_range #148661

nbpatel · 2025-07-14T16:26:15Z

This PR adds a new attribute to the xegpu dialect called xegpu.range. One use case of this attribute can be to attach subgroup_id_range to scf.if of to drive the execution.

llvmbot · 2025-07-15T20:17:12Z

@llvm/pr-subscribers-mlir-gpu

@llvm/pr-subscribers-mlir

Author: Nishant Patel (nbpatel)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/148661.diff

2 Files Affected:

(modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp (+39-1)
(modified) mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir (+83)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index be7b860dd1729..56dc132d8083d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -174,8 +174,46 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       sgDataDim[i] = rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]);
     }
 
+    // Check if there is warp specialization.
+    auto isWarpSpecialized = [](Operation *op, int64_t &startRange,
+                                int64_t &endRange) -> bool {
+      Operation *parent = op->getParentOp();
+      // Find the outermost scf::IfOp with xegpu.sg_id_range.
+      while (parent) {
+        if (auto ifOp = dyn_cast<scf::IfOp>(parent)) {
+          if (Attribute attr = ifOp->getAttr("xegpu.sg_id_range")) {
+            if (auto denseAttr = dyn_cast<DenseI32ArrayAttr>(attr)) {
+              auto values = denseAttr.asArrayRef();
+              if (values.size() == 2) {
+                startRange = values[0];
+                endRange = values[1];
+              }
+            }
+            break;
+          }
+        }
+        parent = parent->getParentOp();
+      }
+      // Return false if startRange is 0
+      return (startRange > 0 && endRange > startRange);
+    };
+
+    int64_t startRange = -1, endRange = -1;
+    bool warpSpecialized = isWarpSpecialized(op, startRange, endRange);
+
+    // If warp specialization is detected, adjust the subgroup id accordingly
+    Value adjustedSgId = linearSgId;
+    if (warpSpecialized) {
+      // Subtract startRange from the original subgroup id to get the adjusted
+      // sg id
+      Value startRangeVal =
+          rewriter.create<arith::ConstantIndexOp>(loc, startRange);
+      adjustedSgId =
+          rewriter.createOrFold<index::SubOp>(loc, linearSgId, startRangeVal);
+    }
+
     auto deLinearizeSgId =
-        affine::delinearizeIndex(rewriter, loc, linearSgId, sgLayoutDim);
+        affine::delinearizeIndex(rewriter, loc, adjustedSgId, sgLayoutDim);
     if (failed(deLinearizeSgId))
       return failure();
     SmallVector<Value> sgIds = *deLinearizeSgId;
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 44b11c304cc80..71eb732ac4953 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -296,5 +296,88 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
+  // CHECK-LABEL: @warp_specialized
+  gpu.func @warp_specialized(%src: memref<256x128xf32>, %src1: memref<128x256xf32>, %src2: memref<128x64xf32>) {
+    %sg_id = gpu.subgroup_id : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c31 = arith.constant 31 : index
+    %c3 = arith.constant 3 : index
+    %cond1 = arith.cmpi sge, %sg_id, %c0 : index
+    %cond2 = arith.cmpi slt, %sg_id, %c1 : index
+    %cond = arith.andi %cond1, %cond2 : i1
+    scf.if %cond {
+        // CHECK-NOT: index.sub
+        %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+          -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+        %load =  xegpu.load_nd %tdesc
+          : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+          -> vector<256x128xf32>
+    } {xegpu.sg_id_range = array<i32: 0, 1>}
+    %cond3 = arith.cmpi sge, %sg_id, %c1 : index
+    %cond4 = arith.cmpi slt, %sg_id, %c2 : index
+    %cond5 = arith.andi %cond3, %cond4 : i1
+     scf.if %cond5 {
+        // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+        // CHECK: %[[C1:.*]] = arith.constant 1 : index
+        // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C1]]
+        %tdesc_a = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+          -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+        %load_a =  xegpu.load_nd %tdesc_a
+          : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+          -> vector<256x128xf32>
+        %tdesc_b = xegpu.create_nd_tdesc %src1[0, 0] : memref<128x256xf32>
+          -> !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32], lane_layout = [4, 8], lane_data = [1, 1]>>
+        %load_b =  xegpu.load_nd %tdesc_b
+          : !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32], lane_layout = [4, 8], lane_data = [1, 1]>>
+          -> vector<128x256xf32>
+        %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], lane_layout = [4, 8], lane_data = [1, 1]>} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
+     }{xegpu.sg_id_range = array<i32: 1, 2>}
+    %cond6 = arith.cmpi sge, %sg_id, %c2 : index
+    %cond7 = arith.cmpi slt, %sg_id, %c31 : index
+    %cond8 = arith.andi %cond6, %cond7 : i1
+    scf.if %cond8 {
+      // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+      // CHECK: %[[C2:.*]] = arith.constant 2 : index
+      // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
+      %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32>
+        -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+      %load =  xegpu.load_nd %tdesc
+        : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+        -> vector<128x64xf32>
+      %exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+    }{xegpu.sg_id_range = array<i32: 2, 32>}
+    gpu.return
+  }
 
+  // CHECK-LABEL: @subgroup_id_range_nested_if
+  gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) {
+    %sg_id = gpu.subgroup_id : index
+    %c1 = arith.constant 1 : i1 
+    %c3 = arith.constant 3 : index
+    %c32 = arith.constant 32 : index
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+    %load =  xegpu.load_nd %tdesc
+      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+      -> vector<256x128xf32>
+    %cond1 = arith.cmpi sge, %sg_id, %c3 : index
+    %cond2 = arith.cmpi slt, %sg_id, %c32 : index
+    %cond = arith.andi %cond1, %cond2 : i1
+    scf.if %c1 {
+      scf.if %cond {
+        // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+        // CHECK: %[[C3:.*]] = arith.constant 3 : index
+        // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]]
+        %td = xegpu.create_nd_tdesc %src1[0, 0] : memref<128x64xf32>
+          -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+        %ld =  xegpu.load_nd %td
+          : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+          -> vector<128x64xf32>
+        %exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+    }
+  } {xegpu.sg_id_range = array<i32: 3, 8>}
+  gpu.return
+  }
 }

Jianhui-Li · 2025-07-17T00:17:25Z

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

    }

+    // Check if there is warp specialization.
+    auto isWarpSpecialized = [](Operation *op, int64_t &startOfRange,


consider taking this out as a separate utility function for wg-to-sg distribution.

ok will change it

Jianhui-Li · 2025-07-17T00:18:56Z

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

+
+    // If warp specialization is detected, adjust the subgroup id accordingly
+    Value adjustedSgId = linearSgId;
+    if (warpSpecialized) {


You also need to verify that the sg id ranges match with xegpu.sg_layout

verify that the number of the subgroups in the sg_layout are equal to than the number of subgroups specified by the sg_id_range?

Jianhui-Li · 2025-07-17T00:19:13Z

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

+        }
+        parent = parent->getParentOp();
+      }
+      // Return false if startOfRange is 0


why startOfRange can't be 0?

it can be 0 but if the starting subgroup id is 0 we don't need to adjust the id's, so the check returns false

Jianhui-Li · 2025-07-17T01:05:11Z

mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir

+        %load_b =  xegpu.load_nd %tdesc_b
+          : !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32], lane_layout = [4, 8], lane_data = [1, 1]>>
+          -> vector<128x256xf32>
+        %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], lane_layout = [4, 8], lane_data = [1, 1]>} : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>


sg_layout size should match with range size

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

nbpatel · 2025-07-22T20:47:37Z

Hi @adam-smnk , do you have any comments on this?

adam-smnk

Overall looks neat - minor comments

adam-smnk · 2025-07-23T15:49:32Z

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

+        startOfRange = attr.getStart().getInt();
+        endOfRange = attr.getEnd().getInt();


General suggestion, non-blocker here: getting int value directly would make for a nice attribute helper method

adam-smnk · 2025-07-23T15:51:05Z

mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp

+RangeAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+                  IntegerAttr startOfRange, IntegerAttr endOfRange) {
+  if (startOfRange.getInt() >= endOfRange.getInt())
+    return emitError() << "EndOfRange must be greater than StartOfRange";


nit: in the error, I'd refer to values to their attribute names end and start
it should improve error readability

adam-smnk · 2025-07-23T15:53:03Z

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

+    AttrBuilder<(ins "int":$start, "int":$end)>
+  ];
+
+  let assemblyFormat = "`<` `[`$start ```,` $end `]``>`";


Suggested change

let assemblyFormat = "`<` `[`$start ```,` $end `]``>`";

let assemblyFormat = "`<` `[`$start `,` $end `]` `>`";

nit: minor cleanup

adam-smnk · 2025-07-23T15:54:44Z

mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp

+//===----------------------------------------------------------------------===//
+
+LogicalResult
+RangeAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,


Could you add one invalid test case?

not sure if its possible to add a negative test case with this pass...because it will always give legalization error for the create_nd_desc op if the pattern returns a failure in this case

adam-smnk · 2025-07-23T15:57:25Z

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

+    ```mlir
+    scf.if %cond {
+      // some operations
+    }{sg_id_range = #xegpu.range<[2, 4]>}


Suggested change

}{sg_id_range = #xegpu.range<[2, 4]>}

} {sg_id_range = #xegpu.range<[2, 4]>}

This PR adds a new attribute to the xegpu dialect called xegpu.range. One use case of this attribute can be to attach subgroup_id_range to scf.if of to drive the execution.

Add support for subgroup_id_range

1d18b89

nbpatel marked this pull request as ready for review July 15, 2025 20:16

llvmbot added mlir:gpu mlir labels Jul 15, 2025

Add xegpu.sg_id_range attribute

b4e3068

nbpatel requested review from Jianhui-Li and chencha3 July 16, 2025 19:23

Jianhui-Li reviewed Jul 17, 2025

View reviewed changes

Update tests

70fe19c

Jianhui-Li reviewed Jul 17, 2025

View reviewed changes

nbpatel added 3 commits July 17, 2025 04:26

Add check for sgLayout size and sg_id_range

e6528ef

Fix variable name

07b9eff

Change variable name

09fdbfc

nbpatel requested a review from adam-smnk July 18, 2025 15:29

Jianhui-Li approved these changes Jul 18, 2025

View reviewed changes

chencha3 reviewed Jul 21, 2025

View reviewed changes

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Show resolved Hide resolved

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td Show resolved Hide resolved

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

nbpatel added 2 commits July 21, 2025 15:32

remove braces

1cecfbe

add verifier for RangeAttr

3cde920

chencha3 approved these changes Jul 21, 2025

View reviewed changes

adam-smnk approved these changes Jul 23, 2025

View reviewed changes

nbpatel added 2 commits July 23, 2025 17:13

clean up

343d630

Merge branch 'main' into xegpu-warp-specialized

56ad954

nbpatel merged commit 65dec99 into llvm:main Jul 23, 2025
9 checks passed

nbpatel deleted the xegpu-warp-specialized branch September 25, 2025 20:34

		startOfRange = attr.getStart().getInt();
		endOfRange = attr.getEnd().getInt();

	let assemblyFormat = "`<` `[`$start ```,` $end `]``>`";
	let assemblyFormat = "`<` `[`$start `,` $end `]` `>`";

	}{sg_id_range = #xegpu.range<[2, 4]>}
	} {sg_id_range = #xegpu.range<[2, 4]>}

[MLIR][XeGPU] Add support for subgroup_id_range #148661

[MLIR][XeGPU] Add support for subgroup_id_range #148661

Uh oh!

Conversation

nbpatel commented Jul 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nbpatel Jul 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nbpatel Jul 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

nbpatel commented Jul 22, 2025

Uh oh!

adam-smnk left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

nbpatel commented Jul 14, 2025 •

edited

Loading

llvmbot commented Jul 15, 2025 •

edited

Loading

nbpatel Jul 17, 2025 •

edited

Loading

nbpatel Jul 17, 2025 •

edited

Loading