[mlir][gpu] Add known subgroup size

FMarno · FMarno · commit aca998e11239 · 2024-10-22T10:47:38.000+01:00
Also use it for lowering in GPUToLLVMSPV
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -64,7 +64,8 @@ def GPU_Dialect : Dialect {
 
   let discardableAttrs = (ins
     "::mlir::DenseI32ArrayAttr":$known_block_size,
-    "::mlir::DenseI32ArrayAttr":$known_grid_size
+    "::mlir::DenseI32ArrayAttr":$known_grid_size,
+    "::mlir::IntegerAttr" : $known_subgroup_size
   );
 
   let dependentDialects = ["arith::ArithDialect"];
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -388,6 +388,12 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
     by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
     the risk that they will de discarded.
 
+    A function may optionally be annotated with the subgroup size that will be
+    used when it is launched using the `known_subgroup_size` attribute. If set,
+    this attribute is a single positive integer (i.e. > 0). Launching a function
+    with this annotation, using a subgroup size other than specified is
+    undefined behaviour.
+
     Syntax:
 
     ```
@@ -431,7 +437,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
                        OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
                        OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
                        GPU_OptionalDimSizeHintAttr:$known_block_size,
-                       GPU_OptionalDimSizeHintAttr:$known_grid_size);
+                       GPU_OptionalDimSizeHintAttr:$known_grid_size,
+                       OptionalAttr<I32Attr>:$known_subgroup_size);
   let regions = (region AnyRegion:$body);
 
   let skipDefaultBuilders = 1;
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -272,23 +272,36 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   }
 
   /// Get the subgroup size from the target or return a default.
-  static int getSubgroupSize(Operation *op) {
-    return spirv::lookupTargetEnvOrDefault(op)
-        .getResourceLimits()
-        .getSubgroupSize();
+  static std::optional<uint32_t> getSubgroupSize(Operation *op) {
+    // TODO check for intel_reqd_sub_group_size
+
+    FunctionOpInterface func = op->getParentOfType<FunctionOpInterface>();
+    if (!func)
+      return {};
+
+    IntegerAttr knownSubgroupSizeAttr =
+        mlir::gpu::GPUDialect::KnownSubgroupSizeAttrHelper(op->getContext())
+            .getAttr(func);
+    if (!knownSubgroupSizeAttr)
+      return {};
+    return knownSubgroupSizeAttr.getInt();
   }
 
-  static bool hasValidWidth(gpu::ShuffleOp op) {
+  static bool hasValidWidth(gpu::ShuffleOp op, uint32_t subgroupSize) {
     llvm::APInt val;
     Value width = op.getWidth();
-    return matchPattern(width, m_ConstantInt(&val)) &&
-           val == getSubgroupSize(op);
+    return matchPattern(width, m_ConstantInt(&val)) && val == subgroupSize;
   }
 
   LogicalResult
   matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
-    if (!hasValidWidth(op))
+    auto maybeSubgroupSize = getSubgroupSize(op);
+    if (!maybeSubgroupSize)
+      return rewriter.notifyMatchFailure(
+          op, "subgroup size not specified. Should be specified with "
+              "known_subgroup_size.");
+    if (!hasValidWidth(op, maybeSubgroupSize.value()))
       return rewriter.notifyMatchFailure(
           op, "shuffle width and subgroup size mismatch");
 
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -260,7 +260,7 @@ gpu.module @shuffles {
   func.func @gpu_shuffles(%val0: i32, %id: i32,
                           %val1: i64, %mask: i32,
                           %val2: f32, %delta_up: i32,
-                          %val3: f64, %delta_down: i32) {
+                          %val3: f64, %delta_down: i32) attributes {gpu.known_subgroup_size = 32 : i32} {
     %width = arith.constant 32 : i32
     // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) {
     // CHECK-SAME-DAG:  no_unwind
@@ -302,9 +302,7 @@ gpu.module @shuffles {
 
 // Check `gpu.shuffle` conversion with explicit subgroup size.
 
-gpu.module @shuffles attributes {
-  spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Kernel, Addresses, GroupNonUniformShuffle, Int64], []>, #spirv.resource_limits<subgroup_size = 16>>
-} {
+gpu.module @shuffles {
   // CHECK:           llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {
   // CHECK-SAME-DAG:  no_unwind
   // CHECK-SAME-DAG:  convergent
@@ -352,15 +350,15 @@ gpu.module @shuffles attributes {
   // CHECK-SAME:              (%[[I8_VAL:.*]]: i8, %[[I16_VAL:.*]]: i16,
   // CHECK-SAME:               %[[I32_VAL:.*]]: i32, %[[I64_VAL:.*]]: i64,
   // CHECK-SAME:               %[[F16_VAL:.*]]: f16, %[[F32_VAL:.*]]: f32,
-  // CHECK-SAME:               %[[F64_VAL:.*]]: f64,  %[[OFFSET:.*]]: i32) {
+  // CHECK-SAME:               %[[F64_VAL:.*]]: f64,  %[[OFFSET:.*]]: i32)
   func.func @gpu_shuffles(%i8_val: i8,
                           %i16_val: i16,
                           %i32_val: i32,
                           %i64_val: i64,
                           %f16_val: f16,
                           %f32_val: f32,
                           %f64_val: f64,
-                          %offset: i32) {
+                          %offset: i32) attributes {gpu.known_subgroup_size = 16 : i32} {
     %width = arith.constant 16 : i32
     // CHECK:         llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[I8_VAL]], %[[OFFSET]])
     // CHECK:         llvm.mlir.constant(true) : i1