diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 2c646934c11c2..b0b94ed49f2e5 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -9,12 +9,13 @@ #ifndef AMDGPU #define AMDGPU +include "mlir/IR/EnumAttr.td" +include "mlir/IR/OpBase.td" +include "mlir/IR/Properties.td" +include "mlir/Interfaces/InferIntRangeInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" -include "mlir/IR/EnumAttr.td" -include "mlir/IR/Properties.td" -include "mlir/IR/OpBase.td" def AMDGPU_Dialect : Dialect { let name = "amdgpu"; @@ -635,6 +636,37 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", let hasVerifier = 1; } +def AMDGPU_AssumeSubgroupUniformOp : AMDGPU_Op<"assume_subgroup_uniform", + [NoMemoryEffect, AllTypesMatch<["result", "src"]>, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods] # + ElementwiseMappable.traits>, + Arguments<(ins AnyType:$src, + DefaultValuedAttr:$all_lanes)> { + let summary = "Assumes value is unform across the lanes in subgroup"; + let description = [{ + This op is a compiler hint to help backend put values into scalar registers. + + If `src` value is uniform across all the active subgroup lanes it is + returned unchanged, otherwise result is poison. + + If `all_lanes` is set, the value is assumed to be uniform across all the + subgroup lanes, this can allow to speculate it out of control flow, which + may change the current active lanes, i.e: + ``` + // %value must be uniform at this point + %value = ... + scf.if lane_id < 13 { + %uniform = amdgpu.assume_subgroup_uniform all_lanes %value + } + ``` + }]; + let results = (outs AnyType:$result); + let assemblyFormat = [{ + (`all_lanes` $all_lanes^)? $src attr-dict `:` type($result) + }]; +} + def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode", [Pure, AllTypesMatch<["result", "src"]>]>, Arguments<(ins AnyIntegerOrFloatOr1DVector:$src, diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h index 3de57c923178a..196ce08b5954c 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -18,6 +18,7 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/OpDefinition.h" +#include "mlir/Interfaces/InferIntRangeInterface.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 64720bfe6cf50..3f52309005690 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1876,6 +1876,19 @@ struct AMDGPUSwizzleBitModeLowering } }; +struct AMDGPUAssumeSubgroupUniformLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(AssumeSubgroupUniformOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = adaptor.getSrc(); + rewriter.replaceOpWithNewOp(op, src.getType(), src); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -1945,5 +1958,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, GatherToLDSOpLowering, TransposeLoadOpLowering>(converter, chipset); - patterns.add(converter); + patterns + .add( + converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index d7ffdcb58ddb5..0115a85ba0bfe 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -510,6 +510,22 @@ LogicalResult DPPOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// AssumeSubgroupUniformOp +//===----------------------------------------------------------------------===// + +void AssumeSubgroupUniformOp::inferResultRanges( + ArrayRef argRanges, SetIntRangeFn setResultRange) { + setResultRange(getResult(), argRanges.front()); +} + +Speculation::Speculatability AssumeSubgroupUniformOp::getSpeculatability() { + if (getAllLanes()) + return Speculation::Speculatable; + + return Speculation::NotSpeculatable; +} + //===----------------------------------------------------------------------===// // GatherToLDSOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index cc1162d8b0de8..6eaf68f84e38f 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -461,3 +461,17 @@ func.func @sched_barrier() { amdgpu.sched_barrier allow = func.return } + +// CHECK-LABEL: func @assume_subgroup_uniform +// CHECK-SAME: (%[[ARG:.*]]: index) +func.func @assume_subgroup_uniform(%arg0 : index) -> (index, index) { +// CHECK: %[[SRC:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : index to i64 +// CHECK: %[[V1:.*]] = rocdl.readfirstlane %[[SRC]] : i64 +// CHECK: %[[RES1:.*]] = builtin.unrealized_conversion_cast %[[V1]] : i64 to index +// CHECK: %[[V2:.*]] = rocdl.readfirstlane %[[SRC]] : i64 +// CHECK: %[[RES2:.*]] = builtin.unrealized_conversion_cast %[[V2]] : i64 to index +// CHECK: return %[[RES1]], %[[RES2]] : index, index + %0 = amdgpu.assume_subgroup_uniform %arg0 : index + %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : index + func.return %0, %1 : index, index +} diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir index 5501ad42dbd90..141bd3f459738 100644 --- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir +++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir @@ -159,3 +159,13 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds: : f32, memref<128x72xf32, 1>, memref func.return } + +// ----- + +// CHECK-LABEL: func @assume_subgroup_uniform_unused +func.func @assume_subgroup_uniform_unused(%arg0 : f32) { +// CHECK-NOT: amdgpu.assume_subgroup_uniform + %0 = amdgpu.assume_subgroup_uniform %arg0 : f32 + %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32 + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 87e11c028c62a..97b4d5f54506f 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -517,6 +517,16 @@ func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16> func.return %0 : vector<8xf16> } +// CHECK-LABEL: func @assume_subgroup_uniform +// CHECK-SAME: (%[[ARG:.*]]: f32) +func.func @assume_subgroup_uniform(%arg0 : f32) -> (f32, f32) { + // CHECK: amdgpu.assume_subgroup_uniform %[[ARG]] : f32 + %0 = amdgpu.assume_subgroup_uniform %arg0 : f32 + // CHECK: amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32 + %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32 + func.return %0, %1 : f32, f32 +} + // CHECK-LABEL: func @swizzle_bitmode func.func @swizzle_bitmode(%arg0 : f32) -> f32 { // CHECK: amdgpu.swizzle_bitmode diff --git a/mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir b/mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir new file mode 100644 index 0000000000000..be20bfdba3baf --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir @@ -0,0 +1,13 @@ +// RUN: mlir-opt --arith-int-range-narrowing="int-bitwidths-supported=32" --split-input-file %s | FileCheck %s + +// CHECK-LABEL: func @narrow +// CHECK: %[[SRC:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index +// CHECK: %[[CAST1:.*]] = arith.index_castui %[[SRC]] : index to i32 +// CHECK: %[[VAL:.*]] = amdgpu.assume_subgroup_uniform %[[CAST1]] : i32 +// CHECK: %[[CAST2:.*]] = arith.index_castui %[[VAL]] : i32 to index +// CHECK: return %[[CAST2]] : index +func.func @narrow() -> index { + %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index + %1 = amdgpu.assume_subgroup_uniform %0 : index + return %1: index +} diff --git a/mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir b/mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir new file mode 100644 index 0000000000000..9be2b5dda267e --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s + +func.func private @side_effect(%arg0 : f32, %arg1 : f32) + +// CHECK-LABEL: func @assume_subgroup_uniform_hoisting +// CHECK-SAME: (%[[ARG:.*]]: f32) +func.func @assume_subgroup_uniform_hoisting(%arg0 : f32) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index +// CHECK: %[[V1:.*]] = amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32 +// CHECK: scf.for +// CHECK: %[[V0:.*]] = amdgpu.assume_subgroup_uniform %[[ARG]] : f32 +// CHECK: func.call @side_effect(%[[V0]], %[[V1]]) + scf.for %i = %c0 to %c10 step %c1 { + %0 = amdgpu.assume_subgroup_uniform %arg0 : f32 + %1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32 + func.call @side_effect(%0, %1) : (f32, f32) -> () + } + func.return +}