Skip to content

[mlir][amdgpu] Introduce assume_subgroup_uniform op #152740

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
#ifndef AMDGPU
#define AMDGPU

include "mlir/IR/EnumAttr.td"
include "mlir/IR/OpBase.td"
include "mlir/IR/Properties.td"
include "mlir/Interfaces/InferIntRangeInterface.td"
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Interfaces/ViewLikeInterface.td"
include "mlir/IR/EnumAttr.td"
include "mlir/IR/Properties.td"
include "mlir/IR/OpBase.td"

def AMDGPU_Dialect : Dialect {
let name = "amdgpu";
Expand Down Expand Up @@ -635,6 +636,37 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp",
let hasVerifier = 1;
}

def AMDGPU_AssumeSubgroupUniformOp : AMDGPU_Op<"assume_subgroup_uniform",
[NoMemoryEffect, AllTypesMatch<["result", "src"]>,
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
ElementwiseMappable.traits>,
Arguments<(ins AnyType:$src,
DefaultValuedAttr<UnitAttr, "false">:$all_lanes)> {
let summary = "Assumes value is unform across the lanes in subgroup";
let description = [{
This op is a compiler hint to help backend put values into scalar registers.

If `src` value is uniform across all the active subgroup lanes it is
returned unchanged, otherwise result is poison.

If `all_lanes` is set, the value is assumed to be uniform across all the
subgroup lanes, this can allow to speculate it out of control flow, which
Comment on lines +653 to +654
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why this isn't the only allowed scenario? If it does not have to be uniform, I think we should call it get_first_lane

may change the current active lanes, i.e:
```
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
```
```mlir

// %value must be uniform at this point
%value = ...
scf.if lane_id < 13 {
%uniform = amdgpu.assume_subgroup_uniform all_lanes %value
}
```
}];
let results = (outs AnyType:$result);
let assemblyFormat = [{
(`all_lanes` $all_lanes^)? $src attr-dict `:` type($result)
}];
}

def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
[Pure, AllTypesMatch<["result", "src"]>]>,
Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,
Expand Down
1 change: 1 addition & 0 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/Interfaces/InferIntRangeInterface.h"
#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"
Expand Down
17 changes: 16 additions & 1 deletion mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1876,6 +1876,19 @@ struct AMDGPUSwizzleBitModeLowering
}
};

struct AMDGPUAssumeSubgroupUniformLowering
: public ConvertOpToLLVMPattern<AssumeSubgroupUniformOp> {
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;

LogicalResult
matchAndRewrite(AssumeSubgroupUniformOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Value src = adaptor.getSrc();
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
return success();
}
};

struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
Expand Down Expand Up @@ -1945,5 +1958,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
TransposeLoadOpLowering>(converter, chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
patterns
.add<AMDGPUSwizzleBitModeLowering, AMDGPUAssumeSubgroupUniformLowering>(
converter);
}
16 changes: 16 additions & 0 deletions mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,22 @@ LogicalResult DPPOp::verify() {
return success();
}

//===----------------------------------------------------------------------===//
// AssumeSubgroupUniformOp
//===----------------------------------------------------------------------===//

void AssumeSubgroupUniformOp::inferResultRanges(
ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
setResultRange(getResult(), argRanges.front());
}

Speculation::Speculatability AssumeSubgroupUniformOp::getSpeculatability() {
if (getAllLanes())
return Speculation::Speculatable;

return Speculation::NotSpeculatable;
}

//===----------------------------------------------------------------------===//
// GatherToLDSOp
//===----------------------------------------------------------------------===//
Expand Down
14 changes: 14 additions & 0 deletions mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -461,3 +461,17 @@ func.func @sched_barrier() {
amdgpu.sched_barrier allow = <valu|all_vmem>
func.return
}

// CHECK-LABEL: func @assume_subgroup_uniform
// CHECK-SAME: (%[[ARG:.*]]: index)
func.func @assume_subgroup_uniform(%arg0 : index) -> (index, index) {
// CHECK: %[[SRC:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : index to i64
// CHECK: %[[V1:.*]] = rocdl.readfirstlane %[[SRC]] : i64
// CHECK: %[[RES1:.*]] = builtin.unrealized_conversion_cast %[[V1]] : i64 to index
// CHECK: %[[V2:.*]] = rocdl.readfirstlane %[[SRC]] : i64
// CHECK: %[[RES2:.*]] = builtin.unrealized_conversion_cast %[[V2]] : i64 to index
// CHECK: return %[[RES1]], %[[RES2]] : index, index
%0 = amdgpu.assume_subgroup_uniform %arg0 : index
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : index
func.return %0, %1 : index, index
}
10 changes: 10 additions & 0 deletions mlir/test/Dialect/AMDGPU/canonicalize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,13 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds:
: f32, memref<128x72xf32, 1>, memref<?x?xf32, 3>
func.return
}

// -----

// CHECK-LABEL: func @assume_subgroup_uniform_unused
func.func @assume_subgroup_uniform_unused(%arg0 : f32) {
// CHECK-NOT: amdgpu.assume_subgroup_uniform
%0 = amdgpu.assume_subgroup_uniform %arg0 : f32
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
func.return
}
10 changes: 10 additions & 0 deletions mlir/test/Dialect/AMDGPU/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,16 @@ func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16>
func.return %0 : vector<8xf16>
}

// CHECK-LABEL: func @assume_subgroup_uniform
// CHECK-SAME: (%[[ARG:.*]]: f32)
func.func @assume_subgroup_uniform(%arg0 : f32) -> (f32, f32) {
// CHECK: amdgpu.assume_subgroup_uniform %[[ARG]] : f32
%0 = amdgpu.assume_subgroup_uniform %arg0 : f32
// CHECK: amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
func.return %0, %1 : f32, f32
}

// CHECK-LABEL: func @swizzle_bitmode
func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
// CHECK: amdgpu.swizzle_bitmode
Expand Down
13 changes: 13 additions & 0 deletions mlir/test/Dialect/AMDGPU/subgroup-uniform-int-range.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: mlir-opt --arith-int-range-narrowing="int-bitwidths-supported=32" --split-input-file %s | FileCheck %s

// CHECK-LABEL: func @narrow
// CHECK: %[[SRC:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
// CHECK: %[[CAST1:.*]] = arith.index_castui %[[SRC]] : index to i32
// CHECK: %[[VAL:.*]] = amdgpu.assume_subgroup_uniform %[[CAST1]] : i32
// CHECK: %[[CAST2:.*]] = arith.index_castui %[[VAL]] : i32 to index
// CHECK: return %[[CAST2]] : index
func.func @narrow() -> index {
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
%1 = amdgpu.assume_subgroup_uniform %0 : index
return %1: index
}
21 changes: 21 additions & 0 deletions mlir/test/Dialect/AMDGPU/subgroup-uniform-speculability.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s

func.func private @side_effect(%arg0 : f32, %arg1 : f32)

// CHECK-LABEL: func @assume_subgroup_uniform_hoisting
// CHECK-SAME: (%[[ARG:.*]]: f32)
func.func @assume_subgroup_uniform_hoisting(%arg0 : f32) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c10 = arith.constant 10 : index
// CHECK: %[[V1:.*]] = amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32
// CHECK: scf.for
// CHECK: %[[V0:.*]] = amdgpu.assume_subgroup_uniform %[[ARG]] : f32
// CHECK: func.call @side_effect(%[[V0]], %[[V1]])
scf.for %i = %c0 to %c10 step %c1 {
%0 = amdgpu.assume_subgroup_uniform %arg0 : f32
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
func.call @side_effect(%0, %1) : (f32, f32) -> ()
}
func.return
}