Skip to content

Commit d5dfd6f

Browse files
committed
[mlir][amdgpu] Introduce assume_subgroup_uniform op
`assume_subgroup_uniform` works as compiler hint to force the specific value into scalar register. Currently implemented via `readfirstlane` intrinsic. Unlike direct `readfirstlane` call, this op is potentially speculatable and have a usual arith and int range interfaces.
1 parent e9d71ef commit d5dfd6f

File tree

9 files changed

+136
-4
lines changed

9 files changed

+136
-4
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@
99
#ifndef AMDGPU
1010
#define AMDGPU
1111

12+
include "mlir/IR/EnumAttr.td"
13+
include "mlir/IR/OpBase.td"
14+
include "mlir/IR/Properties.td"
15+
include "mlir/Interfaces/InferIntRangeInterface.td"
1216
include "mlir/Interfaces/InferTypeOpInterface.td"
1317
include "mlir/Interfaces/SideEffectInterfaces.td"
1418
include "mlir/Interfaces/ViewLikeInterface.td"
15-
include "mlir/IR/EnumAttr.td"
16-
include "mlir/IR/Properties.td"
17-
include "mlir/IR/OpBase.td"
1819

1920
def AMDGPU_Dialect : Dialect {
2021
let name = "amdgpu";
@@ -635,6 +636,37 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp",
635636
let hasVerifier = 1;
636637
}
637638

639+
def AMDGPU_AssumeSubgroupUniformOp : AMDGPU_Op<"assume_subgroup_uniform",
640+
[NoMemoryEffect, AllTypesMatch<["result", "src"]>,
641+
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
642+
DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
643+
ElementwiseMappable.traits>,
644+
Arguments<(ins AnyType:$src,
645+
DefaultValuedAttr<UnitAttr, "false">:$all_lanes)> {
646+
let summary = "Assumes value is unform across the lanes in subgroup";
647+
let description = [{
648+
This op is a compiler hint to help backend put values into scalar registers.
649+
650+
If `src` value is uniform across all the active subgroup lanes it is
651+
returned unchanged, otherwise result is poison.
652+
653+
If `all_lanes` is set, the value is assumed to be uniform across all the
654+
subgroup lanes, this can allow to speculate it out of control flow, which
655+
may change the current active lanes, i.e:
656+
```
657+
// %value must be uniform at this point
658+
%value = ...
659+
scf.if lane_id < 13 {
660+
%uniform = amdgpu.assume_subgroup_uniform all_lanes %value
661+
}
662+
```
663+
}];
664+
let results = (outs AnyType:$result);
665+
let assemblyFormat = [{
666+
(`all_lanes` $all_lanes^)? $src attr-dict `:` type($result)
667+
}];
668+
}
669+
638670
def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
639671
[Pure, AllTypesMatch<["result", "src"]>]>,
640672
Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "mlir/IR/BuiltinTypes.h"
1919
#include "mlir/IR/Dialect.h"
2020
#include "mlir/IR/OpDefinition.h"
21+
#include "mlir/Interfaces/InferIntRangeInterface.h"
2122
#include "mlir/Interfaces/InferTypeOpInterface.h"
2223
#include "mlir/Interfaces/SideEffectInterfaces.h"
2324
#include "mlir/Interfaces/ViewLikeInterface.h"

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1876,6 +1876,19 @@ struct AMDGPUSwizzleBitModeLowering
18761876
}
18771877
};
18781878

1879+
struct AMDGPUAssumeSubgroupUniformLowering
1880+
: public ConvertOpToLLVMPattern<AssumeSubgroupUniformOp> {
1881+
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
1882+
1883+
LogicalResult
1884+
matchAndRewrite(AssumeSubgroupUniformOp op, OpAdaptor adaptor,
1885+
ConversionPatternRewriter &rewriter) const override {
1886+
Value src = adaptor.getSrc();
1887+
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
1888+
return success();
1889+
}
1890+
};
1891+
18791892
struct ConvertAMDGPUToROCDLPass
18801893
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
18811894
using Base::Base;
@@ -1945,5 +1958,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
19451958
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
19461959
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
19471960
TransposeLoadOpLowering>(converter, chipset);
1948-
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
1961+
patterns
1962+
.add<AMDGPUSwizzleBitModeLowering, AMDGPUAssumeSubgroupUniformLowering>(
1963+
converter);
19491964
}

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,22 @@ LogicalResult DPPOp::verify() {
510510
return success();
511511
}
512512

513+
//===----------------------------------------------------------------------===//
514+
// AssumeSubgroupUniformOp
515+
//===----------------------------------------------------------------------===//
516+
517+
void AssumeSubgroupUniformOp::inferResultRanges(
518+
ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
519+
setResultRange(getResult(), argRanges.front());
520+
}
521+
522+
Speculation::Speculatability AssumeSubgroupUniformOp::getSpeculatability() {
523+
if (getAllLanes())
524+
return Speculation::Speculatable;
525+
526+
return Speculation::NotSpeculatable;
527+
}
528+
513529
//===----------------------------------------------------------------------===//
514530
// GatherToLDSOp
515531
//===----------------------------------------------------------------------===//

mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,3 +461,17 @@ func.func @sched_barrier() {
461461
amdgpu.sched_barrier allow = <valu|all_vmem>
462462
func.return
463463
}
464+
465+
// CHECK-LABEL: func @assume_subgroup_uniform
466+
// CHECK-SAME: (%[[ARG:.*]]: index)
467+
func.func @assume_subgroup_uniform(%arg0 : index) -> (index, index) {
468+
// CHECK: %[[SRC:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : index to i64
469+
// CHECK: %[[V1:.*]] = rocdl.readfirstlane %[[SRC]] : i64
470+
// CHECK: %[[RES1:.*]] = builtin.unrealized_conversion_cast %[[V1]] : i64 to index
471+
// CHECK: %[[V2:.*]] = rocdl.readfirstlane %[[SRC]] : i64
472+
// CHECK: %[[RES2:.*]] = builtin.unrealized_conversion_cast %[[V2]] : i64 to index
473+
// CHECK: return %[[RES1]], %[[RES2]] : index, index
474+
%0 = amdgpu.assume_subgroup_uniform %arg0 : index
475+
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : index
476+
func.return %0, %1 : index, index
477+
}

mlir/test/Dialect/AMDGPU/canonicalize.mlir

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,13 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds:
159159
: f32, memref<128x72xf32, 1>, memref<?x?xf32, 3>
160160
func.return
161161
}
162+
163+
// -----
164+
165+
// CHECK-LABEL: func @assume_subgroup_uniform_unused
166+
func.func @assume_subgroup_uniform_unused(%arg0 : f32) {
167+
// CHECK-NOT: amdgpu.assume_subgroup_uniform
168+
%0 = amdgpu.assume_subgroup_uniform %arg0 : f32
169+
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
170+
func.return
171+
}

mlir/test/Dialect/AMDGPU/ops.mlir

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,16 @@ func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16>
517517
func.return %0 : vector<8xf16>
518518
}
519519

520+
// CHECK-LABEL: func @assume_subgroup_uniform
521+
// CHECK-SAME: (%[[ARG:.*]]: f32)
522+
func.func @assume_subgroup_uniform(%arg0 : f32) -> (f32, f32) {
523+
// CHECK: amdgpu.assume_subgroup_uniform %[[ARG]] : f32
524+
%0 = amdgpu.assume_subgroup_uniform %arg0 : f32
525+
// CHECK: amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32
526+
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
527+
func.return %0, %1 : f32, f32
528+
}
529+
520530
// CHECK-LABEL: func @swizzle_bitmode
521531
func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
522532
// CHECK: amdgpu.swizzle_bitmode
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: mlir-opt --arith-int-range-narrowing="int-bitwidths-supported=32" --split-input-file %s | FileCheck %s
2+
3+
// CHECK-LABEL: func @narrow
4+
// CHECK: %[[SRC:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
5+
// CHECK: %[[CAST1:.*]] = arith.index_castui %[[SRC]] : index to i32
6+
// CHECK: %[[VAL:.*]] = amdgpu.assume_subgroup_uniform %[[CAST1]] : i32
7+
// CHECK: %[[CAST2:.*]] = arith.index_castui %[[VAL]] : i32 to index
8+
// CHECK: return %[[CAST2]] : index
9+
func.func @narrow() -> index {
10+
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
11+
%1 = amdgpu.assume_subgroup_uniform %0 : index
12+
return %1: index
13+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
2+
3+
func.func private @side_effect(%arg0 : f32, %arg1 : f32)
4+
5+
// CHECK-LABEL: func @assume_subgroup_uniform_hoisting
6+
// CHECK-SAME: (%[[ARG:.*]]: f32)
7+
func.func @assume_subgroup_uniform_hoisting(%arg0 : f32) {
8+
%c0 = arith.constant 0 : index
9+
%c1 = arith.constant 1 : index
10+
%c10 = arith.constant 10 : index
11+
// CHECK: %[[V1:.*]] = amdgpu.assume_subgroup_uniform all_lanes %[[ARG]] : f32
12+
// CHECK: scf.for
13+
// CHECK: %[[V0:.*]] = amdgpu.assume_subgroup_uniform %[[ARG]] : f32
14+
// CHECK: func.call @side_effect(%[[V0]], %[[V1]])
15+
scf.for %i = %c0 to %c10 step %c1 {
16+
%0 = amdgpu.assume_subgroup_uniform %arg0 : f32
17+
%1 = amdgpu.assume_subgroup_uniform all_lanes %arg0 : f32
18+
func.call @side_effect(%0, %1) : (f32, f32) -> ()
19+
}
20+
func.return
21+
}

0 commit comments

Comments
 (0)