Skip to content

Commit 2b3d3fc

Browse files
authored
[mlir][gpu] Revert gpu.subgroup_broadcast with any_lane (#157373)
This partially reverts #152808. Post-commit comments revealed that the `any_lane` variant hasn't been fully agreed upon at the time of landing.
1 parent fd93dc5 commit 2b3d3fc

File tree

7 files changed

+15
-38
lines changed

7 files changed

+15
-38
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3219,8 +3219,7 @@ def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
32193219
"a lane to broadcast from",
32203220
[
32213221
I32EnumAttrCase<"first_active_lane", 0>,
3222-
I32EnumAttrCase<"any_lane", 1>,
3223-
I32EnumAttrCase<"specific_lane", 2>
3222+
I32EnumAttrCase<"specific_lane", 1>
32243223
]>{
32253224
let genSpecializedAttr = 0;
32263225
let cppNamespace = "::mlir::gpu";
@@ -3248,13 +3247,6 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
32483247
must be uniform and within the subgroup size. The result is poison if the
32493248
lane index is invalid, non subgroup-uniform, or if the source lane is not
32503249
active.
3251-
* `any_lane` - broadcasts the value from any lane of the subgroup,
3252-
assuming the input is already subgroup uniform. The result is poison if
3253-
the input is not uniform. This is useful to convey uniformity to the
3254-
compiler to enable more optimizations. Also, it allows more speculation
3255-
opportunities than `first_active_lane` since `first_active_lane` results
3256-
can depend on active lanes which may change during speculation across
3257-
control flow.
32583250
}];
32593251
let results = (outs AnyType:$result);
32603252
let assemblyFormat = [{

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,9 +193,7 @@ struct GPUSubgroupBroadcastOpToROCDL
193193
if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
194194
rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
195195
adaptor.getLane());
196-
} else { // first_active_lane or any_lane
197-
// any_lane is lowered to readfirstlane too, to force value into scalar
198-
// register.
196+
} else { // first_active_lane
199197
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
200198
src);
201199
}

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2529,8 +2529,6 @@ Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
25292529
// Cannot speculate first_lane broadcast, because speculating it across
25302530
// control flow can change the active lanes.
25312531
return Speculation::NotSpeculatable;
2532-
case BroadcastType::any_lane:
2533-
LLVM_FALLTHROUGH;
25342532
case BroadcastType::specific_lane:
25352533
// Speculation should be safe as long as we inside structured control flow.
25362534
return Speculation::Speculatable;
@@ -2540,8 +2538,6 @@ Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
25402538
LogicalResult gpu::SubgroupBroadcastOp::verify() {
25412539
switch (getBroadcastType()) {
25422540
case BroadcastType::first_active_lane:
2543-
LLVM_FALLTHROUGH;
2544-
case BroadcastType::any_lane:
25452541
if (getLane())
25462542
return emitOpError()
25472543
<< "lane can only be specified for `specific_lane` broadcast";

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -808,13 +808,11 @@ gpu.module @test_module {
808808
gpu.module @test_module {
809809
// CHECK-LABEL: func @broadcast
810810
// CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
811-
func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
812-
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
811+
func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
813812
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
814813
// CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
815814
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : index
816-
%1 = gpu.subgroup_broadcast %arg0, any_lane : index
817-
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
818-
func.return %0, %1, %2 : index, index, index
815+
%1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
816+
func.return %0, %1 : index, index
819817
}
820818
}
Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,22 @@
11
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
22

3-
func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
3+
func.func private @side_effect(%arg0 : f32, %arg1 : f32)
44

55
// CHECK-LABEL: func @broadcast_hoisting
66
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
77
func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
88
%c0 = arith.constant 0 : index
99
%c1 = arith.constant 1 : index
10-
// `any_lane` and `specific_lane` can be speculated across the control flow, but
10+
// `specific_lane` can be speculated across the control flow, but
1111
// `first_active_lane` cannot as active lanes can change.
12-
// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32
13-
// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
12+
// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
1413
// CHECK: scf.for
1514
// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
16-
// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
15+
// CHECK: func.call @side_effect(%[[V0]], %[[V1]])
1716
scf.for %i = %c0 to %arg2 step %c1 {
1817
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
19-
%1 = gpu.subgroup_broadcast %arg0, any_lane : f32
20-
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
21-
func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
18+
%1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
19+
func.call @side_effect(%0, %1) : (f32, f32) -> ()
2220
}
2321
func.return
2422
}

mlir/test/Dialect/GPU/int-range-interface.mlir

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -336,15 +336,12 @@ module attributes {gpu.container_module} {
336336
func.func @broadcast(%idx: i32) {
337337
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
338338
%1 = gpu.subgroup_broadcast %0, first_active_lane : index
339-
%2 = gpu.subgroup_broadcast %0, any_lane : index
340-
%3 = gpu.subgroup_broadcast %0, specific_lane %idx : index
339+
%2 = gpu.subgroup_broadcast %0, specific_lane %idx : index
341340

342-
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
343341
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
344342
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
345343

346344
%4 = test.reflect_bounds %1 : index
347345
%5 = test.reflect_bounds %2 : index
348-
%6 = test.reflect_bounds %3 : index
349346
return
350347
}

mlir/test/Dialect/GPU/ops.mlir

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -545,12 +545,10 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
545545

546546
// CHECK-LABEL: func @subgroup_broadcast
547547
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
548-
func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
548+
func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
549549
// CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
550550
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
551-
// CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32
552-
%1 = gpu.subgroup_broadcast %arg0, any_lane : f32
553551
// CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
554-
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
555-
func.return %0, %1, %2 : f32, f32, f32
552+
%1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
553+
func.return %0, %1 : f32, f32
556554
}

0 commit comments

Comments
 (0)