Skip to content

Commit e08fba5

Browse files
committed
Also hoist permlane64 through binary ops
1 parent 2a75f55 commit e08fba5

File tree

2 files changed

+67
-20
lines changed

2 files changed

+67
-20
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,6 @@ GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
499499
return nullptr;
500500

501501
const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
502-
const bool IsPermLane = (IID == Intrinsic::amdgcn_permlane64);
503502

504503
// If this is a readlane, check that the second operand is a constant, or is
505504
// defined before Op so we know it's safe to move this intrinsic higher.
@@ -545,12 +544,8 @@ GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
545544
return DoIt(0, Remangled);
546545
}
547546

548-
// Don't hoist through a binary operator for permlane64. It doesn't
549-
// achieve anything and we'd need to repeat the call on every operand.
550-
//
551-
// We can do it for read(first)lane if other operands are already scalar
552-
// because then we don't need to repeat the call.
553-
if (!IsPermLane && isa<BinaryOperator>(Op)) {
547+
// We can also hoist through binary operators if the other operand is uniform.
548+
if (isa<BinaryOperator>(Op)) {
554549
// FIXME: If we had access to UniformityInfo here we could just check
555550
// if the operand is uniform.
556551
if (isTriviallyUniform(Op->getOperandUse(0)))

llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.permlane64.ll

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
22
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s
33

4-
; The readfirstlane version of this test covers all the interesting cases of the
4+
; The permlane64 version of this test covers all the interesting cases of the
55
; shared logic. This testcase focuses on permlane64 specific pitfalls.
66

77
; test unary
@@ -10,27 +10,27 @@ define float @hoist_fneg_f32(float %arg) {
1010
; CHECK-LABEL: define float @hoist_fneg_f32(
1111
; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
1212
; CHECK-NEXT: [[BB:.*:]]
13-
; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
13+
; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[ARG]])
1414
; CHECK-NEXT: [[RFL:%.*]] = fneg float [[TMP0]]
1515
; CHECK-NEXT: ret float [[RFL]]
1616
;
1717
bb:
1818
%val = fneg float %arg
19-
%pl = call float @llvm.amdgcn.readfirstlane.f32(float %val)
19+
%pl = call float @llvm.amdgcn.permlane64.f32(float %val)
2020
ret float %pl
2121
}
2222

2323
define double @hoist_fneg_f64(double %arg) {
2424
; CHECK-LABEL: define double @hoist_fneg_f64(
2525
; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] {
2626
; CHECK-NEXT: [[BB:.*:]]
27-
; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]])
27+
; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.permlane64.f64(double [[ARG]])
2828
; CHECK-NEXT: [[RFL:%.*]] = fneg double [[TMP0]]
2929
; CHECK-NEXT: ret double [[RFL]]
3030
;
3131
bb:
3232
%val = fneg double %arg
33-
%pl = call double @llvm.amdgcn.readfirstlane.f64(double %val)
33+
%pl = call double @llvm.amdgcn.permlane64.f64(double %val)
3434
ret double %pl
3535
}
3636

@@ -40,27 +40,27 @@ define i32 @hoist_trunc(i64 %arg) {
4040
; CHECK-LABEL: define i32 @hoist_trunc(
4141
; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] {
4242
; CHECK-NEXT: [[BB:.*:]]
43-
; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]])
43+
; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.permlane64.i64(i64 [[ARG]])
4444
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RFL]] to i32
4545
; CHECK-NEXT: ret i32 [[TMP0]]
4646
;
4747
bb:
4848
%val = trunc i64 %arg to i32
49-
%pl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val)
49+
%pl = call i32 @llvm.amdgcn.permlane64.i32(i32 %val)
5050
ret i32 %pl
5151
}
5252

5353
define i64 @hoist_zext(i32 %arg) {
5454
; CHECK-LABEL: define i64 @hoist_zext(
5555
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
5656
; CHECK-NEXT: [[BB:.*:]]
57-
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
57+
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[ARG]])
5858
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64
5959
; CHECK-NEXT: ret i64 [[TMP0]]
6060
;
6161
bb:
6262
%val = zext i32 %arg to i64
63-
%pl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val)
63+
%pl = call i64 @llvm.amdgcn.permlane64.i64(i64 %val)
6464
ret i64 %pl
6565
}
6666

@@ -70,8 +70,8 @@ define i32 @hoist_add_i32(i32 %arg) {
7070
; CHECK-LABEL: define i32 @hoist_add_i32(
7171
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
7272
; CHECK-NEXT: [[BB:.*:]]
73-
; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], 16777215
74-
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[VAL]])
73+
; CHECK-NEXT: [[PL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[ARG]])
74+
; CHECK-NEXT: [[RFL:%.*]] = add i32 [[PL]], 16777215
7575
; CHECK-NEXT: ret i32 [[RFL]]
7676
;
7777
bb:
@@ -84,8 +84,8 @@ define float @hoist_fadd_f32(float %arg) {
8484
; CHECK-LABEL: define float @hoist_fadd_f32(
8585
; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
8686
; CHECK-NEXT: [[BB:.*:]]
87-
; CHECK-NEXT: [[VAL:%.*]] = fadd float [[ARG]], 1.280000e+02
88-
; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL]])
87+
; CHECK-NEXT: [[PL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[ARG]])
88+
; CHECK-NEXT: [[RFL:%.*]] = fadd float [[PL]], 1.280000e+02
8989
; CHECK-NEXT: ret float [[RFL]]
9090
;
9191
bb:
@@ -94,8 +94,60 @@ bb:
9494
ret float %pl
9595
}
9696

97+
; test multiple iterations
98+
99+
define i32 @hoist_multiple_times(i32 %arg) {
100+
; CHECK-LABEL: define i32 @hoist_multiple_times(
101+
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
102+
; CHECK-NEXT: [[BB:.*:]]
103+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[ARG]])
104+
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2
105+
; CHECK-NEXT: [[TMP2:%.*]] = sub i32 16777215, [[TMP1]]
106+
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 4242
107+
; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP3]], 6
108+
; CHECK-NEXT: ret i32 [[RFL]]
109+
;
110+
bb:
111+
%val.0 = shl i32 %arg, 2
112+
%val.1 = sub i32 16777215, %val.0
113+
%val.2 = xor i32 %val.1, 4242
114+
%val.3 = add i32 %val.2, 6
115+
%rfl = call i32 @llvm.amdgcn.permlane64.i32(i32 %val.3)
116+
ret i32 %rfl
117+
}
118+
97119
; test cases where hoisting isn't possible
98120

121+
define i32 @operand_is_instr(i32 %arg, ptr %src) {
122+
; CHECK-LABEL: define i32 @operand_is_instr(
123+
; CHECK-SAME: i32 [[ARG:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] {
124+
; CHECK-NEXT: [[BB:.*:]]
125+
; CHECK-NEXT: [[OTHER:%.*]] = load i32, ptr [[SRC]], align 4
126+
; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], [[OTHER]]
127+
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[VAL]])
128+
; CHECK-NEXT: ret i32 [[RFL]]
129+
;
130+
bb:
131+
%other = load i32, ptr %src
132+
%val = add i32 %arg, %other
133+
%rfl = call i32 @llvm.amdgcn.permlane64.i32(i32 %val)
134+
ret i32 %rfl
135+
}
136+
137+
define i32 @operand_is_arg(i32 %arg, i32 %other) {
138+
; CHECK-LABEL: define i32 @operand_is_arg(
139+
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[OTHER:%.*]]) #[[ATTR0]] {
140+
; CHECK-NEXT: [[BB:.*:]]
141+
; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], [[OTHER]]
142+
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[VAL]])
143+
; CHECK-NEXT: ret i32 [[RFL]]
144+
;
145+
bb:
146+
%val = add i32 %arg, %other
147+
%rfl = call i32 @llvm.amdgcn.permlane64.i32(i32 %val)
148+
ret i32 %rfl
149+
}
150+
99151
define float @cross_block_hoisting(i1 %cond, float %arg) {
100152
; CHECK-LABEL: define float @cross_block_hoisting(
101153
; CHECK-SAME: i1 [[COND:%.*]], float [[ARG:%.*]]) #[[ATTR0]] {

0 commit comments

Comments
 (0)