Skip to content

Commit e9cbeee

Browse files
committed
Comments
1 parent f7e30b7 commit e9cbeee

File tree

2 files changed

+77
-66
lines changed

2 files changed

+77
-66
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include "llvm/IR/Dominators.h"
2222
#include "llvm/IR/IntrinsicsAMDGPU.h"
2323
#include "llvm/Transforms/InstCombine/InstCombiner.h"
24-
#include "llvm/Transforms/Utils/CodeMoverUtils.h"
2524
#include <optional>
2625

2726
using namespace llvm;
@@ -483,6 +482,16 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
483482
return false;
484483
}
485484

485+
static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
486+
Function &NewCallee, ArrayRef<Value *> Ops) {
487+
SmallVector<OperandBundleDef, 2> OpBundles;
488+
Old.getOperandBundlesAsDefs(OpBundles);
489+
490+
CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
491+
NewCall->takeName(&Old);
492+
return NewCall;
493+
}
494+
486495
Instruction *
487496
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
488497
IntrinsicInst &II) const {
@@ -491,53 +500,54 @@ GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
491500
IID == Intrinsic::amdgcn_readfirstlane ||
492501
IID == Intrinsic::amdgcn_permlane64);
493502

494-
Instruction *Op = dyn_cast<Instruction>(II.getOperand(0));
503+
Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
495504

496505
// Only do this if both instructions are in the same block
497506
// (so the exec mask won't change) and the readlane is the only user of its
498507
// operand.
499-
if (!Op || !Op->hasOneUser() || Op->getParent() != II.getParent())
508+
if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
500509
return nullptr;
501510

502511
const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
503512

504513
// If this is a readlane, check that the second operand is a constant, or is
505-
// defined before Op so we know it's safe to move this intrinsic higher.
514+
// defined before OpInst so we know it's safe to move this intrinsic higher.
506515
Value *LaneID = nullptr;
507516
if (IsReadLane) {
508517
LaneID = II.getOperand(1);
509-
// Check LaneID is available at Op, otherwise we can't move the readlane
510-
// higher.
518+
519+
// readlane take an extra operand for the lane ID, so we must check if that
520+
// LaneID value can be used at the point where we want to move the
521+
// intrinsic.
511522
if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
512-
if (!isSafeToMoveBefore(*LaneIDInst, *Op, IC.getDominatorTree()))
523+
if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
513524
return nullptr;
514525
}
515526
}
516527

528+
// Hoist the intrinsic (II) through OpInst.
529+
//
530+
// (II (OpInst x)) -> (OpInst (II x))
517531
const auto DoIt = [&](unsigned OpIdx,
518532
Function *NewIntrinsic) -> Instruction * {
519-
SmallVector<Value *, 2> Ops{Op->getOperand(OpIdx)};
533+
SmallVector<Value *, 2> Ops{OpInst->getOperand(OpIdx)};
520534
if (IsReadLane)
521535
Ops.push_back(LaneID);
522536

523-
// Make sure convergence tokens are preserved.
524-
// TODO: CreateIntrinsic should allow directly copying bundles
525-
SmallVector<OperandBundleDef, 2> OpBundles;
526-
II.getOperandBundlesAsDefs(OpBundles);
527-
528-
CallInst *NewII = IC.Builder.CreateCall(NewIntrinsic, Ops, OpBundles);
529-
NewII->takeName(&II);
537+
// Rewrite the intrinsic call.
538+
CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
530539

531-
Instruction &NewOp = *Op->clone();
540+
// Rewrite OpInst so it takes the result of the intrinsic now.
541+
Instruction &NewOp = *OpInst->clone();
532542
NewOp.setOperand(OpIdx, NewII);
533543
return &NewOp;
534544
};
535545

536-
if (isa<UnaryOperator>(Op))
546+
if (isa<UnaryOperator>(OpInst))
537547
return DoIt(0, II.getCalledFunction());
538548

539-
if (isa<CastInst>(Op)) {
540-
Value *Src = Op->getOperand(0);
549+
if (isa<CastInst>(OpInst)) {
550+
Value *Src = OpInst->getOperand(0);
541551
Type *SrcTy = Src->getType();
542552
if (!isTypeLegal(SrcTy))
543553
return nullptr;
@@ -548,12 +558,12 @@ GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
548558
}
549559

550560
// We can also hoist through binary operators if the other operand is uniform.
551-
if (isa<BinaryOperator>(Op)) {
561+
if (isa<BinaryOperator>(OpInst)) {
552562
// FIXME: If we had access to UniformityInfo here we could just check
553563
// if the operand is uniform.
554-
if (isTriviallyUniform(Op->getOperandUse(0)))
564+
if (isTriviallyUniform(OpInst->getOperandUse(0)))
555565
return DoIt(1, II.getCalledFunction());
556-
if (isTriviallyUniform(Op->getOperandUse(1)))
566+
if (isTriviallyUniform(OpInst->getOperandUse(1)))
557567
return DoIt(0, II.getCalledFunction());
558568
}
559569

llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ define float @hoist_fneg_f32(float %arg, i32 %lane) {
1010
; CHECK-LABEL: define float @hoist_fneg_f32(
1111
; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0:[0-9]+]] {
1212
; CHECK-NEXT: [[BB:.*:]]
13-
; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
14-
; CHECK-NEXT: [[RFL:%.*]] = fneg float [[TMP0]]
13+
; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
14+
; CHECK-NEXT: [[RFL:%.*]] = fneg float [[RL]]
1515
; CHECK-NEXT: ret float [[RFL]]
1616
;
1717
bb:
@@ -24,8 +24,8 @@ define double @hoist_fneg_f64(double %arg, i32 %lane) {
2424
; CHECK-LABEL: define double @hoist_fneg_f64(
2525
; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
2626
; CHECK-NEXT: [[BB:.*:]]
27-
; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
28-
; CHECK-NEXT: [[RFL:%.*]] = fneg double [[TMP0]]
27+
; CHECK-NEXT: [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
28+
; CHECK-NEXT: [[RFL:%.*]] = fneg double [[RL]]
2929
; CHECK-NEXT: ret double [[RFL]]
3030
;
3131
bb:
@@ -40,8 +40,8 @@ define i32 @hoist_trunc(i64 %arg, i32 %lane) {
4040
; CHECK-LABEL: define i32 @hoist_trunc(
4141
; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
4242
; CHECK-NEXT: [[BB:.*:]]
43-
; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
44-
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RFL]] to i32
43+
; CHECK-NEXT: [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
44+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RL]] to i32
4545
; CHECK-NEXT: ret i32 [[TMP0]]
4646
;
4747
bb:
@@ -54,8 +54,8 @@ define i64 @hoist_zext(i32 %arg, i32 %lane) {
5454
; CHECK-LABEL: define i64 @hoist_zext(
5555
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
5656
; CHECK-NEXT: [[BB:.*:]]
57-
; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
58-
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64
57+
; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
58+
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RL]] to i64
5959
; CHECK-NEXT: ret i64 [[TMP0]]
6060
;
6161
bb:
@@ -70,8 +70,8 @@ define i32 @hoist_add_i32(i32 %arg, i32 %lane) {
7070
; CHECK-LABEL: define i32 @hoist_add_i32(
7171
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
7272
; CHECK-NEXT: [[BB:.*:]]
73-
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
74-
; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215
73+
; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
74+
; CHECK-NEXT: [[RFL:%.*]] = add i32 [[RL]], 16777215
7575
; CHECK-NEXT: ret i32 [[RFL]]
7676
;
7777
bb:
@@ -84,8 +84,8 @@ define float @hoist_fadd_f32(float %arg, i32 %lane) {
8484
; CHECK-LABEL: define float @hoist_fadd_f32(
8585
; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
8686
; CHECK-NEXT: [[BB:.*:]]
87-
; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
88-
; CHECK-NEXT: [[RFL:%.*]] = fadd float [[TMP0]], 1.280000e+02
87+
; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
88+
; CHECK-NEXT: [[RFL:%.*]] = fadd float [[RL]], 1.280000e+02
8989
; CHECK-NEXT: ret float [[RFL]]
9090
;
9191
bb:
@@ -100,8 +100,8 @@ define i64 @hoist_and_i64(i64 %arg, i32 %lane) {
100100
; CHECK-LABEL: define i64 @hoist_and_i64(
101101
; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
102102
; CHECK-NEXT: [[BB:.*:]]
103-
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
104-
; CHECK-NEXT: [[RFL:%.*]] = and i64 [[TMP0]], 16777215
103+
; CHECK-NEXT: [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
104+
; CHECK-NEXT: [[RFL:%.*]] = and i64 [[RL]], 16777215
105105
; CHECK-NEXT: ret i64 [[RFL]]
106106
;
107107
bb:
@@ -114,8 +114,8 @@ define double @hoist_fadd_f64(double %arg, i32 %lane) {
114114
; CHECK-LABEL: define double @hoist_fadd_f64(
115115
; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
116116
; CHECK-NEXT: [[BB:.*:]]
117-
; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
118-
; CHECK-NEXT: [[RFL:%.*]] = fadd double [[TMP0]], 1.280000e+02
117+
; CHECK-NEXT: [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
118+
; CHECK-NEXT: [[RFL:%.*]] = fadd double [[RL]], 1.280000e+02
119119
; CHECK-NEXT: ret double [[RFL]]
120120
;
121121
bb:
@@ -130,8 +130,8 @@ define i32 @hoist_sub_i32_lhs(i32 %arg, i32 %lane) {
130130
; CHECK-LABEL: define i32 @hoist_sub_i32_lhs(
131131
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
132132
; CHECK-NEXT: [[BB:.*:]]
133-
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
134-
; CHECK-NEXT: [[RFL:%.*]] = sub i32 16777215, [[TMP0]]
133+
; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
134+
; CHECK-NEXT: [[RFL:%.*]] = sub i32 16777215, [[RL]]
135135
; CHECK-NEXT: ret i32 [[RFL]]
136136
;
137137
bb:
@@ -144,8 +144,8 @@ define float @hoist_fsub_f32_lhs(float %arg, i32 %lane) {
144144
; CHECK-LABEL: define float @hoist_fsub_f32_lhs(
145145
; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
146146
; CHECK-NEXT: [[BB:.*:]]
147-
; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
148-
; CHECK-NEXT: [[RFL:%.*]] = fsub float 1.280000e+02, [[TMP0]]
147+
; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
148+
; CHECK-NEXT: [[RFL:%.*]] = fsub float 1.280000e+02, [[RL]]
149149
; CHECK-NEXT: ret float [[RFL]]
150150
;
151151
bb:
@@ -154,36 +154,18 @@ bb:
154154
ret float %rl
155155
}
156156

157-
; Check cases where we can't move the readlane higher
158-
159-
define float @cannot_move_readlane(float %arg, i32 %base) {
160-
; CHECK-LABEL: define float @cannot_move_readlane(
161-
; CHECK-SAME: float [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] {
162-
; CHECK-NEXT: [[BB:.*:]]
163-
; CHECK-NEXT: [[VAL:%.*]] = fsub float 1.280000e+02, [[ARG]]
164-
; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2
165-
; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE]])
166-
; CHECK-NEXT: ret float [[RFL]]
167-
;
168-
bb:
169-
%val = fsub float 128.0, %arg
170-
%lane = add i32 %base, 2
171-
%rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane)
172-
ret float %rl
173-
}
174-
175157
define i32 @readlane_lane_op_in_other_block(i1 %cond, i32 %arg, i32 %base) {
176158
; CHECK-LABEL: define i32 @readlane_lane_op_in_other_block(
177159
; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] {
178160
; CHECK-NEXT: [[BB:.*]]:
179161
; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2
180162
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
181163
; CHECK: [[THEN]]:
182-
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
183-
; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215
164+
; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
165+
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[RL]], 16777215
184166
; CHECK-NEXT: br label %[[END]]
185167
; CHECK: [[END]]:
186-
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[LANE]], %[[BB]] ]
168+
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[LANE]], %[[BB]] ]
187169
; CHECK-NEXT: ret i32 [[RES]]
188170
;
189171
bb:
@@ -200,6 +182,25 @@ end:
200182
ret i32 %res
201183
}
202184

185+
; Check cases where we can't move the readlane higher
186+
187+
define float @cannot_move_readlane(float %arg, i32 %base) {
188+
; CHECK-LABEL: define float @cannot_move_readlane(
189+
; CHECK-SAME: float [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] {
190+
; CHECK-NEXT: [[BB:.*:]]
191+
; CHECK-NEXT: [[VAL:%.*]] = fsub float 1.280000e+02, [[ARG]]
192+
; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2
193+
; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE]])
194+
; CHECK-NEXT: ret float [[RFL]]
195+
;
196+
bb:
197+
%val = fsub float 128.0, %arg
198+
%lane = add i32 %base, 2
199+
%rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane)
200+
ret float %rl
201+
}
202+
203+
203204
; test that convergence tokens are preserved
204205

205206
define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg, i32 %lane) convergent {
@@ -209,11 +210,11 @@ define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg, i32 %lane) con
209210
; CHECK-NEXT: [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry()
210211
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
211212
; CHECK: [[THEN]]:
212-
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) [ "convergencectrl"(token [[ENTRY]]) ]
213-
; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215
213+
; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) [ "convergencectrl"(token [[ENTRY]]) ]
214+
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[RL]], 16777215
214215
; CHECK-NEXT: br label %[[END]]
215216
; CHECK: [[END]]:
216-
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[ARG]], %[[BB]] ]
217+
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[ARG]], %[[BB]] ]
217218
; CHECK-NEXT: ret i32 [[RES]]
218219
;
219220
bb:

0 commit comments

Comments
 (0)