Comments

Pierre-vh · Pierre-vh · commit e9cbeee4ce49 · 2025-05-01T11:59:59.000+02:00
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/CodeMoverUtils.h"
 #include <optional>
 
 using namespace llvm;
@@ -483,6 +482,16 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
   return false;
 }
 
+static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
+                             Function &NewCallee, ArrayRef<Value *> Ops) {
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  Old.getOperandBundlesAsDefs(OpBundles);
+
+  CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
+  NewCall->takeName(&Old);
+  return NewCall;
+}
+
 Instruction *
 GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
                                              IntrinsicInst &II) const {
@@ -491,53 +500,54 @@ GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
          IID == Intrinsic::amdgcn_readfirstlane ||
          IID == Intrinsic::amdgcn_permlane64);
 
-  Instruction *Op = dyn_cast<Instruction>(II.getOperand(0));
+  Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
 
   // Only do this if both instructions are in the same block
   // (so the exec mask won't change) and the readlane is the only user of its
   // operand.
-  if (!Op || !Op->hasOneUser() || Op->getParent() != II.getParent())
+  if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
     return nullptr;
 
   const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
 
   // If this is a readlane, check that the second operand is a constant, or is
-  // defined before Op so we know it's safe to move this intrinsic higher.
+  // defined before OpInst so we know it's safe to move this intrinsic higher.
   Value *LaneID = nullptr;
   if (IsReadLane) {
     LaneID = II.getOperand(1);
-    // Check LaneID is available at Op, otherwise we can't move the readlane
-    // higher.
+
+    // readlane take an extra operand for the lane ID, so we must check if that
+    // LaneID value can be used at the point where we want to move the
+    // intrinsic.
     if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
-      if (!isSafeToMoveBefore(*LaneIDInst, *Op, IC.getDominatorTree()))
+      if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
         return nullptr;
     }
   }
 
+  // Hoist the intrinsic (II) through OpInst.
+  //
+  // (II (OpInst x)) -> (OpInst (II x))
   const auto DoIt = [&](unsigned OpIdx,
                         Function *NewIntrinsic) -> Instruction * {
-    SmallVector<Value *, 2> Ops{Op->getOperand(OpIdx)};
+    SmallVector<Value *, 2> Ops{OpInst->getOperand(OpIdx)};
     if (IsReadLane)
       Ops.push_back(LaneID);
 
-    // Make sure convergence tokens are preserved.
-    // TODO: CreateIntrinsic should allow directly copying bundles
-    SmallVector<OperandBundleDef, 2> OpBundles;
-    II.getOperandBundlesAsDefs(OpBundles);
-
-    CallInst *NewII = IC.Builder.CreateCall(NewIntrinsic, Ops, OpBundles);
-    NewII->takeName(&II);
+    // Rewrite the intrinsic call.
+    CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
 
-    Instruction &NewOp = *Op->clone();
+    // Rewrite OpInst so it takes the result of the intrinsic now.
+    Instruction &NewOp = *OpInst->clone();
     NewOp.setOperand(OpIdx, NewII);
     return &NewOp;
   };
 
-  if (isa<UnaryOperator>(Op))
+  if (isa<UnaryOperator>(OpInst))
     return DoIt(0, II.getCalledFunction());
 
-  if (isa<CastInst>(Op)) {
-    Value *Src = Op->getOperand(0);
+  if (isa<CastInst>(OpInst)) {
+    Value *Src = OpInst->getOperand(0);
     Type *SrcTy = Src->getType();
     if (!isTypeLegal(SrcTy))
       return nullptr;
@@ -548,12 +558,12 @@ GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
   }
 
   // We can also hoist through binary operators if the other operand is uniform.
-  if (isa<BinaryOperator>(Op)) {
+  if (isa<BinaryOperator>(OpInst)) {
     // FIXME: If we had access to UniformityInfo here we could just check
     // if the operand is uniform.
-    if (isTriviallyUniform(Op->getOperandUse(0)))
+    if (isTriviallyUniform(OpInst->getOperandUse(0)))
       return DoIt(1, II.getCalledFunction());
-    if (isTriviallyUniform(Op->getOperandUse(1)))
+    if (isTriviallyUniform(OpInst->getOperandUse(1)))
       return DoIt(0, II.getCalledFunction());
   }
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll
@@ -10,8 +10,8 @@ define float @hoist_fneg_f32(float %arg, i32 %lane) {
 ; CHECK-LABEL: define float @hoist_fneg_f32(
 ; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = fneg float [[TMP0]]
+; CHECK-NEXT:    [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = fneg float [[RL]]
 ; CHECK-NEXT:    ret float [[RFL]]
 ;
 bb:
@@ -24,8 +24,8 @@ define double @hoist_fneg_f64(double %arg, i32 %lane) {
 ; CHECK-LABEL: define double @hoist_fneg_f64(
 ; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = fneg double [[TMP0]]
+; CHECK-NEXT:    [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = fneg double [[RL]]
 ; CHECK-NEXT:    ret double [[RFL]]
 ;
 bb:
@@ -40,8 +40,8 @@ define i32 @hoist_trunc(i64 %arg, i32 %lane) {
 ; CHECK-LABEL: define i32 @hoist_trunc(
 ; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[RFL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[RFL]] to i32
+; CHECK-NEXT:    [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[RL]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 bb:
@@ -54,8 +54,8 @@ define i64 @hoist_zext(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: define i64 @hoist_zext(
 ; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[RFL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[RFL]] to i64
+; CHECK-NEXT:    [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[RL]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 bb:
@@ -70,8 +70,8 @@ define i32 @hoist_add_i32(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: define i32 @hoist_add_i32(
 ; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = add i32 [[TMP0]], 16777215
+; CHECK-NEXT:    [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = add i32 [[RL]], 16777215
 ; CHECK-NEXT:    ret i32 [[RFL]]
 ;
 bb:
@@ -84,8 +84,8 @@ define float @hoist_fadd_f32(float %arg, i32 %lane) {
 ; CHECK-LABEL: define float @hoist_fadd_f32(
 ; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = fadd float [[TMP0]], 1.280000e+02
+; CHECK-NEXT:    [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = fadd float [[RL]], 1.280000e+02
 ; CHECK-NEXT:    ret float [[RFL]]
 ;
 bb:
@@ -100,8 +100,8 @@ define i64 @hoist_and_i64(i64 %arg, i32 %lane) {
 ; CHECK-LABEL: define i64 @hoist_and_i64(
 ; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = and i64 [[TMP0]], 16777215
+; CHECK-NEXT:    [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = and i64 [[RL]], 16777215
 ; CHECK-NEXT:    ret i64 [[RFL]]
 ;
 bb:
@@ -114,8 +114,8 @@ define double @hoist_fadd_f64(double %arg, i32 %lane) {
 ; CHECK-LABEL: define double @hoist_fadd_f64(
 ; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = fadd double [[TMP0]], 1.280000e+02
+; CHECK-NEXT:    [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = fadd double [[RL]], 1.280000e+02
 ; CHECK-NEXT:    ret double [[RFL]]
 ;
 bb:
@@ -130,8 +130,8 @@ define i32 @hoist_sub_i32_lhs(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: define i32 @hoist_sub_i32_lhs(
 ; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = sub i32 16777215, [[TMP0]]
+; CHECK-NEXT:    [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = sub i32 16777215, [[RL]]
 ; CHECK-NEXT:    ret i32 [[RFL]]
 ;
 bb:
@@ -144,8 +144,8 @@ define float @hoist_fsub_f32_lhs(float %arg, i32 %lane) {
 ; CHECK-LABEL: define float @hoist_fsub_f32_lhs(
 ; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = fsub float 1.280000e+02, [[TMP0]]
+; CHECK-NEXT:    [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[RFL:%.*]] = fsub float 1.280000e+02, [[RL]]
 ; CHECK-NEXT:    ret float [[RFL]]
 ;
 bb:
@@ -154,36 +154,18 @@ bb:
   ret float %rl
 }
 
-; Check cases where we can't move the readlane higher
-
-define float @cannot_move_readlane(float %arg, i32 %base) {
-; CHECK-LABEL: define float @cannot_move_readlane(
-; CHECK-SAME: float [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[VAL:%.*]] = fsub float 1.280000e+02, [[ARG]]
-; CHECK-NEXT:    [[LANE:%.*]] = add i32 [[BASE]], 2
-; CHECK-NEXT:    [[RFL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE]])
-; CHECK-NEXT:    ret float [[RFL]]
-;
-bb:
-  %val = fsub float 128.0, %arg
-  %lane = add i32 %base, 2
-  %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane)
-  ret float %rl
-}
-
 define i32 @readlane_lane_op_in_other_block(i1 %cond, i32 %arg, i32 %base) {
 ; CHECK-LABEL: define i32 @readlane_lane_op_in_other_block(
 ; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*]]:
 ; CHECK-NEXT:    [[LANE:%.*]] = add i32 [[BASE]], 2
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
-; CHECK-NEXT:    [[RFL:%.*]] = add i32 [[TMP0]], 16777215
+; CHECK-NEXT:    [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[RL]], 16777215
 ; CHECK-NEXT:    br label %[[END]]
 ; CHECK:       [[END]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[LANE]], %[[BB]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[LANE]], %[[BB]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 bb:
@@ -200,6 +182,25 @@ end:
   ret i32 %res
 }
 
+; Check cases where we can't move the readlane higher
+
+define float @cannot_move_readlane(float %arg, i32 %base) {
+; CHECK-LABEL: define float @cannot_move_readlane(
+; CHECK-SAME: float [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[VAL:%.*]] = fsub float 1.280000e+02, [[ARG]]
+; CHECK-NEXT:    [[LANE:%.*]] = add i32 [[BASE]], 2
+; CHECK-NEXT:    [[RFL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE]])
+; CHECK-NEXT:    ret float [[RFL]]
+;
+bb:
+  %val = fsub float 128.0, %arg
+  %lane = add i32 %base, 2
+  %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane)
+  ret float %rl
+}
+
+
 ; test that convergence tokens are preserved
 
 define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg, i32 %lane) convergent {
@@ -209,11 +210,11 @@ define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg, i32 %lane) con
 ; CHECK-NEXT:    [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry()
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) [ "convergencectrl"(token [[ENTRY]]) ]
-; CHECK-NEXT:    [[RFL:%.*]] = add i32 [[TMP0]], 16777215
+; CHECK-NEXT:    [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) [ "convergencectrl"(token [[ENTRY]]) ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[RL]], 16777215
 ; CHECK-NEXT:    br label %[[END]]
 ; CHECK:       [[END]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[ARG]], %[[BB]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[ARG]], %[[BB]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 bb: