llvm · GinShio · Nov 4, 2024 · Nov 11, 2024 · Nov 13, 2024 · Nov 14, 2024
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -1204,6 +1204,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
                                                    i64, double, pointers, multiples of the 32-bit vectors.
 
+  llvm.amdgcn.readanylane                          Similar to readfirstlane. But marks value that is uniform when used.
+                                                   The result is undefined if the value is actual divergent.
+
   llvm.amdgcn.readlane                             Provides direct access to v_readlane_b32. Returns the value in the
                                                    specified lane of the first input operand. The second operand specifies
                                                    the lane to read from. Currently implemented for i16, i32, float, half,

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2152,6 +2152,12 @@ def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// This is similar to readfirstlane, but marks value that is uniform when used, allowed sunk
+// into control flow. The result is undefined if the value is actual divergent.
+def int_amdgcn_readanylane :
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
+            [IntrConvergent, IntrNoCallback, IntrNoFree, IntrNoMem, IntrWillReturn]>;
+
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2775,6 +2775,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_strict_wqm:
     Opcode = AMDGPU::STRICT_WQM;
     break;
+  case Intrinsic::amdgcn_readanylane:
+    Opcode = AMDGPU::SI_READANYLANE;
+    break;
   case Intrinsic::amdgcn_interp_p1_f16:
     SelectInterpP1F16(N);
     return;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1081,6 +1081,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   }
   case Intrinsic::amdgcn_permlane64:
   case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readanylane:
   case Intrinsic::amdgcn_readlane: {
     // If the first argument is uniform these intrinsics return it unchanged.
     const Use &Src = II.getArgOperandUse(0);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include <optional>
@@ -112,7 +113,8 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
   const TargetRegisterClass *SrcRC
     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
-  if (!DstRC || DstRC != SrcRC)
+  // READANYLANE allows input is vgpr and output is sgpr.
+  if (!DstRC || (NewOpc != AMDGPU::SI_READANYLANE && DstRC != SrcRC))
     return false;
 
   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
@@ -1061,6 +1063,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
   case Intrinsic::amdgcn_writelane:
     return selectWritelane(I);
+  case Intrinsic::amdgcn_readanylane:
+    return constrainCopyLikeIntrin(I, AMDGPU::SI_READANYLANE);
   case Intrinsic::amdgcn_div_scale:
     return selectDivScale(I);
   case Intrinsic::amdgcn_icmp:

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5475,6 +5475,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
     switch (IID) {
     case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readanylane:
     case Intrinsic::amdgcn_permlane64:
       return LaneOp.getReg(0);
     case Intrinsic::amdgcn_readlane:
@@ -7561,6 +7562,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_readlane:
   case Intrinsic::amdgcn_writelane:
   case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readanylane:
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -137,7 +137,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
              Opcode == AMDGPU::SI_TCRETURN_GFX) {
     // TODO: How to use branch immediate and avoid register+add?
     Opcode = AMDGPU::S_SETPC_B64;
-  }
+  } else if (Opcode == AMDGPU::SI_READANYLANE)
+    Opcode = AMDGPU::V_READFIRSTLANE_B32;
 
   int MCOpcode = TII->pseudoToMCOpcode(Opcode);
   if (MCOpcode == -1) {

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4658,6 +4658,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
       [[fallthrough]];
     }
+    case Intrinsic::amdgcn_readanylane:
     case Intrinsic::amdgcn_readfirstlane: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -366,6 +366,7 @@ def UniformIntrinsics : GenericTable {
 }
 
 def : AlwaysUniform<int_amdgcn_readfirstlane>;
+def : AlwaysUniform<int_amdgcn_readanylane>;
 def : AlwaysUniform<int_amdgcn_readlane>;
 def : AlwaysUniform<int_amdgcn_icmp>;
 def : AlwaysUniform<int_amdgcn_fcmp>;

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1116,6 +1116,7 @@ void SIFoldOperandsImpl::foldOperand(
 
     unsigned UseOpc = UseMI->getOpcode();
     if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
+        UseOpc == AMDGPU::SI_READANYLANE ||
         (UseOpc == AMDGPU::V_READLANE_B32 &&
          (int)UseOpIdx ==
          AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6186,6 +6186,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
       Operands.push_back(Src1);
       [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readanylane:
     case Intrinsic::amdgcn_permlane64:
       Operands.push_back(Src0);
       break;
@@ -8837,6 +8838,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerADDRSPACECAST(Op, DAG);
   case Intrinsic::amdgcn_readlane:
   case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readanylane:
   case Intrinsic::amdgcn_writelane:
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4159,7 +4159,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
       Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
       Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
-      Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
+      Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
+      Opcode == AMDGPU::SI_READANYLANE)
     return true;
 
   return false;
@@ -9619,6 +9620,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
   unsigned opcode = MI.getOpcode();
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+      opcode == AMDGPU::SI_READANYLANE ||
       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
     return InstructionUniformity::AlwaysUniform;
 

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -546,6 +546,15 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   let maybeAtomic = 0;
 }
 
+def SI_READANYLANE : VPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
+  let Uses = [EXEC];
+  let VALU = 1;
+  let hasSideEffects = 0;
+  let isConvergent = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
@@ -3504,6 +3513,11 @@ def : GCNPat<
   (S_MOV_B32 SReg_32:$src)
 >;
 
+def : GCNPat<
+  (i32 (int_amdgcn_readanylane (i32 imm:$src))),
+  (S_MOV_B32 SReg_32:$src)
+>;
+
 multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
   def : GCNPat <
     (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),