Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.readanylane Similar to readfirstlane. But marks value that is uniform when used.
The result is undefined if the value is actual divergent.

llvm.amdgcn.readlane Provides direct access to v_readlane_b32. Returns the value in the
specified lane of the first input operand. The second operand specifies
the lane to read from. Currently implemented for i16, i32, float, half,
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2152,6 +2152,12 @@ def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// This is similar to readfirstlane, but marks value that is uniform when used, allowed sunk
// into control flow. The result is undefined if the value is actual divergent.
def int_amdgcn_readanylane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrConvergent, IntrNoCallback, IntrNoFree, IntrNoMem, IntrWillReturn]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2775,6 +2775,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_strict_wqm:
Opcode = AMDGPU::STRICT_WQM;
break;
case Intrinsic::amdgcn_readanylane:
Opcode = AMDGPU::SI_READANYLANE;
break;
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
return;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_readlane: {
// If the first argument is uniform these intrinsics return it unchanged.
const Use &Src = II.getArgOperandUse(0);
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include <optional>
Expand Down Expand Up @@ -112,7 +113,8 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
= TRI.getConstrainedRegClassForOperand(Dst, *MRI);
const TargetRegisterClass *SrcRC
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
if (!DstRC || DstRC != SrcRC)
// READANYLANE allows input is vgpr and output is sgpr.
if (!DstRC || (NewOpc != AMDGPU::SI_READANYLANE && DstRC != SrcRC))
return false;

return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
Expand Down Expand Up @@ -1061,6 +1063,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
case Intrinsic::amdgcn_writelane:
return selectWritelane(I);
case Intrinsic::amdgcn_readanylane:
return constrainCopyLikeIntrin(I, AMDGPU::SI_READANYLANE);
case Intrinsic::amdgcn_div_scale:
return selectDivScale(I);
case Intrinsic::amdgcn_icmp:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5475,6 +5475,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
Expand Down Expand Up @@ -7561,6 +7562,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
}
} else if (Opcode == AMDGPU::SI_READANYLANE)
Opcode = AMDGPU::V_READFIRSTLANE_B32;

int MCOpcode = TII->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4658,6 +4658,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
[[fallthrough]];
}
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_readfirstlane: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ def UniformIntrinsics : GenericTable {
}

def : AlwaysUniform<int_amdgcn_readfirstlane>;
def : AlwaysUniform<int_amdgcn_readanylane>;
def : AlwaysUniform<int_amdgcn_readlane>;
def : AlwaysUniform<int_amdgcn_icmp>;
def : AlwaysUniform<int_amdgcn_fcmp>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,7 @@ void SIFoldOperandsImpl::foldOperand(

unsigned UseOpc = UseMI->getOpcode();
if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
UseOpc == AMDGPU::SI_READANYLANE ||
(UseOpc == AMDGPU::V_READLANE_B32 &&
(int)UseOpIdx ==
AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6186,6 +6186,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane64:
Operands.push_back(Src0);
break;
Expand Down Expand Up @@ -8837,6 +8838,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return lowerADDRSPACECAST(Op, DAG);
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4159,7 +4159,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
Opcode == AMDGPU::SI_READANYLANE)
return true;

return false;
Expand Down Expand Up @@ -9619,6 +9620,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
opcode == AMDGPU::SI_READANYLANE ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
return InstructionUniformity::AlwaysUniform;

Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,15 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
let maybeAtomic = 0;
}

def SI_READANYLANE : VPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
let Uses = [EXEC];
let VALU = 1;
let hasSideEffects = 0;
let isConvergent = 1;
let mayLoad = 0;
let mayStore = 0;
}

// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
Expand Down Expand Up @@ -3504,6 +3513,11 @@ def : GCNPat<
(S_MOV_B32 SReg_32:$src)
>;

def : GCNPat<
(i32 (int_amdgcn_readanylane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;

multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
def : GCNPat <
(vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
Expand Down
Loading
Loading