Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2152,6 +2152,13 @@ def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// This is similar to readfirstlane, but marks value that is uniform, allowed sunk into
// control flow. The result is undefined if the value is actual divergent.
def int_amdgcn_readanylane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrConvergent, IntrNoCallback, IntrNoFree, IntrNoMem, IntrSpeculatable,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you add IntrSpeculatable deliberately? If so, you should add a test showing what sort of transform it enables.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's removed.

IntrWillReturn]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2775,6 +2775,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_strict_wqm:
Opcode = AMDGPU::STRICT_WQM;
break;
case Intrinsic::amdgcn_readanylane:
Opcode = AMDGPU::SI_READANYLANE;
break;
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
return;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_readlane: {
// If the first argument is uniform these intrinsics return it unchanged.
const Use &Src = II.getArgOperandUse(0);
Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include <optional>
Expand Down Expand Up @@ -97,9 +98,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,

bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
unsigned NewOpc) const {
const bool NeedExec = NewOpc != AMDGPU::SI_READANYLANE;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doesn't readanylane need exec?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the 1st version, it's SAlu and I thought it's uniform when defined. So...
Now, the intrinsic depends on exec.

MI.setDesc(TII.get(NewOpc));
MI.removeOperand(1); // Remove intrinsic ID.
MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
if (NeedExec)
MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));

MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
Expand All @@ -112,7 +115,7 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
= TRI.getConstrainedRegClassForOperand(Dst, *MRI);
const TargetRegisterClass *SrcRC
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
if (!DstRC || DstRC != SrcRC)
if (!DstRC || (NeedExec && DstRC != SrcRC))
return false;

return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
Expand Down Expand Up @@ -1061,6 +1064,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
case Intrinsic::amdgcn_writelane:
return selectWritelane(I);
case Intrinsic::amdgcn_readanylane:
return constrainCopyLikeIntrin(I, AMDGPU::SI_READANYLANE);
case Intrinsic::amdgcn_div_scale:
return selectDivScale(I);
case Intrinsic::amdgcn_icmp:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5475,6 +5475,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
Expand Down Expand Up @@ -7561,6 +7562,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
}
} else if (Opcode == AMDGPU::SI_READANYLANE)
Opcode = AMDGPU::V_READFIRSTLANE_B32;

int MCOpcode = TII->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4658,6 +4658,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
[[fallthrough]];
}
case Intrinsic::amdgcn_readanylane:
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ def UniformIntrinsics : GenericTable {
}

def : AlwaysUniform<int_amdgcn_readfirstlane>;
def : AlwaysUniform<int_amdgcn_readanylane>;
def : AlwaysUniform<int_amdgcn_readlane>;
def : AlwaysUniform<int_amdgcn_icmp>;
def : AlwaysUniform<int_amdgcn_fcmp>;
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1116,9 +1116,11 @@ void SIFoldOperandsImpl::foldOperand(

unsigned UseOpc = UseMI->getOpcode();
if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
UseOpc == AMDGPU::SI_READANYLANE ||
(UseOpc == AMDGPU::V_READLANE_B32 &&
(int)UseOpIdx ==
AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
const bool NeedExec = UseOpc != AMDGPU::SI_READANYLANE;
// %vgpr = V_MOV_B32 imm
// %sgpr = V_READFIRSTLANE_B32 %vgpr
// =>
Expand All @@ -1136,7 +1138,8 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
else
UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
if (NeedExec)
UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
return;
}

Expand All @@ -1155,7 +1158,8 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
if (NeedExec)
UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
return;
}
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6186,6 +6186,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_permlane64:
Operands.push_back(Src0);
break;
Expand Down Expand Up @@ -8837,6 +8838,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return lowerADDRSPACECAST(Op, DAG);
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readanylane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4159,7 +4159,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
Opcode == AMDGPU::SI_READANYLANE)
return true;

return false;
Expand Down Expand Up @@ -9619,6 +9620,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
opcode == AMDGPU::SI_READANYLANE ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
return InstructionUniformity::AlwaysUniform;

Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,10 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
let maybeAtomic = 0;
}

def SI_READANYLANE : SPseudoInstSI <(outs SReg_32:$dst), (ins VGPR_32:$src)> {
let SALU = 1;
}

// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
Expand Down Expand Up @@ -3504,6 +3508,11 @@ def : GCNPat<
(S_MOV_B32 SReg_32:$src)
>;

def : GCNPat<
(i32 (int_amdgcn_readanylane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;

multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
def : GCNPat <
(vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
Expand Down
Loading
Loading