Skip to content

Commit f3f133f

Browse files
committed
PR feedback, implement in GISel
1 parent 741566b commit f3f133f

File tree

8 files changed

+342
-136
lines changed

8 files changed

+342
-136
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2719,13 +2719,15 @@ def int_amdgcn_call_whole_wave:
27192719
[IntrConvergent]>;
27202720

27212721
// <result>
2722-
// llvm.amdgcn.subgroup.shuffle <value> <id>
2723-
// value and result can be any scalar of floating-point, integer,
2724-
// or Boolean types, but must be the same type
2725-
def int_amdgcn_subgroup_shuffle :
2726-
Intrinsic<[llvm_any_ty], // return types
2727-
[LLVMMatchType<0>, llvm_i32_ty], // arg types
2728-
[IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
2722+
// llvm.amdgcn.wave.shuffle <value> <id>
2723+
// value and result can be 32bit floating-point, integer,
2724+
// or Boolean types, and must be the same type. Any index
2725+
// value that's outside the valid range will wrap around,
2726+
// and reading from an inactive lane will return 0.
2727+
def int_amdgcn_wave_shuffle :
2728+
DefaultAttrsIntrinsic<[llvm_any_ty], // return types
2729+
[LLVMMatchType<0>, llvm_i32_ty], // arg types
2730+
[IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
27292731

27302732
//===----------------------------------------------------------------------===//
27312733
// CI+ Intrinsics

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,6 +1216,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
12161216
case Intrinsic::amdgcn_permlane16_swap:
12171217
case Intrinsic::amdgcn_permlane32_swap:
12181218
return selectPermlaneSwapIntrin(I, IntrinsicID);
1219+
case Intrinsic::amdgcn_wave_shuffle:
1220+
return selectWaveShuffleIntrin(I);
12191221
default:
12201222
return selectImpl(I, *CoverageInfo);
12211223
}
@@ -3852,6 +3854,129 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
38523854
return true;
38533855
}
38543856

3857+
bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
3858+
MachineInstr &MI) const {
3859+
assert(MI.getNumOperands() == 4);
3860+
MachineBasicBlock *MBB = MI.getParent();
3861+
const DebugLoc &DL = MI.getDebugLoc();
3862+
3863+
Register DstReg = MI.getOperand(0).getReg();
3864+
Register ValReg = MI.getOperand(2).getReg();
3865+
Register IdxReg = MI.getOperand(3).getReg();
3866+
3867+
const LLT DstTy = MRI->getType(DstReg);
3868+
unsigned DstSize = DstTy.getSizeInBits();
3869+
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3870+
const TargetRegisterClass *DstRC =
3871+
TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
3872+
3873+
assert(DstTy == LLT::scalar(32));
3874+
3875+
// If we can bpermute across the whole wave, then just do that
3876+
if (Subtarget->supportsWaveWideBPermute()) {
3877+
Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
3878+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
3879+
.addImm(2)
3880+
.addReg(IdxReg);
3881+
3882+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
3883+
.addReg(ShiftIdxReg)
3884+
.addReg(ValReg)
3885+
.addImm(0);
3886+
} else {
3887+
// Otherwise, we need to make use of whole wave mode
3888+
assert(Subtarget->isWave64());
3889+
3890+
// Set inactive lanes to poison
3891+
Register UndefValReg =
3892+
MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
3893+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
3894+
3895+
Register UndefExecReg = MRI->createVirtualRegister(
3896+
TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
3897+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
3898+
3899+
Register PoisonValReg = MRI->createVirtualRegister(DstRC);
3900+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
3901+
.addImm(0)
3902+
.addReg(ValReg)
3903+
.addImm(0)
3904+
.addReg(UndefValReg)
3905+
.addReg(UndefExecReg);
3906+
3907+
// ds_bpermute requires index to be multiplied by 4
3908+
Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
3909+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
3910+
.addImm(2)
3911+
.addReg(IdxReg);
3912+
3913+
Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
3914+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
3915+
.addImm(0)
3916+
.addReg(ShiftIdxReg)
3917+
.addImm(0)
3918+
.addReg(UndefValReg)
3919+
.addReg(UndefExecReg);
3920+
3921+
// Get permutation of each half, then we'll select which one to use
3922+
Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
3923+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
3924+
.addReg(PoisonIdxReg)
3925+
.addReg(PoisonValReg)
3926+
.addImm(0);
3927+
3928+
Register SwappedValReg = MRI->createVirtualRegister(DstRC);
3929+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
3930+
.addReg(PoisonValReg);
3931+
3932+
Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
3933+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
3934+
.addReg(PoisonIdxReg)
3935+
.addReg(SwappedValReg)
3936+
.addImm(0);
3937+
3938+
Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
3939+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
3940+
.addReg(OppSidePermReg);
3941+
3942+
// Select which side to take the permute from
3943+
// We can get away with only using mbcnt_lo here since we're only
3944+
// trying to detect which side of 32 each lane is on, and mbcnt_lo
3945+
// returns 32 for lanes 32-63.
3946+
Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
3947+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
3948+
.addImm(-1)
3949+
.addImm(0);
3950+
3951+
Register XORReg = MRI->createVirtualRegister(DstRC);
3952+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
3953+
.addReg(ThreadIDReg)
3954+
.addReg(PoisonIdxReg);
3955+
3956+
Register ANDReg = MRI->createVirtualRegister(DstRC);
3957+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
3958+
.addReg(XORReg)
3959+
.addImm(32);
3960+
3961+
Register CompareReg = MRI->createVirtualRegister(
3962+
TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
3963+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
3964+
.addReg(ANDReg)
3965+
.addImm(0);
3966+
3967+
// Finally do the selection
3968+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
3969+
.addImm(0)
3970+
.addReg(WWMSwapPermReg)
3971+
.addImm(0)
3972+
.addReg(SameSidePermReg)
3973+
.addReg(CompareReg);
3974+
}
3975+
3976+
MI.eraseFromParent();
3977+
return true;
3978+
}
3979+
38553980
// Match BITOP3 operation and return a number of matched instructions plus
38563981
// truth table.
38573982
static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
156156
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
157157
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
158158
bool selectSBarrierLeave(MachineInstr &I) const;
159+
bool selectWaveShuffleIntrin(MachineInstr &I) const;
159160

160161
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
161162
bool IsCanonicalizing = true,

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5230,11 +5230,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52305230
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
52315231
break;
52325232
}
5233-
case Intrinsic::amdgcn_s_bitreplicate:
5233+
case Intrinsic::amdgcn_s_bitreplicate: {
52345234
Register MaskReg = MI.getOperand(2).getReg();
52355235
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
52365236
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
52375237
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
5238+
break;
5239+
}
5240+
case Intrinsic::amdgcn_wave_shuffle: {
5241+
unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5242+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
5243+
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
5244+
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
5245+
break;
5246+
}
52385247
}
52395248
break;
52405249
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1896,7 +1896,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18961896
}
18971897

18981898
bool supportsWaveWideBPermute() const {
1899-
return ((getGeneration() == AMDGPUSubtarget::GFX12) || isWave32());
1899+
return getGeneration() == AMDGPUSubtarget::GFX12 || isWave32();
19001900
}
19011901
};
19021902

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -7280,24 +7280,25 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
72807280
return DAG.getBitcast(VT, UnrolledLaneOp);
72817281
}
72827282

7283-
static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
7283+
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
72847284
SelectionDAG &DAG) {
72857285
EVT VT = N->getValueType(0);
72867286
unsigned ValSize = VT.getSizeInBits();
7287+
assert(ValSize == 32);
72877288
SDLoc SL(N);
72887289

72897290
SDValue Value = N->getOperand(1);
72907291
SDValue Index = N->getOperand(2);
72917292

72927293
// ds_bpermute requires index to be multiplied by 4
7293-
SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
7294+
SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
72947295
SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
72957296
ShiftAmount);
72967297

72977298
// Intrinsics will require i32 to operate on
7298-
SDValue Value32 = Value;
7299-
if ((ValSize != 32) || (VT.isFloatingPoint()))
7300-
Value32 = DAG.getBitcast(MVT::i32, Value);
7299+
SDValue ValueI32 = Value;
7300+
if (VT.isFloatingPoint())
7301+
ValueI32 = DAG.getBitcast(MVT::i32, Value);
73017302

73027303
auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
73037304
SmallVector<SDValue> IntrinArgs) -> SDValue {
@@ -7307,54 +7308,55 @@ static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
73077308
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
73087309
};
73097310

7311+
// If we can bpermute across the whole wave, then just do that
73107312
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7311-
// If we can bpermute across the whole wave, then just do that
73127313
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7313-
{ShiftedIndex, Value32});
7314+
{ShiftedIndex, ValueI32});
73147315
return DAG.getBitcast(VT, BPermute);
7315-
} else {
7316-
assert(TLI.getSubtarget()->isWave64());
7317-
7318-
// Otherwise, we need to make use of whole wave mode
7319-
SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7320-
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7321-
7322-
// Set inactive lanes to poison
7323-
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7324-
{Value32, PoisonVal});
7325-
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7326-
{ShiftedIndex, PoisonIndex});
7327-
7328-
SDValue Swapped =
7329-
MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7330-
7331-
// Get permutation of each half, then we'll select which one to use
7332-
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7333-
MVT::i32, {WWMIndex, WWMValue});
7334-
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7335-
MVT::i32, {WWMIndex, Swapped});
7336-
SDValue BPermOtherHalfWWM =
7337-
MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7338-
7339-
// Select which side to take the permute from
7340-
SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7341-
SDValue ThreadIDLo =
7342-
MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7343-
{ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7344-
SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7345-
{ThreadIDMask, ThreadIDLo});
7346-
7347-
SDValue SameOrOtherHalf =
7348-
DAG.getNode(ISD::AND, SL, MVT::i32,
7349-
DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7350-
DAG.getTargetConstant(32, SL, MVT::i32));
7351-
SDValue UseSameHalf =
7352-
DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7353-
DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7354-
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7355-
BPermOtherHalfWWM);
7356-
return DAG.getBitcast(VT, Result);
73577316
}
7317+
7318+
assert(TLI.getSubtarget()->isWave64());
7319+
7320+
// Otherwise, we need to make use of whole wave mode
7321+
SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7322+
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7323+
7324+
// Set inactive lanes to poison
7325+
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7326+
{ValueI32, PoisonVal});
7327+
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7328+
{ShiftedIndex, PoisonIndex});
7329+
7330+
SDValue Swapped =
7331+
MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7332+
7333+
// Get permutation of each half, then we'll select which one to use
7334+
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7335+
MVT::i32, {WWMIndex, WWMValue});
7336+
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7337+
MVT::i32, {WWMIndex, Swapped});
7338+
SDValue BPermOtherHalfWWM =
7339+
MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7340+
7341+
// Select which side to take the permute from
7342+
SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7343+
// We can get away with only using mbcnt_lo here since we're only
7344+
// trying to detect which side of 32 each lane is on, and mbcnt_lo
7345+
// returns 32 for lanes 32-63.
7346+
SDValue ThreadID =
7347+
MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7348+
{ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7349+
7350+
SDValue SameOrOtherHalf =
7351+
DAG.getNode(ISD::AND, SL, MVT::i32,
7352+
DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7353+
DAG.getTargetConstant(32, SL, MVT::i32));
7354+
SDValue UseSameHalf =
7355+
DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7356+
DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7357+
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7358+
BPermOtherHalfWWM);
7359+
return DAG.getBitcast(VT, Result);
73587360
}
73597361

73607362
void SITargetLowering::ReplaceNodeResults(SDNode *N,
@@ -10264,8 +10266,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1026410266
Poisons.push_back(DAG.getPOISON(ValTy));
1026510267
return DAG.getMergeValues(Poisons, SDLoc(Op));
1026610268
}
10267-
case Intrinsic::amdgcn_subgroup_shuffle:
10268-
return lowerSubgroupShuffle(*this, Op.getNode(), DAG);
10269+
case Intrinsic::amdgcn_wave_shuffle:
10270+
return lowerWaveShuffle(*this, Op.getNode(), DAG);
1026910271
default:
1027010272
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1027110273
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

0 commit comments

Comments
 (0)