@@ -5030,12 +5030,18 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50305030 Register SrcReg = MI.getOperand(1).getReg();
50315031 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
50325032 Register DstReg = MI.getOperand(0).getReg();
5033+ bool isDstSGPR = TRI->isSGPRClass(MRI.getRegClass(DstReg));
5034+
50335035 MachineBasicBlock *RetBB = nullptr;
50345036 if (isSGPR) {
50355037 // These operations with a uniform value i.e. SGPR are idempotent.
50365038 // Reduced value will be same as given sgpr.
50375039 // clang-format off
5038- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5040+ if(isDstSGPR)
5041+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5042+ .addReg(SrcReg);
5043+ else
5044+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), DstReg)
50395045 .addReg(SrcReg);
50405046 // clang-format on
50415047 RetBB = &BB;
@@ -5051,22 +5057,24 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50515057 // so that we will get the next active lane for next iteration.
50525058 MachineBasicBlock::iterator I = BB.end();
50535059 Register SrcReg = MI.getOperand(1).getReg();
5054-
50555060 // Create Control flow for loop
50565061 // Split MI's Machine Basic block into For loop
50575062 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
50585063
50595064 // Create virtual registers required for lowering.
50605065 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
50615066 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5067+ const TargetRegisterClass *regclass =
5068+ isDstSGPR ? DstRegClass : &AMDGPU::SReg_32RegClass;
5069+ Register accumreg = MRI.createVirtualRegister(regclass);
50625070 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5063- Register InitalValReg = MRI.createVirtualRegister(DstRegClass );
5071+ Register InitalValReg = MRI.createVirtualRegister(regclass );
50645072
5065- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass );
5073+ Register AccumulatorReg = MRI.createVirtualRegister(regclass );
50665074 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
50675075 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
50685076
5069- Register FF1Reg = MRI.createVirtualRegister(DstRegClass );
5077+ Register FF1Reg = MRI.createVirtualRegister(regclass );
50705078 Register LaneValueReg =
50715079 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
50725080
@@ -5106,10 +5114,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51065114 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
51075115 .addReg(SrcReg)
51085116 .addReg(FF1->getOperand(0).getReg());
5109- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg )
5117+ auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), accumreg )
51105118 .addReg(Accumulator->getOperand(0).getReg())
51115119 .addReg(LaneValue->getOperand(0).getReg());
51125120
5121+ BuildMI(*ComputeLoop, I, DL,
5122+ TII->get(isDstSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32),
5123+ DstReg)
5124+ .addReg(accumreg);
51135125 // Manipulate the iterator to get the next active lane
51145126 unsigned BITSETOpc =
51155127 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
@@ -5138,6 +5150,171 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51385150 return RetBB;
51395151}
51405152
5153+ static MachineBasicBlock *
5154+ lowerWaveReduceWrtDivergentMask(MachineInstr &MI, MachineBasicBlock &BB,
5155+ const GCNSubtarget &ST, unsigned Opc) {
5156+ MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5157+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
5158+ const DebugLoc &DL = MI.getDebugLoc();
5159+ const SIInstrInfo *TII = ST.getInstrInfo();
5160+ // const MachineFunction *MF = BB.getParent();
5161+ // const TargetRegisterInfo *TrgtRegInfo =
5162+ // MF->getSubtarget().getRegisterInfo(); Reduction operations depend on
5163+ // whether the input operand is SGPR or VGPR.
5164+ Register SrcReg = MI.getOperand(1).getReg();
5165+ auto SrcRegClass = MRI.getRegClass(SrcReg);
5166+ // llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
5167+ bool isSGPR = TRI->isSGPRClass(SrcRegClass);
5168+ Register DstReg = MI.getOperand(0).getReg();
5169+ // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) <<
5170+ // "\n";
5171+ Register DivergentMaskReg = MI.getOperand(2).getReg();
5172+ // bool isMaskRegUniform =
5173+ // TRI->isSGPRClass(MRI.getRegClass(DivergentMaskReg)); llvm::errs() <<
5174+ // TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
5175+
5176+ // if (isMaskRegUniform)
5177+ // return lowerWaveReduce(MI, BB, ST, Opc);
5178+
5179+ MachineBasicBlock *RetBB = nullptr;
5180+ if (isSGPR) {
5181+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5182+ RetBB = &BB;
5183+ } else {
5184+
5185+ MachineBasicBlock::iterator I = BB.end();
5186+
5187+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5188+
5189+ auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
5190+ auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
5191+
5192+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5193+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5194+ Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
5195+ Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5196+ Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
5197+ Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
5198+ Register AccReg = MRI.createVirtualRegister(DstRegClass);
5199+ Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5200+ Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5201+ Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5202+ Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
5203+ Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
5204+ Register UpdatedActiveLanesReg =
5205+ MRI.createVirtualRegister(WaveMaskRegClass);
5206+ Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
5207+ Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
5208+ Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
5209+ Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5210+ Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5211+
5212+ bool IsWave32 = ST.isWave32();
5213+
5214+ uint32_t IdentityValue =
5215+ (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5216+
5217+ BuildMI(BB, I, DL,
5218+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
5219+ ExecCopyReg)
5220+ .addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
5221+
5222+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
5223+ .addImm(IdentityValue);
5224+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
5225+ .addImm(0);
5226+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
5227+
5228+ I = ComputeLoop->end();
5229+
5230+ auto PhiActiveLanesInst =
5231+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
5232+ .addReg(ExecCopyReg)
5233+ .addMBB(&BB);
5234+ auto PhiAccInst =
5235+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
5236+ .addReg(AccReg)
5237+ .addMBB(&BB);
5238+ auto PhiBPermAddrInst =
5239+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
5240+ .addReg(InitialBPermAddrReg)
5241+ .addMBB(&BB);
5242+
5243+ BuildMI(*ComputeLoop, I, DL,
5244+ TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64),
5245+ FF1ActiveLanesReg)
5246+ .addReg(ActiveLanesReg);
5247+
5248+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
5249+ .addReg(SrcReg)
5250+ .addReg(FF1ActiveLanesReg);
5251+
5252+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
5253+ .addReg(DivergentMaskReg)
5254+ .addReg(FF1ActiveLanesReg);
5255+
5256+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg)
5257+ .addReg(MaskReg);
5258+
5259+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
5260+ .addReg(AccReg1)
5261+ .addReg(FF1MaskReg);
5262+
5263+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg)
5264+ .addReg(AccSGPRReg)
5265+ .addReg(ValReg);
5266+
5267+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5268+ .addReg(FF1MaskReg);
5269+
5270+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32),
5271+ UpdatedAccReg)
5272+ .addReg(UpdatedAccSGPRReg)
5273+ .addReg(AMDGPU::M0)
5274+ .addReg(AccReg1);
5275+
5276+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
5277+ .addReg(FF1MaskReg)
5278+ .addImm(2);
5279+
5280+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5281+ .addReg(FF1ActiveLanesReg);
5282+
5283+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32),
5284+ UpdatedBPermAddrReg)
5285+ .addReg(FF1MaskX4Reg)
5286+ .addReg(AMDGPU::M0)
5287+ .addReg(BPermAddrReg);
5288+
5289+ unsigned BITSETOpc =
5290+ IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5291+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
5292+ .addReg(FF1ActiveLanesReg)
5293+ .addReg(ActiveLanesReg);
5294+
5295+ PhiActiveLanesInst.addReg(UpdatedActiveLanesReg).addMBB(ComputeLoop);
5296+ PhiAccInst.addReg(UpdatedAccReg).addMBB(ComputeLoop);
5297+ PhiBPermAddrInst.addReg(UpdatedBPermAddrReg).addMBB(ComputeLoop);
5298+
5299+ unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5300+ BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5301+ .addReg(UpdatedActiveLanesReg)
5302+ .addImm(0);
5303+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5304+ .addMBB(ComputeLoop);
5305+
5306+ BuildMI(*ComputeEnd, ComputeEnd->begin(), DL,
5307+ TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
5308+ .addReg(UpdatedBPermAddrReg)
5309+ .addReg(UpdatedAccReg)
5310+ .addImm(0);
5311+
5312+ RetBB = ComputeEnd;
5313+ }
5314+ MI.eraseFromParent();
5315+ return RetBB;
5316+ }
5317+
51415318MachineBasicBlock *
51425319SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51435320 MachineBasicBlock *BB) const {
@@ -5151,6 +5328,21 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51515328 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
51525329 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
51535330 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5331+ case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32:
5332+ case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32: {
5333+ unsigned Opc = (MI.getOpcode() ==
5334+ AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32)
5335+ ? AMDGPU::S_MIN_U32
5336+ : AMDGPU::S_MAX_U32;
5337+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5338+ bool isMaskRegUniform = getSubtarget()->getRegisterInfo()->isSGPRClass(
5339+ MRI.getRegClass(MI.getOperand(2).getReg()));
5340+
5341+ if (isMaskRegUniform)
5342+ return lowerWaveReduce(MI, *BB, *getSubtarget(), Opc);
5343+
5344+ return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), Opc);
5345+ }
51545346 case AMDGPU::S_UADDO_PSEUDO:
51555347 case AMDGPU::S_USUBO_PSEUDO: {
51565348 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments