@@ -4970,114 +4970,149 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49704970 const SIRegisterInfo *TRI = ST.getRegisterInfo();
49714971 const DebugLoc &DL = MI.getDebugLoc();
49724972 const SIInstrInfo *TII = ST.getInstrInfo();
4973-
4973+ // const MachineFunction *MF = BB.getParent();
4974+ // const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
49744975 // Reduction operations depend on whether the input operand is SGPR or VGPR.
49754976 Register SrcReg = MI.getOperand(1).getReg();
4976- bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4977+ auto SrcRegClass = MRI.getRegClass(SrcReg);
4978+ // llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
4979+ bool isSGPR = TRI->isSGPRClass(SrcRegClass);
49774980 Register DstReg = MI.getOperand(0).getReg();
4981+ // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
4982+ Register DivergentMaskReg = MI.getOperand(2).getReg();
4983+ // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
4984+
49784985 MachineBasicBlock *RetBB = nullptr;
49794986 if (isSGPR) {
4980- // These operations with a uniform value i.e. SGPR are idempotent.
4981- // Reduced value will be same as given sgpr.
4982- // clang-format off
49834987 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
49844988 .addReg(SrcReg);
4985- // clang-format on
49864989 RetBB = &BB;
49874990 } else {
4988- // TODO: Implement DPP Strategy and switch based on immediate strategy
4989- // operand. For now, for all the cases (default, Iterative and DPP we use
4990- // iterative approach by default.)
4991-
4992- // To reduce the VGPR using iterative approach, we need to iterate
4993- // over all the active lanes. Lowering consists of ComputeLoop,
4994- // which iterate over only active lanes. We use copy of EXEC register
4995- // as induction variable and every active lane modifies it using bitset0
4996- // so that we will get the next active lane for next iteration.
4991+
49974992 MachineBasicBlock::iterator I = BB.end();
4998- Register SrcReg = MI.getOperand(1).getReg();
49994993
5000- // Create Control flow for loop
5001- // Split MI's Machine Basic block into For loop
50024994 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
50034995
5004- // Create virtual registers required for lowering.
4996+ auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
4997+ auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
4998+
50054999 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
50065000 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5007- Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5008- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5009-
5010- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5011- Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5012- Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5013-
5014- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5015- Register LaneValueReg =
5016- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5001+ Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
5002+ Register ExecCopyReg1 = MRI.createVirtualRegister(WaveMaskRegClass);
5003+ Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5004+ Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
5005+ Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
5006+ Register AccReg = MRI.createVirtualRegister(DstRegClass);
5007+ Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5008+ Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5009+ Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5010+ Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
5011+ Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
5012+ Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
5013+ Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
5014+ Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
5015+ Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
5016+ Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5017+ Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
50175018
50185019 bool IsWave32 = ST.isWave32();
5019- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5020- unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50215020
5022- // Create initail values of induction variable from Exec, Accumulator and
5023- // insert branch instr to newly created ComputeBlockk
5024- uint32_t InitalValue =
5021+ uint32_t IdentityValue =
50255022 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5026- auto TmpSReg =
5027- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5028- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5029- .addImm(InitalValue);
5030- // clang-format off
5023+
5024+ BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec
5025+
5026+ BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg1).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec
5027+
5028+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
5029+ .addImm(IdentityValue);// %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
5030+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
5031+ .addImm(0);
50315032 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
50325033 .addMBB(ComputeLoop);
5033- // clang-format on
50345034
5035- // Start constructing ComputeLoop
50365035 I = ComputeLoop->end();
5037- auto Accumulator =
5038- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5039- .addReg(InitalValReg)
5040- .addMBB(&BB);
5041- auto ActiveBits =
5042- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5043- .addReg(TmpSReg->getOperand(0).getReg())
5044- .addMBB(&BB);
50455036
5037+ auto PhiActiveLanesInst =
5038+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
5039+ .addReg(ExecCopyReg)
5040+ .addMBB(&BB);// %25:sreg_64_xexec = PHI %19:sreg_64_xexec, %bb.0, %26:sreg_64_xexec, %bb.1
5041+ auto PhiAccInst =
5042+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
5043+ .addReg(AccReg)
5044+ .addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
5045+ auto PhiBPermAddrInst =
5046+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
5047+ .addReg(InitialBPermAddrReg)
5048+ .addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
50465049 // Perform the computations
5047- unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5048- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5049- .addReg(ActiveBits->getOperand(0).getReg());
5050- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5051- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5050+ BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg)
5051+ .addReg(ActiveLanesReg);//%27:sreg_32 = S_FF1_I32_B64 %25:sreg_64_xexec
5052+
5053+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
50525054 .addReg(SrcReg)
5053- .addReg(FF1->getOperand(0).getReg());
5054- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5055- .addReg(Accumulator->getOperand(0).getReg())
5056- .addReg(LaneValue->getOperand(0).getReg());
5055+ .addReg(FF1ActiveLanesReg);//%29:sreg_32_xm0 = V_READLANE_B32 %10:vgpr_32, %27:sreg_32
5056+
5057+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
5058+ .addReg(DivergentMaskReg)
5059+ .addReg(FF1ActiveLanesReg);
5060+
5061+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg);
5062+
5063+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
5064+ .addReg(AccReg1)
5065+ .addReg(FF1MaskReg);
5066+
5067+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg);
5068+
5069+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5070+ .addReg(FF1MaskReg);
5071+
5072+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg)
5073+ .addReg(UpdatedAccSGPRReg)
5074+ .addReg(AMDGPU::M0)
5075+ .addReg(AccReg1);
5076+
5077+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
5078+ .addReg(FF1MaskReg)
5079+ .addImm(2);
5080+
5081+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5082+ .addReg(FF1ActiveLanesReg);
5083+
5084+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg)
5085+ .addReg(FF1MaskX4Reg)
5086+ .addReg(AMDGPU::M0)
5087+ .addReg(BPermAddrReg);
50575088
5058- // Manipulate the iterator to get the next active lane
50595089 unsigned BITSETOpc =
50605090 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5061- auto NewActiveBits =
5062- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5063- .addReg(FF1->getOperand(0).getReg())
5064- .addReg(ActiveBits->getOperand(0).getReg());
5091+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
5092+ .addReg(FF1ActiveLanesReg)
5093+ .addReg(ActiveLanesReg);
50655094
5066- // Add phi nodes
5067- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5095+ PhiActiveLanesInst.addReg(UpdatedActiveLanesReg)
50685096 .addMBB(ComputeLoop);
5069- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5097+ PhiAccInst.addReg(UpdatedAccReg)
5098+ .addMBB(ComputeLoop);
5099+ PhiBPermAddrInst.addReg(UpdatedBPermAddrReg)
50705100 .addMBB(ComputeLoop);
50715101
5072- // Creating branching
50735102 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
50745103 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5075- .addReg(NewActiveBits->getOperand(0).getReg() )
5104+ .addReg(UpdatedActiveLanesReg )
50765105 .addImm(0);
50775106 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
50785107 .addMBB(ComputeLoop);
50795108
5109+ BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
5110+ .addReg(UpdatedBPermAddrReg)
5111+ .addReg(UpdatedAccReg)
5112+ .addImm(0);
5113+
50805114 RetBB = ComputeEnd;
5115+
50815116 }
50825117 MI.eraseFromParent();
50835118 return RetBB;
0 commit comments