@@ -4846,6 +4846,26 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
48464846 return LoopBB;
48474847}
48484848
4849+ static uint32_t getInitialValueForWaveReduction(unsigned Opc){
4850+ switch(Opc){
4851+ case AMDGPU::S_MIN_U32:
4852+ return std::numeric_limits<uint32_t>::max();
4853+ case AMDGPU::S_MIN_I32:
4854+ return std::numeric_limits<int32_t>::max();
4855+ case AMDGPU::S_MAX_U32:
4856+ return 0;
4857+ case AMDGPU::S_MAX_I32:
4858+ return std::numeric_limits<int32_t>::min();
4859+ case AMDGPU::S_ADD_I32:
4860+ case AMDGPU::S_SUB_I32:
4861+ case AMDGPU::S_OR_B32:
4862+ case AMDGPU::S_XOR_B32:
4863+ return 0x00000000;
4864+ case AMDGPU::S_AND_B32:
4865+ return 0xFFFFFFFF;
4866+ }
4867+ }
4868+
48494869static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
48504870 MachineBasicBlock &BB,
48514871 const GCNSubtarget &ST,
@@ -4861,10 +4881,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
48614881 Register DstReg = MI.getOperand(0).getReg();
48624882 MachineBasicBlock *RetBB = nullptr;
48634883 if (isSGPR) {
4864- // These operations with a uniform value i.e. SGPR are idempotent.
4865- // Reduced value will be same as given sgpr.
4866- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4867- RetBB = &BB;
4884+ switch(Opc){
4885+ case AMDGPU::S_MIN_U32:
4886+ case AMDGPU::S_MIN_I32:
4887+ case AMDGPU::S_MAX_U32:
4888+ case AMDGPU::S_MAX_I32:
4889+ case AMDGPU::S_AND_B32:
4890+ case AMDGPU::S_OR_B32:{
4891+ // Idempotent operations.
4892+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4893+ RetBB = &BB;
4894+ break;
4895+ }
4896+ case AMDGPU::S_XOR_B32:
4897+ case AMDGPU::S_ADD_I32:
4898+ case AMDGPU::S_SUB_I32:{
4899+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4900+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4901+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4902+ Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
4903+
4904+ bool IsWave32 = ST.isWave32();
4905+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4906+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4907+ unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
4908+
4909+ // Create initail values of induction variable from Exec, Accumulator and
4910+ // insert branch instr to newly created ComputeBlock
4911+ auto Exec =
4912+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
4913+
4914+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
4915+ .addReg(Exec->getOperand(0).getReg());
4916+
4917+ switch(Opc){
4918+ case AMDGPU::S_XOR_B32:{
4919+ // Performing an XOR operation on a uniform value
4920+ // depends on the parity of the number of active lanes.
4921+ // For even parity, the result will be 0, for odd
4922+ // parity the result will be the same as the input value.
4923+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
4924+
4925+ auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
4926+ .addReg(NewAccumulator->getOperand(0).getReg())
4927+ .addImm(1);
4928+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4929+ .addReg(SrcReg)
4930+ .addReg(ParityReg->getOperand(0).getReg()) ;
4931+ break;
4932+ }
4933+ case AMDGPU::S_SUB_I32:{
4934+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
4935+
4936+ // Take the negation of the source operand.
4937+ auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
4938+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4939+ .addReg(InvertedValReg->getOperand(0).getReg())
4940+ .addReg(NewAccumulator->getOperand(0).getReg());
4941+ break;
4942+ }
4943+ case AMDGPU::S_ADD_I32:{
4944+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4945+ .addReg(SrcReg)
4946+ .addReg(NewAccumulator->getOperand(0).getReg());
4947+ break;
4948+ }
4949+ }
4950+ RetBB = &BB;
4951+ }
4952+ }
48684953 } else {
48694954 // TODO: Implement DPP Strategy and switch based on immediate strategy
48704955 // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4900,9 +4985,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49004985 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49014986
49024987 // Create initail values of induction variable from Exec, Accumulator and
4903- // insert branch instr to newly created ComputeBlockk
4904- uint32_t InitalValue =
4905- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4988+ // insert branch instr to newly created ComputeBlock
4989+ uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
4990+
49064991 auto TmpSReg =
49074992 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
49084993 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -4970,8 +5055,26 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
49705055 switch (MI.getOpcode()) {
49715056 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
49725057 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5058+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5059+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
49735060 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
49745061 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5062+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5063+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5064+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5065+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5066+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5067+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5068+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5069+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5070+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5071+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5072+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5073+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5074+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5075+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5076+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5077+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
49755078 case AMDGPU::S_UADDO_PSEUDO:
49765079 case AMDGPU::S_USUBO_PSEUDO: {
49775080 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments