@@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
49404940 return LoopBB;
49414941}
49424942
4943+ static uint32_t getInitialValueForWaveReduction(unsigned Opc){
4944+ switch(Opc){
4945+ case AMDGPU::S_MIN_U32:
4946+ return std::numeric_limits<uint32_t>::max();
4947+ case AMDGPU::S_MIN_I32:
4948+ return std::numeric_limits<int32_t>::max();
4949+ case AMDGPU::S_MAX_U32:
4950+ return std::numeric_limits<u_int32_t>::lowest();
4951+ case AMDGPU::S_MAX_I32:
4952+ return std::numeric_limits<int32_t>::min();
4953+ case AMDGPU::S_ADD_I32:
4954+ case AMDGPU::S_SUB_I32:
4955+ case AMDGPU::S_OR_B32:
4956+ case AMDGPU::S_XOR_B32:
4957+ return 0x00000000;
4958+ case AMDGPU::S_AND_B32:
4959+ return 0xFFFFFFFF;
4960+ default:
4961+ llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
4962+ }
4963+ }
4964+
49434965static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49444966 MachineBasicBlock &BB,
49454967 const GCNSubtarget &ST,
@@ -4955,13 +4977,73 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49554977 Register DstReg = MI.getOperand(0).getReg();
49564978 MachineBasicBlock *RetBB = nullptr;
49574979 if (isSGPR) {
4958- // These operations with a uniform value i.e. SGPR are idempotent.
4959- // Reduced value will be same as given sgpr.
4960- // clang-format off
4961- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962- .addReg(SrcReg);
4963- // clang-format on
4964- RetBB = &BB;
4980+ switch(Opc){
4981+ case AMDGPU::S_MIN_U32:
4982+ case AMDGPU::S_MIN_I32:
4983+ case AMDGPU::S_MAX_U32:
4984+ case AMDGPU::S_MAX_I32:
4985+ case AMDGPU::S_AND_B32:
4986+ case AMDGPU::S_OR_B32:{
4987+ // Idempotent operations.
4988+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4989+ RetBB = &BB;
4990+ break;
4991+ }
4992+ case AMDGPU::S_XOR_B32:
4993+ case AMDGPU::S_ADD_I32:
4994+ case AMDGPU::S_SUB_I32:{
4995+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4996+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4997+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4998+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
4999+
5000+ bool IsWave32 = ST.isWave32();
5001+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5002+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5003+ unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5004+
5005+ auto Exec =
5006+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5007+
5008+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5009+ .addReg(Exec->getOperand(0).getReg());
5010+
5011+ switch(Opc){
5012+ case AMDGPU::S_XOR_B32:{
5013+ // Performing an XOR operation on a uniform value
5014+ // depends on the parity of the number of active lanes.
5015+ // For even parity, the result will be 0, for odd
5016+ // parity the result will be the same as the input value.
5017+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5018+
5019+ auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5020+ .addReg(NewAccumulator->getOperand(0).getReg())
5021+ .addImm(1);
5022+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5023+ .addReg(SrcReg)
5024+ .addReg(ParityReg->getOperand(0).getReg()) ;
5025+ break;
5026+ }
5027+ case AMDGPU::S_SUB_I32:{
5028+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5029+
5030+ // Take the negation of the source operand.
5031+ auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
5032+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5033+ .addReg(InvertedValReg->getOperand(0).getReg())
5034+ .addReg(NewAccumulator->getOperand(0).getReg());
5035+ break;
5036+ }
5037+ case AMDGPU::S_ADD_I32:{
5038+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5039+ .addReg(SrcReg)
5040+ .addReg(NewAccumulator->getOperand(0).getReg());
5041+ break;
5042+ }
5043+ }
5044+ RetBB = &BB;
5045+ }
5046+ }
49655047 } else {
49665048 // TODO: Implement DPP Strategy and switch based on immediate strategy
49675049 // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4997,9 +5079,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49975079 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49985080
49995081 // Create initail values of induction variable from Exec, Accumulator and
5000- // insert branch instr to newly created ComputeBlockk
5001- uint32_t InitalValue =
5002- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5082+ // insert branch instr to newly created ComputeBlock
5083+ uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
50035084 auto TmpSReg =
50045085 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
50055086 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5071,8 +5152,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
50715152 switch (MI.getOpcode()) {
50725153 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
50735154 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5155+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5156+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
50745157 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
50755158 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5159+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5160+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5161+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5162+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5163+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5164+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5165+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5166+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5167+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5168+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5169+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5170+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5171+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5172+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5173+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5174+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
50765175 case AMDGPU::S_UADDO_PSEUDO:
50775176 case AMDGPU::S_USUBO_PSEUDO: {
50785177 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments