@@ -5038,6 +5038,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
50385038 return LoopBB;
50395039}
50405040
5041+ static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
5042+ switch (Opc) {
5043+ case AMDGPU::S_MIN_U32:
5044+ return std::numeric_limits<uint32_t>::max();
5045+ case AMDGPU::S_MIN_I32:
5046+ return std::numeric_limits<int32_t>::max();
5047+ case AMDGPU::S_MAX_U32:
5048+ return std::numeric_limits<uint32_t>::min();
5049+ case AMDGPU::S_MAX_I32:
5050+ return std::numeric_limits<int32_t>::min();
5051+ case AMDGPU::S_ADD_I32:
5052+ case AMDGPU::S_SUB_I32:
5053+ case AMDGPU::S_OR_B32:
5054+ case AMDGPU::S_XOR_B32:
5055+ return std::numeric_limits<uint32_t>::min();
5056+ case AMDGPU::S_AND_B32:
5057+ return std::numeric_limits<uint32_t>::max();
5058+ default:
5059+ llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
5060+ }
5061+ }
5062+
50415063static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50425064 MachineBasicBlock &BB,
50435065 const GCNSubtarget &ST,
@@ -5053,13 +5075,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50535075 Register DstReg = MI.getOperand(0).getReg();
50545076 MachineBasicBlock *RetBB = nullptr;
50555077 if (isSGPR) {
5056- // These operations with a uniform value i.e. SGPR are idempotent.
5057- // Reduced value will be same as given sgpr.
5058- // clang-format off
5059- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5060- .addReg(SrcReg);
5061- // clang-format on
5062- RetBB = &BB;
5078+ switch (Opc) {
5079+ case AMDGPU::S_MIN_U32:
5080+ case AMDGPU::S_MIN_I32:
5081+ case AMDGPU::S_MAX_U32:
5082+ case AMDGPU::S_MAX_I32:
5083+ case AMDGPU::S_AND_B32:
5084+ case AMDGPU::S_OR_B32: {
5085+ // Idempotent operations.
5086+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5087+ RetBB = &BB;
5088+ break;
5089+ }
5090+ case AMDGPU::S_XOR_B32:
5091+ case AMDGPU::S_ADD_I32:
5092+ case AMDGPU::S_SUB_I32: {
5093+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5094+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5095+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5096+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5097+
5098+ bool IsWave32 = ST.isWave32();
5099+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5100+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5101+ unsigned CountReg =
5102+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5103+
5104+ auto Exec =
5105+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5106+
5107+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5108+ .addReg(Exec->getOperand(0).getReg());
5109+
5110+ switch (Opc) {
5111+ case AMDGPU::S_XOR_B32: {
5112+ // Performing an XOR operation on a uniform value
5113+ // depends on the parity of the number of active lanes.
5114+ // For even parity, the result will be 0, for odd
5115+ // parity the result will be the same as the input value.
5116+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5117+
5118+ auto ParityReg =
5119+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5120+ .addReg(NewAccumulator->getOperand(0).getReg())
5121+ .addImm(1);
5122+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5123+ .addReg(SrcReg)
5124+ .addReg(ParityReg->getOperand(0).getReg());
5125+ break;
5126+ }
5127+ case AMDGPU::S_SUB_I32: {
5128+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5129+
5130+ // Take the negation of the source operand.
5131+ auto InvertedValReg =
5132+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5133+ .addImm(-1)
5134+ .addReg(SrcReg);
5135+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5136+ .addReg(InvertedValReg->getOperand(0).getReg())
5137+ .addReg(NewAccumulator->getOperand(0).getReg());
5138+ break;
5139+ }
5140+ case AMDGPU::S_ADD_I32: {
5141+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5142+ .addReg(SrcReg)
5143+ .addReg(NewAccumulator->getOperand(0).getReg());
5144+ break;
5145+ }
5146+ }
5147+ RetBB = &BB;
5148+ }
5149+ }
50635150 } else {
50645151 // TODO: Implement DPP Strategy and switch based on immediate strategy
50655152 // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5096,9 +5183,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50965183 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50975184
50985185 // Create initail values of induction variable from Exec, Accumulator and
5099- // insert branch instr to newly created ComputeBlockk
5100- uint32_t InitalValue =
5101- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5186+ // insert branch instr to newly created ComputeBlock
5187+ uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
51025188 auto TmpSReg =
51035189 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
51045190 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5170,8 +5256,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51705256 switch (MI.getOpcode()) {
51715257 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
51725258 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5259+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5260+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
51735261 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
51745262 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5263+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5264+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5265+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5266+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5267+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5268+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5269+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5270+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5271+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5272+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5273+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5274+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5275+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5276+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5277+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5278+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
51755279 case AMDGPU::S_UADDO_PSEUDO:
51765280 case AMDGPU::S_USUBO_PSEUDO: {
51775281 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments