@@ -5040,6 +5040,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
50405040 return LoopBB;
50415041}
50425042
5043+ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5044+ switch (Opc) {
5045+ case AMDGPU::S_MIN_U32:
5046+ return std::numeric_limits<uint32_t>::max();
5047+ case AMDGPU::S_MIN_I32:
5048+ return std::numeric_limits<int32_t>::max();
5049+ case AMDGPU::S_MAX_U32:
5050+ return std::numeric_limits<uint32_t>::min();
5051+ case AMDGPU::S_MAX_I32:
5052+ return std::numeric_limits<int32_t>::min();
5053+ case AMDGPU::S_ADD_I32:
5054+ case AMDGPU::S_SUB_I32:
5055+ case AMDGPU::S_OR_B32:
5056+ case AMDGPU::S_XOR_B32:
5057+ return std::numeric_limits<uint32_t>::min();
5058+ case AMDGPU::S_AND_B32:
5059+ return std::numeric_limits<uint32_t>::max();
5060+ default:
5061+ llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5062+ }
5063+ }
5064+
50435065static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50445066 MachineBasicBlock &BB,
50455067 const GCNSubtarget &ST,
@@ -5055,13 +5077,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50555077 Register DstReg = MI.getOperand(0).getReg();
50565078 MachineBasicBlock *RetBB = nullptr;
50575079 if (isSGPR) {
5058- // These operations with a uniform value i.e. SGPR are idempotent.
5059- // Reduced value will be same as given sgpr.
5060- // clang-format off
5061- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5062- .addReg(SrcReg);
5063- // clang-format on
5064- RetBB = &BB;
5080+ switch (Opc) {
5081+ case AMDGPU::S_MIN_U32:
5082+ case AMDGPU::S_MIN_I32:
5083+ case AMDGPU::S_MAX_U32:
5084+ case AMDGPU::S_MAX_I32:
5085+ case AMDGPU::S_AND_B32:
5086+ case AMDGPU::S_OR_B32: {
5087+ // Idempotent operations.
5088+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5089+ RetBB = &BB;
5090+ break;
5091+ }
5092+ case AMDGPU::S_XOR_B32:
5093+ case AMDGPU::S_ADD_I32:
5094+ case AMDGPU::S_SUB_I32: {
5095+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5096+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5097+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5098+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5099+
5100+ bool IsWave32 = ST.isWave32();
5101+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5102+ MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5103+ unsigned CountReg =
5104+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5105+
5106+ auto Exec =
5107+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5108+
5109+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5110+ .addReg(Exec->getOperand(0).getReg());
5111+
5112+ switch (Opc) {
5113+ case AMDGPU::S_XOR_B32: {
5114+ // Performing an XOR operation on a uniform value
5115+ // depends on the parity of the number of active lanes.
5116+ // For even parity, the result will be 0, for odd
5117+ // parity the result will be the same as the input value.
5118+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5119+
5120+ auto ParityReg =
5121+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5122+ .addReg(NewAccumulator->getOperand(0).getReg())
5123+ .addImm(1);
5124+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5125+ .addReg(SrcReg)
5126+ .addReg(ParityReg->getOperand(0).getReg());
5127+ break;
5128+ }
5129+ case AMDGPU::S_SUB_I32: {
5130+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5131+
5132+ // Take the negation of the source operand.
5133+ auto InvertedValReg =
5134+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5135+ .addImm(-1)
5136+ .addReg(SrcReg);
5137+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5138+ .addReg(InvertedValReg->getOperand(0).getReg())
5139+ .addReg(NewAccumulator->getOperand(0).getReg());
5140+ break;
5141+ }
5142+ case AMDGPU::S_ADD_I32: {
5143+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5144+ .addReg(SrcReg)
5145+ .addReg(NewAccumulator->getOperand(0).getReg());
5146+ break;
5147+ }
5148+ }
5149+ RetBB = &BB;
5150+ }
5151+ }
50655152 } else {
50665153 // TODO: Implement DPP Strategy and switch based on immediate strategy
50675154 // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5097,10 +5184,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50975184 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
50985185 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50995186
5100- // Create initail values of induction variable from Exec, Accumulator and
5101- // insert branch instr to newly created ComputeBlockk
5102- uint32_t InitalValue =
5103- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5187+ // Create initial values of induction variable from Exec, Accumulator and
5188+ // insert branch instr to newly created ComputeBlock
5189+ uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
51045190 auto TmpSReg =
51055191 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
51065192 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5172,8 +5258,22 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51725258 switch (MI.getOpcode()) {
51735259 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
51745260 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5261+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5262+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
51755263 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
51765264 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5265+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5266+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5267+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5268+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5269+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5270+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5271+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5272+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5273+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5274+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5275+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5276+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
51775277 case AMDGPU::S_UADDO_PSEUDO:
51785278 case AMDGPU::S_USUBO_PSEUDO: {
51795279 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments