@@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
49404940 return LoopBB;
49414941}
49424942
4943+ static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
4944+ switch (Opc) {
4945+ case AMDGPU::S_MIN_U32:
4946+ return std::numeric_limits<uint32_t>::max();
4947+ case AMDGPU::S_MIN_I32:
4948+ return std::numeric_limits<int32_t>::max();
4949+ case AMDGPU::S_MAX_U32:
4950+ return std::numeric_limits<uint32_t>::min();
4951+ case AMDGPU::S_MAX_I32:
4952+ return std::numeric_limits<int32_t>::min();
4953+ case AMDGPU::S_ADD_I32:
4954+ case AMDGPU::S_SUB_I32:
4955+ case AMDGPU::S_OR_B32:
4956+ case AMDGPU::S_XOR_B32:
4957+ return std::numeric_limits<uint32_t>::min();
4958+ case AMDGPU::S_AND_B32:
4959+ return std::numeric_limits<uint32_t>::max();
4960+ default:
4961+ llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
4962+ }
4963+ }
4964+
49434965static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49444966 MachineBasicBlock &BB,
49454967 const GCNSubtarget &ST,
@@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49554977 Register DstReg = MI.getOperand(0).getReg();
49564978 MachineBasicBlock *RetBB = nullptr;
49574979 if (isSGPR) {
4958- // These operations with a uniform value i.e. SGPR are idempotent.
4959- // Reduced value will be same as given sgpr.
4960- // clang-format off
4961- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962- .addReg(SrcReg);
4963- // clang-format on
4964- RetBB = &BB;
4980+ switch (Opc) {
4981+ case AMDGPU::S_MIN_U32:
4982+ case AMDGPU::S_MIN_I32:
4983+ case AMDGPU::S_MAX_U32:
4984+ case AMDGPU::S_MAX_I32:
4985+ case AMDGPU::S_AND_B32:
4986+ case AMDGPU::S_OR_B32: {
4987+ // Idempotent operations.
4988+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4989+ RetBB = &BB;
4990+ break;
4991+ }
4992+ case AMDGPU::S_XOR_B32:
4993+ case AMDGPU::S_ADD_I32:
4994+ case AMDGPU::S_SUB_I32: {
4995+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4996+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4997+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4998+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
4999+
5000+ bool IsWave32 = ST.isWave32();
5001+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5002+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5003+ unsigned CountReg =
5004+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5005+
5006+ auto Exec =
5007+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5008+
5009+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5010+ .addReg(Exec->getOperand(0).getReg());
5011+
5012+ switch (Opc) {
5013+ case AMDGPU::S_XOR_B32: {
5014+ // Performing an XOR operation on a uniform value
5015+ // depends on the parity of the number of active lanes.
5016+ // For even parity, the result will be 0, for odd
5017+ // parity the result will be the same as the input value.
5018+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5019+
5020+ auto ParityReg =
5021+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5022+ .addReg(NewAccumulator->getOperand(0).getReg())
5023+ .addImm(1);
5024+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5025+ .addReg(SrcReg)
5026+ .addReg(ParityReg->getOperand(0).getReg());
5027+ break;
5028+ }
5029+ case AMDGPU::S_SUB_I32: {
5030+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5031+
5032+ // Take the negation of the source operand.
5033+ auto InvertedValReg =
5034+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5035+ .addImm(-1)
5036+ .addReg(SrcReg);
5037+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5038+ .addReg(InvertedValReg->getOperand(0).getReg())
5039+ .addReg(NewAccumulator->getOperand(0).getReg());
5040+ break;
5041+ }
5042+ case AMDGPU::S_ADD_I32: {
5043+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5044+ .addReg(SrcReg)
5045+ .addReg(NewAccumulator->getOperand(0).getReg());
5046+ break;
5047+ }
5048+ }
5049+ RetBB = &BB;
5050+ }
5051+ }
49655052 } else {
49665053 // TODO: Implement DPP Strategy and switch based on immediate strategy
49675054 // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49975084 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49985085
49995086 // Create initail values of induction variable from Exec, Accumulator and
5000- // insert branch instr to newly created ComputeBlockk
5001- uint32_t InitalValue =
5002- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5087+ // insert branch instr to newly created ComputeBlock
5088+ uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
50035089 auto TmpSReg =
50045090 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
50055091 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
50715157 switch (MI.getOpcode()) {
50725158 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
50735159 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5160+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5161+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
50745162 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
50755163 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5164+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5165+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5166+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5167+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5168+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5169+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5170+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5171+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5172+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5173+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5174+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5175+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5176+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5177+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5178+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5179+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
50765180 case AMDGPU::S_UADDO_PSEUDO:
50775181 case AMDGPU::S_USUBO_PSEUDO: {
50785182 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments