@@ -4940,26 +4940,26 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
49404940 return LoopBB;
49414941}
49424942
4943- static uint32_t getInitialValueForWaveReduction(unsigned Opc){
4944- switch(Opc){
4945- case AMDGPU::S_MIN_U32:
4946- return std::numeric_limits<uint32_t>::max();
4947- case AMDGPU::S_MIN_I32:
4948- return std::numeric_limits<int32_t>::max();
4949- case AMDGPU::S_MAX_U32:
4950- return std::numeric_limits<u_int32_t >::lowest ();
4951- case AMDGPU::S_MAX_I32:
4952- return std::numeric_limits<int32_t>::min();
4953- case AMDGPU::S_ADD_I32:
4954- case AMDGPU::S_SUB_I32:
4955- case AMDGPU::S_OR_B32:
4956- case AMDGPU::S_XOR_B32:
4957- return 0x00000000 ;
4958- case AMDGPU::S_AND_B32:
4959- return 0xFFFFFFFF ;
4960- default:
4961- llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
4962- }
4943+ static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
4944+ switch (Opc) {
4945+ case AMDGPU::S_MIN_U32:
4946+ return std::numeric_limits<uint32_t>::max();
4947+ case AMDGPU::S_MIN_I32:
4948+ return std::numeric_limits<int32_t>::max();
4949+ case AMDGPU::S_MAX_U32:
4950+ return std::numeric_limits<uint32_t >::min ();
4951+ case AMDGPU::S_MAX_I32:
4952+ return std::numeric_limits<int32_t>::min();
4953+ case AMDGPU::S_ADD_I32:
4954+ case AMDGPU::S_SUB_I32:
4955+ case AMDGPU::S_OR_B32:
4956+ case AMDGPU::S_XOR_B32:
4957+ return std::numeric_limits<uint32_t>::min() ;
4958+ case AMDGPU::S_AND_B32:
4959+ return std::numeric_limits<uint32_t>::max() ;
4960+ default:
4961+ llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
4962+ }
49634963}
49644964
49654965static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -4977,72 +4977,77 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49774977 Register DstReg = MI.getOperand(0).getReg();
49784978 MachineBasicBlock *RetBB = nullptr;
49794979 if (isSGPR) {
4980- switch(Opc){
4981- case AMDGPU::S_MIN_U32:
4982- case AMDGPU::S_MIN_I32:
4983- case AMDGPU::S_MAX_U32:
4984- case AMDGPU::S_MAX_I32:
4985- case AMDGPU::S_AND_B32:
4986- case AMDGPU::S_OR_B32:{
4987- // Idempotent operations.
4988- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4989- RetBB = &BB;
4990- break;
4991- }
4992- case AMDGPU::S_XOR_B32:
4993- case AMDGPU::S_ADD_I32:
4994- case AMDGPU::S_SUB_I32:{
4995- const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4996- const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4997- Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4998- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
4999-
5000- bool IsWave32 = ST.isWave32();
5001- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5002- unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5003- unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5004-
5005- auto Exec =
5006- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5007-
5008- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5009- .addReg(Exec->getOperand(0).getReg());
5010-
5011- switch(Opc){
5012- case AMDGPU::S_XOR_B32:{
5013- // Performing an XOR operation on a uniform value
5014- // depends on the parity of the number of active lanes.
5015- // For even parity, the result will be 0, for odd
5016- // parity the result will be the same as the input value.
5017- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5018-
5019- auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
4980+ switch (Opc) {
4981+ case AMDGPU::S_MIN_U32:
4982+ case AMDGPU::S_MIN_I32:
4983+ case AMDGPU::S_MAX_U32:
4984+ case AMDGPU::S_MAX_I32:
4985+ case AMDGPU::S_AND_B32:
4986+ case AMDGPU::S_OR_B32: {
4987+ // Idempotent operations.
4988+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4989+ RetBB = &BB;
4990+ break;
4991+ }
4992+ case AMDGPU::S_XOR_B32:
4993+ case AMDGPU::S_ADD_I32:
4994+ case AMDGPU::S_SUB_I32: {
4995+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4996+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4997+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4998+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
4999+
5000+ bool IsWave32 = ST.isWave32();
5001+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5002+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5003+ unsigned CountReg =
5004+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5005+
5006+ auto Exec =
5007+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5008+
5009+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5010+ .addReg(Exec->getOperand(0).getReg());
5011+
5012+ switch (Opc) {
5013+ case AMDGPU::S_XOR_B32: {
5014+ // Performing an XOR operation on a uniform value
5015+ // depends on the parity of the number of active lanes.
5016+ // For even parity, the result will be 0, for odd
5017+ // parity the result will be the same as the input value.
5018+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5019+
5020+ auto ParityReg =
5021+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
50205022 .addReg(NewAccumulator->getOperand(0).getReg())
50215023 .addImm(1);
5022- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5023- .addReg(SrcReg)
5024- .addReg(ParityReg->getOperand(0).getReg()) ;
5025- break;
5026- }
5027- case AMDGPU::S_SUB_I32:{
5028- Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5029-
5030- // Take the negation of the source operand.
5031- auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
5032- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg )
5033- .addReg(InvertedValReg->getOperand(0).getReg() )
5034- .addReg(NewAccumulator->getOperand(0).getReg() );
5035- break;
5036- }
5037- case AMDGPU::S_ADD_I32:{
5038- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5039- .addReg(SrcReg)
5040- .addReg(NewAccumulator->getOperand(0).getReg());
5041- break;
5042- }
5043- }
5044- RetBB = &BB ;
5024+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5025+ .addReg(SrcReg)
5026+ .addReg(ParityReg->getOperand(0).getReg());
5027+ break;
5028+ }
5029+ case AMDGPU::S_SUB_I32: {
5030+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5031+
5032+ // Take the negation of the source operand.
5033+ auto InvertedValReg =
5034+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal )
5035+ .addImm(-1 )
5036+ .addReg(SrcReg );
5037+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5038+ .addReg(InvertedValReg->getOperand(0).getReg())
5039+ .addReg(NewAccumulator->getOperand(0).getReg());
5040+ break;
5041+ }
5042+ case AMDGPU::S_ADD_I32: {
5043+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5044+ .addReg(SrcReg)
5045+ .addReg(NewAccumulator->getOperand(0).getReg());
5046+ break ;
50455047 }
5048+ }
5049+ RetBB = &BB;
5050+ }
50465051 }
50475052 } else {
50485053 // TODO: Implement DPP Strategy and switch based on immediate strategy
0 commit comments