@@ -4861,10 +4861,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
48614861 Register DstReg = MI.getOperand(0).getReg();
48624862 MachineBasicBlock *RetBB = nullptr;
48634863 if (isSGPR) {
4864- // These operations with a uniform value i.e. SGPR are idempotent.
4865- // Reduced value will be same as given sgpr.
4866- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4867- RetBB = &BB;
4864+ switch(Opc){
4865+ case AMDGPU::S_MIN_U32:
4866+ case AMDGPU::S_MIN_I32:
4867+ case AMDGPU::S_MAX_U32:
4868+ case AMDGPU::S_MAX_I32:
4869+ case AMDGPU::S_AND_B32:
4870+ case AMDGPU::S_OR_B32:{
4871+ // These operations with a uniform value i.e. SGPR are idempotent.
4872+ // Reduced value will be same as given sgpr.
4873+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4874+ RetBB = &BB;
4875+ break;
4876+ }
4877+ case AMDGPU::S_XOR_B32:
4878+ case AMDGPU::S_ADD_I32:
4879+ case AMDGPU::S_SUB_I32:{
4880+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4881+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4882+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4883+ Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
4884+
4885+ bool IsWave32 = ST.isWave32();
4886+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4887+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4888+ unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
4889+
4890+ // Create initail values of induction variable from Exec, Accumulator and
4891+ // insert branch instr to newly created ComputeBlock
4892+ auto Exec =
4893+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
4894+
4895+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
4896+ .addReg(Exec->getOperand(0).getReg());
4897+
4898+ switch(Opc){
4899+ case AMDGPU::S_XOR_B32:{
4900+ // Performing an XOR operation on a uniform value
4901+ // depends on the number of active lanes. If there
4902+ // are an even number of active lanes, then the XOR
4903+ // will result in 0. And if there are an odd number
4904+ // of Active lanes then the XOR will result in the
4905+ // same value as that in the SGPR. This comes from
4906+ // the fact that A^A = 0 and A^0 = A.
4907+
4908+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
4909+
4910+ auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
4911+ .addReg(NewAccumulator->getOperand(0).getReg())
4912+ .addImm(1);
4913+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4914+ .addReg(SrcReg)
4915+ .addReg(ParityReg->getOperand(0).getReg()) ;
4916+ break;
4917+ }
4918+ case AMDGPU::S_SUB_I32:{
4919+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
4920+
4921+ // Take the negation of the source operand.
4922+ auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
4923+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4924+ .addReg(InvertedValReg->getOperand(0).getReg())
4925+ .addReg(NewAccumulator->getOperand(0).getReg());
4926+ break;
4927+ }
4928+ case AMDGPU::S_ADD_I32:{
4929+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4930+ .addReg(SrcReg)
4931+ .addReg(NewAccumulator->getOperand(0).getReg());
4932+ break;
4933+ }
4934+ }
4935+ RetBB = &BB;
4936+ }
4937+ }
48684938 } else {
48694939 // TODO: Implement DPP Strategy and switch based on immediate strategy
48704940 // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4900,9 +4970,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49004970 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49014971
49024972 // Create initail values of induction variable from Exec, Accumulator and
4903- // insert branch instr to newly created ComputeBlockk
4904- uint32_t InitalValue =
4905- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4973+ // insert branch instr to newly created ComputeBlock
4974+ uint32_t InitalValue;
4975+ switch(Opc){
4976+ case AMDGPU::S_MIN_U32:
4977+ InitalValue = std::numeric_limits<uint32_t>::max();
4978+ break;
4979+ case AMDGPU::S_MIN_I32:
4980+ InitalValue = std::numeric_limits<int32_t>::max();
4981+ break;
4982+ case AMDGPU::S_MAX_U32:
4983+ InitalValue = 0;
4984+ break;
4985+ case AMDGPU::S_MAX_I32:
4986+ InitalValue = std::numeric_limits<int32_t>::min();
4987+ break;
4988+ case AMDGPU::S_ADD_I32:
4989+ case AMDGPU::S_SUB_I32:
4990+ case AMDGPU::S_OR_B32:
4991+ case AMDGPU::S_XOR_B32:
4992+ InitalValue = 0x00000000;
4993+ break;
4994+ case AMDGPU::S_AND_B32:
4995+ InitalValue = 0xFFFFFFFF;
4996+ }
49064997 auto TmpSReg =
49074998 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
49084999 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -4968,10 +5059,28 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
49685059 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
49695060
49705061 switch (MI.getOpcode()) {
4971- case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32 :
5062+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32 :
49725063 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4973- case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5064+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5065+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5066+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
49745067 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5068+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5069+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5070+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
5071+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5072+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5073+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5074+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
5075+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5076+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5077+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5078+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5079+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5080+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5081+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5082+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5083+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
49755084 case AMDGPU::S_UADDO_PSEUDO:
49765085 case AMDGPU::S_USUBO_PSEUDO: {
49775086 const DebugLoc &DL = MI.getDebugLoc();
@@ -6859,7 +6968,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
68596968
68606969 SDValue Op0 = Op.getOperand(0);
68616970 SDValue Op1 = Op.getOperand(1);
6862- // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6971+ // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64
68636972 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
68646973 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
68656974 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
0 commit comments