@@ -5203,7 +5203,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52035203 case AMDGPU::S_MAX_I32:
52045204 return std::numeric_limits<int32_t>::min();
52055205 case AMDGPU::S_ADD_I32:
5206+ case AMDGPU::S_ADD_U64_PSEUDO:
52065207 case AMDGPU::S_SUB_I32:
5208+ case AMDGPU::S_SUB_U64_PSEUDO:
52075209 case AMDGPU::S_OR_B32:
52085210 case AMDGPU::S_XOR_B32:
52095211 return std::numeric_limits<uint32_t>::min();
@@ -5269,51 +5271,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52695271 }
52705272 case AMDGPU::S_XOR_B32:
52715273 case AMDGPU::S_ADD_I32:
5272- case AMDGPU::S_SUB_I32: {
5274+ case AMDGPU::S_ADD_U64_PSEUDO:
5275+ case AMDGPU::S_SUB_I32:
5276+ case AMDGPU::S_SUB_U64_PSEUDO: {
52735277 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52745278 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52755279 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5276- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5280+ Register ActiveLanes =
5281+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52775282
52785283 bool IsWave32 = ST.isWave32();
52795284 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52805285 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
52815286 unsigned CountReg =
52825287 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
52835288
5284- auto Exec =
52855289 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
52865290
5287- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5288- .addReg(Exec->getOperand(0).getReg());
5291+ auto NewAccumulator =
5292+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5293+ .addReg(ExecMask);
5294+
5295+ switch (Opc) {
5296+ case AMDGPU::S_XOR_B32: {
5297+ // Performing an XOR operation on a uniform value
5298+ // depends on the parity of the number of active lanes.
5299+ // For even parity, the result will be 0, for odd
5300+ // parity the result will be the same as the input value.
5301+ Register ParityRegister =
5302+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52895303
5290- switch (Opc) {
5291- case AMDGPU::S_XOR_B32: {
5292- // Performing an XOR operation on a uniform value
5293- // depends on the parity of the number of active lanes.
5294- // For even parity, the result will be 0, for odd
5295- // parity the result will be the same as the input value.
5296- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5297-
5298- auto ParityReg =
52995304 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53005305 .addReg(NewAccumulator->getOperand(0).getReg())
5301- .addImm(1);
5302- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5303- .addReg(SrcReg)
5304- .addReg(ParityReg->getOperand(0).getReg());
5305- break;
5306- }
5306+ .addImm(1)
5307+ .setOperandDead(3); // Dead scc
5308+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5309+ .addReg(SrcReg)
5310+ .addReg(ParityRegister);
5311+ break;
5312+ }
53075313 case AMDGPU::S_SUB_I32: {
53085314 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53095315
53105316 // Take the negation of the source operand.
5311- auto InvertedValReg =
5312- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5313- .addImm(-1)
5314- .addReg(SrcReg);
5317+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5318+ .addImm(0)
5319+ .addReg(SrcReg);
53155320 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5316- .addReg(InvertedValReg->getOperand(0).getReg() )
5321+ .addReg(NegatedVal )
53175322 .addReg(NewAccumulator->getOperand(0).getReg());
53185323 break;
53195324 }
@@ -5323,6 +5328,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53235328 .addReg(NewAccumulator->getOperand(0).getReg());
53245329 break;
53255330 }
5331+ case AMDGPU::S_ADD_U64_PSEUDO:
5332+ case AMDGPU::S_SUB_U64_PSEUDO: {
5333+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5334+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5335+ Register Op1H_Op0L_Reg =
5336+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5337+ Register Op1L_Op0H_Reg =
5338+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5339+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5340+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5341+ Register NegatedValLo =
5342+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5343+ Register NegatedValHi =
5344+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5345+
5346+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5347+ const TargetRegisterClass *Src1SubRC =
5348+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5349+
5350+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5351+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5352+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5353+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5354+
5355+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5356+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5357+ .addImm(0)
5358+ .addReg(NewAccumulator->getOperand(0).getReg());
5359+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5360+ .addReg(NegatedValLo)
5361+ .addImm(31)
5362+ .setOperandDead(3); // Dead scc
5363+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5364+ .add(Op1L)
5365+ .addReg(NegatedValHi);
5366+ }
5367+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5368+ ? NegatedValLo
5369+ : NewAccumulator->getOperand(0).getReg();
5370+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5371+ .add(Op1L)
5372+ .addReg(LowOpcode);
5373+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5374+ .add(Op1L)
5375+ .addReg(LowOpcode);
5376+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5377+ .add(Op1H)
5378+ .addReg(LowOpcode);
5379+
5380+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5381+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5382+ .addReg(CarryReg)
5383+ .addReg(Op1H_Op0L_Reg)
5384+ .setOperandDead(3); // Dead scc
5385+
5386+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5387+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5388+ .addReg(HiVal)
5389+ .addReg(Op1L_Op0H_Reg)
5390+ .setOperandDead(3); // Dead scc
5391+ }
5392+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5393+ .addReg(DestSub0)
5394+ .addImm(AMDGPU::sub0)
5395+ .addReg(DestSub1)
5396+ .addImm(AMDGPU::sub1);
5397+ break;
5398+ }
53265399 }
53275400 RetBB = &BB;
53285401 }
@@ -5475,6 +5548,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54755548 .addReg(Accumulator->getOperand(0).getReg());
54765549 break;
54775550 }
5551+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5552+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5553+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5554+ : AMDGPU::S_SUB_U32;
5555+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5556+ : AMDGPU::S_SUBB_U32;
5557+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5558+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5559+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5560+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5561+ &AMDGPU::SReg_32RegClass);
5562+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5563+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5564+ &AMDGPU::SReg_32RegClass);
5565+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5566+ .add(Accumlo)
5567+ .addReg(LaneValueLo->getOperand(0).getReg());
5568+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5569+ .add(Accumhi)
5570+ .addReg(LaneValueHi->getOperand(0).getReg());
5571+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5572+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5573+ .addReg(DestLo)
5574+ .addImm(AMDGPU::sub0)
5575+ .addReg(DestHi)
5576+ .addImm(AMDGPU::sub1);
5577+ break;
5578+ }
54785579 }
54795580 }
54805581 // Manipulate the iterator to get the next active lane
@@ -5530,8 +5631,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55305631 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
55315632 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
55325633 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5634+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5635+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
55335636 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
55345637 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5638+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5639+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55355640 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55365641 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
55375642 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments