@@ -5203,7 +5203,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52035203 case AMDGPU::S_MAX_I32:
52045204 return std::numeric_limits<int32_t>::min();
52055205 case AMDGPU::S_ADD_I32:
5206+ case AMDGPU::S_ADD_U64_PSEUDO:
52065207 case AMDGPU::S_SUB_I32:
5208+ case AMDGPU::S_SUB_U64_PSEUDO:
52075209 case AMDGPU::S_OR_B32:
52085210 case AMDGPU::S_XOR_B32:
52095211 return std::numeric_limits<uint32_t>::min();
@@ -5277,51 +5279,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52775279 }
52785280 case AMDGPU::S_XOR_B32:
52795281 case AMDGPU::S_ADD_I32:
5280- case AMDGPU::S_SUB_I32: {
5282+ case AMDGPU::S_ADD_U64_PSEUDO:
5283+ case AMDGPU::S_SUB_I32:
5284+ case AMDGPU::S_SUB_U64_PSEUDO: {
52815285 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52825286 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52835287 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5284- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5288+ Register ActiveLanes =
5289+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52855290
52865291 bool IsWave32 = ST.isWave32();
52875292 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52885293 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
52895294 unsigned CountReg =
52905295 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
52915296
5292- auto Exec =
52935297 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
52945298
5295- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5296- .addReg(Exec->getOperand(0).getReg());
5299+ auto NewAccumulator =
5300+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5301+ .addReg(ExecMask);
5302+
5303+ switch (Opc) {
5304+ case AMDGPU::S_XOR_B32: {
5305+ // Performing an XOR operation on a uniform value
5306+ // depends on the parity of the number of active lanes.
5307+ // For even parity, the result will be 0, for odd
5308+ // parity the result will be the same as the input value.
5309+ Register ParityRegister =
5310+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52975311
5298- switch (Opc) {
5299- case AMDGPU::S_XOR_B32: {
5300- // Performing an XOR operation on a uniform value
5301- // depends on the parity of the number of active lanes.
5302- // For even parity, the result will be 0, for odd
5303- // parity the result will be the same as the input value.
5304- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5305-
5306- auto ParityReg =
53075312 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53085313 .addReg(NewAccumulator->getOperand(0).getReg())
5309- .addImm(1);
5310- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5311- .addReg(SrcReg)
5312- .addReg(ParityReg->getOperand(0).getReg());
5313- break;
5314- }
5314+ .addImm(1)
5315+ .setOperandDead(3); // Dead scc
5316+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5317+ .addReg(SrcReg)
5318+ .addReg(ParityRegister);
5319+ break;
5320+ }
53155321 case AMDGPU::S_SUB_I32: {
53165322 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53175323
53185324 // Take the negation of the source operand.
5319- auto InvertedValReg =
5320- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5321- .addImm(-1)
5322- .addReg(SrcReg);
5325+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5326+ .addImm(0)
5327+ .addReg(SrcReg);
53235328 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5324- .addReg(InvertedValReg->getOperand(0).getReg() )
5329+ .addReg(NegatedVal )
53255330 .addReg(NewAccumulator->getOperand(0).getReg());
53265331 break;
53275332 }
@@ -5331,6 +5336,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53315336 .addReg(NewAccumulator->getOperand(0).getReg());
53325337 break;
53335338 }
5339+ case AMDGPU::S_ADD_U64_PSEUDO:
5340+ case AMDGPU::S_SUB_U64_PSEUDO: {
5341+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5342+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5343+ Register Op1H_Op0L_Reg =
5344+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5345+ Register Op1L_Op0H_Reg =
5346+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5347+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5348+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5349+ Register NegatedValLo =
5350+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351+ Register NegatedValHi =
5352+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5353+
5354+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5355+ const TargetRegisterClass *Src1SubRC =
5356+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5357+
5358+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5359+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5360+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5361+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5362+
5363+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5364+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5365+ .addImm(0)
5366+ .addReg(NewAccumulator->getOperand(0).getReg());
5367+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5368+ .addReg(NegatedValLo)
5369+ .addImm(31)
5370+ .setOperandDead(3); // Dead scc
5371+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5372+ .add(Op1L)
5373+ .addReg(NegatedValHi);
5374+ }
5375+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5376+ ? NegatedValLo
5377+ : NewAccumulator->getOperand(0).getReg();
5378+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5379+ .add(Op1L)
5380+ .addReg(LowOpcode);
5381+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5382+ .add(Op1L)
5383+ .addReg(LowOpcode);
5384+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5385+ .add(Op1H)
5386+ .addReg(LowOpcode);
5387+
5388+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5389+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5390+ .addReg(CarryReg)
5391+ .addReg(Op1H_Op0L_Reg)
5392+ .setOperandDead(3); // Dead scc
5393+
5394+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5395+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5396+ .addReg(HiVal)
5397+ .addReg(Op1L_Op0H_Reg)
5398+ .setOperandDead(3); // Dead scc
5399+ }
5400+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5401+ .addReg(DestSub0)
5402+ .addImm(AMDGPU::sub0)
5403+ .addReg(DestSub1)
5404+ .addImm(AMDGPU::sub1);
5405+ break;
5406+ }
53345407 }
53355408 RetBB = &BB;
53365409 }
@@ -5479,6 +5552,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54795552 .addReg(Accumulator->getOperand(0).getReg());
54805553 break;
54815554 }
5555+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5556+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5557+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5558+ : AMDGPU::S_SUB_U32;
5559+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5560+ : AMDGPU::S_SUBB_U32;
5561+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5564+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5565+ &AMDGPU::SReg_32RegClass);
5566+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5567+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5568+ &AMDGPU::SReg_32RegClass);
5569+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5570+ .add(Accumlo)
5571+ .addReg(LaneValueLo->getOperand(0).getReg());
5572+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5573+ .add(Accumhi)
5574+ .addReg(LaneValueHi->getOperand(0).getReg());
5575+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5576+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5577+ .addReg(DestLo)
5578+ .addImm(AMDGPU::sub0)
5579+ .addReg(DestHi)
5580+ .addImm(AMDGPU::sub1);
5581+ break;
5582+ }
54825583 }
54835584 }
54845585 // Manipulate the iterator to get the next active lane
@@ -5534,8 +5635,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55345635 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
55355636 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
55365637 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5638+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5639+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
55375640 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
55385641 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5642+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5643+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55395644 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55405645 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
55415646 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments