@@ -5281,7 +5281,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52815281 case AMDGPU::S_MAX_I32:
52825282 return std::numeric_limits<int32_t>::min();
52835283 case AMDGPU::S_ADD_I32:
5284+ case AMDGPU::S_ADD_U64_PSEUDO:
52845285 case AMDGPU::S_SUB_I32:
5286+ case AMDGPU::S_SUB_U64_PSEUDO:
52855287 case AMDGPU::S_OR_B32:
52865288 case AMDGPU::S_XOR_B32:
52875289 return std::numeric_limits<uint32_t>::min();
@@ -5364,51 +5366,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53645366 }
53655367 case AMDGPU::S_XOR_B32:
53665368 case AMDGPU::S_ADD_I32:
5367- case AMDGPU::S_SUB_I32: {
5369+ case AMDGPU::S_ADD_U64_PSEUDO:
5370+ case AMDGPU::S_SUB_I32:
5371+ case AMDGPU::S_SUB_U64_PSEUDO: {
53685372 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53695373 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53705374 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5371- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5375+ Register ActiveLanes =
5376+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53725377
53735378 bool IsWave32 = ST.isWave32();
53745379 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53755380 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
53765381 unsigned CountReg =
53775382 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
53785383
5379- auto Exec =
53805384 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
53815385
5382- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5383- .addReg(Exec->getOperand(0).getReg());
5386+ auto NewAccumulator =
5387+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5388+ .addReg(ExecMask);
5389+
5390+ switch (Opc) {
5391+ case AMDGPU::S_XOR_B32: {
5392+ // Performing an XOR operation on a uniform value
5393+ // depends on the parity of the number of active lanes.
5394+ // For even parity, the result will be 0, for odd
5395+ // parity the result will be the same as the input value.
5396+ Register ParityRegister =
5397+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53845398
5385- switch (Opc) {
5386- case AMDGPU::S_XOR_B32: {
5387- // Performing an XOR operation on a uniform value
5388- // depends on the parity of the number of active lanes.
5389- // For even parity, the result will be 0, for odd
5390- // parity the result will be the same as the input value.
5391- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5392-
5393- auto ParityReg =
53945399 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53955400 .addReg(NewAccumulator->getOperand(0).getReg())
5396- .addImm(1);
5397- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5398- .addReg(SrcReg)
5399- .addReg(ParityReg->getOperand(0).getReg());
5400- break;
5401- }
5401+ .addImm(1)
5402+ .setOperandDead(3); // Dead scc
5403+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5404+ .addReg(SrcReg)
5405+ .addReg(ParityRegister);
5406+ break;
5407+ }
54025408 case AMDGPU::S_SUB_I32: {
54035409 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
54045410
54055411 // Take the negation of the source operand.
5406- auto InvertedValReg =
5407- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5408- .addImm(-1)
5409- .addReg(SrcReg);
5412+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5413+ .addImm(0)
5414+ .addReg(SrcReg);
54105415 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5411- .addReg(InvertedValReg->getOperand(0).getReg() )
5416+ .addReg(NegatedVal )
54125417 .addReg(NewAccumulator->getOperand(0).getReg());
54135418 break;
54145419 }
@@ -5418,6 +5423,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54185423 .addReg(NewAccumulator->getOperand(0).getReg());
54195424 break;
54205425 }
5426+ case AMDGPU::S_ADD_U64_PSEUDO:
5427+ case AMDGPU::S_SUB_U64_PSEUDO: {
5428+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5429+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5430+ Register Op1H_Op0L_Reg =
5431+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5432+ Register Op1L_Op0H_Reg =
5433+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5436+ Register NegatedValLo =
5437+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5438+ Register NegatedValHi =
5439+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5440+
5441+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5442+ const TargetRegisterClass *Src1SubRC =
5443+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5444+
5445+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5446+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5447+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5448+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5449+
5450+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5451+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5452+ .addImm(0)
5453+ .addReg(NewAccumulator->getOperand(0).getReg());
5454+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5455+ .addReg(NegatedValLo)
5456+ .addImm(31)
5457+ .setOperandDead(3); // Dead scc
5458+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5459+ .add(Op1L)
5460+ .addReg(NegatedValHi);
5461+ }
5462+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5463+ ? NegatedValLo
5464+ : NewAccumulator->getOperand(0).getReg();
5465+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5466+ .add(Op1L)
5467+ .addReg(LowOpcode);
5468+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5469+ .add(Op1L)
5470+ .addReg(LowOpcode);
5471+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5472+ .add(Op1H)
5473+ .addReg(LowOpcode);
5474+
5475+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5476+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5477+ .addReg(CarryReg)
5478+ .addReg(Op1H_Op0L_Reg)
5479+ .setOperandDead(3); // Dead scc
5480+
5481+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5482+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5483+ .addReg(HiVal)
5484+ .addReg(Op1L_Op0H_Reg)
5485+ .setOperandDead(3); // Dead scc
5486+ }
5487+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5488+ .addReg(DestSub0)
5489+ .addImm(AMDGPU::sub0)
5490+ .addReg(DestSub1)
5491+ .addImm(AMDGPU::sub1);
5492+ break;
5493+ }
54215494 }
54225495 RetBB = &BB;
54235496 }
@@ -5564,6 +5637,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55645637 .addReg(Accumulator->getOperand(0).getReg());
55655638 break;
55665639 }
5640+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5641+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5642+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5643+ : AMDGPU::S_SUB_U32;
5644+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5645+ : AMDGPU::S_SUBB_U32;
5646+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5649+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5650+ &AMDGPU::SReg_32RegClass);
5651+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5652+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5653+ &AMDGPU::SReg_32RegClass);
5654+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5655+ .add(Accumlo)
5656+ .addReg(LaneValueLo->getOperand(0).getReg());
5657+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5658+ .add(Accumhi)
5659+ .addReg(LaneValueHi->getOperand(0).getReg());
5660+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5661+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5662+ .addReg(DestLo)
5663+ .addImm(AMDGPU::sub0)
5664+ .addReg(DestHi)
5665+ .addImm(AMDGPU::sub1);
5666+ break;
5667+ }
55675668 }
55685669 }
55695670 // Manipulate the iterator to get the next active lane
@@ -5619,8 +5720,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56195720 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
56205721 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
56215722 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5723+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5724+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
56225725 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
56235726 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5727+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5728+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
56245729 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
56255730 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
56265731 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments