@@ -5281,7 +5281,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52815281 case AMDGPU::S_MAX_I32:
52825282 return std::numeric_limits<int32_t>::min();
52835283 case AMDGPU::S_ADD_I32:
5284+ case AMDGPU::S_ADD_U64_PSEUDO:
52845285 case AMDGPU::S_SUB_I32:
5286+ case AMDGPU::S_SUB_U64_PSEUDO:
52855287 case AMDGPU::S_OR_B32:
52865288 case AMDGPU::S_XOR_B32:
52875289 return std::numeric_limits<uint32_t>::min();
@@ -5355,51 +5357,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53555357 }
53565358 case AMDGPU::S_XOR_B32:
53575359 case AMDGPU::S_ADD_I32:
5358- case AMDGPU::S_SUB_I32: {
5360+ case AMDGPU::S_ADD_U64_PSEUDO:
5361+ case AMDGPU::S_SUB_I32:
5362+ case AMDGPU::S_SUB_U64_PSEUDO: {
53595363 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53605364 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53615365 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5362- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5366+ Register ActiveLanes =
5367+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53635368
53645369 bool IsWave32 = ST.isWave32();
53655370 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53665371 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
53675372 unsigned CountReg =
53685373 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
53695374
5370- auto Exec =
53715375 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
53725376
5373- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5374- .addReg(Exec->getOperand(0).getReg());
5377+ auto NewAccumulator =
5378+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5379+ .addReg(ExecMask);
5380+
5381+ switch (Opc) {
5382+ case AMDGPU::S_XOR_B32: {
5383+ // Performing an XOR operation on a uniform value
5384+ // depends on the parity of the number of active lanes.
5385+ // For even parity, the result will be 0, for odd
5386+ // parity the result will be the same as the input value.
5387+ Register ParityRegister =
5388+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53755389
5376- switch (Opc) {
5377- case AMDGPU::S_XOR_B32: {
5378- // Performing an XOR operation on a uniform value
5379- // depends on the parity of the number of active lanes.
5380- // For even parity, the result will be 0, for odd
5381- // parity the result will be the same as the input value.
5382- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5383-
5384- auto ParityReg =
53855390 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53865391 .addReg(NewAccumulator->getOperand(0).getReg())
5387- .addImm(1);
5388- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5389- .addReg(SrcReg)
5390- .addReg(ParityReg->getOperand(0).getReg());
5391- break;
5392- }
5392+ .addImm(1)
5393+ .setOperandDead(3); // Dead scc
5394+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5395+ .addReg(SrcReg)
5396+ .addReg(ParityRegister);
5397+ break;
5398+ }
53935399 case AMDGPU::S_SUB_I32: {
53945400 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53955401
53965402 // Take the negation of the source operand.
5397- auto InvertedValReg =
5398- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5399- .addImm(-1)
5400- .addReg(SrcReg);
5403+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5404+ .addImm(0)
5405+ .addReg(SrcReg);
54015406 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5402- .addReg(InvertedValReg->getOperand(0).getReg() )
5407+ .addReg(NegatedVal )
54035408 .addReg(NewAccumulator->getOperand(0).getReg());
54045409 break;
54055410 }
@@ -5409,6 +5414,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54095414 .addReg(NewAccumulator->getOperand(0).getReg());
54105415 break;
54115416 }
5417+ case AMDGPU::S_ADD_U64_PSEUDO:
5418+ case AMDGPU::S_SUB_U64_PSEUDO: {
5419+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5420+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5421+ Register Op1H_Op0L_Reg =
5422+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5423+ Register Op1L_Op0H_Reg =
5424+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5425+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5426+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5427+ Register NegatedValLo =
5428+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5429+ Register NegatedValHi =
5430+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5431+
5432+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5433+ const TargetRegisterClass *Src1SubRC =
5434+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5435+
5436+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5437+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5438+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5439+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5440+
5441+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5442+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5443+ .addImm(0)
5444+ .addReg(NewAccumulator->getOperand(0).getReg());
5445+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5446+ .addReg(NegatedValLo)
5447+ .addImm(31)
5448+ .setOperandDead(3); // Dead scc
5449+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5450+ .add(Op1L)
5451+ .addReg(NegatedValHi);
5452+ }
5453+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5454+ ? NegatedValLo
5455+ : NewAccumulator->getOperand(0).getReg();
5456+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5457+ .add(Op1L)
5458+ .addReg(LowOpcode);
5459+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5460+ .add(Op1L)
5461+ .addReg(LowOpcode);
5462+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5463+ .add(Op1H)
5464+ .addReg(LowOpcode);
5465+
5466+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5467+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5468+ .addReg(CarryReg)
5469+ .addReg(Op1H_Op0L_Reg)
5470+ .setOperandDead(3); // Dead scc
5471+
5472+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5473+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5474+ .addReg(HiVal)
5475+ .addReg(Op1L_Op0H_Reg)
5476+ .setOperandDead(3); // Dead scc
5477+ }
5478+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5479+ .addReg(DestSub0)
5480+ .addImm(AMDGPU::sub0)
5481+ .addReg(DestSub1)
5482+ .addImm(AMDGPU::sub1);
5483+ break;
5484+ }
54125485 }
54135486 RetBB = &BB;
54145487 }
@@ -5555,6 +5628,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55555628 .addReg(Accumulator->getOperand(0).getReg());
55565629 break;
55575630 }
5631+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5632+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5633+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5634+ : AMDGPU::S_SUB_U32;
5635+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5636+ : AMDGPU::S_SUBB_U32;
5637+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5638+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5639+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5640+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5641+ &AMDGPU::SReg_32RegClass);
5642+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5643+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5644+ &AMDGPU::SReg_32RegClass);
5645+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5646+ .add(Accumlo)
5647+ .addReg(LaneValueLo->getOperand(0).getReg());
5648+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5649+ .add(Accumhi)
5650+ .addReg(LaneValueHi->getOperand(0).getReg());
5651+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5652+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5653+ .addReg(DestLo)
5654+ .addImm(AMDGPU::sub0)
5655+ .addReg(DestHi)
5656+ .addImm(AMDGPU::sub1);
5657+ break;
5658+ }
55585659 }
55595660 }
55605661 // Manipulate the iterator to get the next active lane
@@ -5610,8 +5711,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56105711 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
56115712 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
56125713 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5714+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5715+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
56135716 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
56145717 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5718+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5719+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
56155720 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
56165721 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
56175722 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments