@@ -5192,6 +5192,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
51925192 return LoopBB;
51935193}
51945194
5195+ static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5196+ MachineBasicBlock *BB) {
5197+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
5198+ // For GFX12, we emit s_add_u64 and s_sub_u64.
5199+ MachineFunction *MF = BB->getParent();
5200+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5201+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5202+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5203+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5204+ const DebugLoc &DL = MI.getDebugLoc();
5205+ MachineOperand &Dest = MI.getOperand(0);
5206+ MachineOperand &Src0 = MI.getOperand(1);
5207+ MachineOperand &Src1 = MI.getOperand(2);
5208+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5209+ if (ST.hasScalarAddSub64()) {
5210+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5211+ // clang-format off
5212+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5213+ .add(Src0)
5214+ .add(Src1);
5215+ // clang-format on
5216+ } else {
5217+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
5218+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5219+
5220+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5222+
5223+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5224+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5225+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5226+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5227+
5228+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5229+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5230+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5231+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5232+
5233+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5234+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5235+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5236+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5237+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5238+ .addReg(DestSub0)
5239+ .addImm(AMDGPU::sub0)
5240+ .addReg(DestSub1)
5241+ .addImm(AMDGPU::sub1);
5242+ }
5243+ MI.eraseFromParent();
5244+ return BB;
5245+ }
5246+
51955247static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
51965248 switch (Opc) {
51975249 case AMDGPU::S_MIN_U32:
@@ -5552,43 +5604,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55525604 }
55535605 case AMDGPU::S_ADD_U64_PSEUDO:
55545606 case AMDGPU::S_SUB_U64_PSEUDO: {
5555- if (ST.hasScalarAddSub64()) {
5556- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5557- TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5558- ? AMDGPU::S_ADD_U64
5559- : AMDGPU::S_SUB_U64),
5560- DstReg)
5561- .addReg(Accumulator->getOperand(0).getReg())
5562- .addReg(LaneValue->getOperand(0).getReg());
5563- } else {
5564- unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5565- ? AMDGPU::S_ADD_U32
5566- : AMDGPU::S_SUB_U32;
5567- unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5568- ? AMDGPU::S_ADDC_U32
5569- : AMDGPU::S_SUBB_U32;
5570- Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5571- Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5572- MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5573- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5574- &AMDGPU::SReg_32RegClass);
5575- MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5576- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5577- &AMDGPU::SReg_32RegClass);
5578- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5579- .add(Accumlo)
5580- .addReg(LaneValueLo->getOperand(0).getReg());
5581- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5582- .add(Accumhi)
5583- .addReg(LaneValueHi->getOperand(0).getReg())
5584- .setOperandDead(3); // Dead scc
5585- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5586- TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5587- .addReg(DestLo)
5588- .addImm(AMDGPU::sub0)
5589- .addReg(DestHi)
5590- .addImm(AMDGPU::sub1);
5591- }
5607+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5608+ .addReg(Accumulator->getOperand(0).getReg())
5609+ .addReg(LaneValue->getOperand(0).getReg());
5610+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
55925611 break;
55935612 }
55945613 }
@@ -5601,8 +5620,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56015620 .addReg(ActiveBitsReg);
56025621
56035622 // Add phi nodes
5604- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5605- .addMBB(ComputeLoop);
5623+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56065624 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56075625
56085626 // Creating branching
@@ -5684,55 +5702,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56845702 }
56855703 case AMDGPU::S_ADD_U64_PSEUDO:
56865704 case AMDGPU::S_SUB_U64_PSEUDO: {
5687- // For targets older than GFX12, we emit a sequence of 32-bit operations.
5688- // For GFX12, we emit s_add_u64 and s_sub_u64.
5689- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5690- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5691- const DebugLoc &DL = MI.getDebugLoc();
5692- MachineOperand &Dest = MI.getOperand(0);
5693- MachineOperand &Src0 = MI.getOperand(1);
5694- MachineOperand &Src1 = MI.getOperand(2);
5695- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5696- if (Subtarget->hasScalarAddSub64()) {
5697- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5698- // clang-format off
5699- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5700- .add(Src0)
5701- .add(Src1);
5702- // clang-format on
5703- } else {
5704- const SIRegisterInfo *TRI = ST.getRegisterInfo();
5705- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5706-
5707- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5708- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5709-
5710- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5711- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5712- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5713- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5714-
5715- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5716- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5717- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5718- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5719-
5720- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5721- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5722- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5723- .add(Src0Sub0)
5724- .add(Src1Sub0);
5725- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5726- .add(Src0Sub1)
5727- .add(Src1Sub1);
5728- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5729- .addReg(DestSub0)
5730- .addImm(AMDGPU::sub0)
5731- .addReg(DestSub1)
5732- .addImm(AMDGPU::sub1);
5733- }
5734- MI.eraseFromParent();
5735- return BB;
5705+ return Expand64BitScalarArithmetic(MI, BB);
57365706 }
57375707 case AMDGPU::V_ADD_U64_PSEUDO:
57385708 case AMDGPU::V_SUB_U64_PSEUDO: {
0 commit comments