@@ -5192,6 +5192,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
51925192 return LoopBB;
51935193}
51945194
5195+ static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5196+ MachineBasicBlock *BB) {
5197+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
5198+ // For GFX12, we emit s_add_u64 and s_sub_u64.
5199+ MachineFunction *MF = BB->getParent();
5200+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5201+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5202+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5203+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5204+ const DebugLoc &DL = MI.getDebugLoc();
5205+ MachineOperand &Dest = MI.getOperand(0);
5206+ MachineOperand &Src0 = MI.getOperand(1);
5207+ MachineOperand &Src1 = MI.getOperand(2);
5208+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5209+ if (ST.hasScalarAddSub64()) {
5210+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5211+ // clang-format off
5212+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5213+ .add(Src0)
5214+ .add(Src1);
5215+ // clang-format on
5216+ } else {
5217+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
5218+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5219+
5220+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5222+
5223+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5224+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5225+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5226+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5227+
5228+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5229+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5230+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5231+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5232+
5233+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5234+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5235+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5236+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5237+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5238+ .addReg(DestSub0)
5239+ .addImm(AMDGPU::sub0)
5240+ .addReg(DestSub1)
5241+ .addImm(AMDGPU::sub1);
5242+ }
5243+ MI.eraseFromParent();
5244+ return BB;
5245+ }
5246+
51955247static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
51965248 switch (Opc) {
51975249 case AMDGPU::S_MIN_U32:
@@ -5556,43 +5608,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55565608 }
55575609 case AMDGPU::S_ADD_U64_PSEUDO:
55585610 case AMDGPU::S_SUB_U64_PSEUDO: {
5559- if (ST.hasScalarAddSub64()) {
5560- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5561- TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5562- ? AMDGPU::S_ADD_U64
5563- : AMDGPU::S_SUB_U64),
5564- DstReg)
5565- .addReg(Accumulator->getOperand(0).getReg())
5566- .addReg(LaneValue->getOperand(0).getReg());
5567- } else {
5568- unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5569- ? AMDGPU::S_ADD_U32
5570- : AMDGPU::S_SUB_U32;
5571- unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5572- ? AMDGPU::S_ADDC_U32
5573- : AMDGPU::S_SUBB_U32;
5574- Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5575- Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5576- MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5577- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5578- &AMDGPU::SReg_32RegClass);
5579- MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5580- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5581- &AMDGPU::SReg_32RegClass);
5582- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5583- .add(Accumlo)
5584- .addReg(LaneValueLo->getOperand(0).getReg());
5585- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5586- .add(Accumhi)
5587- .addReg(LaneValueHi->getOperand(0).getReg())
5588- .setOperandDead(3); // Dead scc
5589- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5590- TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5591- .addReg(DestLo)
5592- .addImm(AMDGPU::sub0)
5593- .addReg(DestHi)
5594- .addImm(AMDGPU::sub1);
5595- }
5611+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5612+ .addReg(Accumulator->getOperand(0).getReg())
5613+ .addReg(LaneValue->getOperand(0).getReg());
5614+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
55965615 break;
55975616 }
55985617 }
@@ -5605,8 +5624,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56055624 .addReg(ActiveBitsReg);
56065625
56075626 // Add phi nodes
5608- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5609- .addMBB(ComputeLoop);
5627+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56105628 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56115629
56125630 // Creating branching
@@ -5688,55 +5706,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56885706 }
56895707 case AMDGPU::S_ADD_U64_PSEUDO:
56905708 case AMDGPU::S_SUB_U64_PSEUDO: {
5691- // For targets older than GFX12, we emit a sequence of 32-bit operations.
5692- // For GFX12, we emit s_add_u64 and s_sub_u64.
5693- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5694- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5695- const DebugLoc &DL = MI.getDebugLoc();
5696- MachineOperand &Dest = MI.getOperand(0);
5697- MachineOperand &Src0 = MI.getOperand(1);
5698- MachineOperand &Src1 = MI.getOperand(2);
5699- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5700- if (Subtarget->hasScalarAddSub64()) {
5701- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5702- // clang-format off
5703- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5704- .add(Src0)
5705- .add(Src1);
5706- // clang-format on
5707- } else {
5708- const SIRegisterInfo *TRI = ST.getRegisterInfo();
5709- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5710-
5711- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5712- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5713-
5714- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5715- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5716- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5717- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5718-
5719- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5720- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5721- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5722- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5723-
5724- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5725- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5726- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5727- .add(Src0Sub0)
5728- .add(Src1Sub0);
5729- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5730- .add(Src0Sub1)
5731- .add(Src1Sub1);
5732- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5733- .addReg(DestSub0)
5734- .addImm(AMDGPU::sub0)
5735- .addReg(DestSub1)
5736- .addImm(AMDGPU::sub1);
5737- }
5738- MI.eraseFromParent();
5739- return BB;
5709+ return Expand64BitScalarArithmetic(MI, BB);
57405710 }
57415711 case AMDGPU::V_ADD_U64_PSEUDO:
57425712 case AMDGPU::V_SUB_U64_PSEUDO: {
0 commit comments