@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51075107 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
51085108 return std::numeric_limits<int32_t>::min();
51095109 case AMDGPU::S_ADD_I32:
5110+ case AMDGPU::S_ADD_U64_PSEUDO:
51105111 case AMDGPU::S_SUB_I32:
5112+ case AMDGPU::S_SUB_U64_PSEUDO:
51115113 case AMDGPU::S_OR_B32:
51125114 case AMDGPU::S_XOR_B32:
51135115 return std::numeric_limits<uint32_t>::min();
@@ -5158,51 +5160,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51585160 }
51595161 case AMDGPU::S_XOR_B32:
51605162 case AMDGPU::S_ADD_I32:
5161- case AMDGPU::S_SUB_I32: {
5163+ case AMDGPU::S_ADD_U64_PSEUDO:
5164+ case AMDGPU::S_SUB_I32:
5165+ case AMDGPU::S_SUB_U64_PSEUDO: {
51625166 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
51635167 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
51645168 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5165- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5169+ Register ActiveLanes =
5170+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51665171
51675172 bool IsWave32 = ST.isWave32();
51685173 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51695174 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
51705175 unsigned CountReg =
51715176 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
51725177
5173- auto Exec =
51745178 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
51755179
5176- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5177- .addReg(Exec->getOperand(0).getReg());
5180+ auto NewAccumulator =
5181+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5182+ .addReg(ExecMask);
5183+
5184+ switch (Opc) {
5185+ case AMDGPU::S_XOR_B32: {
5186+ // Performing an XOR operation on a uniform value
5187+ // depends on the parity of the number of active lanes.
5188+ // For even parity, the result will be 0, for odd
5189+ // parity the result will be the same as the input value.
5190+ Register ParityRegister =
5191+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51785192
5179- switch (Opc) {
5180- case AMDGPU::S_XOR_B32: {
5181- // Performing an XOR operation on a uniform value
5182- // depends on the parity of the number of active lanes.
5183- // For even parity, the result will be 0, for odd
5184- // parity the result will be the same as the input value.
5185- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5186-
5187- auto ParityReg =
51885193 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
51895194 .addReg(NewAccumulator->getOperand(0).getReg())
5190- .addImm(1);
5191- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5192- .addReg(SrcReg)
5193- .addReg(ParityReg->getOperand(0).getReg());
5194- break;
5195- }
5195+ .addImm(1)
5196+ .setOperandDead(3); // Dead scc
5197+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5198+ .addReg(SrcReg)
5199+ .addReg(ParityRegister);
5200+ break;
5201+ }
51965202 case AMDGPU::S_SUB_I32: {
51975203 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
51985204
51995205 // Take the negation of the source operand.
5200- auto InvertedValReg =
5201- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5202- .addImm(-1)
5203- .addReg(SrcReg);
5206+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5207+ .addImm(0)
5208+ .addReg(SrcReg);
52045209 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5205- .addReg(InvertedValReg->getOperand(0).getReg() )
5210+ .addReg(NegatedVal )
52065211 .addReg(NewAccumulator->getOperand(0).getReg());
52075212 break;
52085213 }
@@ -5212,6 +5217,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52125217 .addReg(NewAccumulator->getOperand(0).getReg());
52135218 break;
52145219 }
5220+ case AMDGPU::S_ADD_U64_PSEUDO:
5221+ case AMDGPU::S_SUB_U64_PSEUDO: {
5222+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5223+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224+ Register Op1H_Op0L_Reg =
5225+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5226+ Register Op1L_Op0H_Reg =
5227+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5228+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5229+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5230+ Register NegatedValLo =
5231+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5232+ Register NegatedValHi =
5233+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5234+
5235+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5236+ const TargetRegisterClass *Src1SubRC =
5237+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5238+
5239+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5240+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5241+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5242+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5243+
5244+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5245+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5246+ .addImm(0)
5247+ .addReg(NewAccumulator->getOperand(0).getReg());
5248+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5249+ .addReg(NegatedValLo)
5250+ .addImm(31)
5251+ .setOperandDead(3); // Dead scc
5252+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5253+ .add(Op1L)
5254+ .addReg(NegatedValHi);
5255+ }
5256+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5257+ ? NegatedValLo
5258+ : NewAccumulator->getOperand(0).getReg();
5259+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5260+ .add(Op1L)
5261+ .addReg(LowOpcode);
5262+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5263+ .add(Op1L)
5264+ .addReg(LowOpcode);
5265+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5266+ .add(Op1H)
5267+ .addReg(LowOpcode);
5268+
5269+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5270+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5271+ .addReg(CarryReg)
5272+ .addReg(Op1H_Op0L_Reg)
5273+ .setOperandDead(3); // Dead scc
5274+
5275+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5276+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5277+ .addReg(HiVal)
5278+ .addReg(Op1L_Op0H_Reg)
5279+ .setOperandDead(3); // Dead scc
5280+ }
5281+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5282+ .addReg(DestSub0)
5283+ .addImm(AMDGPU::sub0)
5284+ .addReg(DestSub1)
5285+ .addImm(AMDGPU::sub1);
5286+ break;
5287+ }
52155288 }
52165289 RetBB = &BB;
52175290 }
@@ -5377,6 +5450,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53775450 .addReg(Accumulator->getOperand(0).getReg());
53785451 break;
53795452 }
5453+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5454+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5455+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5456+ : AMDGPU::S_SUB_U32;
5457+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5458+ : AMDGPU::S_SUBB_U32;
5459+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5460+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5461+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5462+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5463+ &AMDGPU::SReg_32RegClass);
5464+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5465+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5466+ &AMDGPU::SReg_32RegClass);
5467+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5468+ .add(Accumlo)
5469+ .addReg(LaneValueLo->getOperand(0).getReg());
5470+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5471+ .add(Accumhi)
5472+ .addReg(LaneValueHi->getOperand(0).getReg());
5473+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5474+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5475+ .addReg(DestLo)
5476+ .addImm(AMDGPU::sub0)
5477+ .addReg(DestHi)
5478+ .addImm(AMDGPU::sub1);
5479+ break;
5480+ }
53805481 }
53815482 }
53825483 // Manipulate the iterator to get the next active lane
@@ -5432,8 +5533,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54325533 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54335534 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54345535 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5536+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5537+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
54355538 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
54365539 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5540+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5541+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
54375542 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
54385543 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
54395544 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments