@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51075107 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
51085108 return std::numeric_limits<int32_t>::min();
51095109 case AMDGPU::S_ADD_I32:
5110+ case AMDGPU::S_ADD_U64_PSEUDO:
51105111 case AMDGPU::S_SUB_I32:
5112+ case AMDGPU::S_SUB_U64_PSEUDO:
51115113 case AMDGPU::S_OR_B32:
51125114 case AMDGPU::S_XOR_B32:
51135115 return std::numeric_limits<uint32_t>::min();
@@ -5153,51 +5155,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51535155 }
51545156 case AMDGPU::S_XOR_B32:
51555157 case AMDGPU::S_ADD_I32:
5156- case AMDGPU::S_SUB_I32: {
5158+ case AMDGPU::S_ADD_U64_PSEUDO:
5159+ case AMDGPU::S_SUB_I32:
5160+ case AMDGPU::S_SUB_U64_PSEUDO: {
51575161 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
51585162 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
51595163 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5160- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5164+ Register ActiveLanes =
5165+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51615166
51625167 bool IsWave32 = ST.isWave32();
51635168 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51645169 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
51655170 unsigned CountReg =
51665171 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
51675172
5168- auto Exec =
51695173 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
51705174
5171- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5172- .addReg(Exec->getOperand(0).getReg());
5175+ auto NewAccumulator =
5176+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5177+ .addReg(ExecMask);
5178+
5179+ switch (Opc) {
5180+ case AMDGPU::S_XOR_B32: {
5181+ // Performing an XOR operation on a uniform value
5182+ // depends on the parity of the number of active lanes.
5183+ // For even parity, the result will be 0, for odd
5184+ // parity the result will be the same as the input value.
5185+ Register ParityRegister =
5186+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51735187
5174- switch (Opc) {
5175- case AMDGPU::S_XOR_B32: {
5176- // Performing an XOR operation on a uniform value
5177- // depends on the parity of the number of active lanes.
5178- // For even parity, the result will be 0, for odd
5179- // parity the result will be the same as the input value.
5180- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5181-
5182- auto ParityReg =
51835188 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
51845189 .addReg(NewAccumulator->getOperand(0).getReg())
5185- .addImm(1);
5186- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5187- .addReg(SrcReg)
5188- .addReg(ParityReg->getOperand(0).getReg());
5189- break;
5190- }
5190+ .addImm(1)
5191+ .setOperandDead(3); // Dead scc
5192+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193+ .addReg(SrcReg)
5194+ .addReg(ParityRegister);
5195+ break;
5196+ }
51915197 case AMDGPU::S_SUB_I32: {
51925198 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
51935199
51945200 // Take the negation of the source operand.
5195- auto InvertedValReg =
5196- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5197- .addImm(-1)
5198- .addReg(SrcReg);
5201+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5202+ .addImm(0)
5203+ .addReg(SrcReg);
51995204 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5200- .addReg(InvertedValReg->getOperand(0).getReg() )
5205+ .addReg(NegatedVal )
52015206 .addReg(NewAccumulator->getOperand(0).getReg());
52025207 break;
52035208 }
@@ -5207,6 +5212,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52075212 .addReg(NewAccumulator->getOperand(0).getReg());
52085213 break;
52095214 }
5215+ case AMDGPU::S_ADD_U64_PSEUDO:
5216+ case AMDGPU::S_SUB_U64_PSEUDO: {
5217+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5218+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5219+ Register Op1H_Op0L_Reg =
5220+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221+ Register Op1L_Op0H_Reg =
5222+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5223+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5225+ Register NegatedValLo =
5226+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5227+ Register NegatedValHi =
5228+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5229+
5230+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5231+ const TargetRegisterClass *Src1SubRC =
5232+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5233+
5234+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5235+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5236+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5237+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5238+
5239+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5240+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5241+ .addImm(0)
5242+ .addReg(NewAccumulator->getOperand(0).getReg());
5243+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5244+ .addReg(NegatedValLo)
5245+ .addImm(31)
5246+ .setOperandDead(3); // Dead scc
5247+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5248+ .add(Op1L)
5249+ .addReg(NegatedValHi);
5250+ }
5251+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5252+ ? NegatedValLo
5253+ : NewAccumulator->getOperand(0).getReg();
5254+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5255+ .add(Op1L)
5256+ .addReg(LowOpcode);
5257+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5258+ .add(Op1L)
5259+ .addReg(LowOpcode);
5260+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5261+ .add(Op1H)
5262+ .addReg(LowOpcode);
5263+
5264+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5265+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5266+ .addReg(CarryReg)
5267+ .addReg(Op1H_Op0L_Reg)
5268+ .setOperandDead(3); // Dead scc
5269+
5270+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5271+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5272+ .addReg(HiVal)
5273+ .addReg(Op1L_Op0H_Reg)
5274+ .setOperandDead(3); // Dead scc
5275+ }
5276+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5277+ .addReg(DestSub0)
5278+ .addImm(AMDGPU::sub0)
5279+ .addReg(DestSub1)
5280+ .addImm(AMDGPU::sub1);
5281+ break;
5282+ }
52105283 }
52115284 RetBB = &BB;
52125285 }
@@ -5374,6 +5447,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53745447 .addReg(Accumulator->getOperand(0).getReg());
53755448 break;
53765449 }
5450+ case ::AMDGPU::S_ADD_U64_PSEUDO:
5451+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
5452+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5453+ : AMDGPU::S_SUB_U32;
5454+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5455+ : AMDGPU::S_SUBB_U32;
5456+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5457+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5458+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5459+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5460+ &AMDGPU::SReg_32RegClass);
5461+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5462+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5463+ &AMDGPU::SReg_32RegClass);
5464+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5465+ .add(Accumlo)
5466+ .addReg(LaneValueLo->getOperand(0).getReg());
5467+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5468+ .add(Accumhi)
5469+ .addReg(LaneValueHi->getOperand(0).getReg());
5470+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5471+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5472+ .addReg(DestLo)
5473+ .addImm(AMDGPU::sub0)
5474+ .addReg(DestHi)
5475+ .addImm(AMDGPU::sub1);
5476+ break;
5477+ }
53775478 }
53785479 }
53795480 // Manipulate the iterator to get the next active lane
@@ -5429,8 +5530,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54295530 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54305531 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54315532 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5533+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5534+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
54325535 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
54335536 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5537+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5538+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
54345539 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
54355540 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
54365541 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
0 commit comments