@@ -5372,39 +5372,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53725372 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53735373 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53745374 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5375- Register ActiveLanes =
5375+ Register NumActiveLanes =
53765376 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53775377
53785378 bool IsWave32 = ST.isWave32();
53795379 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53805380 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5381- unsigned CountReg =
5381+ unsigned BitCountOpc =
53825382 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
53835383
5384- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5385-
5386- auto NewAccumulator =
5387- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5388- .addReg(ExecMask);
5389-
5390- switch (Opc) {
5391- case AMDGPU::S_XOR_B32: {
5392- // Performing an XOR operation on a uniform value
5393- // depends on the parity of the number of active lanes.
5394- // For even parity, the result will be 0, for odd
5395- // parity the result will be the same as the input value.
5396- Register ParityRegister =
5397- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5398-
5399- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5400- .addReg(NewAccumulator->getOperand(0).getReg())
5401- .addImm(1)
5402- .setOperandDead(3); // Dead scc
5403- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5404- .addReg(SrcReg)
5405- .addReg(ParityRegister);
5406- break;
5407- }
5384+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5385+
5386+ auto NewAccumulator =
5387+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5388+ .addReg(ExecMask);
5389+
5390+ switch (Opc) {
5391+ case AMDGPU::S_XOR_B32: {
5392+ // Performing an XOR operation on a uniform value
5393+ // depends on the parity of the number of active lanes.
5394+ // For even parity, the result will be 0, for odd
5395+ // parity the result will be the same as the input value.
5396+ Register ParityRegister =
5397+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5398+
5399+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5400+ .addReg(NewAccumulator->getOperand(0).getReg())
5401+ .addImm(1)
5402+ .setOperandDead(3); // Dead scc
5403+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5404+ .addReg(SrcReg)
5405+ .addReg(ParityRegister);
5406+ break;
5407+ }
54085408 case AMDGPU::S_SUB_I32: {
54095409 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
54105410
@@ -5637,8 +5637,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56375637 .addReg(Accumulator->getOperand(0).getReg());
56385638 break;
56395639 }
5640- case :: AMDGPU::S_ADD_U64_PSEUDO:
5641- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5640+ case AMDGPU::S_ADD_U64_PSEUDO:
5641+ case AMDGPU::S_SUB_U64_PSEUDO: {
56425642 unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
56435643 : AMDGPU::S_SUB_U32;
56445644 unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments