@@ -5363,39 +5363,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53635363 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53645364 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53655365 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5366- Register ActiveLanes =
5366+ Register NumActiveLanes =
53675367 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53685368
53695369 bool IsWave32 = ST.isWave32();
53705370 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53715371 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5372- unsigned CountReg =
5372+ unsigned BitCountOpc =
53735373 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
53745374
5375- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5376-
5377- auto NewAccumulator =
5378- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5379- .addReg(ExecMask);
5380-
5381- switch (Opc) {
5382- case AMDGPU::S_XOR_B32: {
5383- // Performing an XOR operation on a uniform value
5384- // depends on the parity of the number of active lanes.
5385- // For even parity, the result will be 0, for odd
5386- // parity the result will be the same as the input value.
5387- Register ParityRegister =
5388- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5389-
5390- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5391- .addReg(NewAccumulator->getOperand(0).getReg())
5392- .addImm(1)
5393- .setOperandDead(3); // Dead scc
5394- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5395- .addReg(SrcReg)
5396- .addReg(ParityRegister);
5397- break;
5398- }
5375+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5376+
5377+ auto NewAccumulator =
5378+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5379+ .addReg(ExecMask);
5380+
5381+ switch (Opc) {
5382+ case AMDGPU::S_XOR_B32: {
5383+ // Performing an XOR operation on a uniform value
5384+ // depends on the parity of the number of active lanes.
5385+ // For even parity, the result will be 0, for odd
5386+ // parity the result will be the same as the input value.
5387+ Register ParityRegister =
5388+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5389+
5390+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5391+ .addReg(NewAccumulator->getOperand(0).getReg())
5392+ .addImm(1)
5393+ .setOperandDead(3); // Dead scc
5394+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5395+ .addReg(SrcReg)
5396+ .addReg(ParityRegister);
5397+ break;
5398+ }
53995399 case AMDGPU::S_SUB_I32: {
54005400 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
54015401
@@ -5628,8 +5628,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56285628 .addReg(Accumulator->getOperand(0).getReg());
56295629 break;
56305630 }
5631- case :: AMDGPU::S_ADD_U64_PSEUDO:
5632- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5631+ case AMDGPU::S_ADD_U64_PSEUDO:
5632+ case AMDGPU::S_SUB_U64_PSEUDO: {
56335633 unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
56345634 : AMDGPU::S_SUB_U32;
56355635 unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments