@@ -5285,39 +5285,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52855285 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52865286 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52875287 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5288- Register ActiveLanes =
5288+ Register NumActiveLanes =
52895289 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52905290
52915291 bool IsWave32 = ST.isWave32();
52925292 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52935293 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5294- unsigned CountReg =
5294+ unsigned BitCountOpc =
52955295 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
52965296
5297- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5298-
5299- auto NewAccumulator =
5300- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5301- .addReg(ExecMask);
5302-
5303- switch (Opc) {
5304- case AMDGPU::S_XOR_B32: {
5305- // Performing an XOR operation on a uniform value
5306- // depends on the parity of the number of active lanes.
5307- // For even parity, the result will be 0, for odd
5308- // parity the result will be the same as the input value.
5309- Register ParityRegister =
5310- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5311-
5312- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5313- .addReg(NewAccumulator->getOperand(0).getReg())
5314- .addImm(1)
5315- .setOperandDead(3); // Dead scc
5316- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5317- .addReg(SrcReg)
5318- .addReg(ParityRegister);
5319- break;
5320- }
5297+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5298+
5299+ auto NewAccumulator =
5300+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5301+ .addReg(ExecMask);
5302+
5303+ switch (Opc) {
5304+ case AMDGPU::S_XOR_B32: {
5305+ // Performing an XOR operation on a uniform value
5306+ // depends on the parity of the number of active lanes.
5307+ // For even parity, the result will be 0, for odd
5308+ // parity the result will be the same as the input value.
5309+ Register ParityRegister =
5310+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5311+
5312+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5313+ .addReg(NewAccumulator->getOperand(0).getReg())
5314+ .addImm(1)
5315+ .setOperandDead(3); // Dead scc
5316+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5317+ .addReg(SrcReg)
5318+ .addReg(ParityRegister);
5319+ break;
5320+ }
53215321 case AMDGPU::S_SUB_I32: {
53225322 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53235323
@@ -5552,8 +5552,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55525552 .addReg(Accumulator->getOperand(0).getReg());
55535553 break;
55545554 }
5555- case :: AMDGPU::S_ADD_U64_PSEUDO:
5556- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5555+ case AMDGPU::S_ADD_U64_PSEUDO:
5556+ case AMDGPU::S_SUB_U64_PSEUDO: {
55575557 unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
55585558 : AMDGPU::S_SUB_U32;
55595559 unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments