@@ -5166,39 +5166,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51665166 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
51675167 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
51685168 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5169- Register ActiveLanes =
5169+ Register NumActiveLanes =
51705170 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51715171
51725172 bool IsWave32 = ST.isWave32();
51735173 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51745174 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5175- unsigned CountReg =
5175+ unsigned BitCountOpc =
51765176 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
51775177
5178- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5179-
5180- auto NewAccumulator =
5181- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5182- .addReg(ExecMask);
5183-
5184- switch (Opc) {
5185- case AMDGPU::S_XOR_B32: {
5186- // Performing an XOR operation on a uniform value
5187- // depends on the parity of the number of active lanes.
5188- // For even parity, the result will be 0, for odd
5189- // parity the result will be the same as the input value.
5190- Register ParityRegister =
5191- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5192-
5193- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5194- .addReg(NewAccumulator->getOperand(0).getReg())
5195- .addImm(1)
5196- .setOperandDead(3); // Dead scc
5197- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5198- .addReg(SrcReg)
5199- .addReg(ParityRegister);
5200- break;
5201- }
5178+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5179+
5180+ auto NewAccumulator =
5181+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5182+ .addReg(ExecMask);
5183+
5184+ switch (Opc) {
5185+ case AMDGPU::S_XOR_B32: {
5186+ // Performing an XOR operation on a uniform value
5187+ // depends on the parity of the number of active lanes.
5188+ // For even parity, the result will be 0, for odd
5189+ // parity the result will be the same as the input value.
5190+ Register ParityRegister =
5191+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5192+
5193+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5194+ .addReg(NewAccumulator->getOperand(0).getReg())
5195+ .addImm(1)
5196+ .setOperandDead(3); // Dead scc
5197+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5198+ .addReg(SrcReg)
5199+ .addReg(ParityRegister);
5200+ break;
5201+ }
52025202 case AMDGPU::S_SUB_I32: {
52035203 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
52045204
@@ -5450,8 +5450,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54505450 .addReg(Accumulator->getOperand(0).getReg());
54515451 break;
54525452 }
5453- case :: AMDGPU::S_ADD_U64_PSEUDO:
5454- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5453+ case AMDGPU::S_ADD_U64_PSEUDO:
5454+ case AMDGPU::S_SUB_U64_PSEUDO: {
54555455 unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
54565456 : AMDGPU::S_SUB_U32;
54575457 unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments