@@ -5161,39 +5161,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51615161 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
51625162 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
51635163 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5164- Register ActiveLanes =
5164+ Register NumActiveLanes =
51655165 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51665166
51675167 bool IsWave32 = ST.isWave32();
51685168 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51695169 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5170- unsigned CountReg =
5170+ unsigned BitCountOpc =
51715171 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
51725172
5173- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5174-
5175- auto NewAccumulator =
5176- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5177- .addReg(ExecMask);
5178-
5179- switch (Opc) {
5180- case AMDGPU::S_XOR_B32: {
5181- // Performing an XOR operation on a uniform value
5182- // depends on the parity of the number of active lanes.
5183- // For even parity, the result will be 0, for odd
5184- // parity the result will be the same as the input value.
5185- Register ParityRegister =
5186- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5187-
5188- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5189- .addReg(NewAccumulator->getOperand(0).getReg())
5190- .addImm(1)
5191- .setOperandDead(3); // Dead scc
5192- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193- .addReg(SrcReg)
5194- .addReg(ParityRegister);
5195- break;
5196- }
5173+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5174+
5175+ auto NewAccumulator =
5176+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5177+ .addReg(ExecMask);
5178+
5179+ switch (Opc) {
5180+ case AMDGPU::S_XOR_B32: {
5181+ // Performing an XOR operation on a uniform value
5182+ // depends on the parity of the number of active lanes.
5183+ // For even parity, the result will be 0, for odd
5184+ // parity the result will be the same as the input value.
5185+ Register ParityRegister =
5186+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5187+
5188+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5189+ .addReg(NewAccumulator->getOperand(0).getReg())
5190+ .addImm(1)
5191+ .setOperandDead(3); // Dead scc
5192+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193+ .addReg(SrcReg)
5194+ .addReg(ParityRegister);
5195+ break;
5196+ }
51975197 case AMDGPU::S_SUB_I32: {
51985198 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
51995199
@@ -5447,8 +5447,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54475447 .addReg(Accumulator->getOperand(0).getReg());
54485448 break;
54495449 }
5450- case :: AMDGPU::S_ADD_U64_PSEUDO:
5451- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5450+ case AMDGPU::S_ADD_U64_PSEUDO:
5451+ case AMDGPU::S_SUB_U64_PSEUDO: {
54525452 unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
54535453 : AMDGPU::S_SUB_U32;
54545454 unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments