@@ -5277,39 +5277,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52775277 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52785278 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52795279 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5280- Register ActiveLanes =
5280+ Register NumActiveLanes =
52815281 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52825282
52835283 bool IsWave32 = ST.isWave32();
52845284 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52855285 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5286- unsigned CountReg =
5286+ unsigned BitCountOpc =
52875287 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
52885288
5289- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5290-
5291- auto NewAccumulator =
5292- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5293- .addReg(ExecMask);
5294-
5295- switch (Opc) {
5296- case AMDGPU::S_XOR_B32: {
5297- // Performing an XOR operation on a uniform value
5298- // depends on the parity of the number of active lanes.
5299- // For even parity, the result will be 0, for odd
5300- // parity the result will be the same as the input value.
5301- Register ParityRegister =
5302- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5303-
5304- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5305- .addReg(NewAccumulator->getOperand(0).getReg())
5306- .addImm(1)
5307- .setOperandDead(3); // Dead scc
5308- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5309- .addReg(SrcReg)
5310- .addReg(ParityRegister);
5311- break;
5312- }
5289+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5290+
5291+ auto NewAccumulator =
5292+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5293+ .addReg(ExecMask);
5294+
5295+ switch (Opc) {
5296+ case AMDGPU::S_XOR_B32: {
5297+ // Performing an XOR operation on a uniform value
5298+ // depends on the parity of the number of active lanes.
5299+ // For even parity, the result will be 0, for odd
5300+ // parity the result will be the same as the input value.
5301+ Register ParityRegister =
5302+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5303+
5304+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5305+ .addReg(NewAccumulator->getOperand(0).getReg())
5306+ .addImm(1)
5307+ .setOperandDead(3); // Dead scc
5308+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5309+ .addReg(SrcReg)
5310+ .addReg(ParityRegister);
5311+ break;
5312+ }
53135313 case AMDGPU::S_SUB_I32: {
53145314 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53155315
@@ -5548,8 +5548,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55485548 .addReg(Accumulator->getOperand(0).getReg());
55495549 break;
55505550 }
5551- case :: AMDGPU::S_ADD_U64_PSEUDO:
5552- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5551+ case AMDGPU::S_ADD_U64_PSEUDO:
5552+ case AMDGPU::S_SUB_U64_PSEUDO: {
55535553 unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
55545554 : AMDGPU::S_SUB_U32;
55555555 unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments