@@ -5351,55 +5351,55 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53515351 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
53525352 .addReg(ExecMask);
53535353
5354- switch (Opc) {
5355- case AMDGPU::S_XOR_B32:
5356- case AMDGPU::S_XOR_B64: {
5357- // Performing an XOR operation on a uniform value
5358- // depends on the parity of the number of active lanes.
5359- // For even parity, the result will be 0, for odd
5360- // parity the result will be the same as the input value.
5361- Register ParityRegister =
5362- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363-
5364- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5365- .addReg(NewAccumulator->getOperand(0).getReg())
5366- .addImm(1)
5367- .setOperandDead(3); // Dead scc
5368- if (is32BitOpc) {
5369- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5370- .addReg(SrcReg)
5371- .addReg(ParityRegister);
5372- } else {
5373- Register DestSub0 =
5374- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5375- Register DestSub1 =
5376- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5377-
5378- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5379- const TargetRegisterClass *SrcSubRC =
5380- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5381-
5382- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5383- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5384- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5385- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5386-
5387- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5388- .add(Op1L)
5389- .addReg(ParityRegister);
5390-
5391- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5392- .add(Op1H)
5393- .addReg(ParityRegister);
5394-
5395- BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5396- .addReg(DestSub0)
5397- .addImm(AMDGPU::sub0)
5398- .addReg(DestSub1)
5399- .addImm(AMDGPU::sub1);
5400- }
5401- break;
5402- }
5354+ switch (Opc) {
5355+ case AMDGPU::S_XOR_B32:
5356+ case AMDGPU::S_XOR_B64: {
5357+ // Performing an XOR operation on a uniform value
5358+ // depends on the parity of the number of active lanes.
5359+ // For even parity, the result will be 0, for odd
5360+ // parity the result will be the same as the input value.
5361+ Register ParityRegister =
5362+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363+
5364+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5365+ .addReg(NewAccumulator->getOperand(0).getReg())
5366+ .addImm(1)
5367+ .setOperandDead(3); // Dead scc
5368+ if (is32BitOpc) {
5369+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5370+ .addReg(SrcReg)
5371+ .addReg(ParityRegister);
5372+ } else {
5373+ Register DestSub0 =
5374+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5375+ Register DestSub1 =
5376+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5377+
5378+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5379+ const TargetRegisterClass *SrcSubRC =
5380+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5381+
5382+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5383+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5384+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5385+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5386+
5387+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5388+ .add(Op1L)
5389+ .addReg(ParityRegister);
5390+
5391+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5392+ .add(Op1H)
5393+ .addReg(ParityRegister);
5394+
5395+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5396+ .addReg(DestSub0)
5397+ .addImm(AMDGPU::sub0)
5398+ .addReg(DestSub1)
5399+ .addImm(AMDGPU::sub1);
5400+ }
5401+ break;
5402+ }
54035403 case AMDGPU::S_SUB_I32: {
54045404 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
54055405
0 commit comments