@@ -5182,55 +5182,55 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51825182 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
51835183 .addReg(ExecMask);
51845184
5185- switch (Opc) {
5186- case AMDGPU::S_XOR_B32:
5187- case AMDGPU::S_XOR_B64: {
5188- // Performing an XOR operation on a uniform value
5189- // depends on the parity of the number of active lanes.
5190- // For even parity, the result will be 0, for odd
5191- // parity the result will be the same as the input value.
5192- Register ParityRegister =
5193- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5194-
5195- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5196- .addReg(NewAccumulator->getOperand(0).getReg())
5197- .addImm(1)
5198- .setOperandDead(3); // Dead scc
5199- if (is32BitOpc) {
5200- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201- .addReg(SrcReg)
5202- .addReg(ParityRegister);
5203- } else {
5204- Register DestSub0 =
5205- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5206- Register DestSub1 =
5207- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5208-
5209- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5210- const TargetRegisterClass *SrcSubRC =
5211- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5212-
5213- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5214- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5215- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5216- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5217-
5218- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5219- .add(Op1L)
5220- .addReg(ParityRegister);
5221-
5222- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5223- .add(Op1H)
5224- .addReg(ParityRegister);
5225-
5226- BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5227- .addReg(DestSub0)
5228- .addImm(AMDGPU::sub0)
5229- .addReg(DestSub1)
5230- .addImm(AMDGPU::sub1);
5231- }
5232- break;
5233- }
5185+ switch (Opc) {
5186+ case AMDGPU::S_XOR_B32:
5187+ case AMDGPU::S_XOR_B64: {
5188+ // Performing an XOR operation on a uniform value
5189+ // depends on the parity of the number of active lanes.
5190+ // For even parity, the result will be 0, for odd
5191+ // parity the result will be the same as the input value.
5192+ Register ParityRegister =
5193+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5194+
5195+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5196+ .addReg(NewAccumulator->getOperand(0).getReg())
5197+ .addImm(1)
5198+ .setOperandDead(3); // Dead scc
5199+ if (is32BitOpc) {
5200+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201+ .addReg(SrcReg)
5202+ .addReg(ParityRegister);
5203+ } else {
5204+ Register DestSub0 =
5205+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5206+ Register DestSub1 =
5207+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5208+
5209+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5210+ const TargetRegisterClass *SrcSubRC =
5211+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5212+
5213+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5214+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5215+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5216+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5217+
5218+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5219+ .add(Op1L)
5220+ .addReg(ParityRegister);
5221+
5222+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5223+ .add(Op1H)
5224+ .addReg(ParityRegister);
5225+
5226+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5227+ .addReg(DestSub0)
5228+ .addImm(AMDGPU::sub0)
5229+ .addReg(DestSub1)
5230+ .addImm(AMDGPU::sub1);
5231+ }
5232+ break;
5233+ }
52345234 case AMDGPU::S_SUB_I32: {
52355235 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
52365236
0 commit comments