@@ -5111,9 +5111,12 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51115111 case AMDGPU::S_SUB_I32:
51125112 case AMDGPU::S_SUB_U64_PSEUDO:
51135113 case AMDGPU::S_OR_B32:
5114+ case AMDGPU::S_OR_B64:
51145115 case AMDGPU::S_XOR_B32:
5116+ case AMDGPU::S_XOR_B64:
51155117 return std::numeric_limits<uint32_t>::min();
51165118 case AMDGPU::S_AND_B32:
5119+ case AMDGPU::S_AND_B64:
51175120 return std::numeric_limits<uint32_t>::max();
51185121 default:
51195122 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
@@ -5146,14 +5149,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51465149 case AMDGPU::S_MAX_I32:
51475150 case AMDGPU::V_CMP_GT_I64_e64: /*max*/
51485151 case AMDGPU::S_AND_B32:
5149- case AMDGPU::S_OR_B32: {
5152+ case AMDGPU::S_AND_B64:
5153+ case AMDGPU::S_OR_B32:
5154+ case AMDGPU::S_OR_B64: {
51505155 // Idempotent operations.
51515156 unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51525157 BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
51535158 RetBB = &BB;
51545159 break;
51555160 }
51565161 case AMDGPU::S_XOR_B32:
5162+ case AMDGPU::S_XOR_B64:
51575163 case AMDGPU::S_ADD_I32:
51585164 case AMDGPU::S_ADD_U64_PSEUDO:
51595165 case AMDGPU::S_SUB_I32:
@@ -5177,7 +5183,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51775183 .addReg(ExecMask);
51785184
51795185 switch (Opc) {
5180- case AMDGPU::S_XOR_B32: {
5186+ case AMDGPU::S_XOR_B32:
5187+ case AMDGPU::S_XOR_B64: {
51815188 // Performing an XOR operation on a uniform value
51825189 // depends on the parity of the number of active lanes.
51835190 // For even parity, the result will be 0, for odd
@@ -5189,10 +5196,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51895196 .addReg(NewAccumulator->getOperand(0).getReg())
51905197 .addImm(1)
51915198 .setOperandDead(3); // Dead scc
5192- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193- .addReg(SrcReg)
5194- .addReg(ParityRegister);
5195- break;
5199+ if (is32BitOpc) {
5200+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201+ .addReg(SrcReg)
5202+ .addReg(ParityRegister);
5203+ break;
5204+ } else {
5205+ Register DestSub0 =
5206+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5207+ Register DestSub1 =
5208+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5209+ Register Op1H_Op0L_Reg =
5210+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5211+ Register CarryReg =
5212+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5213+
5214+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5215+ const TargetRegisterClass *SrcSubRC =
5216+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5217+
5218+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5219+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5220+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5221+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5222+
5223+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5224+ .add(Op1L)
5225+ .addReg(ParityRegister);
5226+
5227+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5228+ .add(Op1H)
5229+ .addReg(ParityRegister);
5230+
5231+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5232+ .add(Op1L)
5233+ .addReg(ParityRegister);
5234+
5235+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5236+ .addReg(CarryReg)
5237+ .addReg(Op1H_Op0L_Reg)
5238+ .setOperandDead(3); // Dead scc
5239+
5240+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5241+ .addReg(DestSub0)
5242+ .addImm(AMDGPU::sub0)
5243+ .addReg(DestSub1)
5244+ .addImm(AMDGPU::sub1);
5245+ break;
5246+ }
51965247 }
51975248 case AMDGPU::S_SUB_I32: {
51985249 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
@@ -5407,6 +5458,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54075458 .addReg(LaneValueHiReg)
54085459 .addImm(AMDGPU::sub1);
54095460 switch (Opc) {
5461+ case ::AMDGPU::S_OR_B64:
5462+ case ::AMDGPU::S_AND_B64:
5463+ case ::AMDGPU::S_XOR_B64: {
5464+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5465+ .addReg(Accumulator->getOperand(0).getReg())
5466+ .addReg(LaneValue->getOperand(0).getReg())
5467+ .setOperandDead(3); // Dead scc
5468+ break;
5469+ }
54105470 case AMDGPU::V_CMP_GT_I64_e64:
54115471 case AMDGPU::V_CMP_GT_U64_e64:
54125472 case AMDGPU::V_CMP_LT_I64_e64:
@@ -5538,10 +5598,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55385598 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55395599 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55405600 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5601+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5602+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
55415603 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
55425604 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5605+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5606+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
55435607 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
55445608 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5609+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5610+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
55455611 case AMDGPU::S_UADDO_PSEUDO:
55465612 case AMDGPU::S_USUBO_PSEUDO: {
55475613 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments