@@ -5356,7 +5356,11 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
53565356 return std::numeric_limits<int64_t>::min();
53575357 case AMDGPU::S_ADD_U64_PSEUDO:
53585358 case AMDGPU::S_SUB_U64_PSEUDO:
5359+ case AMDGPU::S_OR_B64:
5360+ case AMDGPU::S_XOR_B64:
53595361 return std::numeric_limits<uint64_t>::min();
5362+ case AMDGPU::S_AND_B64:
5363+ return std::numeric_limits<uint64_t>::max();
53605364 default:
53615365 llvm_unreachable(
53625366 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5398,16 +5402,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53985402 RetBB = &BB;
53995403 break;
54005404 }
5401- case AMDGPU::V_CMP_LT_U64_e64: // umin
5402- case AMDGPU::V_CMP_LT_I64_e64: // min
5403- case AMDGPU::V_CMP_GT_U64_e64: // umax
5404- case AMDGPU::V_CMP_GT_I64_e64: { // max
5405+ case AMDGPU::V_CMP_LT_U64_e64: // umin
5406+ case AMDGPU::V_CMP_LT_I64_e64: // min
5407+ case AMDGPU::V_CMP_GT_U64_e64: // umax
5408+ case AMDGPU::V_CMP_GT_I64_e64: // max
5409+ case AMDGPU::S_AND_B64:
5410+ case AMDGPU::S_OR_B64: {
54055411 // Idempotent operations.
54065412 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
54075413 RetBB = &BB;
54085414 break;
54095415 }
54105416 case AMDGPU::S_XOR_B32:
5417+ case AMDGPU::S_XOR_B64:
54115418 case AMDGPU::S_ADD_I32:
54125419 case AMDGPU::S_ADD_U64_PSEUDO:
54135420 case AMDGPU::S_SUB_I32:
@@ -5431,7 +5438,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54315438 .addReg(ExecMask);
54325439
54335440 switch (Opc) {
5434- case AMDGPU::S_XOR_B32: {
5441+ case AMDGPU::S_XOR_B32:
5442+ case AMDGPU::S_XOR_B64: {
54355443 // Performing an XOR operation on a uniform value
54365444 // depends on the parity of the number of active lanes.
54375445 // For even parity, the result will be 0, for odd
@@ -5443,9 +5451,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54435451 .addReg(NewAccumulator->getOperand(0).getReg())
54445452 .addImm(1)
54455453 .setOperandDead(3); // Dead scc
5446- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5447- .addReg(SrcReg)
5448- .addReg(ParityRegister);
5454+ if (Opc == AMDGPU::S_XOR_B32) {
5455+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5456+ .addReg(SrcReg)
5457+ .addReg(ParityRegister);
5458+ } else {
5459+ Register DestSub0 =
5460+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5461+ Register DestSub1 =
5462+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463+
5464+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5465+ const TargetRegisterClass *SrcSubRC =
5466+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5467+
5468+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5469+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5470+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5471+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5472+
5473+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5474+ .add(Op1L)
5475+ .addReg(ParityRegister);
5476+
5477+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5478+ .add(Op1H)
5479+ .addReg(ParityRegister);
5480+
5481+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5482+ .addReg(DestSub0)
5483+ .addImm(AMDGPU::sub0)
5484+ .addReg(DestSub1)
5485+ .addImm(AMDGPU::sub1);
5486+ }
54495487 break;
54505488 }
54515489 case AMDGPU::S_SUB_I32: {
@@ -5643,6 +5681,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56435681 .addReg(LaneValueHiReg)
56445682 .addImm(AMDGPU::sub1);
56455683 switch (Opc) {
5684+ case AMDGPU::S_OR_B64:
5685+ case AMDGPU::S_AND_B64:
5686+ case AMDGPU::S_XOR_B64: {
5687+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5688+ .addReg(Accumulator->getOperand(0).getReg())
5689+ .addReg(LaneValue->getOperand(0).getReg())
5690+ .setOperandDead(3); // Dead scc
5691+ break;
5692+ }
56465693 case AMDGPU::V_CMP_GT_I64_e64:
56475694 case AMDGPU::V_CMP_GT_U64_e64:
56485695 case AMDGPU::V_CMP_LT_I64_e64:
@@ -5751,10 +5798,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
57515798 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
57525799 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
57535800 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5801+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5802+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
57545803 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
57555804 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5805+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5806+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
57565807 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
57575808 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5809+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5810+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
57585811 case AMDGPU::S_UADDO_PSEUDO:
57595812 case AMDGPU::S_USUBO_PSEUDO: {
57605813 const DebugLoc &DL = MI.getDebugLoc();
0 commit comments