@@ -5273,12 +5273,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52735273static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
52745274 switch (Opc) {
52755275 case AMDGPU::S_MIN_U32:
5276+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
52765277 return std::numeric_limits<uint32_t>::max();
52775278 case AMDGPU::S_MIN_I32:
5279+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
52785280 return std::numeric_limits<int32_t>::max();
52795281 case AMDGPU::S_MAX_U32:
5282+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
52805283 return std::numeric_limits<uint32_t>::min();
52815284 case AMDGPU::S_MAX_I32:
5285+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
52825286 return std::numeric_limits<int32_t>::min();
52835287 case AMDGPU::S_ADD_I32:
52845288 case AMDGPU::S_SUB_I32:
@@ -5306,16 +5310,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53065310 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
53075311 Register DstReg = MI.getOperand(0).getReg();
53085312 MachineBasicBlock *RetBB = nullptr;
5313+ bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
53095314 if (isSGPR) {
53105315 switch (Opc) {
53115316 case AMDGPU::S_MIN_U32:
5317+ case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
53125318 case AMDGPU::S_MIN_I32:
5319+ case AMDGPU::V_CMP_LT_I64_e64: /*min*/
53135320 case AMDGPU::S_MAX_U32:
5321+ case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
53145322 case AMDGPU::S_MAX_I32:
5323+ case AMDGPU::V_CMP_GT_I64_e64: /*max*/
53155324 case AMDGPU::S_AND_B32:
53165325 case AMDGPU::S_OR_B32: {
53175326 // Idempotent operations.
5318- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5327+ unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5328+ BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
53195329 RetBB = &BB;
53205330 break;
53215331 }
@@ -5400,73 +5410,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54005410 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
54015411 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
54025412 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5403- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5404-
5413+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
54055414 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
54065415 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
54075416 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5408-
5409- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5410- Register LaneValueReg =
5411- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5417+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5418+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
54125419
54135420 bool IsWave32 = ST.isWave32();
5414- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5421+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
54155422 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
54165423
54175424 // Create initial values of induction variable from Exec, Accumulator and
54185425 // insert branch instr to newly created ComputeBlock
5419- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5420- auto TmpSReg =
5421- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5422- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5423- .addImm(InitalValue);
5426+ uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5427+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5428+ if (is32BitOpc) {
5429+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5430+ .addImm(IdentityValue);
5431+ } else {
5432+ Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433+ Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5435+ .addImm(IdentityValue);
5436+ switch (Opc) {
5437+ case AMDGPU::V_CMP_LT_U64_e64:
5438+ case AMDGPU::V_CMP_LT_I64_e64:
5439+ IdentityValue = int32_t(-1); // u|min
5440+ break;
5441+ case AMDGPU::V_CMP_GT_U64_e64:
5442+ case AMDGPU::V_CMP_GT_I64_e64:
5443+ IdentityValue = int32_t(0); // u|max
5444+ break;
5445+ }
5446+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5447+ .addImm(IdentityValue);
5448+ BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5449+ .addReg(Identitylo)
5450+ .addImm(AMDGPU::sub0)
5451+ .addReg(Identityhi)
5452+ .addImm(AMDGPU::sub1);
5453+ }
54245454 // clang-format off
54255455 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
54265456 .addMBB(ComputeLoop);
54275457 // clang-format on
54285458
54295459 // Start constructing ComputeLoop
5430- I = ComputeLoop->end ();
5460+ I = ComputeLoop->begin ();
54315461 auto Accumulator =
54325462 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5433- .addReg(InitalValReg )
5463+ .addReg(IdentityValReg )
54345464 .addMBB(&BB);
54355465 auto ActiveBits =
54365466 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5437- .addReg(TmpSReg->getOperand(0).getReg() )
5467+ .addReg(LoopIterator )
54385468 .addMBB(&BB);
54395469
5470+ I = ComputeLoop->end();
5471+ MachineInstr *NewAccumulator;
54405472 // Perform the computations
54415473 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5442- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5443- .addReg(ActiveBits->getOperand(0).getReg());
5444- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5445- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5446- .addReg(SrcReg)
5447- .addReg(FF1->getOperand(0).getReg());
5448- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5449- .addReg(Accumulator->getOperand(0).getReg())
5450- .addReg(LaneValue->getOperand(0).getReg());
5451-
5474+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5475+ .addReg(ActiveBitsReg);
5476+ if (is32BitOpc) {
5477+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5478+ LaneValueReg)
5479+ .addReg(SrcReg)
5480+ .addReg(FF1Reg);
5481+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5482+ .addReg(Accumulator->getOperand(0).getReg())
5483+ .addReg(LaneValueReg);
5484+ } else {
5485+ Register LaneValueLoReg =
5486+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5487+ Register LaneValueHiReg =
5488+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5489+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5490+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5491+ const TargetRegisterClass *SrcSubRC =
5492+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5493+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5494+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5495+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5496+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5497+ // lane value input should be in an sgpr
5498+ MachineInstr *LaneValueLo =
5499+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5500+ LaneValueLoReg)
5501+ .add(Op1L)
5502+ .addReg(FF1Reg);
5503+ MachineInstr *LaneValueHi =
5504+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5505+ LaneValueHiReg)
5506+ .add(Op1H)
5507+ .addReg(FF1Reg);
5508+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5509+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5510+ .addReg(LaneValueLoReg)
5511+ .addImm(AMDGPU::sub0)
5512+ .addReg(LaneValueHiReg)
5513+ .addImm(AMDGPU::sub1);
5514+ switch (Opc) {
5515+ case AMDGPU::V_CMP_GT_I64_e64:
5516+ case AMDGPU::V_CMP_GT_U64_e64:
5517+ case AMDGPU::V_CMP_LT_I64_e64:
5518+ case AMDGPU::V_CMP_LT_U64_e64: {
5519+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5520+ Register ComparisonResultReg =
5521+ MRI.createVirtualRegister(WaveMaskRegClass);
5522+ const TargetRegisterClass *VregClass =
5523+ ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5524+ : &AMDGPU::VReg_64RegClass;
5525+ const TargetRegisterClass *VSubRegClass =
5526+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5527+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5528+ MachineOperand SrcReg0Sub0 =
5529+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5530+ VregClass, AMDGPU::sub0, VSubRegClass);
5531+ MachineOperand SrcReg0Sub1 =
5532+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5533+ VregClass, AMDGPU::sub1, VSubRegClass);
5534+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5535+ AccumulatorVReg)
5536+ .add(SrcReg0Sub0)
5537+ .addImm(AMDGPU::sub0)
5538+ .add(SrcReg0Sub1)
5539+ .addImm(AMDGPU::sub1);
5540+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5541+ .addReg(LaneValue->getOperand(0).getReg())
5542+ .addReg(AccumulatorVReg);
5543+
5544+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5545+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5546+ .addReg(LaneMaskReg)
5547+ .addReg(ActiveBitsReg);
5548+
5549+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5550+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5551+ .addReg(LaneValue->getOperand(0).getReg())
5552+ .addReg(Accumulator->getOperand(0).getReg());
5553+ break;
5554+ }
5555+ }
5556+ }
54525557 // Manipulate the iterator to get the next active lane
54535558 unsigned BITSETOpc =
54545559 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5455- auto NewActiveBits =
5456- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5457- .addReg(FF1->getOperand(0).getReg())
5458- .addReg(ActiveBits->getOperand(0).getReg());
5560+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5561+ .addReg(FF1Reg)
5562+ .addReg(ActiveBitsReg);
54595563
54605564 // Add phi nodes
54615565 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
54625566 .addMBB(ComputeLoop);
5463- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5464- .addMBB(ComputeLoop);
5567+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
54655568
54665569 // Creating branching
54675570 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
54685571 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5469- .addReg(NewActiveBits->getOperand(0).getReg() )
5572+ .addReg(NewActiveBitsReg )
54705573 .addImm(0);
54715574 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
54725575 .addMBB(ComputeLoop);
@@ -5488,12 +5591,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54885591 switch (MI.getOpcode()) {
54895592 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
54905593 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5594+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5595+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
54915596 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
54925597 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5598+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5599+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
54935600 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
54945601 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5602+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5603+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
54955604 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
54965605 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5606+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5607+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54975608 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54985609 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
54995610 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
0 commit comments