@@ -5095,12 +5095,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
50955095static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
50965096 switch (Opc) {
50975097 case AMDGPU::S_MIN_U32:
5098+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
50985099 return std::numeric_limits<uint32_t>::max();
50995100 case AMDGPU::S_MIN_I32:
5101+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
51005102 return std::numeric_limits<int32_t>::max();
51015103 case AMDGPU::S_MAX_U32:
5104+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
51025105 return std::numeric_limits<uint32_t>::min();
51035106 case AMDGPU::S_MAX_I32:
5107+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
51045108 return std::numeric_limits<int32_t>::min();
51055109 case AMDGPU::S_ADD_I32:
51065110 case AMDGPU::S_SUB_I32:
@@ -5128,16 +5132,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51285132 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
51295133 Register DstReg = MI.getOperand(0).getReg();
51305134 MachineBasicBlock *RetBB = nullptr;
5135+ bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
51315136 if (isSGPR) {
51325137 switch (Opc) {
51335138 case AMDGPU::S_MIN_U32:
5139+ case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
51345140 case AMDGPU::S_MIN_I32:
5141+ case AMDGPU::V_CMP_LT_I64_e64: /*min*/
51355142 case AMDGPU::S_MAX_U32:
5143+ case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
51365144 case AMDGPU::S_MAX_I32:
5145+ case AMDGPU::V_CMP_GT_I64_e64: /*max*/
51375146 case AMDGPU::S_AND_B32:
51385147 case AMDGPU::S_OR_B32: {
51395148 // Idempotent operations.
5140- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5149+ unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5150+ BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
51415151 RetBB = &BB;
51425152 break;
51435153 }
@@ -5222,73 +5232,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52225232 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52235233 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52245234 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5225- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5226-
5235+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
52275236 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
52285237 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
52295238 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5230-
5231- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5232- Register LaneValueReg =
5233- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5239+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
52345241
52355242 bool IsWave32 = ST.isWave32();
5236- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5243+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52375244 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
52385245
52395246 // Create initial values of induction variable from Exec, Accumulator and
52405247 // insert branch instr to newly created ComputeBlock
5241- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5242- auto TmpSReg =
5243- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5244- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5245- .addImm(InitalValue);
5248+ uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5249+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5250+ if (is32BitOpc) {
5251+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5252+ .addImm(IdentityValue);
5253+ } else {
5254+ Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5255+ Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5256+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5257+ .addImm(IdentityValue);
5258+ switch (Opc) {
5259+ case AMDGPU::V_CMP_LT_U64_e64:
5260+ case AMDGPU::V_CMP_LT_I64_e64:
5261+ IdentityValue = int32_t(-1); // u|min
5262+ break;
5263+ case AMDGPU::V_CMP_GT_U64_e64:
5264+ case AMDGPU::V_CMP_GT_I64_e64:
5265+ IdentityValue = int32_t(0); // u|max
5266+ break;
5267+ }
5268+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5269+ .addImm(IdentityValue);
5270+ BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5271+ .addReg(Identitylo)
5272+ .addImm(AMDGPU::sub0)
5273+ .addReg(Identityhi)
5274+ .addImm(AMDGPU::sub1);
5275+ }
52465276 // clang-format off
52475277 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
52485278 .addMBB(ComputeLoop);
52495279 // clang-format on
52505280
52515281 // Start constructing ComputeLoop
5252- I = ComputeLoop->end ();
5282+ I = ComputeLoop->begin ();
52535283 auto Accumulator =
52545284 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5255- .addReg(InitalValReg )
5285+ .addReg(IdentityValReg )
52565286 .addMBB(&BB);
52575287 auto ActiveBits =
52585288 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5259- .addReg(TmpSReg->getOperand(0).getReg() )
5289+ .addReg(LoopIterator )
52605290 .addMBB(&BB);
52615291
5292+ I = ComputeLoop->end();
5293+ MachineInstr *NewAccumulator;
52625294 // Perform the computations
52635295 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5264- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5265- .addReg(ActiveBits->getOperand(0).getReg());
5266- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5267- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5268- .addReg(SrcReg)
5269- .addReg(FF1->getOperand(0).getReg());
5270- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5271- .addReg(Accumulator->getOperand(0).getReg())
5272- .addReg(LaneValue->getOperand(0).getReg());
5273-
5296+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5297+ .addReg(ActiveBitsReg);
5298+ if (is32BitOpc) {
5299+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5300+ LaneValueReg)
5301+ .addReg(SrcReg)
5302+ .addReg(FF1Reg);
5303+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5304+ .addReg(Accumulator->getOperand(0).getReg())
5305+ .addReg(LaneValueReg);
5306+ } else {
5307+ Register LaneValueLoReg =
5308+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5309+ Register LaneValueHiReg =
5310+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5311+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5312+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5313+ const TargetRegisterClass *SrcSubRC =
5314+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5315+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5316+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5317+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5318+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5319+ // lane value input should be in an sgpr
5320+ MachineInstr *LaneValueLo =
5321+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5322+ LaneValueLoReg)
5323+ .add(Op1L)
5324+ .addReg(FF1Reg);
5325+ MachineInstr *LaneValueHi =
5326+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5327+ LaneValueHiReg)
5328+ .add(Op1H)
5329+ .addReg(FF1Reg);
5330+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5331+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5332+ .addReg(LaneValueLoReg)
5333+ .addImm(AMDGPU::sub0)
5334+ .addReg(LaneValueHiReg)
5335+ .addImm(AMDGPU::sub1);
5336+ switch (Opc) {
5337+ case AMDGPU::V_CMP_GT_I64_e64:
5338+ case AMDGPU::V_CMP_GT_U64_e64:
5339+ case AMDGPU::V_CMP_LT_I64_e64:
5340+ case AMDGPU::V_CMP_LT_U64_e64: {
5341+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5342+ Register ComparisonResultReg =
5343+ MRI.createVirtualRegister(WaveMaskRegClass);
5344+ const TargetRegisterClass *VregClass =
5345+ ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5346+ : &AMDGPU::VReg_64RegClass;
5347+ const TargetRegisterClass *VSubRegClass =
5348+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5349+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5350+ MachineOperand SrcReg0Sub0 =
5351+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5352+ VregClass, AMDGPU::sub0, VSubRegClass);
5353+ MachineOperand SrcReg0Sub1 =
5354+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5355+ VregClass, AMDGPU::sub1, VSubRegClass);
5356+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5357+ AccumulatorVReg)
5358+ .add(SrcReg0Sub0)
5359+ .addImm(AMDGPU::sub0)
5360+ .add(SrcReg0Sub1)
5361+ .addImm(AMDGPU::sub1);
5362+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5363+ .addReg(LaneValue->getOperand(0).getReg())
5364+ .addReg(AccumulatorVReg);
5365+
5366+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5367+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5368+ .addReg(LaneMaskReg)
5369+ .addReg(ActiveBitsReg);
5370+
5371+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5372+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5373+ .addReg(LaneValue->getOperand(0).getReg())
5374+ .addReg(Accumulator->getOperand(0).getReg());
5375+ break;
5376+ }
5377+ }
5378+ }
52745379 // Manipulate the iterator to get the next active lane
52755380 unsigned BITSETOpc =
52765381 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5277- auto NewActiveBits =
5278- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5279- .addReg(FF1->getOperand(0).getReg())
5280- .addReg(ActiveBits->getOperand(0).getReg());
5382+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5383+ .addReg(FF1Reg)
5384+ .addReg(ActiveBitsReg);
52815385
52825386 // Add phi nodes
52835387 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
52845388 .addMBB(ComputeLoop);
5285- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5286- .addMBB(ComputeLoop);
5389+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
52875390
52885391 // Creating branching
52895392 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
52905393 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5291- .addReg(NewActiveBits->getOperand(0).getReg() )
5394+ .addReg(NewActiveBitsReg )
52925395 .addImm(0);
52935396 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
52945397 .addMBB(ComputeLoop);
@@ -5310,12 +5413,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
53105413 switch (MI.getOpcode()) {
53115414 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
53125415 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5416+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5417+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
53135418 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
53145419 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5420+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5421+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
53155422 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
53165423 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5424+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5425+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
53175426 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
53185427 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5428+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5429+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
53195430 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
53205431 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
53215432 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
0 commit comments