@@ -5195,12 +5195,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
51955195static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51965196 switch (Opc) {
51975197 case AMDGPU::S_MIN_U32:
5198+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
51985199 return std::numeric_limits<uint32_t>::max();
51995200 case AMDGPU::S_MIN_I32:
5201+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
52005202 return std::numeric_limits<int32_t>::max();
52015203 case AMDGPU::S_MAX_U32:
5204+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
52025205 return std::numeric_limits<uint32_t>::min();
52035206 case AMDGPU::S_MAX_I32:
5207+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
52045208 return std::numeric_limits<int32_t>::min();
52055209 case AMDGPU::S_ADD_I32:
52065210 case AMDGPU::S_SUB_I32:
@@ -5228,16 +5232,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52285232 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
52295233 Register DstReg = MI.getOperand(0).getReg();
52305234 MachineBasicBlock *RetBB = nullptr;
5235+ bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
52315236 if (isSGPR) {
52325237 switch (Opc) {
52335238 case AMDGPU::S_MIN_U32:
5239+ case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
52345240 case AMDGPU::S_MIN_I32:
5241+ case AMDGPU::V_CMP_LT_I64_e64: /*min*/
52355242 case AMDGPU::S_MAX_U32:
5243+ case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
52365244 case AMDGPU::S_MAX_I32:
5245+ case AMDGPU::V_CMP_GT_I64_e64: /*max*/
52375246 case AMDGPU::S_AND_B32:
52385247 case AMDGPU::S_OR_B32: {
52395248 // Idempotent operations.
5240- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5249+ unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5250+ BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
52415251 RetBB = &BB;
52425252 break;
52435253 }
@@ -5322,73 +5332,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53225332 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53235333 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53245334 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5325- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5326-
5335+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
53275336 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
53285337 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
53295338 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5330-
5331- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5332- Register LaneValueReg =
5333- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5339+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5340+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
53345341
53355342 bool IsWave32 = ST.isWave32();
5336- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5343+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53375344 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
53385345
53395346 // Create initial values of induction variable from Exec, Accumulator and
53405347 // insert branch instr to newly created ComputeBlock
5341- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5342- auto TmpSReg =
5343- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5344- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5345- .addImm(InitalValue);
5348+ uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5349+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5350+ if (is32BitOpc) {
5351+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5352+ .addImm(IdentityValue);
5353+ } else {
5354+ Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5355+ Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5356+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5357+ .addImm(IdentityValue);
5358+ switch (Opc) {
5359+ case AMDGPU::V_CMP_LT_U64_e64:
5360+ case AMDGPU::V_CMP_LT_I64_e64:
5361+ IdentityValue = int32_t(-1); // u|min
5362+ break;
5363+ case AMDGPU::V_CMP_GT_U64_e64:
5364+ case AMDGPU::V_CMP_GT_I64_e64:
5365+ IdentityValue = int32_t(0); // u|max
5366+ break;
5367+ }
5368+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5369+ .addImm(IdentityValue);
5370+ BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5371+ .addReg(Identitylo)
5372+ .addImm(AMDGPU::sub0)
5373+ .addReg(Identityhi)
5374+ .addImm(AMDGPU::sub1);
5375+ }
53465376 // clang-format off
53475377 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
53485378 .addMBB(ComputeLoop);
53495379 // clang-format on
53505380
53515381 // Start constructing ComputeLoop
5352- I = ComputeLoop->end ();
5382+ I = ComputeLoop->begin ();
53535383 auto Accumulator =
53545384 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5355- .addReg(InitalValReg )
5385+ .addReg(IdentityValReg )
53565386 .addMBB(&BB);
53575387 auto ActiveBits =
53585388 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5359- .addReg(TmpSReg->getOperand(0).getReg() )
5389+ .addReg(LoopIterator )
53605390 .addMBB(&BB);
53615391
5392+ I = ComputeLoop->end();
5393+ MachineInstr *NewAccumulator;
53625394 // Perform the computations
53635395 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5364- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5365- .addReg(ActiveBits->getOperand(0).getReg());
5366- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5367- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5368- .addReg(SrcReg)
5369- .addReg(FF1->getOperand(0).getReg());
5370- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5371- .addReg(Accumulator->getOperand(0).getReg())
5372- .addReg(LaneValue->getOperand(0).getReg());
5373-
5396+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5397+ .addReg(ActiveBitsReg);
5398+ if (is32BitOpc) {
5399+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5400+ LaneValueReg)
5401+ .addReg(SrcReg)
5402+ .addReg(FF1Reg);
5403+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5404+ .addReg(Accumulator->getOperand(0).getReg())
5405+ .addReg(LaneValueReg);
5406+ } else {
5407+ Register LaneValueLoReg =
5408+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5409+ Register LaneValueHiReg =
5410+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5411+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5412+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5413+ const TargetRegisterClass *SrcSubRC =
5414+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5415+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5416+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5417+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5418+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5419+ // lane value input should be in an sgpr
5420+ MachineInstr *LaneValueLo =
5421+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5422+ LaneValueLoReg)
5423+ .add(Op1L)
5424+ .addReg(FF1Reg);
5425+ MachineInstr *LaneValueHi =
5426+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5427+ LaneValueHiReg)
5428+ .add(Op1H)
5429+ .addReg(FF1Reg);
5430+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5431+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5432+ .addReg(LaneValueLoReg)
5433+ .addImm(AMDGPU::sub0)
5434+ .addReg(LaneValueHiReg)
5435+ .addImm(AMDGPU::sub1);
5436+ switch (Opc) {
5437+ case AMDGPU::V_CMP_GT_I64_e64:
5438+ case AMDGPU::V_CMP_GT_U64_e64:
5439+ case AMDGPU::V_CMP_LT_I64_e64:
5440+ case AMDGPU::V_CMP_LT_U64_e64: {
5441+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5442+ Register ComparisonResultReg =
5443+ MRI.createVirtualRegister(WaveMaskRegClass);
5444+ const TargetRegisterClass *VregClass =
5445+ ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5446+ : &AMDGPU::VReg_64RegClass;
5447+ const TargetRegisterClass *VSubRegClass =
5448+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5449+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5450+ MachineOperand SrcReg0Sub0 =
5451+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5452+ VregClass, AMDGPU::sub0, VSubRegClass);
5453+ MachineOperand SrcReg0Sub1 =
5454+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5455+ VregClass, AMDGPU::sub1, VSubRegClass);
5456+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5457+ AccumulatorVReg)
5458+ .add(SrcReg0Sub0)
5459+ .addImm(AMDGPU::sub0)
5460+ .add(SrcReg0Sub1)
5461+ .addImm(AMDGPU::sub1);
5462+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5463+ .addReg(LaneValue->getOperand(0).getReg())
5464+ .addReg(AccumulatorVReg);
5465+
5466+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5467+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5468+ .addReg(LaneMaskReg)
5469+ .addReg(ActiveBitsReg);
5470+
5471+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5472+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5473+ .addReg(LaneValue->getOperand(0).getReg())
5474+ .addReg(Accumulator->getOperand(0).getReg());
5475+ break;
5476+ }
5477+ }
5478+ }
53745479 // Manipulate the iterator to get the next active lane
53755480 unsigned BITSETOpc =
53765481 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5377- auto NewActiveBits =
5378- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5379- .addReg(FF1->getOperand(0).getReg())
5380- .addReg(ActiveBits->getOperand(0).getReg());
5482+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5483+ .addReg(FF1Reg)
5484+ .addReg(ActiveBitsReg);
53815485
53825486 // Add phi nodes
53835487 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
53845488 .addMBB(ComputeLoop);
5385- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5386- .addMBB(ComputeLoop);
5489+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
53875490
53885491 // Creating branching
53895492 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
53905493 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5391- .addReg(NewActiveBits->getOperand(0).getReg() )
5494+ .addReg(NewActiveBitsReg )
53925495 .addImm(0);
53935496 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
53945497 .addMBB(ComputeLoop);
@@ -5410,12 +5513,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54105513 switch (MI.getOpcode()) {
54115514 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
54125515 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5516+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5517+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
54135518 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
54145519 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5520+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5521+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
54155522 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
54165523 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5524+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5525+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
54175526 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
54185527 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5528+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5529+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54195530 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54205531 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
54215532 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
0 commit comments