@@ -5487,6 +5487,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
54875487 return std::numeric_limits<uint32_t>::min();
54885488 case AMDGPU::S_AND_B32:
54895489 return std::numeric_limits<uint32_t>::max();
5490+ case AMDGPU::V_MIN_F32_e64:
5491+ case AMDGPU::V_MAX_F32_e64:
5492+ return 0x7fc00000; // qNAN
54905493 default:
54915494 llvm_unreachable(
54925495 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5521,7 +5524,12 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
55215524 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
55225525 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
55235526 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5524- Opc == AMDGPU::S_XOR_B32;
5527+ Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5528+ Opc == AMDGPU::V_MAX_F32_e64;
5529+ }
5530+
5531+ static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5532+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
55255533}
55265534
55275535static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5542,8 +5550,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55425550 switch (Opc) {
55435551 case AMDGPU::S_MIN_U32:
55445552 case AMDGPU::S_MIN_I32:
5553+ case AMDGPU::V_MIN_F32_e64:
55455554 case AMDGPU::S_MAX_U32:
55465555 case AMDGPU::S_MAX_I32:
5556+ case AMDGPU::V_MAX_F32_e64:
55475557 case AMDGPU::S_AND_B32:
55485558 case AMDGPU::S_OR_B32: {
55495559 // Idempotent operations.
@@ -5739,6 +5749,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
57395749 MachineBasicBlock::iterator I = BB.end();
57405750 Register SrcReg = MI.getOperand(1).getReg();
57415751 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5752+ bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
57425753
57435754 // Create Control flow for loop
57445755 // Split MI's Machine Basic block into For loop
@@ -5798,9 +5809,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
57985809 LaneValueReg)
57995810 .addReg(SrcReg)
58005811 .addReg(FF1Reg);
5801- NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5802- .addReg(Accumulator->getOperand(0).getReg())
5803- .addReg(LaneValueReg);
5812+ if (isFPOp) {
5813+ Register LaneValVreg =
5814+ MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5815+ Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5816+ // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5817+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5818+ LaneValVreg)
5819+ .addReg(LaneValueReg);
5820+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5821+ .addImm(0) // src0 modifier
5822+ .addReg(Accumulator->getOperand(0).getReg())
5823+ .addImm(0) // src1 modifier
5824+ .addReg(LaneValVreg)
5825+ .addImm(0) // clamp
5826+ .addImm(0); // omod
5827+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5828+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5829+ .addReg(DstVreg);
5830+ } else {
5831+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5832+ .addReg(Accumulator->getOperand(0).getReg())
5833+ .addReg(LaneValueReg);
5834+ }
58045835 } else {
58055836 Register LaneValueLoReg =
58065837 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5932,6 +5963,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
59325963 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
59335964 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
59345965 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5966+ case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5967+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
59355968 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
59365969 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
59375970 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
@@ -5940,6 +5973,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
59405973 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
59415974 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
59425975 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5976+ case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
5977+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
59435978 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
59445979 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
59455980 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
0 commit comments