@@ -5480,6 +5480,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
54805480 return std::numeric_limits<uint32_t>::min();
54815481 case AMDGPU::S_MAX_I32:
54825482 return std::numeric_limits<int32_t>::min();
5483+ case AMDGPU::V_ADD_F32_e64: // -0.0
5484+ return 0x80000000;
5485+ case AMDGPU::V_SUB_F32_e64: // +0.0
5486+ return 0x0;
54835487 case AMDGPU::S_ADD_I32:
54845488 case AMDGPU::S_SUB_I32:
54855489 case AMDGPU::S_OR_B32:
@@ -5525,11 +5529,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
55255529 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
55265530 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
55275531 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5528- Opc == AMDGPU::V_MAX_F32_e64;
5532+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5533+ Opc == AMDGPU::V_SUB_F32_e64;
55295534}
55305535
55315536static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5532- return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
5537+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5538+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
55335539}
55345540
55355541static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5576,8 +5582,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55765582 case AMDGPU::S_XOR_B64:
55775583 case AMDGPU::S_ADD_I32:
55785584 case AMDGPU::S_ADD_U64_PSEUDO:
5585+ case AMDGPU::V_ADD_F32_e64:
55795586 case AMDGPU::S_SUB_I32:
5580- case AMDGPU::S_SUB_U64_PSEUDO: {
5587+ case AMDGPU::S_SUB_U64_PSEUDO:
5588+ case AMDGPU::V_SUB_F32_e64: {
55815589 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
55825590 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
55835591 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5732,6 +5740,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
57325740 .addImm(AMDGPU::sub1);
57335741 break;
57345742 }
5743+ case AMDGPU::V_ADD_F32_e64:
5744+ case AMDGPU::V_SUB_F32_e64: {
5745+ Register ActiveLanesVreg =
5746+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5747+ Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5748+ // Get number of active lanes as a float val.
5749+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5750+ ActiveLanesVreg)
5751+ .addReg(NewAccumulator->getOperand(0).getReg())
5752+ .addImm(0) // clamp
5753+ .addImm(0); // output-modifier
5754+
5755+ // Take negation of input for SUB reduction
5756+ unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5757+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5758+ .addImm(srcMod) // src0 modifier
5759+ .addReg(SrcReg)
5760+ .addImm(0) // src1 modifier
5761+ .addReg(ActiveLanesVreg)
5762+ .addImm(0) // clamp
5763+ .addImm(0); // output-mod
5764+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5765+ .addReg(DstVreg);
5766+ }
57355767 }
57365768 RetBB = &BB;
57375769 }
@@ -5979,10 +6011,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
59796011 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
59806012 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
59816013 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6014+ case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6015+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
59826016 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
59836017 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
59846018 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
59856019 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6020+ case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6021+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
59866022 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
59876023 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
59886024 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
0 commit comments