Skip to content

Commit e888cf8

Browse files
authored
[AMDGPU] Add wave reduce intrinsics for float types - 2 (#168859)
Supported Ops: `fadd`, `fsub`
1 parent c15a6cc commit e888cf8

File tree

6 files changed

+2046
-4
lines changed

6 files changed

+2046
-4
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2482,7 +2482,7 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
24822482

24832483
multiclass AMDGPUWaveReduceOps {
24842484
foreach Op =
2485-
["umin", "fmin", "min", "umax", "fmax", "max", "add", "sub", "and", "or", "xor"] in {
2485+
["umin", "fmin", "min", "umax", "fmax", "max", "add", "fadd", "sub", "fsub", "and", "or", "xor"] in {
24862486
def Op : AMDGPUWaveReduce;
24872487
}
24882488
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5214,7 +5214,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52145214
break;
52155215
}
52165216
case Intrinsic::amdgcn_wave_reduce_add:
5217+
case Intrinsic::amdgcn_wave_reduce_fadd:
52175218
case Intrinsic::amdgcn_wave_reduce_sub:
5219+
case Intrinsic::amdgcn_wave_reduce_fsub:
52185220
case Intrinsic::amdgcn_wave_reduce_min:
52195221
case Intrinsic::amdgcn_wave_reduce_umin:
52205222
case Intrinsic::amdgcn_wave_reduce_fmin:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5480,6 +5480,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
54805480
return std::numeric_limits<uint32_t>::min();
54815481
case AMDGPU::S_MAX_I32:
54825482
return std::numeric_limits<int32_t>::min();
5483+
case AMDGPU::V_ADD_F32_e64: // -0.0
5484+
return 0x80000000;
5485+
case AMDGPU::V_SUB_F32_e64: // +0.0
5486+
return 0x0;
54835487
case AMDGPU::S_ADD_I32:
54845488
case AMDGPU::S_SUB_I32:
54855489
case AMDGPU::S_OR_B32:
@@ -5525,11 +5529,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
55255529
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
55265530
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
55275531
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5528-
Opc == AMDGPU::V_MAX_F32_e64;
5532+
Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5533+
Opc == AMDGPU::V_SUB_F32_e64;
55295534
}
55305535

55315536
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5532-
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
5537+
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5538+
Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
55335539
}
55345540

55355541
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5576,8 +5582,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55765582
case AMDGPU::S_XOR_B64:
55775583
case AMDGPU::S_ADD_I32:
55785584
case AMDGPU::S_ADD_U64_PSEUDO:
5585+
case AMDGPU::V_ADD_F32_e64:
55795586
case AMDGPU::S_SUB_I32:
5580-
case AMDGPU::S_SUB_U64_PSEUDO: {
5587+
case AMDGPU::S_SUB_U64_PSEUDO:
5588+
case AMDGPU::V_SUB_F32_e64: {
55815589
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
55825590
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
55835591
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5732,6 +5740,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
57325740
.addImm(AMDGPU::sub1);
57335741
break;
57345742
}
5743+
case AMDGPU::V_ADD_F32_e64:
5744+
case AMDGPU::V_SUB_F32_e64: {
5745+
Register ActiveLanesVreg =
5746+
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5747+
Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5748+
// Get number of active lanes as a float val.
5749+
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5750+
ActiveLanesVreg)
5751+
.addReg(NewAccumulator->getOperand(0).getReg())
5752+
.addImm(0) // clamp
5753+
.addImm(0); // output-modifier
5754+
5755+
// Take negation of input for SUB reduction
5756+
unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5757+
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5758+
.addImm(srcMod) // src0 modifier
5759+
.addReg(SrcReg)
5760+
.addImm(0) // src1 modifier
5761+
.addReg(ActiveLanesVreg)
5762+
.addImm(0) // clamp
5763+
.addImm(0); // output-mod
5764+
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5765+
.addReg(DstVreg);
5766+
}
57355767
}
57365768
RetBB = &BB;
57375769
}
@@ -5979,10 +6011,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
59796011
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
59806012
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
59816013
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6014+
case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6015+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
59826016
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
59836017
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
59846018
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
59856019
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6020+
case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6021+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
59866022
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
59876023
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
59886024
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ defvar Operations = [
374374

375375
WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
376376
WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
377+
WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
378+
WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
377379
];
378380

379381
foreach Op = Operations in {

0 commit comments

Comments
 (0)