Skip to content

Commit e6fa6a1

Browse files
easyonaaditgithub-actions[bot]
authored andcommitted
Automerge: [AMDGPU] Extending wave reduction intrinsics for i64 types - 3 (#151310)
Supporting Arithemtic Operations: `and`, `or`, `xor`
2 parents 083cc98 + 1e6a63e commit e6fa6a1

File tree

5 files changed

+3028
-8
lines changed

5 files changed

+3028
-8
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5356,7 +5356,11 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
53565356
return std::numeric_limits<int64_t>::min();
53575357
case AMDGPU::S_ADD_U64_PSEUDO:
53585358
case AMDGPU::S_SUB_U64_PSEUDO:
5359+
case AMDGPU::S_OR_B64:
5360+
case AMDGPU::S_XOR_B64:
53595361
return std::numeric_limits<uint64_t>::min();
5362+
case AMDGPU::S_AND_B64:
5363+
return std::numeric_limits<uint64_t>::max();
53605364
default:
53615365
llvm_unreachable(
53625366
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5398,16 +5402,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53985402
RetBB = &BB;
53995403
break;
54005404
}
5401-
case AMDGPU::V_CMP_LT_U64_e64: // umin
5402-
case AMDGPU::V_CMP_LT_I64_e64: // min
5403-
case AMDGPU::V_CMP_GT_U64_e64: // umax
5404-
case AMDGPU::V_CMP_GT_I64_e64: { // max
5405+
case AMDGPU::V_CMP_LT_U64_e64: // umin
5406+
case AMDGPU::V_CMP_LT_I64_e64: // min
5407+
case AMDGPU::V_CMP_GT_U64_e64: // umax
5408+
case AMDGPU::V_CMP_GT_I64_e64: // max
5409+
case AMDGPU::S_AND_B64:
5410+
case AMDGPU::S_OR_B64: {
54055411
// Idempotent operations.
54065412
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
54075413
RetBB = &BB;
54085414
break;
54095415
}
54105416
case AMDGPU::S_XOR_B32:
5417+
case AMDGPU::S_XOR_B64:
54115418
case AMDGPU::S_ADD_I32:
54125419
case AMDGPU::S_ADD_U64_PSEUDO:
54135420
case AMDGPU::S_SUB_I32:
@@ -5431,7 +5438,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54315438
.addReg(ExecMask);
54325439

54335440
switch (Opc) {
5434-
case AMDGPU::S_XOR_B32: {
5441+
case AMDGPU::S_XOR_B32:
5442+
case AMDGPU::S_XOR_B64: {
54355443
// Performing an XOR operation on a uniform value
54365444
// depends on the parity of the number of active lanes.
54375445
// For even parity, the result will be 0, for odd
@@ -5443,9 +5451,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54435451
.addReg(NewAccumulator->getOperand(0).getReg())
54445452
.addImm(1)
54455453
.setOperandDead(3); // Dead scc
5446-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5447-
.addReg(SrcReg)
5448-
.addReg(ParityRegister);
5454+
if (Opc == AMDGPU::S_XOR_B32) {
5455+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5456+
.addReg(SrcReg)
5457+
.addReg(ParityRegister);
5458+
} else {
5459+
Register DestSub0 =
5460+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5461+
Register DestSub1 =
5462+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463+
5464+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5465+
const TargetRegisterClass *SrcSubRC =
5466+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5467+
5468+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5469+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5470+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5471+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5472+
5473+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5474+
.add(Op1L)
5475+
.addReg(ParityRegister);
5476+
5477+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5478+
.add(Op1H)
5479+
.addReg(ParityRegister);
5480+
5481+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5482+
.addReg(DestSub0)
5483+
.addImm(AMDGPU::sub0)
5484+
.addReg(DestSub1)
5485+
.addImm(AMDGPU::sub1);
5486+
}
54495487
break;
54505488
}
54515489
case AMDGPU::S_SUB_I32: {
@@ -5643,6 +5681,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56435681
.addReg(LaneValueHiReg)
56445682
.addImm(AMDGPU::sub1);
56455683
switch (Opc) {
5684+
case AMDGPU::S_OR_B64:
5685+
case AMDGPU::S_AND_B64:
5686+
case AMDGPU::S_XOR_B64: {
5687+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5688+
.addReg(Accumulator->getOperand(0).getReg())
5689+
.addReg(LaneValue->getOperand(0).getReg())
5690+
.setOperandDead(3); // Dead scc
5691+
break;
5692+
}
56465693
case AMDGPU::V_CMP_GT_I64_e64:
56475694
case AMDGPU::V_CMP_GT_U64_e64:
56485695
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5751,10 +5798,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
57515798
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
57525799
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
57535800
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5801+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5802+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
57545803
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
57555804
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5805+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5806+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
57565807
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
57575808
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5809+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5810+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
57585811
case AMDGPU::S_UADDO_PSEUDO:
57595812
case AMDGPU::S_USUBO_PSEUDO: {
57605813
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,9 @@ defvar Operations = [
369369
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
370370
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
371371
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
372+
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
373+
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
374+
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
372375
];
373376

374377
foreach Op = Operations in {

0 commit comments

Comments
 (0)