-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
#151309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
#151309
Conversation
cca8039 to
d2db1c3
Compare
d2b8989 to
f34b6fd
Compare
d2db1c3 to
cdbe34c
Compare
|
@llvm/pr-subscribers-backend-amdgpu Author: Aaditya (easyonaadit) ChangesSupporting Arithemtic Operations: Patch is 168.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151309.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0f529ef362199..56d8e739b6493 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
@@ -5153,11 +5155,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32: {
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveLanes =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5165,39 +5170,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
- auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
- .addReg(Exec->getOperand(0).getReg());
+ auto NewAccumulator =
+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+ .addReg(ExecMask);
+
+ switch (Opc) {
+ case AMDGPU::S_XOR_B32: {
+ // Performing an XOR operation on a uniform value
+ // depends on the parity of the number of active lanes.
+ // For even parity, the result will be 0, for odd
+ // parity the result will be the same as the input value.
+ Register ParityRegister =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- switch (Opc) {
- case AMDGPU::S_XOR_B32: {
- // Performing an XOR operation on a uniform value
- // depends on the parity of the number of active lanes.
- // For even parity, the result will be 0, for odd
- // parity the result will be the same as the input value.
- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
- auto ParityReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
- .addImm(1);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(SrcReg)
- .addReg(ParityReg->getOperand(0).getReg());
- break;
- }
+ .addImm(1)
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityRegister);
+ break;
+ }
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
// Take the negation of the source operand.
- auto InvertedValReg =
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
- .addImm(-1)
- .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
+ .addImm(0)
+ .addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NegatedVal)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
@@ -5207,6 +5212,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1H_Op0L_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1L_Op0H_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
+ .addImm(0)
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+ .addReg(NegatedValLo)
+ .addImm(31)
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+ .add(Op1L)
+ .addReg(NegatedValHi);
+ }
+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
+ ? NegatedValLo
+ : NewAccumulator->getOperand(0).getReg();
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+ .add(Op1H)
+ .addReg(LowOpcode);
+
+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
+ .addReg(CarryReg)
+ .addReg(Op1H_Op0L_Reg)
+ .setOperandDead(3); // Dead scc
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+ .addReg(HiVal)
+ .addReg(Op1L_Op0H_Reg)
+ .setOperandDead(3); // Dead scc
+ }
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
}
RetBB = &BB;
}
@@ -5374,6 +5447,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(Accumulator->getOperand(0).getReg());
break;
}
+ case ::AMDGPU::S_ADD_U64_PSEUDO:
+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
+ : AMDGPU::S_SUB_U32;
+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
+ : AMDGPU::S_SUBB_U32;
+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32RegClass);
+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32RegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
+ .add(Accumlo)
+ .addReg(LaneValueLo->getOperand(0).getReg());
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
+ .add(Accumhi)
+ .addReg(LaneValueHi->getOperand(0).getReg());
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestHi)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
}
}
// Manipulate the iterator to get the next active lane
@@ -5429,8 +5530,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 25a491ab87a06..64697673fa1b1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -345,6 +345,8 @@ defvar Operations = [
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index d2ca1d8136043..b6af8b4bb798d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -1226,6 +1226,1362 @@ endif:
store i32 %combine, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: s_mul_i32 s0, s2, s4
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s1, s2, s4
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s4
+; GFX8DAGISEL-NEXT: s_add_u32 s1, s1, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX8GISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9GISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1064GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1032GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1164GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1132GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s3
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX8DAGISEL-NEXT: ...
[truncated]
|
| const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); | ||
| Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); | ||
| Register ActiveLanes = MRI.createVirtualRegister(DstRegClass); | ||
| Register ActiveLanes = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Register ActiveLanes = | |
| Register NumActiveLanes = |
Maybe rename this? ActiveLanes makes it sound like a lane mask which it isn't
| auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) | ||
| .addReg(Exec->getOperand(0).getReg()); | ||
| auto NewAccumulator = | ||
| BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similarly CountReg is a confusing name because this isn't a register
| case ::AMDGPU::S_ADD_U64_PSEUDO: | ||
| case ::AMDGPU::S_SUB_U64_PSEUDO: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Formatting
69f9396 to
9fa39cb
Compare
28ce00f to
8d85224
Compare
9fa39cb to
41cff78
Compare
| if (Opc == AMDGPU::S_SUB_U64_PSEUDO) { | ||
| BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo) | ||
| .addImm(0) | ||
| .addReg(NewAccumulator->getOperand(0).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this also have dead setcc?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yupp, modified.
| .addReg(LaneValueLo->getOperand(0).getReg()); | ||
| BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi) | ||
| .add(Accumhi) | ||
| .addReg(LaneValueHi->getOperand(0).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dead setcc?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yuppp, modified.
| } | ||
| case AMDGPU::S_ADD_U64_PSEUDO: | ||
| case AMDGPU::S_SUB_U64_PSEUDO: { | ||
| unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Capitalize
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Donee.
| } | ||
| case AMDGPU::S_ADD_U64_PSEUDO: | ||
| case AMDGPU::S_SUB_U64_PSEUDO: { | ||
| Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is duplicating the base handling for these pseudos, and also is ignoring targets that do have 64-bit scalar add
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried WAVE_RED_ADD_PSEUDO -> S_ADD_U64_PSEUDO pipeline, but I couldn't find a way to expand the S_ADD_U64_PSEUDO beyond that. I'm not sure if I can replace one pseudo with another in the ExpandPseudo pass.
I have removed some operations from the base handling which would be redundant for my use, and I've used 32-bit Opcodes, so this works for all targets. I could add a check to use 64-bit scalar add for the targets which do have it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My point was don't duplicate code. Call a common helper function to expand the 64-bit add
8d85224 to
cae4732
Compare
41cff78 to
163ae0d
Compare
| .addReg(Accumulator->getOperand(0).getReg()); | ||
| break; | ||
| } | ||
| case AMDGPU::S_ADD_U64_PSEUDO: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is nearly identical to the existing expansion, need to not duplicate so much
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup got it.
|
Ping. |
e500764 to
5c613ed
Compare
7937af8 to
e38e5aa
Compare
5c613ed to
9ecbbe2
Compare
522b2f0 to
2f31a4b
Compare
a0bbd94 to
867d645
Compare
Supporting Min/Max Operations: `min`, `max`, `umin`, `umax`
Supporting Arithemtic Operations: `add`, `sub`
2f31a4b to
5799dbd
Compare

Supporting Arithemtic Operations:
add,sub