-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Add builtins for wave reduction intrinsics #127013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Currently, wave reduction intrinsics are supported for `umin` and `umax` operations only. This patch extends support for the following operations: `uadd`, `add`, `usub`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type.
Currently there are no plans to push these into a public header. Initial use is for testing purposes.
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-llvm-ir Author: Aaditya (easyonaadit) ChangesCurrently there are no plans to push these into a Patch is 523.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127013.diff 27 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39e295aced96b..2018f41f1007f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,6 +346,24 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
+//===----------------------------------------------------------------------===//
+
+// Wave Reduction builtins.
+
+//===----------------------------------------------------------------------===//
+
+BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_uadd_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_usub_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_umin_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_umax_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_and_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_or_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_xor_i32, "ii", "nc")
+
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 361e4c4bf2e2e..1cd4fe6a974a0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20212,6 +20212,59 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
return Builder.CreateCall(F, {Env});
}
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_add;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_uadd;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_sub;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_usub;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_min;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_umin;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_max;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_umax;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_and;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_or;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_xor;
+ break;
+ }
+ llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+ llvm::Function *F = CGM.getIntrinsic(IID, {Src0->getType()});
+ llvm::Value *Strategy =
+ llvm::ConstantInt::get(llvm::Type::getInt32Ty(getLLVMContext()), 0);
+ return Builder.CreateCall(F, {Src0, Strategy});
+ }
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index ded5f6b5ac4fd..94a7b5ff9bbb2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -398,6 +398,83 @@ void test_s_sendmsghalt_var(int in)
__builtin_amdgcn_s_sendmsghalt(1, in);
}
+// CHECK-LABEL: @test_wave_reduce_add_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
+void test_wave_reduce_add_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_uadd_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32(
+void test_wave_reduce_uadd_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_uadd_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
+void test_wave_reduce_sub_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_usub_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32(
+void test_wave_reduce_usub_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_usub_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
+void test_wave_reduce_min_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_umin_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
+void test_wave_reduce_umin_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_umin_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
+void test_wave_reduce_max_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_umax_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
+void test_wave_reduce_umax_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_umax_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
+void test_wave_reduce_and_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
+void test_wave_reduce_or_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
+void test_wave_reduce_xor_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_i32(in);
+}
+
// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index eb7bde6999491..fac4228d3bc1f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceOps<list<string> Operations> {
+ foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
+}
+
+defvar Operations = [
+ "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
+];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2e5f42c3bdc40..7d8fb718a88ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
+ case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_uadd:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_usub:
+ case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
- case Intrinsic::amdgcn_wave_reduce_umax: {
+ case Intrinsic::amdgcn_wave_reduce_max:
+ case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_and:
+ case Intrinsic::amdgcn_wave_reduce_or:
+ case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b632c50dae0e3..6711809bf0d31 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
+static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ return std::numeric_limits<uint32_t>::max();
+ case AMDGPU::S_MIN_I32:
+ return std::numeric_limits<int32_t>::max();
+ case AMDGPU::S_MAX_U32:
+ return std::numeric_limits<uint32_t>::min();
+ case AMDGPU::S_MAX_I32:
+ return std::numeric_limits<int32_t>::min();
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ return std::numeric_limits<uint32_t>::min();
+ case AMDGPU::S_AND_B32:
+ return std::numeric_limits<uint32_t>::max();
+ default:
+ llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
+ }
+}
+
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
@@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
- // These operations with a uniform value i.e. SGPR are idempotent.
- // Reduced value will be same as given sgpr.
- // clang-format off
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
- .addReg(SrcReg);
- // clang-format on
- RetBB = &BB;
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MIN_I32:
+ case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_MAX_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32: {
+ // Idempotent operations.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ }
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32: {
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+
+ bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned CountReg =
+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
+
+ auto Exec =
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+ .addReg(Exec->getOperand(0).getReg());
+
+ switch (Opc) {
+ case AMDGPU::S_XOR_B32: {
+ // Performing an XOR operation on a uniform value
+ // depends on the parity of the number of active lanes.
+ // For even parity, the result will be 0, for odd
+ // parity the result will be the same as the input value.
+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+
+ auto ParityReg =
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityReg->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_SUB_I32: {
+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+
+ // Take the negation of the source operand.
+ auto InvertedValReg =
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
+ .addImm(-1)
+ .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_ADD_I32: {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ }
+ RetBB = &BB;
+ }
+ }
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlockk
- uint32_t InitalValue =
- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+ // insert branch instr to newly created ComputeBlock
+ uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6e08aff24ec23..8a3e93a4faa95 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
-let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+// clang-format off
+defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+multiclass
+ AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
+ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ def !toupper(Op) #"_PSEUDO_" #DataType #Size
+ : VPseudoInstSI<(outs SGPR_32 : $sdst),
+ (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
+ [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
}
+}
+// clang-format on
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
- }
+// Input list : [Operation_name,
+// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+// Size_in_bits]
+defvar Operations = [
+ ["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
+ ["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
+ ["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
+ ["or", "B", "32"], ["xor", "B", "32"]
+];
+
+foreach Op = Operations in {
+ defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
}
let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
new file mode 100644
index 0000000000000..e93e8d4108ba0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -0,0 +1,1237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0...
[truncated]
|
|
@llvm/pr-subscribers-clang Author: Aaditya (easyonaadit) ChangesCurrently there are no plans to push these into a Patch is 523.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127013.diff 27 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39e295aced96b..2018f41f1007f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,6 +346,24 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
+//===----------------------------------------------------------------------===//
+
+// Wave Reduction builtins.
+
+//===----------------------------------------------------------------------===//
+
+BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_uadd_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_usub_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_umin_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_umax_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_and_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_or_i32, "ii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_xor_i32, "ii", "nc")
+
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 361e4c4bf2e2e..1cd4fe6a974a0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20212,6 +20212,59 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
return Builder.CreateCall(F, {Env});
}
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_add;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_uadd;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_sub;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_usub;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_min;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_umin;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_max;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_umax;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_and;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_or;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32:
+ IID = Intrinsic::amdgcn_wave_reduce_xor;
+ break;
+ }
+ llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+ llvm::Function *F = CGM.getIntrinsic(IID, {Src0->getType()});
+ llvm::Value *Strategy =
+ llvm::ConstantInt::get(llvm::Type::getInt32Ty(getLLVMContext()), 0);
+ return Builder.CreateCall(F, {Src0, Strategy});
+ }
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index ded5f6b5ac4fd..94a7b5ff9bbb2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -398,6 +398,83 @@ void test_s_sendmsghalt_var(int in)
__builtin_amdgcn_s_sendmsghalt(1, in);
}
+// CHECK-LABEL: @test_wave_reduce_add_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
+void test_wave_reduce_add_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_uadd_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32(
+void test_wave_reduce_uadd_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_uadd_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
+void test_wave_reduce_sub_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_usub_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32(
+void test_wave_reduce_usub_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_usub_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
+void test_wave_reduce_min_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_umin_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
+void test_wave_reduce_umin_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_umin_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
+void test_wave_reduce_max_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_umax_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
+void test_wave_reduce_umax_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_umax_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
+void test_wave_reduce_and_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
+void test_wave_reduce_or_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_i32(in);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_i32
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
+void test_wave_reduce_xor_i32(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_i32(in);
+}
+
// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index eb7bde6999491..fac4228d3bc1f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceOps<list<string> Operations> {
+ foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
+}
+
+defvar Operations = [
+ "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
+];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2e5f42c3bdc40..7d8fb718a88ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
+ case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_uadd:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_usub:
+ case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
- case Intrinsic::amdgcn_wave_reduce_umax: {
+ case Intrinsic::amdgcn_wave_reduce_max:
+ case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_and:
+ case Intrinsic::amdgcn_wave_reduce_or:
+ case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b632c50dae0e3..6711809bf0d31 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
+static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ return std::numeric_limits<uint32_t>::max();
+ case AMDGPU::S_MIN_I32:
+ return std::numeric_limits<int32_t>::max();
+ case AMDGPU::S_MAX_U32:
+ return std::numeric_limits<uint32_t>::min();
+ case AMDGPU::S_MAX_I32:
+ return std::numeric_limits<int32_t>::min();
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ return std::numeric_limits<uint32_t>::min();
+ case AMDGPU::S_AND_B32:
+ return std::numeric_limits<uint32_t>::max();
+ default:
+ llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
+ }
+}
+
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
@@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
- // These operations with a uniform value i.e. SGPR are idempotent.
- // Reduced value will be same as given sgpr.
- // clang-format off
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
- .addReg(SrcReg);
- // clang-format on
- RetBB = &BB;
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MIN_I32:
+ case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_MAX_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32: {
+ // Idempotent operations.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ }
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32: {
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+
+ bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned CountReg =
+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
+
+ auto Exec =
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+ .addReg(Exec->getOperand(0).getReg());
+
+ switch (Opc) {
+ case AMDGPU::S_XOR_B32: {
+ // Performing an XOR operation on a uniform value
+ // depends on the parity of the number of active lanes.
+ // For even parity, the result will be 0, for odd
+ // parity the result will be the same as the input value.
+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+
+ auto ParityReg =
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityReg->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_SUB_I32: {
+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+
+ // Take the negation of the source operand.
+ auto InvertedValReg =
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
+ .addImm(-1)
+ .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_ADD_I32: {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ }
+ RetBB = &BB;
+ }
+ }
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlockk
- uint32_t InitalValue =
- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+ // insert branch instr to newly created ComputeBlock
+ uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6e08aff24ec23..8a3e93a4faa95 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
-let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+// clang-format off
+defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+multiclass
+ AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
+ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ def !toupper(Op) #"_PSEUDO_" #DataType #Size
+ : VPseudoInstSI<(outs SGPR_32 : $sdst),
+ (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
+ [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
}
+}
+// clang-format on
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
- }
+// Input list : [Operation_name,
+// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+// Size_in_bits]
+defvar Operations = [
+ ["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
+ ["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
+ ["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
+ ["or", "B", "32"], ["xor", "B", "32"]
+];
+
+foreach Op = Operations in {
+ defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
}
let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
new file mode 100644
index 0000000000000..e93e8d4108ba0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -0,0 +1,1237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0...
[truncated]
|
|
Stacked on: #126469 |
| [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {} | ||
| } | ||
| } | ||
| // clang-format on |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we need clang-format control here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tablegen has started getting formatted by clang-format.
This was getting tedious to read, so I put this in.
Currently there are no plans to push these into a
public header. Initial use is for testing purposes.
(Stacked on: #126469)