Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,24 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")

//===----------------------------------------------------------------------===//

// Wave Reduction builtins.

//===----------------------------------------------------------------------===//

BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_uadd_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_usub_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_umin_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_umax_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_i32, "ii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_i32, "ii", "nc")

//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
Expand Down
53 changes: 53 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20212,6 +20212,59 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
return Builder.CreateCall(F, {Env});
}
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
IID = Intrinsic::amdgcn_wave_reduce_add;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32:
IID = Intrinsic::amdgcn_wave_reduce_uadd;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
IID = Intrinsic::amdgcn_wave_reduce_sub;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32:
IID = Intrinsic::amdgcn_wave_reduce_usub;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
IID = Intrinsic::amdgcn_wave_reduce_min;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32:
IID = Intrinsic::amdgcn_wave_reduce_umin;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
IID = Intrinsic::amdgcn_wave_reduce_max;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32:
IID = Intrinsic::amdgcn_wave_reduce_umax;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32:
IID = Intrinsic::amdgcn_wave_reduce_and;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32:
IID = Intrinsic::amdgcn_wave_reduce_or;
break;
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32:
IID = Intrinsic::amdgcn_wave_reduce_xor;
break;
}
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
llvm::Function *F = CGM.getIntrinsic(IID, {Src0->getType()});
llvm::Value *Strategy =
llvm::ConstantInt::get(llvm::Type::getInt32Ty(getLLVMContext()), 0);
return Builder.CreateCall(F, {Src0, Strategy});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
Expand Down
77 changes: 77 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,83 @@ void test_s_sendmsghalt_var(int in)
__builtin_amdgcn_s_sendmsghalt(1, in);
}

// CHECK-LABEL: @test_wave_reduce_add_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_uadd_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32(
void test_wave_reduce_uadd_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_uadd_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_sub_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_usub_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32(
void test_wave_reduce_usub_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_usub_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_min_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_umin_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_umin_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_umin_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_max_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_umax_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_umax_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_umax_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_and_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_or_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_i32(in);
}

// CHECK-LABEL: @test_wave_reduce_xor_i32
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_i32(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_i32(in);
}

// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()
Expand Down
10 changes: 8 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
multiclass AMDGPUWaveReduceOps<list<string> Operations> {
foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
}

defvar Operations = [
"umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
];
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;

def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_uadd:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_usub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
124 changes: 114 additions & 10 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}

static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::S_MIN_I32:
return std::numeric_limits<int32_t>::max();
case AMDGPU::S_MAX_U32:
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
default:
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
}
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
Expand All @@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
// clang-format off
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
.addReg(SrcReg);
// clang-format on
RetBB = &BB;
switch (Opc) {
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32: {
// Idempotent operations.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);

bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;

auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);

auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
.addReg(Exec->getOperand(0).getReg());

switch (Opc) {
case AMDGPU::S_XOR_B32: {
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);

auto ParityReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg());
break;
}
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);

// Take the negation of the source operand.
auto InvertedValReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
.addImm(-1)
.addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_I32: {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
}
RetBB = &BB;
}
}
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
Expand Down Expand Up @@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
Expand Down Expand Up @@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
Expand Down
31 changes: 23 additions & 8 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;

let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
// clang-format off
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
multiclass
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType #Size
: VPseudoInstSI<(outs SGPR_32 : $sdst),
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
}
}
// clang-format on
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need clang-format control here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tablegen has started getting formatted by clang-format.
This was getting tedious to read, so I put this in.


def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
// Input list : [Operation_name,
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
// Size_in_bits]
defvar Operations = [
["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
["or", "B", "32"], ["xor", "B", "32"]
];

foreach Op = Operations in {
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
}

let usesCustomInserter = 1, Defs = [VCC] in {
Expand Down
Loading