Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,24 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")

//===----------------------------------------------------------------------===//

// Wave Reduction builtins.

//===----------------------------------------------------------------------===//

BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "iii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_add_u32, "UiUii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "iii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_sub_u32, "UiUii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "iii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "UiUii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "iii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "UiUii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "iii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "iii", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "iii", "nc")

//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
Expand Down
46 changes: 46 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19911,6 +19911,35 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}

static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
switch (BuiltinID) {
default:
llvm_unreachable("Unknown BuiltinID for wave reduction");
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
return Intrinsic::amdgcn_wave_reduce_add;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
return Intrinsic::amdgcn_wave_reduce_uadd;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
return Intrinsic::amdgcn_wave_reduce_sub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
return Intrinsic::amdgcn_wave_reduce_usub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
return Intrinsic::amdgcn_wave_reduce_min;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
return Intrinsic::amdgcn_wave_reduce_umin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
return Intrinsic::amdgcn_wave_reduce_max;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
return Intrinsic::amdgcn_wave_reduce_umax;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
return Intrinsic::amdgcn_wave_reduce_and;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
return Intrinsic::amdgcn_wave_reduce_or;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
return Intrinsic::amdgcn_wave_reduce_xor;
}
}

Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
Expand Down Expand Up @@ -20212,6 +20241,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
return Builder.CreateCall(F, {Env});
}
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: {
Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
llvm::Value *Value = EmitScalarExpr(E->getArg(0));
llvm::Value *Strategy = EmitScalarExpr(E->getArg(1));
llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
return Builder.CreateCall(F, {Value, Strategy});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
Expand Down
231 changes: 231 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,237 @@ void test_s_sendmsghalt_var(int in)
__builtin_amdgcn_s_sendmsghalt(1, in);
}

// CHECK-LABEL: @test_wave_reduce_add_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_i32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_add_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_i32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_add_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_i32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_uadd_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32(
void test_wave_reduce_uadd_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_u32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_uadd_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32(
void test_wave_reduce_uadd_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_u32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_uadd_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32(
void test_wave_reduce_uadd_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_add_u32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_sub_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_i32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_sub_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_i32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_sub_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_i32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_usub_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32(
void test_wave_reduce_usub_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_usub_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32(
void test_wave_reduce_usub_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_usub_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32(
void test_wave_reduce_usub_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_sub_u32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_min_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_min_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_min_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_i32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_umin_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_umin_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_u32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_umin_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_umin_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_u32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_umin_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_umin_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_min_u32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_max_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_max_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_max_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_i32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_umax_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_umax_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_u32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_umax_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_umax_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_u32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_umax_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_umax_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_max_u32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_and_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_b32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_and_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_b32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_and_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_and_b32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_or_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_b32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_or_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_b32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_or_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
void test_wave_reduce_or_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_or_b32(in, 2);
}

// CHECK-LABEL: @test_wave_reduce_xor_i32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_i32_default(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0);
}

// CHECK-LABEL: @test_wave_reduce_xor_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_i32_iterative(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b32(in, 1);
}

// CHECK-LABEL: @test_wave_reduce_xor_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
void test_wave_reduce_xor_i32_dpp(global int* out, int in)
{
*out = __builtin_amdgcn_wave_reduce_xor_b32(in, 2);
}

// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()
Expand Down
10 changes: 8 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;

def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
multiclass AMDGPUWaveReduceOps<list<string> Operations> {
foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
}

defvar Operations = [
"umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
];
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;

def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
case Intrinsic::amdgcn_wave_reduce_uadd:
case Intrinsic::amdgcn_wave_reduce_sub:
case Intrinsic::amdgcn_wave_reduce_usub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
Expand Down
Loading