Skip to content

Commit cbebbcc

Browse files
committed
clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64}
Add builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while. This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality.
1 parent 658a931 commit cbebbcc

File tree

5 files changed

+41
-0
lines changed

5 files changed

+41
-0
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address spaces are fenced.
51625162
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
51635163
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
51645164
5165+
__builtin_amdgcn_ballot_w{32,64}
5166+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
5167+
5168+
``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its
5169+
boolean argument as a bit for every lane of the current wave that is currently
5170+
active (i.e., that is converged with the executing thread), and a 0 bit for
5171+
every lane that is not active.
5172+
5173+
The result is uniform, i.e. it is the same in every active thread of the wave.
5174+
5175+
__builtin_amdgcn_inverse_ballot_w{32,64}
5176+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
5177+
5178+
Given a wave-uniform bitmask, ``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)``
5179+
returns the bit at the position of the current lane. It is almost equivalent to
5180+
``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
5181+
the given mask has the same value for all active lanes of the current wave.
51655182
51665183
ARM/AArch64 Language Extensions
51675184
-------------------------------

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi",
183183
TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32")
184184
BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc")
185185

186+
TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", "wavefrontsize32")
187+
TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", "wavefrontsize64")
188+
186189
// Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64}
187190
BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc")
188191
BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
504504
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
505505
return Builder.CreateCall(F, { Src });
506506
}
507+
case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
508+
case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
509+
llvm::Value *Src = EmitScalarExpr(E->getArg(0));
510+
Function *F =
511+
CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()});
512+
return Builder.CreateCall(F, {Src});
513+
}
507514
case AMDGPU::BI__builtin_amdgcn_tanhf:
508515
case AMDGPU::BI__builtin_amdgcn_tanhh:
509516
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:

clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b)
2424
*out = __builtin_amdgcn_ballot_w32(a == b);
2525
}
2626

27+
// CHECK-LABEL: @test_inverse_ballot_wave32(
28+
// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}})
29+
void test_inverse_ballot_wave32(global bool* out, int a)
30+
{
31+
*out = __builtin_amdgcn_inverse_ballot_w32(a);
32+
}
33+
2734
// CHECK-LABEL: @test_read_exec(
2835
// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
2936
void test_read_exec(global uint* out) {

clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b)
2323
*out = __builtin_amdgcn_ballot_w64(a == b);
2424
}
2525

26+
// CHECK-LABEL: @test_inverse_ballot_wave64(
27+
// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}})
28+
void test_inverse_ballot_wave64(global bool* out, ulong a)
29+
{
30+
*out = __builtin_amdgcn_inverse_ballot_w64(a);
31+
}
32+
2633
// CHECK-LABEL: @test_read_exec(
2734
// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
2835
void test_read_exec(global ulong* out) {

0 commit comments

Comments
 (0)