Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cpp/include/cudf/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,5 +283,21 @@ std::vector<size_type> batch_null_count(host_span<bitmask_type const* const> bit
size_type stop,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/**
* @brief Given a validity bitmask, returns the index of the first set bit
* in the range `[start, stop)`.
*
* @param bitmask Validity bitmask residing in device memory
* @param start Index of the first bit to check (inclusive)
* @param stop Index of the last bit to check (exclusive)
* @param stream CUDA stream used for device memory operations and kernel launches
* @return The index of the first set bit in the specified range,
* or `stop-start` if no set bit is found (all nulls)
*/
size_type index_of_first_set_bit(bitmask_type const* bitmask,
size_type start,
size_type stop,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/** @} */ // end of group
} // namespace CUDF_EXPORT cudf
79 changes: 77 additions & 2 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <rmm/mr/device_memory_resource.hpp>

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cub/cub.cuh>
#include <cuda/atomic>
#include <thrust/execution_policy.h>
Expand All @@ -32,8 +33,6 @@
#include <limits>
#include <numeric>

namespace cg = cooperative_groups;

namespace cudf {
size_type state_null_count(mask_state state, size_type size)
{
Expand Down Expand Up @@ -156,6 +155,7 @@ CUDF_KERNEL void set_null_masks_kernel(cudf::device_span<bitmask_type*> destinat
cudf::device_span<bool const> valids,
cudf::device_span<size_type const> numbers_of_mask_words)
{
namespace cg = cooperative_groups;
auto const bitmask_idx = cg::this_grid().block_rank();
// Return early if nothing to do
if (begin_bits[bitmask_idx] == end_bits[bitmask_idx]) { return; }
Expand All @@ -173,6 +173,7 @@ CUDF_KERNEL void set_null_mask_kernel(bitmask_type* destination,
bool valid,
size_type number_of_mask_words)
{
namespace cg = cooperative_groups;
set_null_mask_impl<cg::grid_group, mask_set_mode::UNSAFE>(
destination, begin_bit, end_bit, valid, number_of_mask_words, cg::this_grid());
}
Expand Down Expand Up @@ -727,6 +728,71 @@ void set_all_valid_null_masks(column_view const& input,
}
}

namespace {

template <size_type block_size>
CUDF_KERNEL void find_first_set_bit_kernel(bitmask_type const* __restrict__ bitmask,
size_type start,
size_type stop,
size_type max,
size_type* index)
{
constexpr auto word_size = detail::size_in_bits<bitmask_type>();

namespace cg = cooperative_groups;
auto const block = cg::tiled_partition<block_size>(cg::this_thread_block());
auto const tid = cudf::detail::grid_1d::global_thread_id<block_size>();

cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(index)};
if (ref.load(cuda::std::memory_order_relaxed) != max) {
return; // early exit if bit has already been found
}

auto const end_word_index = word_index(stop);

auto const thread_word_index = tid + word_index(start);
auto bit_index = max;
if (thread_word_index <= end_word_index) {
auto const mask = detail::get_mask_offset_word(bitmask, tid, start, stop);
// returned index is 1-based; 0 means no bits were set
auto mask_bit_index = __ffs(mask);
if (mask_bit_index != 0) {
bit_index = static_cast<size_type>(tid * word_size) + mask_bit_index - 1;
}
}
size_type out_index = cg::reduce(block, bit_index, cg::less<size_type>());
block.sync();

if (block.thread_rank() == 0 && out_index != max) {
ref.fetch_min(out_index, cuda::std::memory_order_relaxed);
}
}
} // namespace

size_type index_of_first_set_bit(bitmask_type const* bitmask,
size_type start,
size_type stop,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(
start >= 0 and start <= stop and start != stop, "Invalid bit range.", std::invalid_argument);
if (bitmask == nullptr) { return 0; }

auto const bit_count = stop - start;
auto const mask_words = num_bitmask_words(bit_count);

auto d_index =
cudf::detail::device_scalar<size_type>(stream, cudf::get_current_device_resource_ref());
d_index.set_value_async(bit_count, stream); // init to no set bits found

constexpr size_type block_size = 256;
auto const grid = grid_1d{mask_words + 1, block_size};
find_first_set_bit_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
bitmask, start, stop, bit_count, d_index.data());
return d_index.value(stream);
}

} // namespace detail

// Create a bitmask from a specific range
Expand Down Expand Up @@ -800,4 +866,13 @@ std::vector<size_type> batch_null_count(host_span<bitmask_type const* const> bit
return counts;
}

size_type index_of_first_set_bit(bitmask_type const* bitmask,
size_type start,
size_type stop,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
return detail::index_of_first_set_bit(bitmask, start, stop, stream);
}

} // namespace cudf
44 changes: 44 additions & 0 deletions cpp/tests/bitmask/bitmask_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ TEST_F(CountBitmaskTest, NegativeStart)
std::invalid_argument);
EXPECT_THROW(cudf::detail::valid_count(mask.data(), -1, 32, cudf::get_default_stream()),
std::invalid_argument);
EXPECT_THROW(cudf::index_of_first_set_bit(mask.data(), -1, 32), std::invalid_argument);

std::vector<cudf::size_type> indices = {0, 16, -1, 32};
EXPECT_THROW(
Expand All @@ -115,6 +116,7 @@ TEST_F(CountBitmaskTest, StartLargerThanStop)
std::invalid_argument);
EXPECT_THROW(cudf::detail::valid_count(mask.data(), 32, 31, cudf::get_default_stream()),
std::invalid_argument);
EXPECT_THROW(cudf::index_of_first_set_bit(mask.data(), 32, 31), std::invalid_argument);

std::vector<cudf::size_type> indices = {0, 16, 31, 30};
EXPECT_THROW(
Expand All @@ -130,6 +132,7 @@ TEST_F(CountBitmaskTest, EmptyRange)
auto mask = make_mask(1);
EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 17, cudf::get_default_stream()));
EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 17, cudf::get_default_stream()));
EXPECT_THROW(cudf::index_of_first_set_bit(mask.data(), 17, 17), std::invalid_argument);

std::vector<cudf::size_type> indices = {0, 0, 17, 17};
auto set_counts =
Expand Down Expand Up @@ -360,6 +363,47 @@ TEST_F(CountBitmaskTest, BatchNullCount)
::testing::ElementsAreArray(std::vector<cudf::size_type>{3, 3, 2, 1, 6, 0}));
}

struct iofub_test_parameter {
cudf::size_type size;
cudf::size_type set_index;
cudf::size_type start_index;
cudf::size_type result;
};

TEST_F(CountBitmaskTest, IndexOfFirstUnsetBit)
{
auto parameters = std::vector<iofub_test_parameter>({
// clang-format off
// rows set start result
{ 28, 0, 0, 0}, // less than sizeof bitmask_type
{ 28, 10, 10, 0},
{ 32, 0, 0, 0}, // equal to sizeof bitmask_type
{ 32, 10, 10, 0},
{ 32, 10, 11, 21}, // set bit is not in range
{ 64, 33, 0, 33}, // exactly 2x sizeof bitmask_type
{ 260, 258, 0, 258}, // greater than 256 bits
{ 260, 258, 32, 226},
{ 320, 260, 60, 200},
{ 320, 260, 256, 4},
{9000, 2, 0, 2}, // more than one CUDA block
{9000, 260, 256, 4},
{9000, 8193, 0, 8193},
{9000, 8193, 8192, 1},
{9000, 8, 80, 8920}, // set bit is not in range
// clang-format on
});
for (auto parm : parameters) {
auto data = std::vector<bool>(parm.size, false);
data[parm.set_index] = true;
auto input =
cudf::test::fixed_width_column_wrapper<int, bool>(data.begin(), data.end(), data.begin())
.release();
auto result = cudf::index_of_first_set_bit(
input->view().null_mask(), parm.start_index, input->view().size());
EXPECT_EQ(result, parm.result);
}
}

using CountUnsetBitsTest = CountBitmaskTest;

TEST_F(CountUnsetBitsTest, SingleBitAllSet)
Expand Down
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,10 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
size_type stop,
cuda_stream_view stream
)

cdef size_type index_of_first_set_bit(
const bitmask_type * bitmask,
size_type start,
size_type stop,
cuda_stream_view stream
)
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/null_mask.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,10 @@ cpdef size_type null_count(
size_type stop,
Stream stream=*
)

cpdef size_type index_of_first_set_bit(
object bitmask,
size_type start,
size_type stop,
Stream stream=*
)
3 changes: 3 additions & 0 deletions python/pylibcudf/pylibcudf/null_mask.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,6 @@ def bitmask_or(
def null_count(
bitmask: Span, start: int, stop: int, stream: Stream | None = None
) -> int: ...
def index_of_first_set_bit(
bitmask: Span, start: int, stop: int, stream: Stream | None = None
) -> int: ...
42 changes: 42 additions & 0 deletions python/pylibcudf/pylibcudf/null_mask.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ __all__ = [
"copy_bitmask",
"create_null_mask",
"null_count",
"index_of_first_set_bit",
]

cdef DeviceBuffer buffer_to_python(
Expand Down Expand Up @@ -284,3 +285,44 @@ cpdef size_type null_count(
stop,
stream.view()
)

cpdef size_type index_of_first_set_bit(
object bitmask,
size_type start,
size_type stop,
Stream stream=None
):
"""Given a validity bitmask, returns the index of the first valid element.

For details, see :cpp:func:`index_of_first_set_bit`.

Parameters
----------
bitmask : Span-like object
Object with ptr and size attributes (e.g., gpumemoryview, Buffer, DeviceBuffer).
start : int
Index of the first bit to check (inclusive).
stop : int
Index of the last bit to check (exclusive).
stream : Stream | None
CUDA stream on which to perform the operation.

Returns
-------
int
The number of null elements in the specified range.
"""
if not py_is_span(bitmask):
raise TypeError(
f"bitmask must satisfy Span protocol (have .ptr and .size), "
f"got {type(bitmask).__name__}"
)
cdef uintptr_t ptr = bitmask.ptr
stream = _get_stream(stream)
with nogil:
return cpp_null_mask.index_of_first_set_bit(
<bitmask_type*>ptr,
start,
stop,
stream.view()
)
Loading