Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ int main() {
// Compute the sum reduction of `data` using a custom kernel
constexpr int block_size = 256;
int const num_blocks = cuda::ceil_div(N, block_size);
reduce<block_size><<<num_blocks, block_size>>>(cuda::std::span<int const>(thrust::raw_pointer_cast(data.data()), data.size()),
cuda::std::span<int>(thrust::raw_pointer_cast(kernel_result.data()), 1));
reduce<block_size><<<num_blocks, block_size>>>(cuda::std::span<int const>(cuda::std::to_address(data.data()), data.size()),
cuda::std::span<int>(cuda::std::to_address(kernel_result.data()), 1));

auto const err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
Expand Down
6 changes: 3 additions & 3 deletions c2h/generators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ public:

generate();

return thrust::raw_pointer_cast(m_distribution.data());
return cuda::std::to_address(m_distribution.data());
}

// re-fills the currently held distribution vector with new random values
void generate()
{
#if C2H_HAS_CURAND
curandGenerateUniform(m_gen, thrust::raw_pointer_cast(m_distribution.data()), m_distribution.size());
curandGenerateUniform(m_gen, cuda::std::to_address(m_distribution.data()), m_distribution.size());
#else
thrust::tabulate(device_policy, m_distribution.begin(), m_distribution.end(), i_to_rnd_t{m_gen});
m_gen.discard(m_distribution.size());
Expand Down Expand Up @@ -246,7 +246,7 @@ void init_key_segments(::cuda::std::span<const OffsetT> segment_offsets, KeyT* d
device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
# endif // THRUST_VERSION >= 300100

d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
d_temp_storage = cuda::std::to_address(temp_storage.data());

// TODO(bgruber): replace by a non-CUB implementation
cub::DeviceCopy::Batched(
Expand Down
18 changes: 8 additions & 10 deletions c2h/include/c2h/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ void gen(seed_t seed,
{
detail::gen_custom_type_state(
seed,
reinterpret_cast<char*>(THRUST_NS_QUALIFIER::raw_pointer_cast(data.data())),
reinterpret_cast<char*>(cuda::std::to_address(data.data())),
min,
max,
data.size(),
Expand All @@ -110,13 +110,13 @@ void gen(seed_t seed,
T min = ::cuda::std::numeric_limits<T>::lowest(),
T max = ::cuda::std::numeric_limits<T>::max())
{
detail::gen_values_between(seed, {THRUST_NS_QUALIFIER::raw_pointer_cast(data.data()), data.size()}, min, max);
detail::gen_values_between(seed, {cuda::std::to_address(data.data()), data.size()}, min, max);
}

template <typename T>
void gen(modulo_t mod, device_vector<T>& data)
{
detail::gen_values_cyclic(mod, ::cuda::std::span<T>{THRUST_NS_QUALIFIER::raw_pointer_cast(data.data()), data.size()});
detail::gen_values_cyclic(mod, ::cuda::std::span<T>{cuda::std::to_address(data.data()), data.size()});
}

/**
Expand All @@ -132,7 +132,7 @@ device_vector<T> gen_uniform_offsets(seed_t seed, T total_elements, T min_segmen
device_vector<T> segment_offsets(total_elements + 2);
const auto new_size = detail::gen_uniform_offsets(
seed,
{THRUST_NS_QUALIFIER::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()},
{cuda::std::to_address(segment_offsets.data()), segment_offsets.size()},
total_elements,
min_segment_size,
max_segment_size);
Expand All @@ -148,19 +148,17 @@ template <typename OffsetT, typename KeyT>
void init_key_segments(const device_vector<OffsetT>& segment_offsets, device_vector<KeyT>& keys_out)
{
detail::init_key_segments(
::cuda::std::span<const OffsetT>{
THRUST_NS_QUALIFIER::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()},
THRUST_NS_QUALIFIER::raw_pointer_cast(keys_out.data()),
::cuda::std::span<const OffsetT>{cuda::std::to_address(segment_offsets.data()), segment_offsets.size()},
cuda::std::to_address(keys_out.data()),
sizeof(KeyT));
}

template <typename OffsetT, template <typename> class... Ps>
void init_key_segments(const device_vector<OffsetT>& segment_offsets, device_vector<custom_type_t<Ps...>>& keys_out)
{
detail::init_key_segments(
::cuda::std::span<const OffsetT>{
THRUST_NS_QUALIFIER::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()},
static_cast<custom_type_state_t*>(THRUST_NS_QUALIFIER::raw_pointer_cast(keys_out.data())),
::cuda::std::span<const OffsetT>{cuda::std::to_address(segment_offsets.data()), segment_offsets.size()},
static_cast<custom_type_state_t*>(cuda::std::to_address(keys_out.data())),
sizeof(custom_type_t<Ps...>));
}
} // namespace c2h
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/adjacent_difference/subtract_left.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<T> in = generate(elements);
thrust::device_vector<T> out(elements);

input_it_t d_in = thrust::raw_pointer_cast(in.data());
output_it_t d_out = thrust::raw_pointer_cast(out.data());
input_it_t d_in = cuda::std::to_address(in.data());
output_it_t d_out = cuda::std::to_address(out.data());

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
Expand All @@ -60,7 +60,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
);

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes, thrust::no_init);
std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
std::uint8_t* d_temp_storage = cuda::std::to_address(temp_storage.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::detail::adjacent_difference::
Expand Down
18 changes: 9 additions & 9 deletions cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ void gen_it(T* d_buffer,
bool randomize,
thrust::default_random_engine& rne)
{
OffsetT* d_offsets = thrust::raw_pointer_cast(offsets.data());
OffsetT* d_offsets = cuda::std::to_address(offsets.data());

if (randomize)
{
Expand All @@ -134,7 +134,7 @@ void gen_it(T* d_buffer,
thrust::tabulate(sizes.begin(), sizes.end(), offset_to_size_t<T, OffsetT>{d_offsets});
thrust::scatter(sizes.begin(), sizes.end(), map.begin(), offsets.begin());
thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin());
OffsetT* d_map = thrust::raw_pointer_cast(map.data());
OffsetT* d_map = cuda::std::to_address(map.data());
thrust::tabulate(output.begin(), output.end(), reordered_offset_to_ptr_t<T, OffsetT>{d_buffer, d_map, d_offsets});
}
else
Expand Down Expand Up @@ -177,9 +177,9 @@ void copy(nvbench::state& state,
thrust::device_vector<offset_t> offsets =
generate.uniform.segment_offsets(elements, min_buffer_size, max_buffer_size);

T* d_input_buffer = thrust::raw_pointer_cast(input_buffer.data());
T* d_output_buffer = thrust::raw_pointer_cast(output_buffer.data());
offset_t* d_offsets = thrust::raw_pointer_cast(offsets.data());
T* d_input_buffer = cuda::std::to_address(input_buffer.data());
T* d_output_buffer = cuda::std::to_address(output_buffer.data());
offset_t* d_offsets = cuda::std::to_address(offsets.data());

const auto buffers = offsets.size() - 1;

Expand All @@ -197,9 +197,9 @@ void copy(nvbench::state& state,
offsets.shrink_to_fit();
d_offsets = nullptr;

input_buffer_it_t d_input_buffers = thrust::raw_pointer_cast(input_buffers.data());
output_buffer_it_t d_output_buffers = thrust::raw_pointer_cast(output_buffers.data());
buffer_size_it_t d_buffer_sizes = thrust::raw_pointer_cast(buffer_sizes.data());
input_buffer_it_t d_input_buffers = cuda::std::to_address(input_buffers.data());
output_buffer_it_t d_output_buffers = cuda::std::to_address(output_buffers.data());
buffer_size_it_t d_buffer_sizes = cuda::std::to_address(buffer_sizes.data());

state.add_element_count(elements);
state.add_global_memory_writes<T>(elements);
Expand All @@ -214,7 +214,7 @@ void copy(nvbench::state& state,
d_temp_storage, temp_storage_bytes, d_input_buffers, d_output_buffers, d_buffer_sizes, buffers, 0);

thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
d_temp_storage = cuda::std::to_address(temp_storage.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
Expand Down
10 changes: 5 additions & 5 deletions cub/benchmarks/bench/find_if/base.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,21 @@ void find_if(nvbench::state& state, nvbench::type_list<T, OffsetT>)
cub::DeviceFind::FindIf(
d_temp_storage,
temp_storage_bytes,
thrust::raw_pointer_cast(dinput.data()),
thrust::raw_pointer_cast(d_result.data()),
cuda::std::to_address(dinput.data()),
cuda::std::to_address(d_result.data()),
cuda::equal_to_value<T>(val),
static_cast<OffsetT>(dinput.size()),
0);

thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes, thrust::no_init);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
d_temp_storage = cuda::std::to_address(temp_storage.data());

state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::DeviceFind::FindIf(
d_temp_storage,
temp_storage_bytes,
thrust::raw_pointer_cast(dinput.data()),
thrust::raw_pointer_cast(d_result.data()),
cuda::std::to_address(dinput.data()),
cuda::std::to_address(d_result.data()),
cuda::equal_to_value<T>(val),
static_cast<OffsetT>(dinput.size()),
launch.get_stream());
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/for_each/base.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)

thrust::device_vector<T> in(elements, T{42});

input_it_t d_in = thrust::raw_pointer_cast(in.data());
input_it_t d_in = cuda::std::to_address(in.data());
// `d_out` exists for visibility
// All inputs are equal to `42`, while the operator is searching for `0`.
// If the operator finds `0` in the input sequence, it's an issue leading to a segfault.
Expand All @@ -45,7 +45,7 @@ void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
cub::DeviceFor::ForEachN(nullptr, temp_size, d_in, elements, op);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());
auto* temp_storage = cuda::std::to_address(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::DeviceFor::ForEachN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/for_each/copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)

thrust::device_vector<T> in(elements, T{42});

input_it_t d_in = thrust::raw_pointer_cast(in.data());
input_it_t d_in = cuda::std::to_address(in.data());
output_it_t d_out = nullptr;

state.add_element_count(elements);
Expand All @@ -42,7 +42,7 @@ void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
cub::DeviceFor::ForEachCopyN(nullptr, temp_size, d_in, elements, op);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());
auto* temp_storage = cuda::std::to_address(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::DeviceFor::ForEachCopyN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/for_each/extents.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ void for_each_in_extents(nvbench::state& state, nvbench::type_list<T, OffsetT>)

thrust::device_vector<T> in(elements, T{42});
thrust::device_vector<T> out(elements);
it_t d_in = thrust::raw_pointer_cast(in.data());
it_t d_out = thrust::raw_pointer_cast(out.data());
it_t d_in = cuda::std::to_address(in.data());
it_t d_out = cuda::std::to_address(out.data());
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);
Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
thrust::device_vector<CounterT> hist(num_bins);

SampleT* d_input = thrust::raw_pointer_cast(input.data());
CounterT* d_histogram = thrust::raw_pointer_cast(hist.data());
SampleT* d_input = cuda::std::to_address(input.data());
CounterT* d_histogram = cuda::std::to_address(hist.data());

std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};
Expand Down Expand Up @@ -93,7 +93,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
is_byte_sample);

thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());
d_temp_storage = cuda::std::to_address(tmp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::DispatchEven(
Expand Down
10 changes: 5 additions & 5 deletions cub/benchmarks/bench/histogram/multi/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
thrust::device_vector<CounterT> hist_b(num_bins);
thrust::device_vector<SampleT> input = generate(elements * num_channels, entropy, lower_level_r, upper_level_r);

SampleT* d_input = thrust::raw_pointer_cast(input.data());
CounterT* d_histogram_r = thrust::raw_pointer_cast(hist_r.data());
CounterT* d_histogram_g = thrust::raw_pointer_cast(hist_g.data());
CounterT* d_histogram_b = thrust::raw_pointer_cast(hist_b.data());
SampleT* d_input = cuda::std::to_address(input.data());
CounterT* d_histogram_r = cuda::std::to_address(hist_r.data());
CounterT* d_histogram_g = cuda::std::to_address(hist_g.data());
CounterT* d_histogram_b = cuda::std::to_address(hist_b.data());

std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};
Expand Down Expand Up @@ -103,7 +103,7 @@ static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, Of
is_byte_sample);

thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());
d_temp_storage = cuda::std::to_address(tmp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::DispatchEven(
Expand Down
16 changes: 8 additions & 8 deletions cub/benchmarks/bench/histogram/multi/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,19 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
thrust::device_vector<SampleT> levels_g = levels_r;
thrust::device_vector<SampleT> levels_b = levels_g;

SampleT* d_levels_r = thrust::raw_pointer_cast(levels_r.data());
SampleT* d_levels_g = thrust::raw_pointer_cast(levels_g.data());
SampleT* d_levels_b = thrust::raw_pointer_cast(levels_b.data());
SampleT* d_levels_r = cuda::std::to_address(levels_r.data());
SampleT* d_levels_g = cuda::std::to_address(levels_g.data());
SampleT* d_levels_b = cuda::std::to_address(levels_b.data());

thrust::device_vector<CounterT> hist_r(num_bins);
thrust::device_vector<CounterT> hist_g(num_bins);
thrust::device_vector<CounterT> hist_b(num_bins);
thrust::device_vector<SampleT> input = generate(elements * num_channels, entropy, lower_level, upper_level);

SampleT* d_input = thrust::raw_pointer_cast(input.data());
CounterT* d_histogram_r = thrust::raw_pointer_cast(hist_r.data());
CounterT* d_histogram_g = thrust::raw_pointer_cast(hist_g.data());
CounterT* d_histogram_b = thrust::raw_pointer_cast(hist_b.data());
SampleT* d_input = cuda::std::to_address(input.data());
CounterT* d_histogram_r = cuda::std::to_address(hist_r.data());
CounterT* d_histogram_g = cuda::std::to_address(hist_g.data());
CounterT* d_histogram_b = cuda::std::to_address(hist_b.data());

std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};
Expand Down Expand Up @@ -102,7 +102,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
is_byte_sample);

thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());
d_temp_storage = cuda::std::to_address(tmp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::DispatchRange(
Expand Down
8 changes: 4 additions & 4 deletions cub/benchmarks/bench/histogram/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O

// TODO Extract sequence to the helper TU
thrust::sequence(levels.begin(), levels.end(), lower_level, step);
SampleT* d_levels = thrust::raw_pointer_cast(levels.data());
SampleT* d_levels = cuda::std::to_address(levels.data());

thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
thrust::device_vector<CounterT> hist(num_bins);

SampleT* d_input = thrust::raw_pointer_cast(input.data());
CounterT* d_histogram = thrust::raw_pointer_cast(hist.data());
SampleT* d_input = cuda::std::to_address(input.data());
CounterT* d_histogram = cuda::std::to_address(hist.data());

std::uint8_t* d_temp_storage = nullptr;
std::size_t temp_storage_bytes{};
Expand Down Expand Up @@ -91,7 +91,7 @@ static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, O
is_byte_sample);

thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());
d_temp_storage = cuda::std::to_address(tmp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::DispatchRange(
Expand Down
11 changes: 5 additions & 6 deletions cub/benchmarks/bench/merge/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

#include <cub/device/device_merge.cuh>

#include <thrust/detail/raw_pointer_cast.h>

#include <cuda/std/memory>
#include <cuda/std/utility>

#include <cstdint>
Expand Down Expand Up @@ -47,9 +46,9 @@ void keys(nvbench::state& state, nvbench::type_list<KeyT, OffsetT>)
auto [keys_lhs, keys_rhs] = generate_lhs_rhs<KeyT>(num_items_lhs, num_items_rhs, entropy);

thrust::device_vector<KeyT> keys_out(elements);
KeyT* d_keys_lhs = thrust::raw_pointer_cast(keys_lhs.data());
KeyT* d_keys_rhs = thrust::raw_pointer_cast(keys_rhs.data());
KeyT* d_keys_out = thrust::raw_pointer_cast(keys_out.data());
KeyT* d_keys_lhs = cuda::std::to_address(keys_lhs.data());
KeyT* d_keys_rhs = cuda::std::to_address(keys_rhs.data());
KeyT* d_keys_out = cuda::std::to_address(keys_out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
Expand Down Expand Up @@ -80,7 +79,7 @@ void keys(nvbench::state& state, nvbench::type_list<KeyT, OffsetT>)
);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());
auto* temp_storage = cuda::std::to_address(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::detail::merge::dispatch(
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/merge/merge_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ generate_lhs_rhs(std::size_t num_items_lhs, std::size_t num_items_rhs, bit_entro
counting_it,
counting_it + elements,
rnd_selector_val.begin(),
cuda::make_tabulate_output_iterator(write_pivot_point_t<offset_t>{
static_cast<offset_t>(num_items_lhs), thrust::raw_pointer_cast(pivot_point.data())}),
cuda::make_tabulate_output_iterator(
write_pivot_point_t<offset_t>{static_cast<offset_t>(num_items_lhs), cuda::std::to_address(pivot_point.data())}),
select_lhs_op);

thrust::device_vector<KeyT> keys_lhs(num_items_lhs);
Expand Down
Loading
Loading