-
Notifications
You must be signed in to change notification settings - Fork 28
Adding chunked_pack to pack table chunks when device memory is insufficient #726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
1d28b4e
f6224a8
eb8f742
3f67fce
241d697
d12c5eb
2a08a1d
88fa3e4
bf5088d
027875e
856f515
5f984f6
78ed513
34c96cc
41760de
a897597
d35feb5
8aa081d
e1fcf23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -267,4 +267,54 @@ std::vector<PackedData> unspill_partitions( | |
| statistics->add_bytes_stat("spill-bytes-host-to-device", non_device_size); | ||
| return ret; | ||
| } | ||
|
|
||
| PackedData chunked_pack( | ||
| cudf::table_view const& table, Buffer& bounce_buf, MemoryReservation& data_res | ||
| ) { | ||
| RAPIDSMPF_EXPECTS( | ||
| bounce_buf.mem_type() == MemoryType::DEVICE, | ||
| "bounce buffer must be in device memory", | ||
| std::invalid_argument | ||
| ); | ||
| // all copies will be done on the bounce buffer's stream | ||
| auto const& stream = bounce_buf.stream(); | ||
nirandaperera marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| auto* br = data_res.br(); | ||
| size_t chunk_size = bounce_buf.size; | ||
|
|
||
| cudf::chunked_pack packer(table, chunk_size, stream, br->device_mr()); | ||
| auto const packed_size = packer.get_total_contiguous_size(); | ||
|
|
||
| // if the packed size > data reservation, and it is within the wiggle room, pad the | ||
| // data reservation to the packed size from the same memory type. | ||
| if (packed_size > data_res.size()) { | ||
| if (packed_size <= data_res.size() + total_packing_wiggle_room(table)) { | ||
| data_res = | ||
| data_res.br()->reserve(data_res.mem_type(), packed_size, true).first; | ||
| } | ||
| } | ||
|
|
||
| // allocate the data buffer | ||
| auto data_buf = br->allocate(packed_size, stream, data_res); | ||
nirandaperera marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| bounce_buf.write_access([&](std::byte* bounce_buf_ptr, rmm::cuda_stream_view) { | ||
| // all copies are done on the same stream, so we can omit the stream parameter | ||
| cudf::device_span<uint8_t> buf_span( | ||
| reinterpret_cast<uint8_t*>(bounce_buf_ptr), chunk_size | ||
|
Comment on lines
+320
to
+322
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let use use Also the comment is meaningless,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment refers to the |
||
| ); | ||
|
|
||
| data_buf->write_access([&](std::byte* data_ptr, rmm::cuda_stream_view) { | ||
| size_t offset = 0; | ||
| while (packer.has_next()) { | ||
| size_t n_bytes = packer.next(buf_span); | ||
| RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( | ||
| data_ptr + offset, buf_span.data(), n_bytes, cudaMemcpyDefault, stream | ||
| )); | ||
| offset += n_bytes; | ||
| } | ||
| }); | ||
| }); | ||
|
|
||
| return {packer.build_metadata(), std::move(data_buf)}; | ||
| } | ||
|
|
||
| } // namespace rapidsmpf | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |
|
|
||
| #include <memory> | ||
|
|
||
| #include <rapidsmpf/integrations/cudf/partition.hpp> | ||
| #include <rapidsmpf/integrations/cudf/utils.hpp> | ||
| #include <rapidsmpf/memory/buffer.hpp> | ||
| #include <rapidsmpf/streaming/cudf/table_chunk.hpp> | ||
|
|
@@ -169,32 +170,57 @@ TableChunk TableChunk::copy(MemoryReservation& reservation) const { | |
| // serialize `table_view()` into a packed_columns and then we move | ||
| // the packed_columns' gpu_data to a new host buffer. | ||
|
|
||
| // TODO: use `cudf::chunked_pack()` with a bounce buffer. Currently, | ||
| // `cudf::pack()` allocates device memory we haven't reserved. | ||
| auto packed_columns = | ||
| cudf::pack(table_view(), stream(), br->device_mr()); | ||
| packed_data = std::make_unique<PackedData>( | ||
| std::move(packed_columns.metadata), | ||
| br->move(std::move(packed_columns.gpu_data), stream()) | ||
| // make a reservation for packing | ||
| auto [pack_res, overbooking] = br->reserve( | ||
| MemoryType::DEVICE, | ||
| estimated_memory_usage(table_view(), stream()), | ||
| true | ||
| ); | ||
|
|
||
| // Handle the case where `cudf::pack` allocates slightly more than the | ||
| // input size. This can occur because cudf uses aligned allocations, | ||
| // which may exceed the requested size. To accommodate this, we | ||
| // allow some wiggle room. | ||
| if (packed_data->data->size > reservation.size()) { | ||
| auto const wiggle_room = | ||
| 1024 * static_cast<std::size_t>(table_view().num_columns()); | ||
| if (packed_data->data->size <= reservation.size() + wiggle_room) { | ||
| reservation = | ||
| br->reserve( | ||
| MemoryType::HOST, packed_data->data->size, true | ||
| ) | ||
| .first; | ||
| if (overbooking > 0) { | ||
| // there is not enough memory to pack the table. | ||
| size_t avail_dev_mem = pack_res.size() - overbooking; | ||
|
||
| RAPIDSMPF_EXPECTS( | ||
| avail_dev_mem > 1 << 20, | ||
| "not enough device memory for the bounce buffer", | ||
| std::runtime_error | ||
| ); | ||
nirandaperera marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| auto bounce_buf = br->allocate(avail_dev_mem, stream(), pack_res); | ||
|
|
||
| packed_data = std::make_unique<PackedData>( | ||
| chunked_pack(table_view(), *bounce_buf, reservation) | ||
| ); | ||
| } else { | ||
| // if there is enough memory to pack the table, use `cudf::pack` | ||
| auto packed_columns = | ||
| cudf::pack(table_view(), stream(), br->device_mr()); | ||
| // clear the reservation as we are done with it. | ||
| pack_res.clear(); | ||
| packed_data = std::make_unique<PackedData>( | ||
| std::move(packed_columns.metadata), | ||
| br->move(std::move(packed_columns.gpu_data), stream()) | ||
| ); | ||
|
|
||
| // Handle the case where `cudf::pack` allocates slightly more than | ||
| // the input size. This can occur because cudf uses aligned | ||
| // allocations, which may exceed the requested size. To | ||
| // accommodate this, we allow some wiggle room. | ||
| if (packed_data->data->size > reservation.size()) { | ||
| if (packed_data->data->size | ||
| <= reservation.size() | ||
| + total_packing_wiggle_room(table_view())) | ||
| { | ||
| reservation = | ||
| br->reserve( | ||
| MemoryType::HOST, packed_data->data->size, true | ||
| ) | ||
| .first; | ||
| } | ||
|
||
| } | ||
| // finally copy the packed data device buffer to HOST memory | ||
| packed_data->data = | ||
| br->move(std::move(packed_data->data), reservation); | ||
| } | ||
| packed_data->data = | ||
| br->move(std::move(packed_data->data), reservation); | ||
| } | ||
| return TableChunk(std::move(packed_data)); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: This is likely not enough if the table has many nested columns.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure. I was just mimicking the current impl we have here
https://github.com/rapidsai/rapidsmpf/blob/main/cpp/src/streaming/cudf/table_chunk.cpp#L187-L189