Matt711
diff --git a/‎cpp/benchmarks/io/parquet/experimental/parquet_dictionary_page_filter.cpp‎
Lines changed: 37 additions & 17 deletions b/‎cpp/benchmarks/io/parquet/experimental/parquet_dictionary_page_filter.cpp‎
Lines changed: 37 additions & 17 deletions
diff --git a/‎cpp/examples/hybrid_scan_io/common_utils.cpp‎
Lines changed: 11 additions & 11 deletions b/‎cpp/examples/hybrid_scan_io/common_utils.cpp‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎cpp/examples/hybrid_scan_io/common_utils.hpp‎
Lines changed: 22 additions & 3 deletions b/‎cpp/examples/hybrid_scan_io/common_utils.hpp‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎cpp/examples/hybrid_scan_io/hybrid_scan_io.cpp‎
Lines changed: 10 additions & 6 deletions b/‎cpp/examples/hybrid_scan_io/hybrid_scan_io.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎cpp/examples/hybrid_scan_io/hybrid_scan_pipeline.cpp‎
Lines changed: 11 additions & 7 deletions b/‎cpp/examples/hybrid_scan_io/hybrid_scan_pipeline.cpp‎
Lines changed: 11 additions & 7 deletions
@@ -65,32 +65,51 @@ cudf::host_span<uint8_t const> fetch_page_index_bytes(
 }
 
 /**
- * @brief Fetches a list of byte ranges from a host buffer into a vector of device buffers
+ * @brief Converts a span of device buffers into a vector of corresponding device spans
+ *
+ * @tparam T Type of output device spans
+ * @param buffers Host span of device buffers
+ * @return Device spans corresponding to the input device buffers
+ */
+template <typename T>
+std::vector<cudf::device_span<T const>> make_device_spans(
+  cudf::host_span<rmm::device_buffer const> buffers)
+  requires(sizeof(T) == 1)
+{
+  std::vector<cudf::device_span<T const>> device_spans(buffers.size());
+  std::transform(buffers.begin(), buffers.end(), device_spans.begin(), [](auto const& buffer) {
+    return cudf::device_span<T const>{static_cast<T const*>((buffer.data())), buffer.size()};
+  });
+  return device_spans;
+}
+
+/**
+ * @brief Fetches a list of byte ranges from a host buffer into device buffers
  *
  * @param host_buffer Host buffer span
  * @param byte_ranges Byte ranges to fetch
  * @param stream CUDA stream
+ * @param mr Device memory resource
  *
- * @return Vector of device buffers
+ * @return Device buffers
  */
 std::vector<rmm::device_buffer> fetch_byte_ranges(
   cudf::host_span<uint8_t const> host_buffer,
   cudf::host_span<cudf::io::text::byte_range_info const> byte_ranges,
-  rmm::cuda_stream_view stream)
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
-  std::vector<rmm::device_buffer> buffers{};
-  buffers.reserve(byte_ranges.size());
+  std::vector<rmm::device_buffer> buffers(byte_ranges.size());
 
   std::transform(
-    byte_ranges.begin(),
-    byte_ranges.end(),
-    std::back_inserter(buffers),
-    [&](auto const& byte_range) {
+    byte_ranges.begin(), byte_ranges.end(), buffers.begin(), [&](auto const& byte_range) {
       auto const chunk_offset = host_buffer.data() + byte_range.offset();
-      auto const chunk_size   = byte_range.size();
-      auto buffer             = rmm::device_buffer(chunk_size, stream);
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        buffer.data(), chunk_offset, chunk_size, cudaMemcpyHostToDevice, stream.value()));
+      auto const chunk_size   = static_cast<size_t>(byte_range.size());
+      auto buffer             = rmm::device_buffer(chunk_size, stream, mr);
+      cudf::detail::cuda_memcpy_async(
+        cudf::device_span<uint8_t>{static_cast<uint8_t*>(buffer.data()), chunk_size},
+        cudf::host_span<uint8_t const>{chunk_offset, chunk_size},
+        stream);
       return buffer;
     });
 
@@ -157,9 +176,10 @@ void BM_parquet_filter_string_row_groups_with_dicts_common(nvbench::state& state
   // If we have dictionary page byte ranges, filter row groups with dictionary pages
   CUDF_EXPECTS(dict_page_byte_ranges.size() > 0, "No dictionary page byte ranges found");
 
-  // Fetch dictionary page buffers from the input file buffer
-  std::vector<rmm::device_buffer> dictionary_page_buffers =
-    fetch_byte_ranges(file_buffer_span, dict_page_byte_ranges, stream);
+  // Fetch dictionary page buffers and corresponding device spans from the input file buffer
+  auto dictionary_page_buffers = fetch_byte_ranges(
+    file_buffer_span, dict_page_byte_ranges, stream, cudf::get_current_device_resource_ref());
+  auto dictionary_page_data = make_device_spans<uint8_t>(dictionary_page_buffers);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -168,7 +188,7 @@ void BM_parquet_filter_string_row_groups_with_dicts_common(nvbench::state& state
                try_drop_l3_cache();
                timer.start();
                std::ignore = reader->filter_row_groups_with_dictionary_pages(
-                 dictionary_page_buffers, input_row_group_indices, read_opts, stream);
+                 dictionary_page_data, input_row_group_indices, read_opts, stream);
                timer.stop();
              });
 
 
@@ -136,20 +136,20 @@ std::vector<rmm::device_buffer> fetch_byte_ranges(
   static std::mutex mutex;
 
   std::vector<rmm::device_buffer> buffers(byte_ranges.size());
-
   {
     std::lock_guard<std::mutex> lock(mutex);
 
-    std::for_each(thrust::counting_iterator<size_t>(0),
-                  thrust::counting_iterator(byte_ranges.size()),
-                  [&](auto const idx) {
-                    auto const chunk_offset = host_buffer.data() + byte_ranges[idx].offset();
-                    auto const chunk_size   = byte_ranges[idx].size();
-                    auto buffer             = rmm::device_buffer(chunk_size, stream, mr);
-                    CUDF_CUDA_TRY(cudaMemcpyAsync(
-                      buffer.data(), chunk_offset, chunk_size, cudaMemcpyDefault, stream.value()));
-                    buffers[idx] = std::move(buffer);
-                  });
+    std::transform(
+      byte_ranges.begin(), byte_ranges.end(), buffers.begin(), [&](auto const& byte_range) {
+        auto const chunk_offset = host_buffer.data() + byte_range.offset();
+        auto const chunk_size   = static_cast<size_t>(byte_range.size());
+        auto buffer             = rmm::device_buffer(chunk_size, stream, mr);
+        cudf::detail::cuda_memcpy_async(
+          cudf::device_span<uint8_t>{static_cast<uint8_t*>(buffer.data()), chunk_size},
+          cudf::host_span<uint8_t const>{chunk_offset, chunk_size},
+          stream);
+        return buffer;
+      });
   }
 
   return buffers;
 
@@ -77,14 +77,33 @@ cudf::host_span<uint8_t const> fetch_page_index_bytes(
   cudf::host_span<uint8_t const> buffer, cudf::io::text::byte_range_info const page_index_bytes);
 
 /**
- * @brief Fetches a list of byte ranges from a host buffer into a vector of device buffers
+ * @brief Converts a span of device buffers into a vector of corresponding device spans
+ *
+ * @tparam T Type of output device spans
+ * @param buffers Host span of device buffers
+ * @return Device spans corresponding to the input device buffers
+ */
+template <typename T>
+std::vector<cudf::device_span<T const>> make_device_spans(
+  cudf::host_span<rmm::device_buffer const> buffers)
+  requires(sizeof(T) == 1)
+{
+  std::vector<cudf::device_span<T const>> device_spans(buffers.size());
+  std::transform(buffers.begin(), buffers.end(), device_spans.begin(), [](auto const& buffer) {
+    return cudf::device_span<T const>{static_cast<T const*>(buffer.data()), buffer.size()};
+  });
+  return device_spans;
+}
+
+/**
+ * @brief Fetches a list of byte ranges from a host buffer into device buffers
  *
  * @param host_buffer Host buffer span
  * @param byte_ranges Byte ranges to fetch
  * @param stream CUDA stream
- * @param mr Device memory resource to create device buffers with
+ * @param mr Device memory resource
  *
- * @return Vector of device buffers
+ * @return Device buffers
  */
 std::vector<rmm::device_buffer> fetch_byte_ranges(
   cudf::host_span<uint8_t const> host_buffer,
 
@@ -141,11 +141,12 @@ auto hybrid_scan(io_source const& io_source,
       dict_page_byte_ranges.size()) {
     std::cout << "READER: Filter row groups with dictionary pages...\n";
     timer.reset();
-    // Fetch dictionary page buffers from the input file buffer
-    std::vector<rmm::device_buffer> dictionary_page_buffers =
+    // Fetch dictionary page buffers and corresponding device spans from the input file buffer
+    auto dictionary_page_buffers =
       fetch_byte_ranges(file_buffer_span, dict_page_byte_ranges, stream, mr);
+    auto dictionary_page_data = make_device_spans<uint8_t>(dictionary_page_buffers);
     dictionary_page_filtered_row_group_indices = reader->filter_row_groups_with_dictionary_pages(
-      dictionary_page_buffers, current_row_group_indices, options, stream);
+      dictionary_page_data, current_row_group_indices, options, stream);
 
     // Update current row group indices
     current_row_group_indices = dictionary_page_filtered_row_group_indices;
@@ -166,8 +167,9 @@ auto hybrid_scan(io_source const& io_source,
       mr, bloom_filter_alignment);
     std::cout << "READER: Filter row groups with bloom filters...\n";
     timer.reset();
-    std::vector<rmm::device_buffer> bloom_filter_data =
+    auto bloom_filter_buffers =
       fetch_byte_ranges(file_buffer_span, bloom_filter_byte_ranges, stream, aligned_mr);
+    auto bloom_filter_data = make_device_spans<uint8_t>(bloom_filter_buffers);
     // Filter row groups with bloom filters
     bloom_filtered_row_group_indices = reader->filter_row_groups_with_bloom_filters(
       bloom_filter_data, current_row_group_indices, options, stream);
@@ -207,14 +209,15 @@ auto hybrid_scan(io_source const& io_source,
     reader->filter_column_chunks_byte_ranges(current_row_group_indices, options);
   auto filter_column_chunk_buffers =
     fetch_byte_ranges(file_buffer_span, filter_column_chunk_byte_ranges, stream, mr);
+  auto filter_column_chunk_data = make_device_spans<uint8_t>(filter_column_chunk_buffers);
 
   // Materialize the table with only the filter columns
   auto row_mask_mutable_view = row_mask->mutable_view();
   auto filter_table =
     reader
       ->materialize_filter_columns(
         current_row_group_indices,
-        std::move(filter_column_chunk_buffers),
+        filter_column_chunk_data,
         row_mask_mutable_view,
         prune_filter_data_pages ? use_data_page_mask::YES : use_data_page_mask::NO,
         options,
@@ -239,13 +242,14 @@ auto hybrid_scan(io_source const& io_source,
     reader->payload_column_chunks_byte_ranges(current_row_group_indices, options);
   auto payload_column_chunk_buffers =
     fetch_byte_ranges(file_buffer_span, payload_column_chunk_byte_ranges, stream, mr);
+  auto payload_column_chunk_data = make_device_spans<uint8_t>(payload_column_chunk_buffers);
 
   // Materialize the table with only the payload columns
   auto payload_table =
     reader
       ->materialize_payload_columns(
         current_row_group_indices,
-        std::move(payload_column_chunk_buffers),
+        payload_column_chunk_data,
         row_mask->view(),
         prune_payload_data_pages ? use_data_page_mask::YES : use_data_page_mask::NO,
         options,
 
@@ -96,11 +96,10 @@ struct hybrid_scan_fn {
       reader->all_column_chunks_byte_ranges(row_groups_indices, options);
     auto all_column_chunk_buffers =
       fetch_byte_ranges(file_buffer_span, all_column_chunk_byte_ranges, stream, mr);
-    table.get() =
-      std::move(reader
-                  ->materialize_all_columns(
-                    row_groups_indices, std::move(all_column_chunk_buffers), options, stream)
-                  .tbl);
+    auto all_column_chunk_data = make_device_spans<uint8_t>(all_column_chunk_buffers);
+    table.get()                = std::move(
+      reader->materialize_all_columns(row_groups_indices, all_column_chunk_data, options, stream)
+        .tbl);
     stream.synchronize_no_throw();
   }
 };
@@ -163,10 +162,14 @@ auto hybrid_scan_pipelined(io_source const& io_source,
 
   timer.print_elapsed_millis();
 
-  std::cout << "Creating row group partitions... \n";
-  timer.reset();
+  if (num_partitions > 1) {
+    std::cout << "Creating row group partitions... \n";
+    timer.reset();
+  }
 
   if (num_partitions == 1) {
+    std::cout << "Reading as single partition... \n";
+    timer.reset();
     hybrid_scan_fn{.table              = std::ref(tables.front()),
                    .reader             = std::move(readers.front()),
                    .file_buffer_span   = file_buffer_span,
@@ -175,6 +178,7 @@ auto hybrid_scan_pipelined(io_source const& io_source,
                    .options            = options,
                    .stream             = stream_pool.get_stream(),
                    .mr                 = mr}();
+    timer.print_elapsed_millis();
     return std::move(tables.front());
   }