From 558386b377ce550184c3c1da55a90b90913f47b2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 25 Nov 2025 19:02:48 +0000
Subject: [PATCH 1/8] Filter row groups with byte range in hybrid scan reader

---
 .../cudf/io/experimental/hybrid_scan.hpp      | 35 +++++++++--
 .../io/parquet/experimental/hybrid_scan.cpp   | 16 +++++
 .../experimental/hybrid_scan_helpers.cpp      |  9 +++
 .../experimental/hybrid_scan_helpers.hpp      | 18 ++++++
 .../parquet/experimental/hybrid_scan_impl.cpp | 17 ++++++
 .../parquet/experimental/hybrid_scan_impl.hpp |  8 +++
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  5 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  3 +-
 .../experimental/hybrid_scan_filters_test.cpp | 61 +++++++++++++++++++
 9 files changed, 163 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp
index db65a0dd06f..78634116fd0 100644
--- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp
+++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp
@@ -113,11 +113,11 @@ enum class use_data_page_mask : bool {
  * @endcode
  *
  * Row group pruning (OPTIONAL): Start with either a list of custom or all row group indices in the
- * parquet file and optionally filter it subject to filter expression using column chunk statistics,
- * dictionaries and bloom filters. Byte ranges for column chunk dictionary pages and bloom filters
- * within parquet file may be obtained via `secondary_filters_byte_ranges()` function. The byte
- * ranges may be read into a corresponding vector of device buffers and passed to the corresponding
- * row group filtration function.
+ * parquet file and optionally filter it using a byte range and/or the filter expression using
+ * column chunk statistics, dictionaries and bloom filters. Byte ranges for column chunk dictionary
+ * pages and bloom filters within parquet file may be obtained via `secondary_filters_byte_ranges()`
+ * function. The byte ranges may be read into a corresponding vector of device buffers and passed to
+ * the corresponding row group filtration function.
  * @code{.cpp}
  * // Start with a list of all parquet row group indices from the file footer
  * auto all_row_group_indices = reader->all_row_groups(options);
@@ -125,6 +125,13 @@ enum class use_data_page_mask : bool {
  * // Span to track the indices of row groups currently at hand
  * auto current_row_group_indices = cudf::host_span<size_type>(all_row_group_indices);
  *
+ * // Optional: Prune row group indices to the ones that start within the byte range
+ * auto byte_range_filtered_row_group_indices = reader->filter_row_groups_with_byte_range(
+ *   current_row_group_indices, bytes_to_skip, bytes_to_read);
+ *
+ * // Update current row group indices to byte range filtered row group indices
+ * current_row_group_indices = byte_range_filtered_row_group_indices;
+ *
  * // Optional: Prune row group indices subject to filter expression using row group statistics
  * auto stats_filtered_row_group_indices =
  *   reader->filter_row_groups_with_stats(current_row_group_indices, options, stream);
@@ -335,6 +342,24 @@ class hybrid_scan_reader {
   [[nodiscard]] size_type total_rows_in_row_groups(
     cudf::host_span<size_type const> row_group_indices) const;
 
+  /**
+   * @brief Filter the row groups using the specified byte range specified by [`bytes_to_skip`,
+   * `bytes_to_skip + bytes_to_read`)
+   *
+   * Filters the row groups such that only the row groups that start within the byte range are
+   * selected. Note that the last selected row group may end beyond the byte range.
+   *
+   * @param row_group_indices Input row groups indices
+   * @param bytes_to_skip Bytes to skip before selecting row groups
+   * @param bytes_to_read Optional bytes to select row groups from after skipping. All row groups
+   * until the end of the file are selected if not provided
+   * @return Filtered row group indices
+   */
+  [[nodiscard]] std::vector<size_type> filter_row_groups_with_byte_range(
+    cudf::host_span<size_type const> row_group_indices,
+    size_t bytes_to_skip,
+    std::optional<size_t> bytes_to_read) const;
+
   /**
    * @brief Filter the input row groups using column chunk statistics
    *
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp
index 49a1e6d1f30..41bbfe7a0d8 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp
@@ -74,6 +74,22 @@ size_type hybrid_scan_reader::total_rows_in_row_groups(
   return _impl->total_rows_in_row_groups(input_row_group_indices);
 }
 
+std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_byte_range(
+  cudf::host_span<size_type const> row_group_indices,
+  size_t bytes_to_skip,
+  std::optional<size_t> bytes_to_read) const
+{
+  CUDF_FUNC_RANGE();
+
+  // Temporary vector with row group indices from the first source
+  auto const input_row_group_indices =
+    std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}};
+
+  return _impl
+    ->filter_row_groups_with_byte_range(input_row_group_indices, bytes_to_skip, bytes_to_read)
+    .front();
+}
+
 std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_stats(
   cudf::host_span<size_type const> row_group_indices,
   parquet_reader_options const& options,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
index 3818e5ed36f..5370c2a36ff 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
@@ -279,6 +279,15 @@ aggregate_reader_metadata::select_payload_columns(
                         timestamp_type_id);
 }
 
+std::vector<std::vector<cudf::size_type>>
+aggregate_reader_metadata::filter_row_groups_with_byte_range(
+  cudf::host_span<std::vector<size_type> const> row_group_indices,
+  size_t bytes_to_skip,
+  std::optional<size_t> const& bytes_to_read) const
+{
+  return apply_byte_bounds_filter(row_group_indices, bytes_to_skip, bytes_to_read);
+}
+
 std::vector<std::vector<cudf::size_type>> aggregate_reader_metadata::filter_row_groups_with_stats(
   host_span<std::vector<cudf::size_type> const> row_group_indices,
   host_span<data_type const> output_dtypes,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
index bce18919003..7113d0a0be4 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
@@ -146,6 +146,24 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
                            bool ignore_missing_columns,
                            type_id timestamp_type_id);
 
+  /**
+   * @brief Filters row groups such that only the row groups that start within the byte range
+   * specified by [`bytes_to_skip`, `bytes_to_skip + bytes_to_read`) are selected
+   *
+   * @note The last selected row group may end beyond the byte range.
+   *
+   * @param row_group_indices Input row groups indices
+   * @param bytes_to_skip Bytes to skip before selecting row groups
+   * @param bytes_to_read Optional bytes to select row groups from after skipping. All row groups
+   * until the end of the file are selected if not provided
+   *
+   * @return Filtered row group indices
+   */
+  [[nodiscard]] std::vector<std::vector<cudf::size_type>> filter_row_groups_with_byte_range(
+    cudf::host_span<std::vector<size_type> const> row_group_indices,
+    size_t bytes_to_skip,
+    std::optional<size_t> const& bytes_to_read) const;
+
   /**
    * @brief Filter the row groups with statistics based on predicate filter
    *
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
index 82f7d619a2b..823763a8777 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
@@ -168,6 +168,23 @@ size_type hybrid_scan_reader_impl::total_rows_in_row_groups(
   return _extended_metadata->total_rows_in_row_groups(row_group_indices);
 }
 
+std::vector<std::vector<cudf::size_type>>
+hybrid_scan_reader_impl::filter_row_groups_with_byte_range(
+  cudf::host_span<std::vector<size_type> const> row_group_indices,
+  size_t bytes_to_skip,
+  std::optional<size_t> const& bytes_to_read) const
+{
+  CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered");
+
+  if (bytes_to_skip == 0 and not bytes_to_read.has_value()) {
+    return std::vector<std::vector<cudf::size_type>>{row_group_indices.begin(),
+                                                     row_group_indices.end()};
+  }
+
+  return _extended_metadata->filter_row_groups_with_byte_range(
+    row_group_indices, bytes_to_skip, bytes_to_read);
+}
+
 std::vector<std::vector<size_type>> hybrid_scan_reader_impl::filter_row_groups_with_stats(
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   parquet_reader_options const& options,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
index 219428df37a..9c4662629ca 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
@@ -85,6 +85,14 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
   [[nodiscard]] size_type total_rows_in_row_groups(
     cudf::host_span<std::vector<size_type> const> row_group_indices) const;
 
+  /**
+   * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_byte_range
+   */
+  [[nodiscard]] std::vector<std::vector<cudf::size_type>> filter_row_groups_with_byte_range(
+    cudf::host_span<std::vector<size_type> const> row_group_indices,
+    size_t bytes_to_skip,
+    std::optional<size_t> const& bytes_to_read) const;
+
   /**
    * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_stats
    */
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index fe9d33678c3..6b34b33c509 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -1431,9 +1431,8 @@ aggregate_reader_metadata::select_row_groups(
   }
 
   // Flag to check if the row groups will be filtered using byte bounds
-  bool const is_byte_bounded_row_groups = row_group_indices.empty() and
-
-                                          (skip_bytes_opt > 0 or byte_count_opt.has_value());
+  bool const is_byte_bounded_row_groups =
+    row_group_indices.empty() and (skip_bytes_opt > 0 or byte_count_opt.has_value());
 
   // We can't filter with both row bounds and byte bounds
   CUDF_EXPECTS(not(is_row_bounded_row_groups and is_byte_bounded_row_groups),
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 4b431ce45bd..cbf7069f622 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -267,7 +267,8 @@ class aggregate_reader_metadata {
    *
    * @param input_row_group_indices Lists of input row groups, one per source
    * @param bytes_to_skip Bytes to skip before selecting row groups
-   * @param bytes_to_read Bytes to select row groups after skipping
+   * @param bytes_to_read Optional bytes to select row groups from after skipping. All row groups
+   * until the end of the file are selected if not provided
    *
    * @return A vector of surviving row group indices
    */
diff --git a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
index f2235e96d32..749a917aedd 100644
--- a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
+++ b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
@@ -18,6 +18,9 @@
 
 #include <src/io/parquet/parquet_gpu.hpp>
 
+#include <filesystem>
+#include <fstream>
+
 namespace {
 
 /**
@@ -225,6 +228,64 @@ TEST_F(HybridScanFiltersTest, TestExternalMetadata)
   EXPECT_EQ(reader->total_rows_in_row_groups(input_row_group_indices), 2 * rows_per_row_group);
 }
 
+TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
+{
+  using T                      = cudf::string_view;
+  auto const [table, filepath] = create_parquet_typed_with_stats<T>("ByteBounds.parquet");
+
+  auto const file_size = std::filesystem::file_size(filepath);
+  std::vector<char> file_buffer(file_size);
+  std::ifstream file{filepath, std::ifstream::binary};
+  file.read(file_buffer.data(), file_size);
+  file.close();
+
+  // Input file buffer span
+  auto const file_buffer_span = cudf::host_span<uint8_t const>(
+    reinterpret_cast<uint8_t const*>(file_buffer.data()), file_buffer.size());
+
+  // Fetch footer and page index bytes from the buffer.
+  auto const footer_buffer = fetch_footer_bytes(file_buffer_span);
+
+  // Create hybrid scan reader with footer bytes
+  auto const options = cudf::io::parquet_reader_options::builder().build();
+  auto const reader =
+    std::make_unique<cudf::io::parquet::experimental::hybrid_scan_reader>(footer_buffer, options);
+
+  auto const input_row_group_indices = reader->all_row_groups(options);
+
+  // @note: In the above parquet file, the row groups start at the following byte offsets: 4, 75224,
+  // 150332, 225561. The `skip_bytes` and `num_bytes` have been chosen to have enough cushion but
+  // may need to be adjusted in the future if this test suddenly starts failing.
+
+  {
+    // Start with all row groups and only read row group 0 as only it will start in [0, 1000) byte
+    // range
+    auto constexpr num_bytes = 1000;
+    auto const filtered_row_group_indices =
+      reader->filter_row_groups_with_byte_range(input_row_group_indices, 0, num_bytes);
+    auto const expected_row_group_indices = std::vector<cudf::size_type>{0};
+    EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
+  }
+
+  {
+    // Start with all row groups and skip row group 0 as it won't start in [1000, inf) byte range
+    auto skip_bytes = 1000;
+    auto filtered_row_group_indices =
+      reader->filter_row_groups_with_byte_range(input_row_group_indices, skip_bytes, {});
+    auto expected_row_group_indices = std::vector<cudf::size_type>{1, 2, 3};
+    EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
+
+    // Now start with filtered row groups and only read row group 1 as only it starts in [50000,
+    // 100000) byte range
+    skip_bytes               = 50000;
+    auto constexpr num_bytes = 50000;
+    filtered_row_group_indices =
+      reader->filter_row_groups_with_byte_range(filtered_row_group_indices, skip_bytes, num_bytes);
+    expected_row_group_indices = std::vector<cudf::size_type>{1};
+    EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
+  }
+}
+
 TEST_F(HybridScanFiltersTest, FilterRowGroupsWithStats)
 {
   srand(0xc001);

From c8b73a2d91d2f0d4022ead9269f9d58a68592036 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 25 Nov 2025 19:16:03 +0000
Subject: [PATCH 2/8] Update API

---
 cpp/include/cudf/io/experimental/hybrid_scan.hpp    |  7 ++-----
 cpp/src/io/functions.cpp                            |  4 ++--
 cpp/src/io/parquet/experimental/hybrid_scan.cpp     |  8 ++------
 .../io/parquet/experimental/hybrid_scan_impl.cpp    |  7 +++----
 .../io/parquet/experimental/hybrid_scan_impl.hpp    |  3 +--
 .../io/experimental/hybrid_scan_filters_test.cpp    | 13 +++++++++----
 6 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp
index 78634116fd0..877d4a97f7d 100644
--- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp
+++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp
@@ -350,15 +350,12 @@ class hybrid_scan_reader {
    * selected. Note that the last selected row group may end beyond the byte range.
    *
    * @param row_group_indices Input row groups indices
-   * @param bytes_to_skip Bytes to skip before selecting row groups
-   * @param bytes_to_read Optional bytes to select row groups from after skipping. All row groups
-   * until the end of the file are selected if not provided
+   * @param options Parquet reader options
    * @return Filtered row group indices
    */
   [[nodiscard]] std::vector<size_type> filter_row_groups_with_byte_range(
     cudf::host_span<size_type const> row_group_indices,
-    size_t bytes_to_skip,
-    std::optional<size_t> bytes_to_read) const;
+    parquet_reader_options const& options) const;
 
   /**
    * @brief Filter the input row groups using column chunk statistics
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 55aa7a9ba24..c15a9f7bc58 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -821,7 +821,7 @@ void parquet_reader_options::set_num_rows(int64_t val)
 
 void parquet_reader_options::set_skip_bytes(size_t val)
 {
-  CUDF_EXPECTS(val == 0 or std::cmp_equal(_source.num_sources(), 1),
+  CUDF_EXPECTS(val == 0 or std::cmp_less_equal(_source.num_sources(), 1),
                "skip_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(val == 0 or (not _num_rows.has_value() and _skip_rows == 0),
                "skip_bytes cannot be set along with skip_rows and num_rows");
@@ -833,7 +833,7 @@ void parquet_reader_options::set_skip_bytes(size_t val)
 
 void parquet_reader_options::set_num_bytes(size_t val)
 {
-  CUDF_EXPECTS(std::cmp_equal(_source.num_sources(), 1),
+  CUDF_EXPECTS(std::cmp_less_equal(_source.num_sources(), 1),
                "num_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(not _num_rows.has_value() and _skip_rows == 0,
                "num_bytes cannot be set along with skip_rows and num_rows");
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp
index 41bbfe7a0d8..4e0f73f62ad 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp
@@ -75,9 +75,7 @@ size_type hybrid_scan_reader::total_rows_in_row_groups(
 }
 
 std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_byte_range(
-  cudf::host_span<size_type const> row_group_indices,
-  size_t bytes_to_skip,
-  std::optional<size_t> bytes_to_read) const
+  cudf::host_span<size_type const> row_group_indices, parquet_reader_options const& options) const
 {
   CUDF_FUNC_RANGE();
 
@@ -85,9 +83,7 @@ std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_byte_ran
   auto const input_row_group_indices =
     std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}};
 
-  return _impl
-    ->filter_row_groups_with_byte_range(input_row_group_indices, bytes_to_skip, bytes_to_read)
-    .front();
+  return _impl->filter_row_groups_with_byte_range(input_row_group_indices, options).front();
 }
 
 std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_stats(
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
index 823763a8777..93e60cdb9e7 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
@@ -171,18 +171,17 @@ size_type hybrid_scan_reader_impl::total_rows_in_row_groups(
 std::vector<std::vector<cudf::size_type>>
 hybrid_scan_reader_impl::filter_row_groups_with_byte_range(
   cudf::host_span<std::vector<size_type> const> row_group_indices,
-  size_t bytes_to_skip,
-  std::optional<size_t> const& bytes_to_read) const
+  parquet_reader_options const& options) const
 {
   CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered");
 
-  if (bytes_to_skip == 0 and not bytes_to_read.has_value()) {
+  if (options.get_skip_bytes() == 0 and not options.get_num_bytes().has_value()) {
     return std::vector<std::vector<cudf::size_type>>{row_group_indices.begin(),
                                                      row_group_indices.end()};
   }
 
   return _extended_metadata->filter_row_groups_with_byte_range(
-    row_group_indices, bytes_to_skip, bytes_to_read);
+    row_group_indices, options.get_skip_bytes(), options.get_num_bytes());
 }
 
 std::vector<std::vector<size_type>> hybrid_scan_reader_impl::filter_row_groups_with_stats(
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
index 9c4662629ca..85ebb5c89fb 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
@@ -90,8 +90,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    */
   [[nodiscard]] std::vector<std::vector<cudf::size_type>> filter_row_groups_with_byte_range(
     cudf::host_span<std::vector<size_type> const> row_group_indices,
-    size_t bytes_to_skip,
-    std::optional<size_t> const& bytes_to_read) const;
+    parquet_reader_options const& options) const;
 
   /**
    * @copydoc cudf::io::experimental::hybrid_scan::filter_row_groups_with_stats
diff --git a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
index 749a917aedd..0fb886efa8f 100644
--- a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
+++ b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
@@ -247,7 +247,7 @@ TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
   auto const footer_buffer = fetch_footer_bytes(file_buffer_span);
 
   // Create hybrid scan reader with footer bytes
-  auto const options = cudf::io::parquet_reader_options::builder().build();
+  auto options = cudf::io::parquet_reader_options::builder().build();
   auto const reader =
     std::make_unique<cudf::io::parquet::experimental::hybrid_scan_reader>(footer_buffer, options);
 
@@ -261,8 +261,9 @@ TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
     // Start with all row groups and only read row group 0 as only it will start in [0, 1000) byte
     // range
     auto constexpr num_bytes = 1000;
+    options.set_num_bytes(num_bytes);
     auto const filtered_row_group_indices =
-      reader->filter_row_groups_with_byte_range(input_row_group_indices, 0, num_bytes);
+      reader->filter_row_groups_with_byte_range(input_row_group_indices, options);
     auto const expected_row_group_indices = std::vector<cudf::size_type>{0};
     EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
   }
@@ -270,8 +271,10 @@ TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
   {
     // Start with all row groups and skip row group 0 as it won't start in [1000, inf) byte range
     auto skip_bytes = 1000;
+    options.set_skip_bytes(skip_bytes);
+    options.set_num_bytes(std::numeric_limits<size_t>::max());
     auto filtered_row_group_indices =
-      reader->filter_row_groups_with_byte_range(input_row_group_indices, skip_bytes, {});
+      reader->filter_row_groups_with_byte_range(input_row_group_indices, options);
     auto expected_row_group_indices = std::vector<cudf::size_type>{1, 2, 3};
     EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
 
@@ -279,8 +282,10 @@ TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
     // 100000) byte range
     skip_bytes               = 50000;
     auto constexpr num_bytes = 50000;
+    options.set_skip_bytes(skip_bytes);
+    options.set_num_bytes(num_bytes);
     filtered_row_group_indices =
-      reader->filter_row_groups_with_byte_range(filtered_row_group_indices, skip_bytes, num_bytes);
+      reader->filter_row_groups_with_byte_range(filtered_row_group_indices, options);
     expected_row_group_indices = std::vector<cudf::size_type>{1};
     EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
   }

From bb58fc47d0c4f968ae5911b11a576a9eb70094e9 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 25 Nov 2025 19:16:39 +0000
Subject: [PATCH 3/8] Update doc

---
 cpp/include/cudf/io/experimental/hybrid_scan.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp
index 877d4a97f7d..96d913bfaea 100644
--- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp
+++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp
@@ -127,7 +127,7 @@ enum class use_data_page_mask : bool {
  *
  * // Optional: Prune row group indices to the ones that start within the byte range
  * auto byte_range_filtered_row_group_indices = reader->filter_row_groups_with_byte_range(
- *   current_row_group_indices, bytes_to_skip, bytes_to_read);
+ *   current_row_group_indices, options);
  *
  * // Update current row group indices to byte range filtered row group indices
  * current_row_group_indices = byte_range_filtered_row_group_indices;

From cafde09ea16927a6cad3d38c0966d9dbe7b5cdab Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 25 Nov 2025 13:24:54 -0800
Subject: [PATCH 4/8] Apply suggestions from code review

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/io/parquet/experimental/hybrid_scan.cpp         | 2 +-
 cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp | 4 ++--
 cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp
index 4e0f73f62ad..4676ed1f43d 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp
@@ -74,7 +74,7 @@ size_type hybrid_scan_reader::total_rows_in_row_groups(
   return _impl->total_rows_in_row_groups(input_row_group_indices);
 }
 
-std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_byte_range(
+std::vector<size_type> hybrid_scan_reader::filter_row_groups_with_byte_range(
   cudf::host_span<size_type const> row_group_indices, parquet_reader_options const& options) const
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
index 5370c2a36ff..5642852267b 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp
@@ -282,8 +282,8 @@ aggregate_reader_metadata::select_payload_columns(
 std::vector<std::vector<cudf::size_type>>
 aggregate_reader_metadata::filter_row_groups_with_byte_range(
   cudf::host_span<std::vector<size_type> const> row_group_indices,
-  size_t bytes_to_skip,
-  std::optional<size_t> const& bytes_to_read) const
+  std::size_t bytes_to_skip,
+  std::optional<std::size_t> const& bytes_to_read) const
 {
   return apply_byte_bounds_filter(row_group_indices, bytes_to_skip, bytes_to_read);
 }
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
index 7113d0a0be4..df4b1175594 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
@@ -161,8 +161,8 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
    */
   [[nodiscard]] std::vector<std::vector<cudf::size_type>> filter_row_groups_with_byte_range(
     cudf::host_span<std::vector<size_type> const> row_group_indices,
-    size_t bytes_to_skip,
-    std::optional<size_t> const& bytes_to_read) const;
+    std::size_t bytes_to_skip,
+    std::optional<std::size_t> const& bytes_to_read) const;
 
   /**
    * @brief Filter the row groups with statistics based on predicate filter

From 103d53e7d7e8bd1b5705a2d95b244f42ea7cc25c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 25 Nov 2025 13:25:42 -0800
Subject: [PATCH 5/8] Apply suggestions from code review

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/io/parquet/reader_impl_helpers.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index cbf7069f622..2357836a6db 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -268,7 +268,7 @@ class aggregate_reader_metadata {
    * @param input_row_group_indices Lists of input row groups, one per source
    * @param bytes_to_skip Bytes to skip before selecting row groups
    * @param bytes_to_read Optional bytes to select row groups from after skipping. All row groups
-   * until the end of the file are selected if not provided
+   * until the end of the file are selected if not provided.
    *
    * @return A vector of surviving row group indices
    */

From 8ba39e5f83841ac8e8ee6ae1078d62d5d5b4b231 Mon Sep 17 00:00:00 2001
From: anon <users.noreply.github.com>
Date: Tue, 2 Dec 2025 17:11:46 +0000
Subject: [PATCH 6/8] Add comments

---
 cpp/src/io/functions.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index c15a9f7bc58..dacacbd5286 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -821,7 +821,8 @@ void parquet_reader_options::set_num_rows(int64_t val)
 
 void parquet_reader_options::set_skip_bytes(size_t val)
 {
-  CUDF_EXPECTS(val == 0 or std::cmp_less_equal(_source.num_sources(), 1),
+  // Hybrid scan reader does not contain a source so relaxing this check to zero or one source
+  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0),
                "skip_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(val == 0 or (not _num_rows.has_value() and _skip_rows == 0),
                "skip_bytes cannot be set along with skip_rows and num_rows");
@@ -833,7 +834,8 @@ void parquet_reader_options::set_skip_bytes(size_t val)
 
 void parquet_reader_options::set_num_bytes(size_t val)
 {
-  CUDF_EXPECTS(std::cmp_less_equal(_source.num_sources(), 1),
+  // Hybrid scan reader does not contain a source so relaxing this check to zero or one source
+  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0),
                "num_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(not _num_rows.has_value() and _skip_rows == 0,
                "num_bytes cannot be set along with skip_rows and num_rows");

From da158246efd24999ec996a2b46e5c57ca617f6af Mon Sep 17 00:00:00 2001
From: anon <users.noreply.github.com>
Date: Tue, 2 Dec 2025 18:47:25 +0000
Subject: [PATCH 7/8] minor stuff

---
 cpp/src/io/functions.cpp                               |  4 ++--
 cpp/tests/io/experimental/hybrid_scan_filters_test.cpp | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index dacacbd5286..db3f41b082f 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -822,7 +822,7 @@ void parquet_reader_options::set_num_rows(int64_t val)
 void parquet_reader_options::set_skip_bytes(size_t val)
 {
   // Hybrid scan reader does not contain a source so relaxing this check to zero or one source
-  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0),
+  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0,
                "skip_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(val == 0 or (not _num_rows.has_value() and _skip_rows == 0),
                "skip_bytes cannot be set along with skip_rows and num_rows");
@@ -835,7 +835,7 @@ void parquet_reader_options::set_skip_bytes(size_t val)
 void parquet_reader_options::set_num_bytes(size_t val)
 {
   // Hybrid scan reader does not contain a source so relaxing this check to zero or one source
-  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0),
+  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0,
                "num_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(not _num_rows.has_value() and _skip_rows == 0,
                "num_bytes cannot be set along with skip_rows and num_rows");
diff --git a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
index 0fb886efa8f..d3e8c69dd3e 100644
--- a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
+++ b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
@@ -289,6 +289,16 @@ TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
     expected_row_group_indices = std::vector<cudf::size_type>{1};
     EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
   }
+  
+  {
+    // Start with all row groups and skip all row groups as [500000, inf) byte range is beyond the file size
+    auto constexpr skip_bytes               = 500'000;
+    options.set_skip_bytes(skip_bytes);
+    auto const filtered_row_group_indices =
+      reader->filter_row_groups_with_byte_range(input_row_group_indices, options);
+    auto const expected_row_group_indices = std::vector<cudf::size_type>{};
+    EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
+  }
 }
 
 TEST_F(HybridScanFiltersTest, FilterRowGroupsWithStats)

From 2d29027cb0cca16aba03d597ff968e69c5a6ab7f Mon Sep 17 00:00:00 2001
From: anon <users.noreply.github.com>
Date: Tue, 2 Dec 2025 19:04:33 +0000
Subject: [PATCH 8/8] style fix

---
 cpp/src/io/functions.cpp                               | 3 ++-
 cpp/tests/io/experimental/hybrid_scan_filters_test.cpp | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index db3f41b082f..a8e7d1a2494 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -835,7 +835,8 @@ void parquet_reader_options::set_skip_bytes(size_t val)
 void parquet_reader_options::set_num_bytes(size_t val)
 {
   // Hybrid scan reader does not contain a source so relaxing this check to zero or one source
-  CUDF_EXPECTS(val == 0 or _source.num_sources() == 1 or _source.num_sources() == 0,
+  CUDF_EXPECTS(val == std::numeric_limits<size_t>::max() or _source.num_sources() == 1 or
+                 _source.num_sources() == 0,
                "num_bytes can only be set for single parquet source case");
   CUDF_EXPECTS(not _num_rows.has_value() and _skip_rows == 0,
                "num_bytes cannot be set along with skip_rows and num_rows");
diff --git a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
index d3e8c69dd3e..cde685fde32 100644
--- a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
+++ b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp
@@ -289,10 +289,11 @@ TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges)
     expected_row_group_indices = std::vector<cudf::size_type>{1};
     EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices);
   }
-  
+
   {
-    // Start with all row groups and skip all row groups as [500000, inf) byte range is beyond the file size
-    auto constexpr skip_bytes               = 500'000;
+    // Start with all row groups and skip all row groups as [500000, inf) byte range is beyond the
+    // file size
+    auto constexpr skip_bytes = 500'000;
     options.set_skip_bytes(skip_bytes);
     auto const filtered_row_group_indices =
       reader->filter_row_groups_with_byte_range(input_row_group_indices, options);