|
18 | 18 |
|
19 | 19 | #include <src/io/parquet/parquet_gpu.hpp> |
20 | 20 |
|
| 21 | +#include <filesystem> |
| 22 | +#include <fstream> |
| 23 | + |
21 | 24 | namespace { |
22 | 25 |
|
23 | 26 | /** |
@@ -225,6 +228,80 @@ TEST_F(HybridScanFiltersTest, TestExternalMetadata) |
225 | 228 | EXPECT_EQ(reader->total_rows_in_row_groups(input_row_group_indices), 2 * rows_per_row_group); |
226 | 229 | } |
227 | 230 |
|
| 231 | +TEST_F(HybridScanFiltersTest, FilterRowGroupsWithByteRanges) |
| 232 | +{ |
| 233 | + using T = cudf::string_view; |
| 234 | + auto const [table, filepath] = create_parquet_typed_with_stats<T>("ByteBounds.parquet"); |
| 235 | + |
| 236 | + auto const file_size = std::filesystem::file_size(filepath); |
| 237 | + std::vector<char> file_buffer(file_size); |
| 238 | + std::ifstream file{filepath, std::ifstream::binary}; |
| 239 | + file.read(file_buffer.data(), file_size); |
| 240 | + file.close(); |
| 241 | + |
| 242 | + // Input file buffer span |
| 243 | + auto const file_buffer_span = cudf::host_span<uint8_t const>( |
| 244 | + reinterpret_cast<uint8_t const*>(file_buffer.data()), file_buffer.size()); |
| 245 | + |
| 246 | + // Fetch footer and page index bytes from the buffer. |
| 247 | + auto const footer_buffer = fetch_footer_bytes(file_buffer_span); |
| 248 | + |
| 249 | + // Create hybrid scan reader with footer bytes |
| 250 | + auto options = cudf::io::parquet_reader_options::builder().build(); |
| 251 | + auto const reader = |
| 252 | + std::make_unique<cudf::io::parquet::experimental::hybrid_scan_reader>(footer_buffer, options); |
| 253 | + |
| 254 | + auto const input_row_group_indices = reader->all_row_groups(options); |
| 255 | + |
| 256 | + // @note: In the above parquet file, the row groups start at the following byte offsets: 4, 75224, |
| 257 | + // 150332, 225561. The `skip_bytes` and `num_bytes` have been chosen to have enough cushion but |
| 258 | + // may need to be adjusted in the future if this test suddenly starts failing. |
| 259 | + |
| 260 | + { |
| 261 | + // Start with all row groups and only read row group 0 as only it will start in [0, 1000) byte |
| 262 | + // range |
| 263 | + auto constexpr num_bytes = 1000; |
| 264 | + options.set_num_bytes(num_bytes); |
| 265 | + auto const filtered_row_group_indices = |
| 266 | + reader->filter_row_groups_with_byte_range(input_row_group_indices, options); |
| 267 | + auto const expected_row_group_indices = std::vector<cudf::size_type>{0}; |
| 268 | + EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices); |
| 269 | + } |
| 270 | + |
| 271 | + { |
| 272 | + // Start with all row groups and skip row group 0 as it won't start in [1000, inf) byte range |
| 273 | + auto skip_bytes = 1000; |
| 274 | + options.set_skip_bytes(skip_bytes); |
| 275 | + options.set_num_bytes(std::numeric_limits<size_t>::max()); |
| 276 | + auto filtered_row_group_indices = |
| 277 | + reader->filter_row_groups_with_byte_range(input_row_group_indices, options); |
| 278 | + auto expected_row_group_indices = std::vector<cudf::size_type>{1, 2, 3}; |
| 279 | + EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices); |
| 280 | + |
| 281 | + // Now start with filtered row groups and only read row group 1 as only it starts in [50000, |
| 282 | + // 100000) byte range |
| 283 | + skip_bytes = 50000; |
| 284 | + auto constexpr num_bytes = 50000; |
| 285 | + options.set_skip_bytes(skip_bytes); |
| 286 | + options.set_num_bytes(num_bytes); |
| 287 | + filtered_row_group_indices = |
| 288 | + reader->filter_row_groups_with_byte_range(filtered_row_group_indices, options); |
| 289 | + expected_row_group_indices = std::vector<cudf::size_type>{1}; |
| 290 | + EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices); |
| 291 | + } |
| 292 | + |
| 293 | + { |
| 294 | + // Start with all row groups and skip all row groups as [500000, inf) byte range is beyond the |
| 295 | + // file size |
| 296 | + auto constexpr skip_bytes = 500'000; |
| 297 | + options.set_skip_bytes(skip_bytes); |
| 298 | + auto const filtered_row_group_indices = |
| 299 | + reader->filter_row_groups_with_byte_range(input_row_group_indices, options); |
| 300 | + auto const expected_row_group_indices = std::vector<cudf::size_type>{}; |
| 301 | + EXPECT_EQ(filtered_row_group_indices, expected_row_group_indices); |
| 302 | + } |
| 303 | +} |
| 304 | + |
228 | 305 | TEST_F(HybridScanFiltersTest, FilterRowGroupsWithStats) |
229 | 306 | { |
230 | 307 | srand(0xc001); |
|
0 commit comments