-
Notifications
You must be signed in to change notification settings - Fork 990
Filter row groups using byte range in the new experimental parquet reader #20733
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
558386b
c8b73a2
bb58fc4
3159064
cafde09
103d53e
abcc2b6
8ba39e5
da15824
2d29027
4c0b6a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -821,7 +821,7 @@ void parquet_reader_options::set_num_rows(int64_t val) | |
|
|
||
| void parquet_reader_options::set_skip_bytes(size_t val) | ||
| { | ||
| CUDF_EXPECTS(val == 0 or std::cmp_equal(_source.num_sources(), 1), | ||
| CUDF_EXPECTS(val == 0 or std::cmp_less_equal(_source.num_sources(), 1), | ||
|
||
| "skip_bytes can only be set for single parquet source case"); | ||
| CUDF_EXPECTS(val == 0 or (not _num_rows.has_value() and _skip_rows == 0), | ||
| "skip_bytes cannot be set along with skip_rows and num_rows"); | ||
|
|
@@ -833,7 +833,7 @@ void parquet_reader_options::set_skip_bytes(size_t val) | |
|
|
||
| void parquet_reader_options::set_num_bytes(size_t val) | ||
| { | ||
| CUDF_EXPECTS(std::cmp_equal(_source.num_sources(), 1), | ||
| CUDF_EXPECTS(std::cmp_less_equal(_source.num_sources(), 1), | ||
| "num_bytes can only be set for single parquet source case"); | ||
| CUDF_EXPECTS(not _num_rows.has_value() and _skip_rows == 0, | ||
| "num_bytes cannot be set along with skip_rows and num_rows"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -74,6 +74,18 @@ size_type hybrid_scan_reader::total_rows_in_row_groups( | |
| return _impl->total_rows_in_row_groups(input_row_group_indices); | ||
| } | ||
|
|
||
| std::vector<size_type> hybrid_scan_reader::filter_row_groups_with_byte_range( | ||
| cudf::host_span<size_type const> row_group_indices, parquet_reader_options const& options) const | ||
| { | ||
| CUDF_FUNC_RANGE(); | ||
|
|
||
| // Temporary vector with row group indices from the first source | ||
| auto const input_row_group_indices = | ||
| std::vector<std::vector<size_type>>{{row_group_indices.begin(), row_group_indices.end()}}; | ||
|
|
||
| return _impl->filter_row_groups_with_byte_range(input_row_group_indices, options).front(); | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call the impl API |
||
| } | ||
|
|
||
| std::vector<cudf::size_type> hybrid_scan_reader::filter_row_groups_with_stats( | ||
| cudf::host_span<size_type const> row_group_indices, | ||
| parquet_reader_options const& options, | ||
|
|
||
mhaseeb123 marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
New public API