1- //! Arrow -rs based parquet reader .
1+ //! Parquet reader built on the arrow -rs ` parquet` crate .
22//!
3- //! This module provides a parquet reader built on the arrow-rs `parquet` crate,
4- //! replacing the parquet2/arrow2 decode pipeline. It uses [`DaftAsyncFileReader`]
5- //! as the IO bridge for remote reads, and the sync `ParquetRecordBatchReaderBuilder`
6- //! with `std::fs::File` for local reads (avoiding IOClient overhead).
3+ //! Uses [`DaftAsyncFileReader`] as the IO bridge for remote reads, and the sync
4+ //! `ParquetRecordBatchReaderBuilder` with `std::fs::File` for local reads.
75
86use std:: {
97 borrow:: Borrow ,
@@ -37,10 +35,12 @@ use tokio_stream::wrappers::ReceiverStream;
3735
3836use crate :: {
3937 async_reader:: DaftAsyncFileReader ,
40- metadata:: apply_field_ids_to_arrowrs_parquet_metadata,
41- read:: ParquetSchemaInferenceOptions ,
38+ metadata:: {
39+ apply_field_ids_to_arrowrs_parquet_metadata, strip_string_types_from_parquet_metadata,
40+ } ,
41+ read:: { ParquetSchemaInferenceOptions , StringEncoding } ,
4242 schema_inference:: { arrow_schema_to_daft_schema, infer_schema_from_parquet_metadata_arrowrs} ,
43- statistics:: arrowrs_row_group_metadata_to_table_stats ,
43+ statistics:: row_group_metadata_to_table_stats ,
4444} ;
4545
4646/// Default batch size for the arrow-rs reader (number of rows per batch).
@@ -89,8 +89,7 @@ fn infer_schemas(
8989 let arrow_schema = infer_schema_from_parquet_metadata_arrowrs (
9090 parquet_metadata,
9191 Some ( schema_infer_options. coerce_int96_timestamp_unit ) ,
92- schema_infer_options. string_encoding
93- == daft_arrow:: io:: parquet:: read:: schema:: StringEncoding :: Raw ,
92+ schema_infer_options. string_encoding == StringEncoding :: Raw ,
9493 )
9594 . map_err ( parquet_err) ?;
9695 let daft_schema = arrow_schema_to_daft_schema ( & arrow_schema) ?;
@@ -325,9 +324,8 @@ fn deletes_to_row_selection(local_deletes: &[usize], total_rows: usize) -> RowSe
325324 selectors. into ( )
326325}
327326
328- /// Read a single parquet file into a Daft [`RecordBatch`] using the arrow-rs reader .
327+ /// Read a single parquet file into a Daft [`RecordBatch`].
329328///
330- /// This is the arrow-rs equivalent of the parquet2-based `read_parquet_single`.
331329/// When `predicate` and/or `delete_rows` are provided, the reader handles them
332330/// internally using arrow-rs `RowFilter` and `RowSelection` for late materialization.
333331///
@@ -339,10 +337,9 @@ fn deletes_to_row_selection(local_deletes: &[usize], total_rows: usize) -> RowSe
339337/// offset (skip file rows) → predicate filter → limit
340338///
341339/// Note: `start_offset > 0` is rejected by the micropartition reader and never used
342- /// in production (the streaming scan path doesn't even accept the parameter). The
343- /// parquet2 reader has latent bugs for this case — both its local and remote paths
344- /// produce RecordBatch size mismatches when `start_offset > 0`. Our implementation
345- /// follows the intended semantics based on the code structure and the `apply_delete_rows`
340+ /// in production (the streaming scan path doesn't even accept the parameter). Our
341+ /// implementation follows the intended semantics based on the code structure and the
342+ /// `apply_delete_rows`
346343/// docstring in `read.rs`, but there is no working reference implementation to compare
347344/// against.
348345#[ allow( clippy:: too_many_arguments) ]
@@ -370,6 +367,13 @@ pub async fn read_parquet_single_arrowrs(
370367 parquet_metadata = apply_field_ids_to_arrowrs_parquet_metadata ( parquet_metadata, mapping) ?;
371368 }
372369
370+ // 1c. For StringEncoding::Raw, strip STRING/UTF8 logical types from the parquet
371+ // metadata so arrow-rs infers Binary instead of Utf8. This avoids UTF-8
372+ // validation during decode, allowing files with invalid UTF-8 to be read.
373+ if schema_infer_options. string_encoding == StringEncoding :: Raw {
374+ parquet_metadata = strip_string_types_from_parquet_metadata ( parquet_metadata) ?;
375+ }
376+
373377 // 2. Infer schema with Daft options (INT96 coercion, string encoding).
374378 let ( arrow_schema, daft_schema) = infer_schemas ( & parquet_metadata, & schema_infer_options) ?;
375379
@@ -672,6 +676,12 @@ pub(crate) fn local_parquet_setup(
672676 parquet_metadata = apply_field_ids_to_arrowrs_parquet_metadata ( parquet_metadata, mapping) ?;
673677 }
674678
679+ // 1c. For StringEncoding::Raw, strip STRING/UTF8 logical types so arrow-rs
680+ // reads BYTE_ARRAY as Binary (no UTF-8 validation).
681+ if schema_infer_options. string_encoding == StringEncoding :: Raw {
682+ parquet_metadata = strip_string_types_from_parquet_metadata ( parquet_metadata) ?;
683+ }
684+
675685 // 2. Infer schema with Daft options.
676686 let ( arrow_schema, daft_schema) = infer_schemas ( & parquet_metadata, & schema_infer_options) ?;
677687
@@ -876,8 +886,7 @@ pub(crate) fn decode_single_rg(
876886///
877887/// This avoids the overhead of `DaftAsyncFileReader` + `IOClient` for local files
878888/// by using `std::fs::File` directly with `ParquetRecordBatchReaderBuilder`.
879- /// Row groups are decoded in parallel using rayon, matching the parquet2 reader's
880- /// parallelism strategy. Supports late materialization via `RowFilter` and
889+ /// Row groups are decoded in parallel using rayon. Supports late materialization via `RowFilter` and
881890/// positional delete skipping via `RowSelection`.
882891///
883892/// See [`read_parquet_single_arrowrs`] for `start_offset` semantics.
@@ -964,8 +973,8 @@ pub fn local_parquet_read_arrowrs(
964973/// Stream a local parquet file as Daft [`RecordBatch`]es using the sync arrow-rs reader,
965974/// dispatching per-row-group decode as async tasks on the compute runtime.
966975///
967- /// Matches parquet2's `local_parquet_stream` pattern: sync metadata read, then
968- /// per-RG tasks on the DAFTCPU pool with semaphore-gated parallelism.
976+ /// Performs sync metadata read, then per-RG tasks on the DAFTCPU pool with
977+ /// semaphore-gated parallelism.
969978#[ allow( clippy:: too_many_arguments) ]
970979pub async fn local_parquet_stream_arrowrs (
971980 path : & str ,
@@ -1009,8 +1018,7 @@ pub async fn local_parquet_stream_arrowrs(
10091018 }
10101019
10111020 // 2. Semaphore: limit concurrent RG decodes.
1012- // Unlike parquet2 (which spawns per-column tasks and divides by num_columns),
1013- // arrowrs decodes all columns in a single block_in_place call per RG,
1021+ // All columns are decoded in a single block_in_place call per RG,
10141022 // so concurrency is limited only by available CPUs.
10151023 let num_cpus = std:: thread:: available_parallelism ( )
10161024 . map ( |n| n. get ( ) )
@@ -1073,7 +1081,6 @@ pub async fn local_parquet_stream_arrowrs(
10731081
10741082/// Stream a single parquet file as Daft [`RecordBatch`]es using the arrow-rs reader.
10751083///
1076- /// This is the arrow-rs equivalent of the parquet2-based `stream_parquet_single`.
10771084/// Supports late materialization via `RowFilter` and positional delete skipping
10781085/// via `RowSelection`.
10791086#[ allow( clippy:: too_many_arguments) ]
@@ -1101,6 +1108,12 @@ pub async fn stream_parquet_single_arrowrs(
11011108 parquet_metadata = apply_field_ids_to_arrowrs_parquet_metadata ( parquet_metadata, mapping) ?;
11021109 }
11031110
1111+ // 1c. For StringEncoding::Raw, strip STRING/UTF8 logical types so arrow-rs
1112+ // reads BYTE_ARRAY as Binary (no UTF-8 validation).
1113+ if schema_infer_options. string_encoding == StringEncoding :: Raw {
1114+ parquet_metadata = strip_string_types_from_parquet_metadata ( parquet_metadata) ?;
1115+ }
1116+
11041117 // 2. Infer schema with Daft options.
11051118 let ( arrow_schema, daft_schema) = infer_schemas ( & parquet_metadata, & schema_infer_options) ?;
11061119
@@ -1296,7 +1309,7 @@ fn prune_row_groups(
12961309 let mut result = Vec :: with_capacity ( candidates. len ( ) ) ;
12971310 for rg_idx in candidates {
12981311 let rg_meta = metadata. row_group ( rg_idx) ;
1299- match arrowrs_row_group_metadata_to_table_stats ( rg_meta, schema) {
1312+ match row_group_metadata_to_table_stats ( rg_meta, schema) {
13001313 Ok ( stats) => {
13011314 let evaled = stats. eval_expression ( & bound_pred) ?;
13021315 if evaled. to_truth_value ( ) != TruthValue :: False {
0 commit comments