-
Notifications
You must be signed in to change notification settings - Fork 14
remove schema adapter #205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
98ff52e
f400367
88eb80b
06e6493
a3f8407
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,10 +11,12 @@ use crate::protobuf::{ | |
| }; | ||
| use arrow_flight::FlightData; | ||
| use arrow_flight::Ticket; | ||
| use arrow_flight::encode::FlightDataEncoderBuilder; | ||
| use arrow_flight::encode::{DictionaryHandling, FlightDataEncoderBuilder}; | ||
| use arrow_flight::error::FlightError; | ||
| use arrow_flight::flight_service_server::FlightService; | ||
| use arrow_select::dictionary::garbage_collect_any_dictionary; | ||
| use bytes::Bytes; | ||
| use datafusion::arrow::array::{Array, AsArray, RecordBatch}; | ||
|
|
||
| use datafusion::common::exec_datafusion_err; | ||
| use datafusion::error::DataFusionError; | ||
|
|
@@ -134,8 +136,22 @@ impl ArrowFlightEndpoint { | |
| .execute(doget.target_partition as usize, session_state.task_ctx()) | ||
| .map_err(|err| Status::internal(format!("Error executing stage plan: {err:#?}")))?; | ||
|
|
||
| let schema = stream.schema().clone(); | ||
|
|
||
| // Apply garbage collection of dictionary and view arrays before sending over the network | ||
| let stream = stream.and_then(|rb| std::future::ready(garbage_collect_arrays(rb))); | ||
|
|
||
| let stream = FlightDataEncoderBuilder::new() | ||
| .with_schema(stream.schema().clone()) | ||
| .with_schema(schema) | ||
| // This tells the encoder to send dictionaries across the wire as-is. | ||
| // The alternative (`DictionaryHandling::Hydrate`) would expand the dictionaries | ||
| // into their value types, which can potentially blow up the size of the data transfer. | ||
| // The main reason to use `DictionaryHandling::Hydrate` is for compatibility with clients | ||
| // that do not support dictionaries, but since we are using the same server/client on both | ||
| // sides, we can safely use `DictionaryHandling::Resend`. | ||
| // Note that we do garbage collection of unused dictionary values above, so we are not sending | ||
| // unused dictionary values over the wire. | ||
| .with_dictionary_handling(DictionaryHandling::Resend) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, with this line, we are telling the flight data encoder to just send the dictionaries over the wire instead of hydrating them and re-encoding them. Is that right? If you could include a very brief comment above this line that would be awesome 🙏
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a great optimization. Thanks @adriangb |
||
| .build(stream.map_err(|err| { | ||
| FlightError::Tonic(Box::new(datafusion_error_to_tonic_status(&err))) | ||
| })); | ||
|
|
@@ -210,6 +226,34 @@ fn collect_and_create_metrics_flight_data( | |
| Ok(incoming.with_app_metadata(buf)) | ||
| } | ||
|
|
||
| /// Garbage collects values sub-arrays. | ||
| /// | ||
| /// We apply this before sending RecordBatches over the network to avoid sending | ||
| /// values that are not referenced by any dictionary keys or buffers that are not used. | ||
| /// | ||
| /// Unused values can arise from operations such as filtering, where | ||
| /// some keys may no longer be referenced in the filtered result. | ||
| fn garbage_collect_arrays(batch: RecordBatch) -> Result<RecordBatch, DataFusionError> { | ||
| let (schema, arrays, _row_count) = batch.into_parts(); | ||
|
|
||
| let arrays = arrays | ||
| .into_iter() | ||
| .map(|array| { | ||
| if let Some(array) = array.as_any_dictionary_opt() { | ||
| garbage_collect_any_dictionary(array) | ||
| } else if let Some(array) = array.as_string_view_opt() { | ||
| Ok(Arc::new(array.gc()) as Arc<dyn Array>) | ||
| } else if let Some(array) = array.as_binary_view_opt() { | ||
| Ok(Arc::new(array.gc()) as Arc<dyn Array>) | ||
| } else { | ||
| Ok(array) | ||
| } | ||
| }) | ||
| .collect::<Result<Vec<_>, _>>()?; | ||
|
|
||
| Ok(RecordBatch::try_new(schema, arrays)?) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.