-
Notifications
You must be signed in to change notification settings - Fork 14
Fix Dictionary Encoded Values #174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
576db69
624228b
5e670b2
5510bd8
8f858a7
a141a3b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,6 @@ use crate::config_extension_ext::ContextGrpcMetadata; | |
| use crate::execution_plans::{DistributedTaskContext, StageExec}; | ||
| use crate::flight_service::service::ArrowFlightEndpoint; | ||
| use crate::flight_service::session_builder::DistributedSessionBuilderContext; | ||
| use crate::flight_service::trailing_flight_data_stream::TrailingFlightDataStream; | ||
| use crate::metrics::TaskMetricsCollector; | ||
| use crate::metrics::proto::df_metrics_set_to_proto; | ||
| use crate::protobuf::{ | ||
|
|
@@ -16,18 +15,17 @@ use arrow_flight::encode::FlightDataEncoderBuilder; | |
| use arrow_flight::error::FlightError; | ||
| use arrow_flight::flight_service_server::FlightService; | ||
| use bytes::Bytes; | ||
| use datafusion::arrow::array::RecordBatch; | ||
| use datafusion::arrow::datatypes::SchemaRef; | ||
| use datafusion::arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; | ||
|
|
||
| use datafusion::common::exec_datafusion_err; | ||
| use datafusion::execution::SendableRecordBatchStream; | ||
| use datafusion::physical_plan::stream::RecordBatchStreamAdapter; | ||
| use datafusion::prelude::SessionContext; | ||
| use futures::TryStreamExt; | ||
| use futures::{Stream, stream}; | ||
| use futures::stream; | ||
| use futures::{StreamExt, TryStreamExt}; | ||
| use prost::Message; | ||
| use std::sync::Arc; | ||
| use std::sync::atomic::{AtomicUsize, Ordering}; | ||
| use std::task::Poll; | ||
| use tonic::{Request, Response, Status}; | ||
|
|
||
| #[derive(Clone, PartialEq, ::prost::Message)] | ||
|
|
@@ -173,51 +171,82 @@ fn flight_stream_from_record_batch_stream( | |
| evict_stage: impl FnOnce() + Send + 'static, | ||
| stream: SendableRecordBatchStream, | ||
| ) -> Response<<ArrowFlightEndpoint as FlightService>::DoGetStream> { | ||
| let flight_data_stream = | ||
| let mut flight_data_stream = | ||
| FlightDataEncoderBuilder::new() | ||
| .with_schema(stream.schema().clone()) | ||
| .build(stream.map_err(|err| { | ||
| FlightError::Tonic(Box::new(datafusion_error_to_tonic_status(&err))) | ||
| })); | ||
|
|
||
| let trailing_metrics_stream = TrailingFlightDataStream::new( | ||
| move || { | ||
| if stage_data | ||
| .num_partitions_remaining | ||
| .fetch_sub(1, Ordering::SeqCst) | ||
| == 1 | ||
| { | ||
| evict_stage(); | ||
|
|
||
| let metrics_stream = | ||
| collect_and_create_metrics_flight_data(stage_key, stage_data.stage).map_err( | ||
| |err| { | ||
| Status::internal(format!( | ||
| "error collecting metrics in arrow flight endpoint: {err}" | ||
| )) | ||
| }, | ||
| )?; | ||
|
|
||
| return Ok(Some(metrics_stream)); | ||
| } | ||
|
|
||
| Ok(None) | ||
| }, | ||
| flight_data_stream, | ||
| ); | ||
| // executed once when the stream ends | ||
| // decorates the last flight data with our metrics | ||
| let mut final_closure = Some(move |last_flight_data| { | ||
| if stage_data | ||
| .num_partitions_remaining | ||
| .fetch_sub(1, Ordering::SeqCst) | ||
| == 1 | ||
| { | ||
| evict_stage(); | ||
|
|
||
| collect_and_create_metrics_flight_data(stage_key, stage_data.stage, last_flight_data) | ||
| } else { | ||
| Ok(last_flight_data) | ||
| } | ||
| }); | ||
|
|
||
| // this is used to peek the new value | ||
| // so that we can add our metrics to the last flight data | ||
| let mut current_value = None; | ||
|
|
||
| let stream = | ||
| stream::poll_fn( | ||
|
||
| move |cx| match futures::ready!(flight_data_stream.poll_next_unpin(cx)) { | ||
| Some(Ok(new_val)) => { | ||
| match current_value.take() { | ||
| // This is the first value, so we store it and repoll to get the next value | ||
| None => { | ||
| current_value = Some(new_val); | ||
| cx.waker().wake_by_ref(); | ||
| Poll::Pending | ||
| } | ||
|
|
||
| Some(existing) => { | ||
| current_value = Some(new_val); | ||
|
|
||
| Poll::Ready(Some(Ok(existing))) | ||
| } | ||
| } | ||
| } | ||
| // this is our last value, so we add our metrics to this flight data | ||
| None => match current_value.take() { | ||
| Some(existing) => { | ||
| // make sure we wake ourselves to finish the stream | ||
| cx.waker().wake_by_ref(); | ||
|
|
||
| if let Some(closure) = final_closure.take() { | ||
| Poll::Ready(Some(closure(existing))) | ||
| } else { | ||
| unreachable!("the closure is only executed once") | ||
| } | ||
| } | ||
| None => Poll::Ready(None), | ||
| }, | ||
| err => Poll::Ready(err), | ||
| }, | ||
| ); | ||
|
|
||
| Response::new(Box::pin(trailing_metrics_stream.map_err(|err| match err { | ||
| Response::new(Box::pin(stream.map_err(|err| match err { | ||
| FlightError::Tonic(status) => *status, | ||
| _ => Status::internal(format!("Error during flight stream: {err}")), | ||
| }))) | ||
| } | ||
|
|
||
| // Collects metrics from the provided stage and encodes it into a stream of flight data using | ||
| // the schema of the stage. | ||
| /// Collects metrics from the provided stage and includes it in the flight data | ||
| fn collect_and_create_metrics_flight_data( | ||
| stage_key: StageKey, | ||
| stage: Arc<StageExec>, | ||
| ) -> Result<impl Stream<Item = Result<FlightData, FlightError>> + Send + 'static, FlightError> { | ||
| incoming: FlightData, | ||
| ) -> Result<FlightData, FlightError> { | ||
| // Get the metrics for the task executed on this worker. Separately, collect metrics for child tasks. | ||
| let mut result = TaskMetricsCollector::new() | ||
| .collect(stage.plan.clone()) | ||
|
|
@@ -252,35 +281,12 @@ fn collect_and_create_metrics_flight_data( | |
| })), | ||
| }; | ||
|
|
||
| let metrics_flight_data = | ||
| empty_flight_data_with_app_metadata(flight_app_metadata, stage.plan.schema())?; | ||
| Ok(Box::pin(stream::once( | ||
| async move { Ok(metrics_flight_data) }, | ||
| ))) | ||
| } | ||
|
|
||
| /// Creates a FlightData with the given app_metadata and empty RecordBatch using the provided schema. | ||
| /// We don't use [arrow_flight::encode::FlightDataEncoder] (and by extension, the [arrow_flight::encode::FlightDataEncoderBuilder]) | ||
| /// since they skip messages with empty RecordBatch data. | ||
| pub fn empty_flight_data_with_app_metadata( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is overall looking really good! I have one question purely for my understanding: Before, we were appending the Arrow Flight stream with one extra message containing the Now, we are intercepting the last message in the Arrow Flight stream, and we are enriching it with the I see that without these changes, your new test fails with the following error: But I don't fully understand why, as I'd have expected the "Before" and "Now" approach to be equivalent.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, based on my understanding, when encoding arrow flight data, there are 3 main types:
When the stream starts, the schema is encoded and sent. At that point (when we encode & send the schema) the dictionary tracker is loaded and populated based upon what dictionaries are present within fields. Then we get to the record batches. If any of them have fields with dictionary types, we use the dictionary tracker to determine if we have sent the dictionary already, and then send that if not, along with the record batch itself. In the old code, we weren't encoding the schema with the empty record batch, so the dictionary tracker was never populated with the dictionaries. This meant that when it came time to encode the record batch (& possibly dictionary) it checks the schema of the record batch, and, determines a field has a dictionary, so consults the tracker as to what to do. But because we weren't encoding the schema, the dictionary tracker is in a bad state, and so you get the So you can fix it by asking the flight data encoder the schema it has and using that to encode the empty batch, but then if you ever want to change whether to hydrate dictionaries in the future or not it'd be broken. So I figured a simpler fix is what is proposed: don't bother writing out a empty record batch, just append the data to the last value.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👋🏽 Thanks for the contribution! Sending the metadata with the last message is more optimal for sure. I'm reviewing this PR now. |
||
| metadata: FlightAppMetadata, | ||
| schema: SchemaRef, | ||
| ) -> Result<FlightData, FlightError> { | ||
| let mut buf = vec![]; | ||
| metadata | ||
| flight_app_metadata | ||
| .encode(&mut buf) | ||
| .map_err(|err| FlightError::ProtocolError(err.to_string()))?; | ||
|
|
||
| let empty_batch = RecordBatch::new_empty(schema); | ||
| let options = IpcWriteOptions::default(); | ||
| let data_gen = IpcDataGenerator::default(); | ||
| let mut dictionary_tracker = DictionaryTracker::new(true); | ||
| let (_, encoded_data) = data_gen | ||
| .encoded_batch(&empty_batch, &mut dictionary_tracker, &options) | ||
| .map_err(|e| { | ||
| FlightError::ProtocolError(format!("Failed to create empty batch FlightData: {e}")) | ||
| })?; | ||
| Ok(FlightData::from(encoded_data).with_app_metadata(buf)) | ||
| Ok(incoming.with_app_metadata(buf)) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like 1:1 schema mapping. What does it do? Is this just a way to assert that the schema hasn't changed? I think adding a test which shows why this is necessary would be good.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The schema does change. The Arrow Flight data hydrates dictionary values as real values, and so the schema of the incoming recordbatch is different. We use the mapper here to map back to what the execution plan expects
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I noticed that tests still pass without this line.
IIUC, the root problem was on the server - we were sending an empty flight data to the client without sending the schema / dictionary message first. You've fixed this problem.
I don't see an issue on the client that this solves. The flight decoder in the client should be able to handle any message sent by the encoder on the server.
The metrics collector on the client passes through flight data unchanged, minus clearing the app_metadata.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer to either have a test which shows why this is needed or remove the lines. Lmk if you think otherwise though!
Once again, I appreciate the contribution 🙏🏽 - the old empty flight data code was sketchy for sure.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
test_metrics_collection_e2e_4 fails with this removed from both the network plans
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah sorry I commented one but not the other. This LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added an
asserthere to make sure the schema matches: a141a3b