@@ -11,10 +11,12 @@ use crate::protobuf::{
1111} ;
1212use arrow_flight:: FlightData ;
1313use arrow_flight:: Ticket ;
14- use arrow_flight:: encode:: FlightDataEncoderBuilder ;
14+ use arrow_flight:: encode:: { DictionaryHandling , FlightDataEncoderBuilder } ;
1515use arrow_flight:: error:: FlightError ;
1616use arrow_flight:: flight_service_server:: FlightService ;
17+ use arrow_select:: dictionary:: garbage_collect_any_dictionary;
1718use bytes:: Bytes ;
19+ use datafusion:: arrow:: array:: { Array , AsArray , RecordBatch } ;
1820
1921use datafusion:: common:: exec_datafusion_err;
2022use datafusion:: error:: DataFusionError ;
@@ -134,8 +136,22 @@ impl ArrowFlightEndpoint {
134136 . execute ( doget. target_partition as usize , session_state. task_ctx ( ) )
135137 . map_err ( |err| Status :: internal ( format ! ( "Error executing stage plan: {err:#?}" ) ) ) ?;
136138
139+ let schema = stream. schema ( ) . clone ( ) ;
140+
141+ // Apply garbage collection of dictionary and view arrays before sending over the network
142+ let stream = stream. and_then ( |rb| std:: future:: ready ( garbage_collect_arrays ( rb) ) ) ;
143+
137144 let stream = FlightDataEncoderBuilder :: new ( )
138- . with_schema ( stream. schema ( ) . clone ( ) )
145+ . with_schema ( schema)
146+ // This tells the encoder to send dictionaries across the wire as-is.
147+ // The alternative (`DictionaryHandling::Hydrate`) would expand the dictionaries
148+ // into their value types, which can potentially blow up the size of the data transfer.
149+ // The main reason to use `DictionaryHandling::Hydrate` is for compatibility with clients
150+ // that do not support dictionaries, but since we are using the same server/client on both
151+ // sides, we can safely use `DictionaryHandling::Resend`.
152+ // Note that we do garbage collection of unused dictionary values above, so we are not sending
153+ // unused dictionary values over the wire.
154+ . with_dictionary_handling ( DictionaryHandling :: Resend )
139155 . build ( stream. map_err ( |err| {
140156 FlightError :: Tonic ( Box :: new ( datafusion_error_to_tonic_status ( & err) ) )
141157 } ) ) ;
@@ -210,6 +226,34 @@ fn collect_and_create_metrics_flight_data(
210226 Ok ( incoming. with_app_metadata ( buf) )
211227}
212228
229+ /// Garbage collects values sub-arrays.
230+ ///
231+ /// We apply this before sending RecordBatches over the network to avoid sending
232+ /// values that are not referenced by any dictionary keys or buffers that are not used.
233+ ///
234+ /// Unused values can arise from operations such as filtering, where
235+ /// some keys may no longer be referenced in the filtered result.
236+ fn garbage_collect_arrays ( batch : RecordBatch ) -> Result < RecordBatch , DataFusionError > {
237+ let ( schema, arrays, _row_count) = batch. into_parts ( ) ;
238+
239+ let arrays = arrays
240+ . into_iter ( )
241+ . map ( |array| {
242+ if let Some ( array) = array. as_any_dictionary_opt ( ) {
243+ garbage_collect_any_dictionary ( array)
244+ } else if let Some ( array) = array. as_string_view_opt ( ) {
245+ Ok ( Arc :: new ( array. gc ( ) ) as Arc < dyn Array > )
246+ } else if let Some ( array) = array. as_binary_view_opt ( ) {
247+ Ok ( Arc :: new ( array. gc ( ) ) as Arc < dyn Array > )
248+ } else {
249+ Ok ( array)
250+ }
251+ } )
252+ . collect :: < Result < Vec < _ > , _ > > ( ) ?;
253+
254+ Ok ( RecordBatch :: try_new ( schema, arrays) ?)
255+ }
256+
213257#[ cfg( test) ]
214258mod tests {
215259 use super :: * ;
0 commit comments