DataDog
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎datafusion/common/src/config.rs‎
Lines changed: 30 additions & 0 deletions b/‎datafusion/common/src/config.rs‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs‎
Lines changed: 448 additions & 8 deletions b/‎datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs‎
Lines changed: 448 additions & 8 deletions
diff --git a/‎datafusion/physical-expr/src/expressions/in_list.rs‎
Lines changed: 8 additions & 0 deletions b/‎datafusion/physical-expr/src/expressions/in_list.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎datafusion/physical-plan/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎datafusion/physical-plan/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎datafusion/physical-plan/src/joins/hash_join/exec.rs‎
Lines changed: 59 additions & 4 deletions b/‎datafusion/physical-plan/src/joins/hash_join/exec.rs‎
Lines changed: 59 additions & 4 deletions
diff --git a/‎datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs‎
Lines changed: 133 additions & 0 deletions b/‎datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎datafusion/physical-plan/src/joins/hash_join/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎datafusion/physical-plan/src/joins/hash_join/mod.rs‎
Lines changed: 1 addition & 0 deletions
@@ -971,6 +971,36 @@ config_namespace! {
         /// will be collected into a single partition
         pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128
 
+        /// Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
+        /// Build sides larger than this will use hash table lookups instead.
+        /// Set to 0 to always use hash table lookups.
+        ///
+        /// InList pushdown can be more efficient for small build sides because it can result in better
+        /// statistics pruning as well as use any bloom filters present on the scan side.
+        /// InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion.
+        /// On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory.
+        ///
+        /// This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory.
+        ///
+        /// The default is 128kB per partition.
+        /// This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases
+        /// but avoids excessive memory usage or overhead for larger joins.
+        pub hash_join_inlist_pushdown_max_size: usize, default = 128 * 1024
+
+        /// Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
+        /// Build sides with more rows than this will use hash table lookups instead.
+        /// Set to 0 to always use hash table lookups.
+        ///
+        /// This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent
+        /// very large IN lists that might not provide much benefit over hash table lookups.
+        ///
+        /// This uses the deduplicated row count once the build side has been evaluated.
+        ///
+        /// The default is 150 values per partition.
+        /// This is inspired by Trino's `max-filter-keys-per-column` setting.
+        /// See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>
+        pub hash_join_inlist_pushdown_max_distinct_values: usize, default = 150
+
         /// The default filter selectivity used by Filter Statistics
         /// when an exact selectivity cannot be determined. Valid values are
         /// between 0 (no selectivity) and 100 (all rows are selected).
 
@@ -320,6 +320,14 @@ impl InListExpr {
         &self.list
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.list.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.list.len()
+    }
+
     /// Is this negated e.g. NOT IN LIST
     pub fn negated(&self) -> bool {
         self.negated
 
@@ -56,6 +56,7 @@ datafusion-common = { workspace = true }
 datafusion-common-runtime = { workspace = true, default-features = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 datafusion-physical-expr = { workspace = true, default-features = true }
 
@@ -26,8 +26,9 @@ use crate::filter_pushdown::{
     ChildPushdownResult, FilterDescription, FilterPushdownPhase,
     FilterPushdownPropagation,
 };
+use crate::joins::hash_join::inlist_builder::build_struct_inlist_values;
 use crate::joins::hash_join::shared_bounds::{
-    ColumnBounds, PartitionBounds, SharedBuildAccumulator,
+    ColumnBounds, PartitionBounds, PushdownStrategy, SharedBuildAccumulator,
 };
 use crate::joins::hash_join::stream::{
     BuildSide, BuildSideInitialState, HashJoinStream, HashJoinStreamState,
@@ -85,7 +86,7 @@ use futures::TryStreamExt;
 use parking_lot::Mutex;
 
 /// Hard-coded seed to ensure hash values from the hash join differ from `RepartitionExec`, avoiding collisions.
-const HASH_JOIN_SEED: RandomState =
+pub(crate) const HASH_JOIN_SEED: RandomState =
     RandomState::with_seeds('J' as u64, 'O' as u64, 'I' as u64, 'N' as u64);
 
 /// HashTable and input data for the left (build side) of a join
@@ -111,6 +112,9 @@ pub(super) struct JoinLeftData {
     /// If the partition is empty (no rows) this will be None.
     /// If the partition has some rows this will be Some with the bounds for each join key column.
     pub(super) bounds: Option<PartitionBounds>,
+    /// Membership testing strategy for filter pushdown
+    /// Contains either InList values for small build sides or hash table reference for large build sides
+    pub(super) membership: PushdownStrategy,
 }
 
 impl JoinLeftData {
@@ -134,6 +138,11 @@ impl JoinLeftData {
         &self.visited_indices_bitmap
     }
 
+    /// returns a reference to the InList values for filter pushdown
+    pub(super) fn membership(&self) -> &PushdownStrategy {
+        &self.membership
+    }
+
     /// Decrements the counter of running threads, and returns `true`
     /// if caller is the last running thread
     pub(super) fn report_probe_completed(&self) -> bool {
@@ -931,6 +940,16 @@ impl ExecutionPlan for HashJoinExec {
                     need_produce_result_in_final(self.join_type),
                     self.right().output_partitioning().partition_count(),
                     enable_dynamic_filter_pushdown,
+                    context
+                        .session_config()
+                        .options()
+                        .optimizer
+                        .hash_join_inlist_pushdown_max_size,
+                    context
+                        .session_config()
+                        .options()
+                        .optimizer
+                        .hash_join_inlist_pushdown_max_distinct_values,
                 ))
             })?,
             PartitionMode::Partitioned => {
@@ -949,6 +968,16 @@ impl ExecutionPlan for HashJoinExec {
                     need_produce_result_in_final(self.join_type),
                     1,
                     enable_dynamic_filter_pushdown,
+                    context
+                        .session_config()
+                        .options()
+                        .optimizer
+                        .hash_join_inlist_pushdown_max_size,
+                    context
+                        .session_config()
+                        .options()
+                        .optimizer
+                        .hash_join_inlist_pushdown_max_distinct_values,
                 ))
             }
             PartitionMode::Auto => {
@@ -1349,6 +1378,8 @@ async fn collect_left_input(
     with_visited_indices_bitmap: bool,
     probe_threads_count: usize,
     should_compute_dynamic_filters: bool,
+    max_inlist_size: usize,
+    max_inlist_distinct_values: usize,
 ) -> Result<JoinLeftData> {
     let schema = left_stream.schema();
 
@@ -1472,6 +1503,29 @@ async fn collect_left_input(
     // Convert Box to Arc for sharing with SharedBuildAccumulator
     let hash_map: Arc<dyn JoinHashMapType> = hashmap.into();
 
+    let membership = if num_rows == 0 {
+        PushdownStrategy::Empty
+    } else {
+        // If the build side is small enough we can use IN list pushdown.
+        // If it's too big we fall back to pushing down a reference to the hash table.
+        // See `PushdownStrategy` for more details.
+        let estimated_size = left_values
+            .iter()
+            .map(|arr| arr.get_array_memory_size())
+            .sum::<usize>();
+        if left_values.is_empty()
+            || left_values[0].is_empty()
+            || estimated_size > max_inlist_size
+            || hash_map.len() > max_inlist_distinct_values
+        {
+            PushdownStrategy::HashTable(Arc::clone(&hash_map))
+        } else if let Some(in_list_values) = build_struct_inlist_values(&left_values)? {
+            PushdownStrategy::InList(in_list_values)
+        } else {
+            PushdownStrategy::HashTable(Arc::clone(&hash_map))
+        }
+    };
+
     let data = JoinLeftData {
         hash_map,
         batch,
@@ -1480,6 +1534,7 @@ async fn collect_left_input(
         probe_threads_counter: AtomicUsize::new(probe_threads_count),
         _reservation: reservation,
         bounds,
+        membership,
     };
 
     Ok(data)
@@ -4525,7 +4580,7 @@ mod tests {
         )?;
         join.dynamic_filter = Some(HashJoinExecDynamicFilter {
             filter: dynamic_filter,
-            bounds_accumulator: OnceLock::new(),
+            build_accumulator: OnceLock::new(),
         });
 
         // Execute the join
@@ -4573,7 +4628,7 @@ mod tests {
         )?;
         join.dynamic_filter = Some(HashJoinExecDynamicFilter {
             filter: dynamic_filter,
-            bounds_accumulator: OnceLock::new(),
+            build_accumulator: OnceLock::new(),
         });
 
         // Execute the join
 
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for building InList expressions from hash join build side data
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StructArray};
+use arrow::datatypes::{Field, FieldRef, Fields};
+use arrow::downcast_dictionary_array;
+use arrow_schema::DataType;
+use datafusion_common::Result;
+
+pub(super) fn build_struct_fields(data_types: &[DataType]) -> Result<Fields> {
+    data_types
+        .iter()
+        .enumerate()
+        .map(|(i, dt)| Ok(Field::new(format!("c{i}"), dt.clone(), true)))
+        .collect()
+}
+
+/// Flattens dictionary-encoded arrays to their underlying value arrays.
+/// Non-dictionary arrays are returned as-is.
+fn flatten_dictionary_array(array: &ArrayRef) -> ArrayRef {
+    downcast_dictionary_array! {
+        array => {
+            // Recursively flatten in case of nested dictionaries
+            flatten_dictionary_array(array.values())
+        }
+        _ => Arc::clone(array)
+    }
+}
+
+/// Builds InList values from join key column arrays.
+///
+/// If `join_key_arrays` is:
+/// 1. A single array, let's say Int32, this will produce a flat
+///    InList expression where the lookup is expected to be scalar Int32 values,
+///    that is: this will produce `IN LIST (1, 2, 3)` expected to be used as `2 IN LIST (1, 2, 3)`.
+/// 2. An Int32 array and a Utf8 array, this will produce a Struct InList expression
+///    where the lookup is expected to be Struct values with two fields (Int32, Utf8),
+///    that is: this will produce `IN LIST ((1, "a"), (2, "b"))` expected to be used as `(2, "b") IN LIST ((1, "a"), (2, "b"))`.
+///    The field names of the struct are auto-generated as "c0", "c1", ... and should match the struct expression used in the join keys.
+///
+/// Note that this function does not deduplicate values - deduplication will happen later
+/// when building an InList expression from this array via `InListExpr::try_new_from_array`.
+///
+/// Returns `None` if the estimated size exceeds `max_size_bytes` or if the number of rows
+/// exceeds `max_distinct_values`.
+pub(super) fn build_struct_inlist_values(
+    join_key_arrays: &[ArrayRef],
+) -> Result<Option<ArrayRef>> {
+    // Flatten any dictionary-encoded arrays
+    let flattened_arrays: Vec<ArrayRef> = join_key_arrays
+        .iter()
+        .map(flatten_dictionary_array)
+        .collect();
+
+    // Build the source array/struct
+    let source_array: ArrayRef = if flattened_arrays.len() == 1 {
+        // Single column: use directly
+        Arc::clone(&flattened_arrays[0])
+    } else {
+        // Multi-column: build StructArray once from all columns
+        let fields = build_struct_fields(
+            &flattened_arrays
+                .iter()
+                .map(|arr| arr.data_type().clone())
+                .collect::<Vec<_>>(),
+        )?;
+
+        // Build field references with proper Arc wrapping
+        let arrays_with_fields: Vec<(FieldRef, ArrayRef)> = fields
+            .iter()
+            .cloned()
+            .zip(flattened_arrays.iter().cloned())
+            .collect();
+
+        Arc::new(StructArray::from(arrays_with_fields))
+    };
+
+    Ok(Some(source_array))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Int32Array, StringArray};
+    use arrow_schema::DataType;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_build_single_column_inlist_array() {
+        let array = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef;
+        let result = build_struct_inlist_values(std::slice::from_ref(&array))
+            .unwrap()
+            .unwrap();
+
+        assert!(array.eq(&result));
+    }
+
+    #[test]
+    fn test_build_multi_column_inlist() {
+        let array1 = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef;
+        let array2 =
+            Arc::new(StringArray::from(vec!["a", "b", "c", "b", "a"])) as ArrayRef;
+
+        let result = build_struct_inlist_values(&[array1, array2])
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(
+            *result.data_type(),
+            DataType::Struct(
+                build_struct_fields(&[DataType::Int32, DataType::Utf8]).unwrap()
+            )
+        );
+    }
+}
@@ -20,6 +20,7 @@
 pub use exec::HashJoinExec;
 
 mod exec;
+mod inlist_builder;
 mod partitioned_hash_eval;
 mod shared_bounds;
 mod stream;