Skip to content

Commit 31e4e15

Browse files
committed
[datafusion]: keep PR 20858 to code fix only
1 parent c5eced1 commit 31e4e15

File tree

7 files changed

+11
-189
lines changed

7 files changed

+11
-189
lines changed

.github/workflows/extended.yml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,16 +154,11 @@ jobs:
154154
uses: ./.github/actions/setup-builder
155155
with:
156156
rust-version: stable
157-
- name: Run tests (force_hash_collisions)
157+
- name: Run tests
158158
run: |
159159
cd datafusion
160160
cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --exclude datafusion-cli --workspace --lib --tests --features=force_hash_collisions,avro
161161
cargo clean
162-
- name: Run tests (force_hash_partial_collisions, #20724)
163-
run: |
164-
cd datafusion
165-
cargo test --profile ci -p datafusion --test core_integration --features=force_hash_partial_collisions -- memory_limit::test_no_duplicate_groups_after_spill --exact
166-
cargo clean
167162
168163
sqllogictest-sqlite:
169164
name: "Run sqllogictests with the sqlite test suite"

.github/workflows/rust.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,6 @@ jobs:
210210
run: cargo check --profile ci --no-default-features -p datafusion --features=encoding_expressions
211211
- name: Check datafusion (force_hash_collisions)
212212
run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_collisions
213-
- name: Check datafusion (force_hash_partial_collisions)
214-
run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_partial_collisions
215213
- name: Check datafusion (math_expressions)
216214
run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions
217215
- name: Check datafusion (parquet)

datafusion/common/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ parquet_encryption = [
4949
"dep:hex",
5050
]
5151
force_hash_collisions = []
52-
force_hash_partial_collisions = []
5352
recursive_protection = ["dep:recursive"]
5453
parquet = ["dep:parquet"]
5554
sql = ["sqlparser"]

datafusion/common/src/hash_utils.rs

Lines changed: 9 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,12 @@ use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
2222
use arrow::array::*;
2323
use arrow::compute::take;
2424
use arrow::datatypes::*;
25-
#[cfg(not(all(
26-
feature = "force_hash_collisions",
27-
not(feature = "force_hash_partial_collisions")
28-
)))]
25+
#[cfg(not(feature = "force_hash_collisions"))]
2926
use arrow::{downcast_dictionary_array, downcast_primitive_array};
3027
use itertools::Itertools;
3128
use std::collections::HashMap;
3229

33-
#[cfg(not(all(
34-
feature = "force_hash_collisions",
35-
not(feature = "force_hash_partial_collisions")
36-
)))]
30+
#[cfg(not(feature = "force_hash_collisions"))]
3731
use crate::cast::{
3832
as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
3933
as_generic_binary_array, as_large_list_array, as_large_list_view_array,
@@ -941,11 +935,8 @@ fn hash_run_array<R: RunEndIndexType>(
941935

942936
/// Internal helper function that hashes a single array and either initializes or combines
943937
/// the hash values in the buffer.
944-
#[cfg(not(all(
945-
feature = "force_hash_collisions",
946-
not(feature = "force_hash_partial_collisions")
947-
)))]
948-
fn hash_single_array_impl(
938+
#[cfg(not(feature = "force_hash_collisions"))]
939+
fn hash_single_array(
949940
array: &dyn Array,
950941
random_state: &RandomState,
951942
hashes_buffer: &mut [u64],
@@ -1016,47 +1007,17 @@ fn hash_single_array_impl(
10161007
Ok(())
10171008
}
10181009

1019-
/// Dispatches to the appropriate `hash_single_array` implementation based on
1020-
/// the enabled feature flags.
1021-
#[cfg(not(any(
1022-
feature = "force_hash_collisions",
1023-
feature = "force_hash_partial_collisions"
1024-
)))]
1025-
fn hash_single_array(
1026-
array: &dyn Array,
1027-
random_state: &RandomState,
1028-
hashes_buffer: &mut [u64],
1029-
rehash: bool,
1030-
) -> Result<()> {
1031-
hash_single_array_impl(array, random_state, hashes_buffer, rehash)
1032-
}
1033-
1034-
/// Test version: forces full hash collisions by setting all hashes to 0.
1035-
#[cfg(all(
1036-
feature = "force_hash_collisions",
1037-
not(feature = "force_hash_partial_collisions")
1038-
))]
1010+
/// Test version of `hash_single_array` that forces all hashes to collide to zero.
1011+
#[cfg(feature = "force_hash_collisions")]
10391012
fn hash_single_array(
10401013
_array: &dyn Array,
10411014
_random_state: &RandomState,
10421015
hashes_buffer: &mut [u64],
10431016
_rehash: bool,
10441017
) -> Result<()> {
1045-
hashes_buffer.iter_mut().for_each(|x| *x = 0);
1046-
Ok(())
1047-
}
1048-
1049-
/// Test version: truncates real hashes to 5 bits (32 distinct values) to create
1050-
/// partial collisions that expose non-monotonic group index bugs (#20724).
1051-
#[cfg(feature = "force_hash_partial_collisions")]
1052-
fn hash_single_array(
1053-
array: &dyn Array,
1054-
random_state: &RandomState,
1055-
hashes_buffer: &mut [u64],
1056-
rehash: bool,
1057-
) -> Result<()> {
1058-
hash_single_array_impl(array, random_state, hashes_buffer, rehash)?;
1059-
hashes_buffer.iter_mut().for_each(|h| *h &= 0x1F);
1018+
for hash in hashes_buffer.iter_mut() {
1019+
*hash = 0
1020+
}
10601021
Ok(())
10611022
}
10621023

datafusion/core/Cargo.toml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,6 @@ default = [
7171
encoding_expressions = ["datafusion-functions/encoding_expressions"]
7272
# Used for testing ONLY: causes all values to hash to the same value (test for collisions)
7373
force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"]
74-
# Used for testing ONLY: truncates hashes to 5 bits (32 distinct values) to create partial collisions.
75-
# Unlike force_hash_collisions (all hashes = 0), this creates a mix of colliding and non-colliding keys,
76-
# which triggers non-monotonic group indices in vectorized_intern (#20724).
77-
force_hash_partial_collisions = [
78-
"datafusion-physical-plan/force_hash_partial_collisions",
79-
"datafusion-common/force_hash_partial_collisions",
80-
]
8174
math_expressions = ["datafusion-functions/math_expressions"]
8275
parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"]
8376
parquet_encryption = [

datafusion/core/tests/memory_limit/mod.rs

Lines changed: 1 addition & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ use std::sync::{Arc, LazyLock};
2424
#[cfg(feature = "extended_tests")]
2525
mod memory_limit_validation;
2626
mod repartition_mem_limit;
27-
use arrow::array::{
28-
ArrayRef, DictionaryArray, Int32Array, Int64Array, RecordBatch, StringViewArray,
29-
};
27+
use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringViewArray};
3028
use arrow::compute::SortOptions;
3129
use arrow::datatypes::{Int32Type, SchemaRef};
3230
use arrow_schema::{DataType, Field, Schema};
@@ -58,7 +56,6 @@ use datafusion_physical_plan::collect as collect_batches;
5856
use datafusion_physical_plan::common::collect;
5957
use datafusion_physical_plan::spill::get_record_batch_memory_size;
6058
use rand::Rng;
61-
use std::collections::HashSet;
6259
use test_utils::AccessLogGenerator;
6360

6461
use async_trait::async_trait;
@@ -1175,123 +1172,3 @@ impl TableProvider for SortedTableProvider {
11751172
Ok(DataSourceExec::from_data_source(mem_conf))
11761173
}
11771174
}
1178-
1179-
// ============================================================================
1180-
// Regression tests for https://github.com/apache/datafusion/issues/20724
1181-
//
1182-
// When hash aggregation spills and switches to streaming merge,
1183-
// `group_values` must be recreated with the streaming variant.
1184-
// Otherwise `vectorized_intern` can produce non-monotonic group indices
1185-
// under hash collisions, causing `GroupOrderingFull` to prematurely
1186-
// emit groups → duplicate keys in output.
1187-
// ============================================================================
1188-
1189-
/// Helper: set up a session that forces spilling during aggregation.
1190-
async fn setup_spill_agg_context(
1191-
memory_limit: usize,
1192-
batch_size: usize,
1193-
) -> Result<SessionContext> {
1194-
let runtime = RuntimeEnvBuilder::new()
1195-
.with_memory_pool(Arc::new(FairSpillPool::new(memory_limit)))
1196-
.with_disk_manager_builder(
1197-
DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
1198-
)
1199-
.build_arc()
1200-
.unwrap();
1201-
1202-
let config = SessionConfig::new()
1203-
.with_sort_spill_reservation_bytes(64 * 1024)
1204-
.with_sort_in_place_threshold_bytes(0)
1205-
.with_spill_compression(SpillCompression::Uncompressed)
1206-
.with_batch_size(batch_size)
1207-
.with_target_partitions(1);
1208-
1209-
Ok(SessionContext::new_with_config_rt(config, runtime))
1210-
}
1211-
1212-
/// Regression test for https://github.com/apache/datafusion/issues/20724
1213-
///
1214-
/// When hash aggregation spills and switches to streaming merge,
1215-
/// `group_values` (GroupValuesColumn<false>) is not recreated with the
1216-
/// streaming variant (<true>). This means `vectorized_intern` is used
1217-
/// post-spill, which can produce non-monotonic group indices under hash
1218-
/// collisions, causing `GroupOrderingFull` to prematurely emit groups
1219-
/// and create duplicate keys in the output.
1220-
///
1221-
/// Requirements to trigger:
1222-
/// - Two-column GROUP BY → forces `GroupValuesColumn` (not `GroupValuesPrimitive`)
1223-
/// - `force_hash_partial_collisions` feature → truncated hashes create the mix
1224-
/// of colliding/non-colliding keys needed for non-monotonic indices
1225-
/// - `batch_size=50` → not a multiple of rows-per-group in the merged stream,
1226-
/// so groups span batch boundaries and premature emission causes duplicates
1227-
#[tokio::test]
1228-
async fn test_no_duplicate_groups_after_spill() -> Result<()> {
1229-
let num_keys: i64 = 5000;
1230-
let rows_per_key: i64 = 4;
1231-
let total_rows = (num_keys * rows_per_key) as usize;
1232-
1233-
let schema = Arc::new(Schema::new(vec![
1234-
Field::new("key_a", DataType::Int64, false),
1235-
Field::new("key_b", DataType::Int64, false),
1236-
Field::new("value", DataType::Int64, false),
1237-
]));
1238-
1239-
let mut keys_a = Vec::with_capacity(total_rows);
1240-
let mut keys_b = Vec::with_capacity(total_rows);
1241-
let mut vals = Vec::with_capacity(total_rows);
1242-
for r in 0..rows_per_key {
1243-
for k in 0..num_keys {
1244-
keys_a.push(k / 100);
1245-
keys_b.push(k % 100);
1246-
vals.push(r * num_keys + k);
1247-
}
1248-
}
1249-
1250-
let mut batches = Vec::new();
1251-
for start in (0..total_rows).step_by(500) {
1252-
let end = (start + 500).min(total_rows);
1253-
batches.push(RecordBatch::try_new(
1254-
Arc::clone(&schema),
1255-
vec![
1256-
Arc::new(Int64Array::from(keys_a[start..end].to_vec())),
1257-
Arc::new(Int64Array::from(keys_b[start..end].to_vec())),
1258-
Arc::new(Int64Array::from(vals[start..end].to_vec())),
1259-
],
1260-
)?);
1261-
}
1262-
1263-
let ctx = setup_spill_agg_context(128 * 1024, 50).await?;
1264-
let table = MemTable::try_new(schema, vec![batches])?;
1265-
ctx.register_table("t", Arc::new(table))?;
1266-
1267-
let df = ctx
1268-
.sql("SELECT key_a, key_b, COUNT(*) as cnt FROM t GROUP BY key_a, key_b")
1269-
.await?;
1270-
let results =
1271-
collect_batches(df.create_physical_plan().await?, ctx.task_ctx()).await?;
1272-
1273-
let mut seen = HashSet::new();
1274-
for batch in &results {
1275-
let ka = batch
1276-
.column(0)
1277-
.as_any()
1278-
.downcast_ref::<Int64Array>()
1279-
.unwrap();
1280-
let kb = batch
1281-
.column(1)
1282-
.as_any()
1283-
.downcast_ref::<Int64Array>()
1284-
.unwrap();
1285-
for i in 0..batch.num_rows() {
1286-
assert!(
1287-
seen.insert((ka.value(i), kb.value(i))),
1288-
"DUPLICATE group key ({}, {}). \
1289-
Bug #20724: group_values not recreated for streaming merge.",
1290-
ka.value(i),
1291-
kb.value(i),
1292-
);
1293-
}
1294-
}
1295-
assert_eq!(seen.len(), num_keys as usize);
1296-
Ok(())
1297-
}

datafusion/physical-plan/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ workspace = true
3939

4040
[features]
4141
force_hash_collisions = []
42-
force_hash_partial_collisions = ["datafusion-common/force_hash_partial_collisions"]
4342
test_utils = ["arrow/test_utils"]
4443
tokio_coop = []
4544
tokio_coop_fallback = []

0 commit comments

Comments
 (0)