Skip to content

Commit 2e08d5c

Browse files
authored
Improve dictionary null handling in hashing and expand aggregate test coverage for nulls (#16466)
- **Hashing Logic Enhancements:** - Introduced a new helper function `update_hash_for_dict_key` to cleanly and correctly apply dictionary value hashes based on validity. - Reworked the logic in `hash_dictionary` to use this helper, improving clarity and maintainability. - **Test Coverage Improvements:** - Renamed `aggregates.rs` to `aggregates/basic.rs` and introduced a new `aggregates/dict_nulls.rs` module. - Added new test utilities and structured test data generators in `mod.rs` to support dictionary null testing. - Comprehensive new test cases for: - `COUNT`, `SUM`, `MIN`, `MAX`, `MEDIAN`, `FIRST_VALUE`, `LAST_VALUE` on dictionary columns with nulls. - `RESPECT NULLS` / `IGNORE NULLS` options in window functions. - Grouping by dictionary columns with null keys or values. - Partitioned execution consistency using fuzzed data across multiple types (numeric, binary, decimal, timestamp). - **Minor Fixes:** - Corrected test message grammar (e.g., "should success to..." → "should succeed to..."). - Enabled `null_pct` support in dictionary value generation to better simulate real-world datasets.
1 parent 4dd7825 commit 2e08d5c

File tree

6 files changed

+1521
-23
lines changed

6 files changed

+1521
-23
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,26 @@ fn hash_array<T>(
184184
}
185185
}
186186

187+
/// Helper function to update hash for a dictionary key if the value is valid
188+
#[cfg(not(feature = "force_hash_collisions"))]
189+
#[inline]
190+
fn update_hash_for_dict_key(
191+
hash: &mut u64,
192+
dict_hashes: &[u64],
193+
dict_values: &dyn Array,
194+
idx: usize,
195+
multi_col: bool,
196+
) {
197+
if dict_values.is_valid(idx) {
198+
if multi_col {
199+
*hash = combine_hashes(dict_hashes[idx], *hash);
200+
} else {
201+
*hash = dict_hashes[idx];
202+
}
203+
}
204+
// no update for invalid dictionary value
205+
}
206+
187207
/// Hash the values in a dictionary array
188208
#[cfg(not(feature = "force_hash_collisions"))]
189209
fn hash_dictionary<K: ArrowDictionaryKeyType>(
@@ -195,23 +215,23 @@ fn hash_dictionary<K: ArrowDictionaryKeyType>(
195215
// Hash each dictionary value once, and then use that computed
196216
// hash for each key value to avoid a potentially expensive
197217
// redundant hashing for large dictionary elements (e.g. strings)
198-
let values = Arc::clone(array.values());
199-
let mut dict_hashes = vec![0; values.len()];
200-
create_hashes(&[values], random_state, &mut dict_hashes)?;
218+
let dict_values = Arc::clone(array.values());
219+
let mut dict_hashes = vec![0; dict_values.len()];
220+
create_hashes(&[dict_values], random_state, &mut dict_hashes)?;
201221

202222
// combine hash for each index in values
203-
if multi_col {
204-
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
205-
if let Some(key) = key {
206-
*hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
207-
} // no update for Null, consistent with other hashes
208-
}
209-
} else {
210-
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
211-
if let Some(key) = key {
212-
*hash = dict_hashes[key.as_usize()]
213-
} // no update for Null, consistent with other hashes
214-
}
223+
let dict_values = array.values();
224+
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
225+
if let Some(key) = key {
226+
let idx = key.as_usize();
227+
update_hash_for_dict_key(
228+
hash,
229+
&dict_hashes,
230+
dict_values.as_ref(),
231+
idx,
232+
multi_col,
233+
);
234+
} // no update for Null key
215235
}
216236
Ok(())
217237
}

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ impl AggregationFuzzer {
171171
let datasets = self
172172
.dataset_generator
173173
.generate()
174-
.expect("should success to generate dataset");
174+
.expect("should succeed to generate dataset");
175175

176176
// Then for each of them, we random select a test sql for it
177177
let query_groups = datasets
@@ -216,16 +216,16 @@ impl AggregationFuzzer {
216216
// Generate the baseline context, and get the baseline result firstly
217217
let baseline_ctx_with_params = ctx_generator
218218
.generate_baseline()
219-
.expect("should success to generate baseline session context");
219+
.expect("should succeed to generate baseline session context");
220220
let baseline_result = run_sql(&sql, &baseline_ctx_with_params.ctx)
221221
.await
222-
.expect("should success to run baseline sql");
222+
.expect("should succeed to run baseline sql");
223223
let baseline_result = Arc::new(baseline_result);
224224
// Generate test tasks
225225
for _ in 0..CTX_GEN_ROUNDS {
226226
let ctx_with_params = ctx_generator
227227
.generate()
228-
.expect("should success to generate session context");
228+
.expect("should succeed to generate session context");
229229
let task = AggregationFuzzTestTask {
230230
dataset_ref: dataset_ref.clone(),
231231
expected_result: baseline_result.clone(),

datafusion/core/tests/fuzz_cases/record_batch_generator.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -724,15 +724,13 @@ impl RecordBatchGenerator {
724724
{
725725
// We generate just num_distinct values because they will be reused by different keys
726726
let mut array_gen_rng = array_gen_rng;
727-
727+
debug_assert!((0.0..=1.0).contains(&null_pct));
728728
let values = Self::generate_array_of_type_inner(
729729
&ColumnDescr::new("values", *value_type.clone()),
730730
num_distinct,
731731
batch_gen_rng,
732732
array_gen_rng.clone(),
733-
// Once https://github.com/apache/datafusion/issues/16228 is fixed
734-
// we can also generate nulls in values
735-
0.0, // null values are generated on the key level
733+
null_pct, // generate some null values
736734
);
737735

738736
match key_type.as_ref() {

0 commit comments

Comments
 (0)