Skip to content

Commit 418f62a

Browse files
authored
fix: NULL handling in arrow_intersect and arrow_union (#19415)
## Which issue does this PR close? - Closes #9706 ## Rationale for this change Mentioned in issue. ## What changes are included in this PR? `RowConverter` API seems to have changed since initial implementation. Columns needs to be exactly same as schema passed in, so when we were handling case of `l_values` or `r_values` being `None`, us passing empty columns was causing an assert to fail in `RowConverter`. ## Are these changes tested? `cargo test --test sqllogictests -- array` passes For query mentioned in original issue, current result is: ``` > select array_intersect(column1, column2) from array_intersect_table; +------------------------------------------------------------------------------+ | array_intersect(array_intersect_table.column1,array_intersect_table.column2) | +------------------------------------------------------------------------------+ | [2, 3] | | [3] | | [3] | | [] | | [] | | [] | +------------------------------------------------------------------------------+ 6 row(s) fetched. Elapsed 0.012 seconds. ``` ## Are there any user-facing changes? No
1 parent 45d4948 commit 418f62a

File tree

2 files changed

+47
-4
lines changed

2 files changed

+47
-4
lines changed

datafusion/functions-nested/src/set_ops.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use arrow::array::{
2222
Array, ArrayRef, GenericListArray, LargeListArray, ListArray, OffsetSizeTrait,
2323
new_null_array,
2424
};
25-
use arrow::buffer::OffsetBuffer;
25+
use arrow::buffer::{NullBuffer, OffsetBuffer};
2626
use arrow::compute;
2727
use arrow::datatypes::DataType::{LargeList, List, Null};
2828
use arrow::datatypes::{DataType, Field, FieldRef};
@@ -363,18 +363,23 @@ fn generic_set_lists<OffsetSize: OffsetSizeTrait>(
363363

364364
let mut offsets = vec![OffsetSize::usize_as(0)];
365365
let mut new_arrays = vec![];
366+
let mut new_null_buf = vec![];
366367
let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
367368
for (first_arr, second_arr) in l.iter().zip(r.iter()) {
369+
let mut ele_should_be_null = false;
370+
368371
let l_values = if let Some(first_arr) = first_arr {
369372
converter.convert_columns(&[first_arr])?
370373
} else {
371-
converter.convert_columns(&[])?
374+
ele_should_be_null = true;
375+
converter.empty_rows(0, 0)
372376
};
373377

374378
let r_values = if let Some(second_arr) = second_arr {
375379
converter.convert_columns(&[second_arr])?
376380
} else {
377-
converter.convert_columns(&[])?
381+
ele_should_be_null = true;
382+
converter.empty_rows(0, 0)
378383
};
379384

380385
let l_iter = l_values.iter().sorted().dedup();
@@ -414,13 +419,19 @@ fn generic_set_lists<OffsetSize: OffsetSizeTrait>(
414419
}
415420
};
416421

422+
new_null_buf.push(!ele_should_be_null);
417423
new_arrays.push(array);
418424
}
419425

420426
let offsets = OffsetBuffer::new(offsets.into());
421427
let new_arrays_ref: Vec<_> = new_arrays.iter().map(|v| v.as_ref()).collect();
422428
let values = compute::concat(&new_arrays_ref)?;
423-
let arr = GenericListArray::<OffsetSize>::try_new(field, offsets, values, None)?;
429+
let arr = GenericListArray::<OffsetSize>::try_new(
430+
field,
431+
offsets,
432+
values,
433+
Some(NullBuffer::new(new_null_buf.into())),
434+
)?;
424435
Ok(Arc::new(arr))
425436
}
426437

datafusion/sqllogictest/test_files/array.slt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,17 @@ AS
503503
FROM array_intersect_table_1D_UTF8
504504
;
505505

506+
statement ok
507+
CREATE TABLE array_intersect_table_1D_NULL
508+
AS VALUES
509+
([1, 2, 2, 3], [2, 3, 4]),
510+
([2, 3, 3], [3]),
511+
([3], [3, 3, 4]),
512+
(null, [3, 4]),
513+
([1, 2], null),
514+
(null, null)
515+
;
516+
506517
statement ok
507518
CREATE TABLE array_intersect_table_2D
508519
AS VALUES
@@ -4816,6 +4827,16 @@ select array_union(arrow_cast(['hello'], 'LargeList(Utf8)'), arrow_cast(['hello'
48164827
----
48174828
[hello, datafusion]
48184829

4830+
query ?
4831+
select array_union(column1, column2)
4832+
from array_intersect_table_1D_NULL;
4833+
----
4834+
[1, 2, 3, 4]
4835+
[2, 3]
4836+
[3, 4]
4837+
NULL
4838+
NULL
4839+
NULL
48194840

48204841
# list_to_string scalar function #4 (function alias `array_to_string`)
48214842
query TTT
@@ -6765,6 +6786,17 @@ from large_array_intersect_table_1D_UTF8;
67656786
[bc] [arrow, rust] []
67666787
[] [arrow, datafusion, rust] [arrow, rust]
67676788

6789+
query ?
6790+
select array_intersect(column1, column2)
6791+
from array_intersect_table_1D_NULL;
6792+
----
6793+
[2, 3]
6794+
[3]
6795+
[3]
6796+
NULL
6797+
NULL
6798+
NULL
6799+
67686800
query ??
67696801
select array_intersect(column1, column2),
67706802
array_intersect(column3, column4)

0 commit comments

Comments
 (0)