Skip to content

Commit 6529540

Browse files
committed
fix string cast array missing null value
1 parent 8ab5b5a commit 6529540

File tree

2 files changed

+237
-0
lines changed

2 files changed

+237
-0
lines changed

native-engine/datafusion-ext-commons/src/arrow/cast.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,49 @@ pub fn cast_impl(
268268

269269
Arc::new(builder.finish())
270270
}
271+
// array to string (spark compatible)
272+
(&DataType::List(_), &DataType::Utf8) => {
273+
let list_array = as_list_array(array);
274+
let values = list_array.values();
275+
let casted_values = cast_impl(values, &DataType::Utf8, match_struct_fields)?;
276+
let string_values = as_string_array(&casted_values);
277+
278+
let mut builder = StringBuilder::new();
279+
280+
for row_idx in 0..list_array.len() {
281+
if list_array.is_null(row_idx) {
282+
builder.append_null();
283+
} else {
284+
let mut row_str = String::from("[");
285+
let start = list_array.value_offsets()[row_idx] as usize;
286+
let end = list_array.value_offsets()[row_idx + 1] as usize;
287+
let num_elements = end - start;
288+
289+
if num_elements > 0 {
290+
if values.is_null(start) {
291+
row_str.push_str("null");
292+
} else {
293+
row_str.push_str(string_values.value(start));
294+
}
295+
296+
for i in 1..num_elements {
297+
row_str.push(',');
298+
if values.is_null(start + i) {
299+
row_str.push_str(" null");
300+
} else {
301+
row_str.push(' ');
302+
row_str.push_str(string_values.value(start + i));
303+
}
304+
}
305+
}
306+
307+
row_str.push(']');
308+
builder.append_value(&row_str);
309+
}
310+
}
311+
312+
Arc::new(builder.finish())
313+
}
271314
_ => {
272315
// default cast
273316
arrow::compute::kernels::cast::cast(array, cast_type)?
@@ -474,6 +517,7 @@ fn to_date(s: &str) -> Option<i32> {
474517

475518
#[cfg(test)]
476519
mod test {
520+
use arrow::buffer::OffsetBuffer;
477521
use datafusion::common::cast::{as_decimal128_array, as_float64_array, as_int32_array};
478522

479523
use super::*;
@@ -795,4 +839,125 @@ mod test {
795839
&StringArray::from_iter(vec![Some("{100, {x, true}}"), Some("{200, {y, null}}"),])
796840
);
797841
}
842+
843+
#[test]
844+
fn test_array_to_string() {
845+
// Create a list array with int32 elements
846+
let values = Int32Array::from(vec![
847+
Some(1),
848+
Some(2),
849+
Some(3),
850+
None,
851+
Some(5),
852+
Some(6),
853+
None,
854+
None,
855+
]);
856+
let offsets = OffsetBuffer::new(vec![0, 3, 5, 8].into());
857+
let list_array: ArrayRef = Arc::new(ListArray::new(
858+
Arc::new(Field::new("item", DataType::Int32, true)),
859+
offsets,
860+
Arc::new(values),
861+
None,
862+
));
863+
864+
let casted = cast(&list_array, &DataType::Utf8).unwrap();
865+
assert_eq!(
866+
as_string_array(&casted),
867+
&StringArray::from_iter(vec![
868+
Some("[1, 2, 3]"),
869+
Some("[null, 5]"),
870+
Some("[6, null, null]"),
871+
])
872+
);
873+
}
874+
875+
#[test]
876+
fn test_array_to_string_with_null_array() {
877+
// Create a list array where some rows are entirely null
878+
let values = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)]);
879+
let offsets = OffsetBuffer::new(vec![0, 2, 2, 4].into());
880+
let nulls = arrow::buffer::NullBuffer::from(vec![true, false, true]);
881+
let list_array: ArrayRef = Arc::new(ListArray::new(
882+
Arc::new(Field::new("item", DataType::Int32, true)),
883+
offsets,
884+
Arc::new(values),
885+
Some(nulls),
886+
));
887+
888+
let casted = cast(&list_array, &DataType::Utf8).unwrap();
889+
assert_eq!(
890+
as_string_array(&casted),
891+
&StringArray::from_iter(vec![Some("[1, 2]"), None, Some("[3, 4]"),])
892+
);
893+
}
894+
895+
#[test]
896+
fn test_empty_array_to_string() {
897+
// Create a list array with empty arrays
898+
let values = Int32Array::from(vec![] as Vec<Option<i32>>);
899+
let offsets = OffsetBuffer::new(vec![0, 0, 0].into());
900+
let list_array: ArrayRef = Arc::new(ListArray::new(
901+
Arc::new(Field::new("item", DataType::Int32, true)),
902+
offsets,
903+
Arc::new(values),
904+
None,
905+
));
906+
907+
let casted = cast(&list_array, &DataType::Utf8).unwrap();
908+
assert_eq!(
909+
as_string_array(&casted),
910+
&StringArray::from_iter(vec![Some("[]"), Some("[]")])
911+
);
912+
}
913+
914+
#[test]
915+
fn test_nested_array_to_string() {
916+
// Create a nested array: array<array<int>>
917+
let inner_values = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)]);
918+
let inner_offsets = OffsetBuffer::new(vec![0, 2, 4].into());
919+
let inner_list = ListArray::new(
920+
Arc::new(Field::new("item", DataType::Int32, true)),
921+
inner_offsets,
922+
Arc::new(inner_values),
923+
None,
924+
);
925+
926+
let outer_offsets = OffsetBuffer::new(vec![0, 1, 2].into());
927+
let outer_list: ArrayRef = Arc::new(ListArray::new(
928+
Arc::new(Field::new(
929+
"item",
930+
DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
931+
true,
932+
)),
933+
outer_offsets,
934+
Arc::new(inner_list),
935+
None,
936+
));
937+
938+
let casted = cast(&outer_list, &DataType::Utf8).unwrap();
939+
assert_eq!(
940+
as_string_array(&casted),
941+
&StringArray::from_iter(vec![Some("[[1, 2]]"), Some("[[3, 4]]"),])
942+
);
943+
}
944+
945+
#[test]
946+
fn test_array_of_strings_to_string() {
947+
// Create a list array with string elements
948+
let values = StringArray::from(vec![Some("a"), Some("b"), None, Some("d")]);
949+
let offsets = OffsetBuffer::new(vec![0, 2, 4].into());
950+
let list_array: ArrayRef = Arc::new(ListArray::new(
951+
Arc::new(Field::new("item", DataType::Utf8, true)),
952+
offsets,
953+
Arc::new(values),
954+
None,
955+
));
956+
957+
let casted = cast(&list_array, &DataType::Utf8).unwrap();
958+
assert_eq!(
959+
as_string_array(&casted),
960+
&StringArray::from_iter(vec![Some("[a, b]"), Some("[null, d]"),])
961+
);
962+
}
798963
}

spark-extension-shims-spark/src/test/scala/org.apache.auron/AuronQuerySuite.scala

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,4 +508,76 @@ class AuronQuerySuite extends AuronQueryTest with BaseAuronSQLSuite with AuronSQ
508508
}
509509
}
510510
}
511+
512+
test("cast array to string") {
513+
if (AuronTestUtils.isSparkV31OrGreater) {
514+
withTable("t_array") {
515+
sql("""
516+
|create table t_array using parquet as
517+
|select array(1, 2, 3) as arr
518+
|union all select array(4, 5)
519+
|union all select array(null, 7, null)
520+
|""".stripMargin)
521+
522+
checkSparkAnswerAndOperator("select cast(arr as string) from t_array")
523+
}
524+
}
525+
}
526+
527+
test("cast nested array to string") {
528+
if (AuronTestUtils.isSparkV31OrGreater) {
529+
withTable("t_nested_array") {
530+
sql("""
531+
|create table t_nested_array using parquet as
532+
|select array(array(1, 2), array(3, 4, 5)) as arr
533+
|union all select array(array(6), array(7, 8))
534+
|""".stripMargin)
535+
536+
checkSparkAnswerAndOperator("select cast(arr as string) from t_nested_array")
537+
}
538+
}
539+
}
540+
541+
test("cast array with null elements to string") {
542+
if (AuronTestUtils.isSparkV31OrGreater) {
543+
withTable("t_array_nulls") {
544+
sql("""
545+
|create table t_array_nulls using parquet as
546+
|select array(cast(null as int), cast(null as int)) as arr
547+
|union all select array(1, null, 3)
548+
|union all select array(null, 2, null, 4)
549+
|""".stripMargin)
550+
551+
checkSparkAnswerAndOperator("select cast(arr as string) from t_array_nulls")
552+
}
553+
}
554+
}
555+
556+
test("cast array of strings to string") {
557+
if (AuronTestUtils.isSparkV31OrGreater) {
558+
withTable("t_array_strings") {
559+
sql("""
560+
|create table t_array_strings using parquet as
561+
|select array('hello', 'world') as arr
562+
|union all select array('foo', null, 'bar')
563+
|""".stripMargin)
564+
565+
checkSparkAnswerAndOperator("select cast(arr as string) from t_array_strings")
566+
}
567+
}
568+
}
569+
570+
test("cast empty array to string") {
571+
if (AuronTestUtils.isSparkV31OrGreater) {
572+
withTable("t_empty_array") {
573+
sql("""
574+
|create table t_empty_array using parquet as
575+
|select array() as arr
576+
|union all select array(1, 2)
577+
|""".stripMargin)
578+
579+
checkSparkAnswerAndOperator("select cast(arr as string) from t_empty_array")
580+
}
581+
}
582+
}
511583
}

0 commit comments

Comments
 (0)