Skip to content

Commit 6713439

Browse files
neilconwayclaude
andauthored
perf: Optimize array_to_string(), support more types (#20553)
## Which issue does this PR close? - Closes #20551. - Closes #20552. ## Rationale for this change `array_to_string` did a lot of unnecessary allocations. Rewriting the function to avoid those allocations yields around a 50-75% performance improvement. ## What changes are included in this PR? * Add benchmark for `array_to_string` * Move nested functions to top-level * Get rid of some unnecessary macros * Borrow instead of cloning on recursion * Reuse a single `String` buffer across rows via `buf.clear()`, instead of allocating fresh * Write directly to the output buffer, instead of allocating via `x.to_string()` + `push_str` * Specialized logic for writing values of different types; this adds a dependency on the `itoa` crate, but it yields significant speedups over a generic `write!` approach * Add support for arrays of `decimal` and all the datetime types * Improve docs ## Are these changes tested? Yes, and benchmarked. ## Are there any user-facing changes? No. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8482e2e commit 6713439

File tree

7 files changed

+510
-207
lines changed

7 files changed

+510
-207
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ hex = { version = "0.4.3" }
161161
indexmap = "2.13.0"
162162
insta = { version = "1.46.3", features = ["glob", "filters"] }
163163
itertools = "0.14"
164+
itoa = "1.0"
164165
liblzma = { version = "0.4.6", features = ["static"] }
165166
log = "^0.4"
166167
memchr = "2.8.0"

datafusion/functions-nested/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ datafusion-macros = { workspace = true }
5959
datafusion-physical-expr-common = { workspace = true }
6060
hashbrown = { workspace = true }
6161
itertools = { workspace = true, features = ["use_std"] }
62+
itoa = { workspace = true }
6263
log = { workspace = true }
6364
paste = { workspace = true }
6465

@@ -98,6 +99,10 @@ name = "array_repeat"
9899
harness = false
99100
name = "array_set_ops"
100101

102+
[[bench]]
103+
harness = false
104+
name = "array_to_string"
105+
101106
[[bench]]
102107
harness = false
103108
name = "array_position"
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray};
19+
use arrow::buffer::OffsetBuffer;
20+
use arrow::datatypes::{DataType, Field};
21+
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
22+
use datafusion_common::ScalarValue;
23+
use datafusion_common::config::ConfigOptions;
24+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
25+
use datafusion_functions_nested::string::ArrayToString;
26+
use rand::rngs::StdRng;
27+
use rand::{Rng, SeedableRng};
28+
use std::hint::black_box;
29+
use std::sync::Arc;
30+
31+
const NUM_ROWS: usize = 1000;
32+
const ARRAY_SIZES: &[usize] = &[5, 20, 100];
33+
const NESTED_ARRAY_SIZE: usize = 3;
34+
const SEED: u64 = 42;
35+
const NULL_DENSITY: f64 = 0.1;
36+
37+
fn criterion_benchmark(c: &mut Criterion) {
38+
bench_array_to_string(c, "array_to_string_int64", create_int64_list_array);
39+
bench_array_to_string(c, "array_to_string_float64", create_float64_list_array);
40+
bench_array_to_string(c, "array_to_string_string", create_string_list_array);
41+
bench_array_to_string(
42+
c,
43+
"array_to_string_nested_int64",
44+
create_nested_int64_list_array,
45+
);
46+
}
47+
48+
fn bench_array_to_string(
49+
c: &mut Criterion,
50+
group_name: &str,
51+
make_array: impl Fn(usize) -> ArrayRef,
52+
) {
53+
let mut group = c.benchmark_group(group_name);
54+
55+
for &array_size in ARRAY_SIZES {
56+
let list_array = make_array(array_size);
57+
let args = vec![
58+
ColumnarValue::Array(list_array.clone()),
59+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))),
60+
];
61+
let arg_fields = vec![
62+
Field::new("array", list_array.data_type().clone(), true).into(),
63+
Field::new("delimiter", DataType::Utf8, false).into(),
64+
];
65+
66+
group.bench_with_input(
67+
BenchmarkId::from_parameter(array_size),
68+
&array_size,
69+
|b, _| {
70+
let udf = ArrayToString::new();
71+
b.iter(|| {
72+
black_box(
73+
udf.invoke_with_args(ScalarFunctionArgs {
74+
args: args.clone(),
75+
arg_fields: arg_fields.clone(),
76+
number_rows: NUM_ROWS,
77+
return_field: Field::new("result", DataType::Utf8, true)
78+
.into(),
79+
config_options: Arc::new(ConfigOptions::default()),
80+
})
81+
.unwrap(),
82+
)
83+
})
84+
},
85+
);
86+
}
87+
88+
group.finish();
89+
}
90+
91+
fn create_int64_list_array(array_size: usize) -> ArrayRef {
92+
let mut rng = StdRng::seed_from_u64(SEED);
93+
let values = (0..NUM_ROWS * array_size)
94+
.map(|_| {
95+
if rng.random::<f64>() < NULL_DENSITY {
96+
None
97+
} else {
98+
Some(rng.random_range(0..1000))
99+
}
100+
})
101+
.collect::<Int64Array>();
102+
let offsets = (0..=NUM_ROWS)
103+
.map(|i| (i * array_size) as i32)
104+
.collect::<Vec<i32>>();
105+
106+
Arc::new(
107+
ListArray::try_new(
108+
Arc::new(Field::new("item", DataType::Int64, true)),
109+
OffsetBuffer::new(offsets.into()),
110+
Arc::new(values),
111+
None,
112+
)
113+
.unwrap(),
114+
)
115+
}
116+
117+
fn create_nested_int64_list_array(array_size: usize) -> ArrayRef {
118+
let inner = create_int64_list_array(array_size);
119+
let inner_rows = NUM_ROWS;
120+
let outer_rows = inner_rows / NESTED_ARRAY_SIZE;
121+
let offsets = (0..=outer_rows)
122+
.map(|i| (i * NESTED_ARRAY_SIZE) as i32)
123+
.collect::<Vec<i32>>();
124+
Arc::new(
125+
ListArray::try_new(
126+
Arc::new(Field::new("item", inner.data_type().clone(), true)),
127+
OffsetBuffer::new(offsets.into()),
128+
inner,
129+
None,
130+
)
131+
.unwrap(),
132+
)
133+
}
134+
135+
fn create_float64_list_array(array_size: usize) -> ArrayRef {
136+
let mut rng = StdRng::seed_from_u64(SEED);
137+
let values = (0..NUM_ROWS * array_size)
138+
.map(|_| {
139+
if rng.random::<f64>() < NULL_DENSITY {
140+
None
141+
} else {
142+
Some(rng.random_range(-1000.0..1000.0))
143+
}
144+
})
145+
.collect::<Float64Array>();
146+
let offsets = (0..=NUM_ROWS)
147+
.map(|i| (i * array_size) as i32)
148+
.collect::<Vec<i32>>();
149+
150+
Arc::new(
151+
ListArray::try_new(
152+
Arc::new(Field::new("item", DataType::Float64, true)),
153+
OffsetBuffer::new(offsets.into()),
154+
Arc::new(values),
155+
None,
156+
)
157+
.unwrap(),
158+
)
159+
}
160+
161+
fn create_string_list_array(array_size: usize) -> ArrayRef {
162+
let mut rng = StdRng::seed_from_u64(SEED);
163+
let values = (0..NUM_ROWS * array_size)
164+
.map(|_| {
165+
if rng.random::<f64>() < NULL_DENSITY {
166+
None
167+
} else {
168+
Some(format!("value_{}", rng.random_range(0..100)))
169+
}
170+
})
171+
.collect::<StringArray>();
172+
let offsets = (0..=NUM_ROWS)
173+
.map(|i| (i * array_size) as i32)
174+
.collect::<Vec<i32>>();
175+
176+
Arc::new(
177+
ListArray::try_new(
178+
Arc::new(Field::new("item", DataType::Utf8, true)),
179+
OffsetBuffer::new(offsets.into()),
180+
Arc::new(values),
181+
None,
182+
)
183+
.unwrap(),
184+
)
185+
}
186+
187+
criterion_group!(benches, criterion_benchmark);
188+
criterion_main!(benches);

0 commit comments

Comments
 (0)