Skip to content

Commit 75fdd3c

Browse files
authored
Merge branch 'main' into alamb/default_arc
2 parents cc0178c + 878b879 commit 75fdd3c

File tree

11 files changed

+688
-104
lines changed

11 files changed

+688
-104
lines changed

.github/workflows/extended.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,10 @@ jobs:
173173
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
174174
submodules: true
175175
fetch-depth: 1
176-
- name: Setup Rust toolchain
177-
uses: ./.github/actions/setup-builder
178-
with:
179-
rust-version: stable
176+
# Don't use setup-builder to avoid configuring RUST_BACKTRACE which is expensive
177+
- name: Install protobuf compiler
178+
run: |
179+
apt-get update && apt-get install -y protobuf-compiler
180180
- name: Run sqllogictest
181181
run: |
182182
cargo test --features backtrace,parquet_encryption --profile ci-optimized --test sqllogictests -- --include-sqlite

AGENTS.md

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,22 @@
22

33
## Developer Documentation
44

5+
- [Quick Start Setup](docs/source/contributor-guide/development_environment.md#quick-start)
6+
- [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
7+
- [Before Submitting a PR](docs/source/contributor-guide/index.md#before-submitting-a-pr)
58
- [Contributor Guide](docs/source/contributor-guide/index.md)
69
- [Architecture Guide](docs/source/contributor-guide/architecture.md)
710

811
## Before Committing
912

10-
Before committing any changes, you **must** run the following checks and fix any issues:
13+
Before committing any changes, you MUST follow the instructions in
14+
[Before Submitting a PR](docs/source/contributor-guide/index.md#before-submitting-a-pr)
15+
and ensure the required checks listed there pass. Do not commit code that
16+
fails any of those checks.
1117

12-
```bash
13-
cargo fmt --all
14-
cargo clippy --all-targets --all-features -- -D warnings
15-
```
16-
17-
- `cargo fmt` ensures consistent code formatting across the project.
18-
- `cargo clippy` catches common mistakes and enforces idiomatic Rust patterns. All warnings must be resolved (treated as errors via `-D warnings`).
19-
20-
Do not commit code that fails either of these checks.
18+
When creating a PR, you MUST follow the [PR template](.github/pull_request_template.md).
2119

2220
## Testing
2321

24-
Run relevant tests before submitting changes:
25-
26-
```bash
27-
cargo test --all-features
28-
```
29-
30-
For SQL logic tests:
31-
32-
```bash
33-
cargo test -p datafusion-sqllogictest
34-
```
22+
See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
23+
for the recommended pre-PR test commands.

datafusion/common/src/utils/mod.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ use std::cmp::{Ordering, min};
4040
use std::collections::HashSet;
4141
use std::num::NonZero;
4242
use std::ops::Range;
43-
use std::sync::Arc;
43+
use std::sync::{Arc, LazyLock};
4444
use std::thread::available_parallelism;
4545

4646
/// Applies an optional projection to a [`SchemaRef`], returning the
@@ -923,10 +923,15 @@ pub fn combine_limit(
923923
///
924924
/// This is a wrapper around `std::thread::available_parallelism`, providing a default value
925925
/// of `1` if the system's parallelism cannot be determined.
926+
///
927+
/// The result is cached after the first call.
926928
pub fn get_available_parallelism() -> usize {
927-
available_parallelism()
928-
.unwrap_or(NonZero::new(1).expect("literal value `1` shouldn't be zero"))
929-
.get()
929+
static PARALLELISM: LazyLock<usize> = LazyLock::new(|| {
930+
available_parallelism()
931+
.unwrap_or(NonZero::new(1).expect("literal value `1` shouldn't be zero"))
932+
.get()
933+
});
934+
*PARALLELISM
930935
}
931936

932937
/// Converts a collection of function arguments into a fixed-size array of length N

datafusion/functions-nested/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,7 @@ name = "array_to_string"
109109
[[bench]]
110110
harness = false
111111
name = "array_position"
112+
113+
[[bench]]
114+
harness = false
115+
name = "array_sort"
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::hint::black_box;
19+
use std::sync::Arc;
20+
21+
use arrow::array::{ArrayRef, BooleanBufferBuilder, Int32Array, ListArray, StringArray};
22+
use arrow::buffer::{NullBuffer, OffsetBuffer};
23+
use arrow::datatypes::{DataType, Field};
24+
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
25+
use datafusion_common::config::ConfigOptions;
26+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
27+
use datafusion_functions_nested::sort::ArraySort;
28+
use rand::SeedableRng;
29+
use rand::rngs::StdRng;
30+
use rand::seq::SliceRandom;
31+
32+
const SEED: u64 = 42;
33+
const NUM_ROWS: usize = 8192;
34+
35+
fn create_int32_list_array(
36+
num_rows: usize,
37+
elements_per_row: usize,
38+
with_nulls: bool,
39+
) -> ArrayRef {
40+
let mut rng = StdRng::seed_from_u64(SEED);
41+
let total_values = num_rows * elements_per_row;
42+
43+
let mut values: Vec<i32> = (0..total_values as i32).collect();
44+
values.shuffle(&mut rng);
45+
46+
let values = Arc::new(Int32Array::from(values));
47+
let offsets: Vec<i32> = (0..=num_rows)
48+
.map(|i| (i * elements_per_row) as i32)
49+
.collect();
50+
51+
let nulls = if with_nulls {
52+
// Every 10th row is null
53+
Some(NullBuffer::from(
54+
(0..num_rows).map(|i| i % 10 != 0).collect::<Vec<bool>>(),
55+
))
56+
} else {
57+
None
58+
};
59+
60+
Arc::new(ListArray::new(
61+
Arc::new(Field::new("item", DataType::Int32, true)),
62+
OffsetBuffer::new(offsets.into()),
63+
values,
64+
nulls,
65+
))
66+
}
67+
68+
/// Creates a ListArray where ~10% of elements within each row are null.
69+
fn create_int32_list_array_with_null_elements(
70+
num_rows: usize,
71+
elements_per_row: usize,
72+
) -> ArrayRef {
73+
let mut rng = StdRng::seed_from_u64(SEED);
74+
let total_values = num_rows * elements_per_row;
75+
76+
let mut values: Vec<i32> = (0..total_values as i32).collect();
77+
values.shuffle(&mut rng);
78+
79+
// ~10% of elements are null
80+
let mut validity = BooleanBufferBuilder::new(total_values);
81+
for i in 0..total_values {
82+
validity.append(i % 10 != 0);
83+
}
84+
let null_buffer = NullBuffer::from(validity.finish());
85+
86+
let values = Arc::new(Int32Array::new(values.into(), Some(null_buffer)));
87+
let offsets: Vec<i32> = (0..=num_rows)
88+
.map(|i| (i * elements_per_row) as i32)
89+
.collect();
90+
91+
Arc::new(ListArray::new(
92+
Arc::new(Field::new("item", DataType::Int32, true)),
93+
OffsetBuffer::new(offsets.into()),
94+
values,
95+
None,
96+
))
97+
}
98+
99+
fn create_string_list_array(num_rows: usize, elements_per_row: usize) -> ArrayRef {
100+
let mut rng = StdRng::seed_from_u64(SEED);
101+
let total_values = num_rows * elements_per_row;
102+
103+
let mut indices: Vec<usize> = (0..total_values).collect();
104+
indices.shuffle(&mut rng);
105+
let string_values: Vec<String> =
106+
indices.iter().map(|i| format!("value_{i:06}")).collect();
107+
let values = Arc::new(StringArray::from(string_values));
108+
109+
let offsets: Vec<i32> = (0..=num_rows)
110+
.map(|i| (i * elements_per_row) as i32)
111+
.collect();
112+
113+
Arc::new(ListArray::new(
114+
Arc::new(Field::new("item", DataType::Utf8, true)),
115+
OffsetBuffer::new(offsets.into()),
116+
values,
117+
None,
118+
))
119+
}
120+
121+
fn invoke_array_sort(udf: &ArraySort, array: &ArrayRef) -> ColumnarValue {
122+
udf.invoke_with_args(ScalarFunctionArgs {
123+
args: vec![ColumnarValue::Array(Arc::clone(array))],
124+
arg_fields: vec![Field::new("arr", array.data_type().clone(), true).into()],
125+
number_rows: array.len(),
126+
return_field: Field::new("result", array.data_type().clone(), true).into(),
127+
config_options: Arc::new(ConfigOptions::default()),
128+
})
129+
.unwrap()
130+
}
131+
132+
/// Vary elements_per_row over [5, 20, 100, 1000]: for small arrays, per-row
133+
/// overhead dominates, whereas for larger arrays the sort kernel dominates.
134+
fn bench_array_sort(c: &mut Criterion) {
135+
let mut group = c.benchmark_group("array_sort");
136+
let udf = ArraySort::new();
137+
138+
// Int32 arrays
139+
for &elements_per_row in &[5, 20, 100, 1000] {
140+
let array = create_int32_list_array(NUM_ROWS, elements_per_row, false);
141+
group.bench_with_input(
142+
BenchmarkId::new("int32", elements_per_row),
143+
&elements_per_row,
144+
|b, _| {
145+
b.iter(|| {
146+
black_box(invoke_array_sort(&udf, &array));
147+
});
148+
},
149+
);
150+
}
151+
152+
// Int32 with nulls in the outer list (10% null rows), single size
153+
{
154+
let array = create_int32_list_array(NUM_ROWS, 50, true);
155+
group.bench_function("int32_with_nulls", |b| {
156+
b.iter(|| {
157+
black_box(invoke_array_sort(&udf, &array));
158+
});
159+
});
160+
}
161+
162+
// Int32 with null elements (~10% of elements within rows are null)
163+
for &elements_per_row in &[5, 20, 100, 1000] {
164+
let array =
165+
create_int32_list_array_with_null_elements(NUM_ROWS, elements_per_row);
166+
group.bench_with_input(
167+
BenchmarkId::new("int32_null_elements", elements_per_row),
168+
&elements_per_row,
169+
|b, _| {
170+
b.iter(|| {
171+
black_box(invoke_array_sort(&udf, &array));
172+
});
173+
},
174+
);
175+
}
176+
177+
// String arrays
178+
for &elements_per_row in &[5, 20, 100, 1000] {
179+
let array = create_string_list_array(NUM_ROWS, elements_per_row);
180+
group.bench_with_input(
181+
BenchmarkId::new("string", elements_per_row),
182+
&elements_per_row,
183+
|b, _| {
184+
b.iter(|| {
185+
black_box(invoke_array_sort(&udf, &array));
186+
});
187+
},
188+
);
189+
}
190+
191+
group.finish();
192+
}
193+
194+
criterion_group!(benches, bench_array_sort);
195+
criterion_main!(benches);

0 commit comments

Comments
 (0)