From b1ddc241e9b985791e7c9e33fbb2f0f3c8ea2e2d Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 27 Dec 2025 02:52:16 +0200 Subject: [PATCH 01/33] fix: `Rows` `size` should use `capacity` and not `len` (#9044) # Which issue does this PR close? N/A # Rationale for this change because `Rows` own the data and offsets vector, it should use the capacity for tracking the size that it uses # What changes are included in this PR? replace `len` with `capacity` and added test # Are these changes tested? yes (and of course they are failing on main and passing with this fix) # Are there any user-facing changes? more accurate size --- arrow-row/src/lib.rs | 47 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 72a295627ed2..aa6543485fe3 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1131,8 +1131,8 @@ impl Rows { pub fn size(&self) -> usize { // Size of fields is accounted for as part of RowConverter std::mem::size_of::() - + self.buffer.len() - + self.offsets.len() * std::mem::size_of::() + + self.buffer.capacity() + + self.offsets.capacity() * std::mem::size_of::() } /// Create a [BinaryArray] from the [Rows] data without reallocating the @@ -4050,4 +4050,47 @@ mod tests { // "a" < "z" assert!(rows.row(3) < rows.row(1)); } + + #[test] + fn rows_size_should_count_for_capacity() { + let row_converter = RowConverter::new(vec![SortField::new(DataType::UInt8)]).unwrap(); + + let empty_rows_size_with_preallocate_rows_and_data = { + let rows = row_converter.empty_rows(1000, 1000); + + rows.size() + }; + let empty_rows_size_with_preallocate_rows = { + let rows = row_converter.empty_rows(1000, 0); + + rows.size() + }; + let empty_rows_size_with_preallocate_data = { + let rows = row_converter.empty_rows(0, 1000); + + rows.size() + }; + let empty_rows_size_without_preallocate = { + let rows = row_converter.empty_rows(0, 0); + + rows.size() + }; + + assert!( + empty_rows_size_with_preallocate_rows_and_data > empty_rows_size_with_preallocate_rows, + "{empty_rows_size_with_preallocate_rows_and_data} should be larger than {empty_rows_size_with_preallocate_rows}" + ); + assert!( + empty_rows_size_with_preallocate_rows_and_data > empty_rows_size_with_preallocate_data, + "{empty_rows_size_with_preallocate_rows_and_data} should be larger than {empty_rows_size_with_preallocate_data}" + ); + assert!( + empty_rows_size_with_preallocate_rows > empty_rows_size_without_preallocate, + "{empty_rows_size_with_preallocate_rows} should be larger than {empty_rows_size_without_preallocate}" + ); + assert!( + empty_rows_size_with_preallocate_data > empty_rows_size_without_preallocate, + "{empty_rows_size_with_preallocate_data} should be larger than {empty_rows_size_without_preallocate}" + ); + } } From 8ed2b5246d5de68909695f5953aa2811f3f8ea0d Mon Sep 17 00:00:00 2001 From: Lanqing Yang Date: Sat, 27 Dec 2025 09:53:54 +0900 Subject: [PATCH 02/33] fix: integration / Archery test With other arrows container ran out of space (#9043) # Which issue does this PR close? - Closes #9024. # Rationale for this change the ci container starts with 63gb / 72gb used, the 9GB remaining disk space is barely enough for a cross build in 7 languages that leads to ci being stuck. this is what a debug step after initialize container shows === CONTAINER DISK USAGE === Filesystem Size Used Avail Use% Mounted on overlay 72G 63G 9.5G 87% / # What changes are included in this PR? - add resource monitoring to build process - add a clean up step to remove unnecessary software (cuts 6GB of space) === Cleaning up host disk space === Disk space before cleanup: Filesystem Size Used Avail Use% Mounted on overlay 72G 63G 9.5G 87% / Disk space after cleanup: Filesystem Size Used Avail Use% Mounted on overlay 72G 57G 16G 79% / - add a small optimization to shallow clone (only clone most recent commit not full history) for github repos optimization results we have 6.1 GB left after build === After Build === Filesystem Size Used Avail Use% Mounted on overlay 72G 66G 6.1G 92% / # Are these changes tested? tested by github ci # Are there any user-facing changes? no --------- Signed-off-by: lyang24 --- .github/workflows/integration.yml | 66 ++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 32c5e78d4f04..cc74650812e9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -78,58 +78,112 @@ jobs: run: shell: bash steps: + - name: Monitor disk usage - Initial + run: | + echo "=== Initial Disk Usage ===" + df -h / + echo "" + + - name: Remove unnecessary preinstalled software + run: | + echo "=== Cleaning up host disk space ===" + echo "Disk space before cleanup:" + df -h / + + # Clean apt cache + apt-get clean || true + + # Remove GitHub Actions tool cache + rm -rf /__t/* || true + + # Remove large packages from host filesystem (mounted at /host/) + rm -rf /host/usr/share/dotnet || true + rm -rf /host/usr/local/lib/android || true + rm -rf /host/usr/local/.ghcup || true + rm -rf /host/opt/hostedtoolcache/CodeQL || true + + echo "" + echo "Disk space after cleanup:" + df -h / + echo "" + # This is necessary so that actions/checkout can find git - name: Export conda path run: echo "/opt/conda/envs/arrow/bin" >> $GITHUB_PATH # This is necessary so that Rust can find cargo - name: Export cargo path run: echo "/root/.cargo/bin" >> $GITHUB_PATH - - name: Check rustup - run: which rustup - - name: Check cmake - run: which cmake + + # Checkout repos (using shallow clones with fetch-depth: 1) - name: Checkout Arrow uses: actions/checkout@v6 with: repository: apache/arrow submodules: true - fetch-depth: 0 + fetch-depth: 1 - name: Checkout Arrow Rust uses: actions/checkout@v6 with: path: rust submodules: true - fetch-depth: 0 + fetch-depth: 1 - name: Checkout Arrow .NET uses: actions/checkout@v6 with: repository: apache/arrow-dotnet path: dotnet + fetch-depth: 1 - name: Checkout Arrow Go uses: actions/checkout@v6 with: repository: apache/arrow-go path: go + fetch-depth: 1 - name: Checkout Arrow Java uses: actions/checkout@v6 with: repository: apache/arrow-java path: java + fetch-depth: 1 - name: Checkout Arrow JavaScript uses: actions/checkout@v6 with: repository: apache/arrow-js path: js + fetch-depth: 1 - name: Checkout Arrow nanoarrow uses: actions/checkout@v6 with: repository: apache/arrow-nanoarrow path: nanoarrow + fetch-depth: 1 + + - name: Monitor disk usage - After checkouts + run: | + echo "=== After Checkouts ===" + df -h / + echo "" + - name: Build run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build + + - name: Monitor disk usage - After build + if: always() + run: | + echo "=== After Build ===" + df -h / + echo "" + - name: Run run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build + - name: Monitor disk usage - After tests + if: always() + run: | + echo "=== After Tests ===" + df -h / + echo "" + # test FFI against the C-Data interface exposed by pyarrow pyarrow-integration-test: name: Pyarrow C Data Interface From de1686ac79738793e5dfa067c7c844edbe79864a Mon Sep 17 00:00:00 2001 From: Dhanush Date: Sat, 27 Dec 2025 17:30:39 +0530 Subject: [PATCH 03/33] feat: support array indices in VariantPath dot notation (#9012) # Which issue does this PR close? - Closes #8946 # What changes are included in this PR? The PR adds support for parsing array index (eg. `foo.bar[3]`) with the help of parse_path fn. Currently the parser silently parses invalid segments as Field (eg., `foo[0`, `[0]`(parsed as index), `foo0]`, `foo[0][`) #### Feedback requested Whether to add stricter validation (throw an error) and reject the segment ? Or to keep the current behavior ? # Are these changes tested? yes, only for valid inputs # Are there any user-facing changes? no --- parquet-variant/src/path.rs | 39 +++++++++++++++++++++++++++++++----- parquet-variant/src/utils.rs | 33 ++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index e222c3ac9ccb..2aeb9df97d82 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -16,6 +16,8 @@ // under the License. use std::{borrow::Cow, ops::Deref}; +use crate::utils::parse_path; + /// Represents a qualified path to a potential subfield or index of a variant /// value. /// @@ -112,11 +114,7 @@ impl<'a> From>> for VariantPath<'a> { /// Create from &str with support for dot notation impl<'a> From<&'a str> for VariantPath<'a> { fn from(path: &'a str) -> Self { - if path.is_empty() { - VariantPath::new(vec![]) - } else { - VariantPath::new(path.split('.').map(Into::into).collect()) - } + VariantPath::new(path.split(".").flat_map(parse_path).collect()) } } @@ -223,4 +221,35 @@ mod tests { let path = VariantPath::from_iter([p]); assert!(!path.is_empty()); } + + #[test] + fn test_variant_path_dot_notation_with_array_index() { + let path = VariantPath::from("city.store.books[3].title"); + + let expected = VariantPath::from("city") + .join("store") + .join("books") + .join(3) + .join("title"); + + assert_eq!(path, expected); + } + + #[test] + fn test_variant_path_dot_notation_with_only_array_index() { + let path = VariantPath::from("[3]"); + + let expected = VariantPath::from(3); + + assert_eq!(path, expected); + } + + #[test] + fn test_variant_path_dot_notation_with_starting_array_index() { + let path = VariantPath::from("[3].title"); + + let expected = VariantPath::from(3).join("title"); + + assert_eq!(path, expected); + } } diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index d28b8685baa2..6accbcb36649 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -16,6 +16,7 @@ // under the License. use std::{array::TryFromSliceError, ops::Range, str}; +use crate::VariantPathElement; use arrow_schema::ArrowError; use std::cmp::Ordering; @@ -149,6 +150,38 @@ pub(crate) fn fits_precision(n: impl Into) -> bool { n.into().unsigned_abs().leading_zeros() >= (i64::BITS - N) } +// Helper fn to parse input segments like foo[0] or foo[0][0] +#[inline] +pub(crate) fn parse_path<'a>(segment: &'a str) -> Vec> { + if segment.is_empty() { + return Vec::new(); + } + + let mut path_elements = Vec::new(); + let mut base = segment; + + while let Some(stripped) = base.strip_suffix(']') { + let Some(open_pos) = stripped.rfind('[') else { + return vec![VariantPathElement::field(segment)]; + }; + + let index_str = &stripped[open_pos + 1..]; + let Ok(index) = index_str.parse::() else { + return vec![VariantPathElement::field(segment)]; + }; + + path_elements.push(VariantPathElement::index(index)); + base = &stripped[..open_pos]; + } + + if !base.is_empty() { + path_elements.push(VariantPathElement::field(base)); + } + + path_elements.reverse(); + path_elements +} + #[cfg(test)] mod test { use super::*; From 7f656ff814c93451d946c43d9dd63fde2a5d1792 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 27 Dec 2025 07:27:27 -0500 Subject: [PATCH 04/33] Minor: avoid some clones when reading parquet (#9048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Which issue does this PR close? - RElated to https://github.com/apache/datafusion/pull/19477 # Rationale for this change While profiling https://github.com/apache/datafusion/pull/19477 I noticed some additional clones we could avoid Screenshot 2025-12-26 at 12 03
00 PM I doubt this will be a huge deal but it does remove some allocations int he parquet read path # What changes are included in this PR? Use `into_data` rather than `to_data` # Are these changes tested? # Are there any user-facing changes? --- parquet/src/arrow/array_reader/struct_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 8df6a25c9102..b4a6a375334f 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -129,8 +129,8 @@ impl ArrayReader for StructArrayReader { .len(children_array_len) .child_data( children_array - .iter() - .map(|x| x.to_data()) + .into_iter() + .map(|x| x.into_data()) .collect::>(), ); From 814ee4227c01fce478bdd3594dd156250286b46e Mon Sep 17 00:00:00 2001 From: Michael Kleen Date: Sun, 28 Dec 2025 13:49:15 +0100 Subject: [PATCH 05/33] Add benchmarks for Utf8View scalars for zip (#8988) # Which issue does this PR close? N/A # Rationale for this change I have a PR to improve zip perf for Utf8View/BinaryView scalars and I need benchmarks for that. - https://github.com/apache/arrow-rs/pull/8963 # What changes are included in this PR? This extends the zip benchmarks by one new Input Generator for StringViews and two more functions to test scalar combinations of different StringViews combinations. # Are these changes tested? N/A # Are there any user-facing changes? No --- arrow/benches/zip_kernels.rs | 48 ++++++++++++++++++++++++++++++++++++ arrow/src/util/bench_util.rs | 27 ++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/arrow/benches/zip_kernels.rs b/arrow/benches/zip_kernels.rs index 31cbca639717..65f6bb280f00 100644 --- a/arrow/benches/zip_kernels.rs +++ b/arrow/benches/zip_kernels.rs @@ -21,6 +21,7 @@ use rand::distr::{Distribution, StandardUniform}; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; use std::hint; +use std::ops::Range; use std::sync::Arc; use arrow::array::*; @@ -133,6 +134,35 @@ where } } +struct GenerateStringView { + range: Range, + description: String, + _marker: std::marker::PhantomData, +} + +impl InputGenerator for GenerateStringView { + fn name(&self) -> &str { + self.description.as_str() + } + fn generate_scalar_with_null_value(&self) -> ArrayRef { + new_null_array(&DataType::Utf8View, 1) + } + + fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec { + let array = self.generate_array(seed, number_of_scalars, 0.0); + (0..number_of_scalars).map(|i| array.slice(i, 1)).collect() + } + + fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef { + Arc::new(create_string_view_array_with_len_range_and_seed( + array_length, + null_percentage, + self.range.clone(), + seed, + )) + } +} + fn mask_cases(len: usize) -> Vec<(&'static str, BooleanArray)> { vec![ ("all_true", create_boolean_array(len, 0.0, 1.0)), @@ -273,6 +303,24 @@ fn add_benchmark(c: &mut Criterion) { _marker: std::marker::PhantomData, }, ); + + bench_zip_on_input_generator( + c, + &GenerateStringView { + description: "string_views size (3..10)".to_string(), + range: 3..10, + _marker: std::marker::PhantomData, + }, + ); + + bench_zip_on_input_generator( + c, + &GenerateStringView { + description: "string_views size (10..100)".to_string(), + range: 10..100, + _marker: std::marker::PhantomData, + }, + ); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 9f83a50f4f8f..1f1dcff9b62a 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -208,6 +208,33 @@ pub fn create_string_array_with_len_range_and_prefix_and_seed, + seed: u64, +) -> StringViewArray { + let rng = &mut StdRng::seed_from_u64(seed); + (0..size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + let str_len = rng.random_range(range.clone()); + let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); + let value = String::from_utf8(value).unwrap(); + Some(value) + } + }) + .collect() +} fn create_string_view_array_with_len_range_and_prefix( size: usize, From 2d6fc518ed41d74de41ebd52a0165bda4b272772 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Tue, 30 Dec 2025 07:36:20 +0000 Subject: [PATCH 06/33] Add examples for min and max functions (#9062) # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/9055. # What changes are included in this PR? Changes to docstrings for `min` and `max` functions. # Are these changes tested? Yes # Are there any user-facing changes? Yes. These doc pages will be updated: https://docs.rs/arrow/latest/arrow/compute/fn.min.html https://docs.rs/arrow/latest/arrow/compute/fn.max.html --- arrow-arith/src/aggregate.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 91623bc22b92..0fbddbc6e6df 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -809,6 +809,15 @@ where /// Returns the minimum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value +/// +/// # Example +/// ```rust +/// # use arrow_array::Int32Array; +/// # use arrow_arith::aggregate::min; +/// let array = Int32Array::from(vec![8, 2, 4]); +/// let result = min(&array); +/// assert_eq!(result, Some(2)); +/// ``` pub fn min(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, @@ -818,6 +827,15 @@ where /// Returns the maximum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value +/// +/// # Example +/// ```rust +/// # use arrow_array::Int32Array; +/// # use arrow_arith::aggregate::max; +/// let array = Int32Array::from(vec![4, 8, 2]); +/// let result = max(&array); +/// assert_eq!(result, Some(8)); +/// ``` pub fn max(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, From 0991c76899209c1910f029b46c9af4223044b351 Mon Sep 17 00:00:00 2001 From: Congxian Qiu Date: Tue, 30 Dec 2025 20:31:00 +0800 Subject: [PATCH 07/33] [Variant] Unify the CastOptions usage in parquet-variant-compute (#8984) # Which issue does this PR close? - Closes #8873 . # What changes are included in this PR? Unify the `CastOptions` usage in `parquet-variant-compute` Currently, there is only `arrow::compute::CastOptions` in `parquet-variant-compute` now, the existing `parquet-variant-compute/CastOptions` has replaced by the `arrow::compute::CastOptions` with the equal behavior image # Are these changes tested? The existing tests covered the logic # Are there any user-facing changes? This will break some public API in `parquet-variant-compute`, but this crate is `experiment` now, so maybe we don't need to wait for a major release. --- .../src/arrow_to_variant.rs | 79 ++++++++++++++----- .../src/cast_to_variant.rs | 20 +++-- parquet-variant-compute/src/lib.rs | 1 - .../src/type_conversion.rs | 13 --- 4 files changed, 74 insertions(+), 39 deletions(-) diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index 3009b602cb80..be241a9a4e00 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::type_conversion::CastOptions; use arrow::array::{ Array, ArrayRef, AsArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericListViewArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; -use arrow::compute::kernels::cast; +use arrow::compute::{CastOptions, kernels::cast}; use arrow::datatypes::{ self as datatypes, ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, DecimalType, RunEndIndexType, @@ -367,7 +366,7 @@ macro_rules! define_row_builder { $( // NOTE: The `?` macro expansion fails without the type annotation. let Some(value): Option<$option_ty> = value else { - if self.options.strict { + if !self.options.safe { return Err(ArrowError::ComputeError(format!( "Failed to convert value at index {index}: conversion failed", ))); @@ -404,7 +403,7 @@ define_row_builder!( where V: VariantDecimalType, { - options: &'a CastOptions, + options: &'a CastOptions<'a>, scale: i8, }, |array| -> PrimitiveArray { array.as_primitive() }, @@ -414,7 +413,7 @@ define_row_builder!( // Decimal256 needs a two-stage conversion via i128 define_row_builder!( struct Decimal256ArrowToVariantBuilder<'a> { - options: &'a CastOptions, + options: &'a CastOptions<'a>, scale: i8, }, |array| -> arrow::array::Decimal256Array { array.as_primitive() }, @@ -426,7 +425,7 @@ define_row_builder!( define_row_builder!( struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { - options: &'a CastOptions, + options: &'a CastOptions<'a>, has_time_zone: bool, }, |array| -> PrimitiveArray { array.as_primitive() }, @@ -450,7 +449,7 @@ define_row_builder!( where i64: From, { - options: &'a CastOptions, + options: &'a CastOptions<'a>, }, |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { @@ -464,7 +463,7 @@ define_row_builder!( where i64: From, { - options: &'a CastOptions, + options: &'a CastOptions<'a>, }, |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { @@ -899,7 +898,13 @@ mod tests { /// Builds a VariantArray from an Arrow array using the row builder. fn execute_row_builder_test(array: &dyn Array) -> VariantArray { - execute_row_builder_test_with_options(array, CastOptions::default()) + execute_row_builder_test_with_options( + array, + CastOptions { + safe: false, + ..Default::default() + }, + ) } /// Variant of `execute_row_builder_test` that allows specifying options @@ -925,7 +930,14 @@ mod tests { /// Generic helper function to test row builders with basic assertion patterns. /// Uses execute_row_builder_test and adds simple value comparison assertions. fn test_row_builder_basic(array: &dyn Array, expected_values: Vec>) { - test_row_builder_basic_with_options(array, expected_values, CastOptions::default()); + test_row_builder_basic_with_options( + array, + expected_values, + CastOptions { + safe: false, + ..Default::default() + }, + ); } /// Variant of `test_row_builder_basic` that allows specifying options @@ -1058,7 +1070,10 @@ mod tests { let run_ends = Int32Array::from(vec![2, 5, 6]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array, &options).unwrap(); @@ -1084,7 +1099,10 @@ mod tests { let run_ends = Int32Array::from(vec![2, 4, 5]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array, &options).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); @@ -1135,7 +1153,10 @@ mod tests { let keys = Int32Array::from(vec![Some(0), None, Some(1), None, Some(2)]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) .unwrap(); @@ -1167,7 +1188,10 @@ mod tests { let keys = Int32Array::from(vec![0, 1, 2, 0, 1, 2]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) .unwrap(); @@ -1207,7 +1231,10 @@ mod tests { let dict_array = DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) .unwrap(); @@ -1302,7 +1329,10 @@ mod tests { // Slice to get just the middle element: [[3, 4, 5]] let sliced_array = list_array.slice(1, 1); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array, &options) .unwrap(); @@ -1346,7 +1376,10 @@ mod tests { Some(arrow::buffer::NullBuffer::from(vec![true, false])), ); - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list, &options) .unwrap(); @@ -1533,7 +1566,10 @@ mod tests { .unwrap(); // Test the row builder - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(union_array.data_type(), &union_array, &options) .unwrap(); @@ -1585,7 +1621,10 @@ mod tests { .unwrap(); // Test the row builder - let options = CastOptions::default(); + let options = CastOptions { + safe: false, + ..Default::default() + }; let mut row_builder = make_arrow_to_variant_row_builder(union_array.data_type(), &union_array, &options) .unwrap(); @@ -1663,7 +1702,7 @@ mod tests { Some(Variant::Null), // Overflow value becomes Variant::Null Some(Variant::from(VariantDecimal16::try_new(123, 3).unwrap())), ], - CastOptions { strict: false }, + CastOptions::default(), ); } diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index c3ffc7a42cc7..b6c968b0678d 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -16,8 +16,9 @@ // under the License. use crate::arrow_to_variant::make_arrow_to_variant_row_builder; -use crate::{CastOptions, VariantArray, VariantArrayBuilder}; +use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::Array; +use arrow::compute::CastOptions; use arrow_schema::ArrowError; /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you @@ -75,9 +76,15 @@ pub fn cast_to_variant_with_options( /// failures). /// /// This function provides backward compatibility. For non-strict behavior, -/// use [`cast_to_variant_with_options`] with `CastOptions { strict: false }`. +/// use [`cast_to_variant_with_options`] with `CastOptions { safe: true, ..Default::default() }`. pub fn cast_to_variant(input: &dyn Array) -> Result { - cast_to_variant_with_options(input, &CastOptions::default()) + cast_to_variant_with_options( + input, + &CastOptions { + safe: false, + ..Default::default() + }, + ) } #[cfg(test)] @@ -2255,14 +2262,17 @@ mod tests { } fn run_test(values: ArrayRef, expected: Vec>) { - run_test_with_options(values, expected, CastOptions { strict: false }); + run_test_with_options(values, expected, CastOptions::default()); } fn run_test_in_strict_mode( values: ArrayRef, expected: Result>, ArrowError>, ) { - let options = CastOptions { strict: true }; + let options = CastOptions { + safe: false, + ..Default::default() + }; match expected { Ok(expected) => run_test_with_options(values, expected, options), Err(_) => { diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 9b8008f58422..b05d0e023653 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -58,6 +58,5 @@ pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; pub use from_json::json_to_variant; pub use shred_variant::{IntoShreddingField, ShreddedSchemaBuilder, shred_variant}; pub use to_json::variant_to_json; -pub use type_conversion::CastOptions; pub use unshred_variant::unshred_variant; pub use variant_get::{GetOptions, variant_get}; diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 01065175653f..6a0a743c9029 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -25,19 +25,6 @@ use arrow::datatypes::{ use chrono::Timelike; use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16}; -/// Options for controlling the behavior of `cast_to_variant_with_options`. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct CastOptions { - /// If true, return error on conversion failure. If false, insert null for failed conversions. - pub strict: bool, -} - -impl Default for CastOptions { - fn default() -> Self { - Self { strict: true } - } -} - /// Extension trait for Arrow primitive types that can extract their native value from a Variant pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { fn from_variant(variant: &Variant<'_, '_>) -> Option; From 9b16fb3a7e1001e70e2e4195857eda6d74589ac4 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Dec 2025 14:33:51 +0200 Subject: [PATCH 08/33] fix: don't generate nulls for `Decimal128` and `Decimal256` when field is non-nullable and have non-zero `null_density` (#9046) # Which issue does this PR close? N/A # Rationale for this change if decimal field is non nullable and have null density we should not have nulls in the genrated array # What changes are included in this PR? Override `null_density` for non nested and non dictionary to avoid future problems like this added assertion and test # Are these changes tested? yes # Are there any user-facing changes? working generate for `Decimal128` and `Decimal256` Co-authored-by: Andrew Lamb --- arrow/src/util/data_gen.rs | 160 ++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 89 deletions(-) diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 89bbe4b1fbcb..023436e0a7f7 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -66,110 +66,72 @@ pub fn create_random_batch( pub fn create_random_array( field: &Field, size: usize, - null_density: f32, + mut null_density: f32, true_density: f32, ) -> Result { - // Override null density with 0.0 if the array is non-nullable - // and a primitive type in case a nested field is nullable - let primitive_null_density = match field.is_nullable() { - true => null_density, - false => 0.0, - }; + // Override nullability in case of not nested and not dictionary + // For nested we don't want to override as we want to keep the nullability for the children + // For dictionary it handle the nullability internally + if !field.data_type().is_nested() && !matches!(field.data_type(), Dictionary(_, _)) { + // Override null density with 0.0 if the array is non-nullable + null_density = match field.is_nullable() { + true => null_density, + false => 0.0, + }; + } + use DataType::*; - Ok(match field.data_type() { + let array = match field.data_type() { Null => Arc::new(NullArray::new(size)) as ArrayRef, - Boolean => Arc::new(create_boolean_array( - size, - primitive_null_density, - true_density, - )), - Int8 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Int16 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Int32 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Int64 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt8 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt16 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt32 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - UInt64 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), + Boolean => Arc::new(create_boolean_array(size, null_density, true_density)), + Int8 => Arc::new(create_primitive_array::(size, null_density)), + Int16 => Arc::new(create_primitive_array::(size, null_density)), + Int32 => Arc::new(create_primitive_array::(size, null_density)), + Int64 => Arc::new(create_primitive_array::(size, null_density)), + UInt8 => Arc::new(create_primitive_array::(size, null_density)), + UInt16 => Arc::new(create_primitive_array::(size, null_density)), + UInt32 => Arc::new(create_primitive_array::(size, null_density)), + UInt64 => Arc::new(create_primitive_array::(size, null_density)), Float16 => { return Err(ArrowError::NotYetImplemented( "Float16 is not implemented".to_string(), )); } - Float32 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), - Float64 => Arc::new(create_primitive_array::( - size, - primitive_null_density, - )), + Float32 => Arc::new(create_primitive_array::(size, null_density)), + Float64 => Arc::new(create_primitive_array::(size, null_density)), Timestamp(unit, tz) => match unit { TimeUnit::Second => Arc::new( - create_random_temporal_array::(size, primitive_null_density) + create_random_temporal_array::(size, null_density) .with_timezone_opt(tz.clone()), - ), + ) as ArrayRef, TimeUnit::Millisecond => Arc::new( - create_random_temporal_array::( - size, - primitive_null_density, - ) - .with_timezone_opt(tz.clone()), + create_random_temporal_array::(size, null_density) + .with_timezone_opt(tz.clone()), ), TimeUnit::Microsecond => Arc::new( - create_random_temporal_array::( - size, - primitive_null_density, - ) - .with_timezone_opt(tz.clone()), + create_random_temporal_array::(size, null_density) + .with_timezone_opt(tz.clone()), ), TimeUnit::Nanosecond => Arc::new( - create_random_temporal_array::( - size, - primitive_null_density, - ) - .with_timezone_opt(tz.clone()), + create_random_temporal_array::(size, null_density) + .with_timezone_opt(tz.clone()), ), }, Date32 => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )), Date64 => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )), Time32(unit) => match unit { TimeUnit::Second => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )) as ArrayRef, TimeUnit::Millisecond => Arc::new( - create_random_temporal_array::(size, primitive_null_density), + create_random_temporal_array::(size, null_density), ), _ => { return Err(ArrowError::InvalidArgumentError(format!( @@ -179,11 +141,11 @@ pub fn create_random_array( }, Time64(unit) => match unit { TimeUnit::Microsecond => Arc::new( - create_random_temporal_array::(size, primitive_null_density), + create_random_temporal_array::(size, null_density), ) as ArrayRef, TimeUnit::Nanosecond => Arc::new(create_random_temporal_array::( size, - primitive_null_density, + null_density, )), _ => { return Err(ArrowError::InvalidArgumentError(format!( @@ -191,24 +153,19 @@ pub fn create_random_array( ))); } }, - Utf8 => Arc::new(create_string_array::(size, primitive_null_density)), - LargeUtf8 => Arc::new(create_string_array::(size, primitive_null_density)), + Utf8 => Arc::new(create_string_array::(size, null_density)), + LargeUtf8 => Arc::new(create_string_array::(size, null_density)), Utf8View => Arc::new(create_string_view_array_with_len( size, - primitive_null_density, + null_density, 4, false, )), - Binary => Arc::new(create_binary_array::(size, primitive_null_density)), - LargeBinary => Arc::new(create_binary_array::(size, primitive_null_density)), - FixedSizeBinary(len) => Arc::new(create_fsb_array( - size, - primitive_null_density, - *len as usize, - )), + Binary => Arc::new(create_binary_array::(size, null_density)), + LargeBinary => Arc::new(create_binary_array::(size, null_density)), + FixedSizeBinary(len) => Arc::new(create_fsb_array(size, null_density, *len as usize)), BinaryView => Arc::new( - create_string_view_array_with_len(size, primitive_null_density, 4, false) - .to_binary_view(), + create_string_view_array_with_len(size, null_density, 4, false).to_binary_view(), ), List(_) => create_random_list_array(field, size, null_density, true_density)?, LargeList(_) => create_random_list_array(field, size, null_density, true_density)?, @@ -230,7 +187,13 @@ pub fn create_random_array( "Generating random arrays not yet implemented for {other:?}" ))); } - }) + }; + + if !field.is_nullable() { + assert_eq!(array.null_count(), 0); + } + + Ok(array) } #[inline] @@ -812,4 +775,23 @@ mod tests { assert_eq!(array.len(), size); } } + + #[test] + fn create_non_nullable_decimal_array_with_null_density() { + let size = 10; + let fields = vec![ + Field::new("a", DataType::Decimal128(10, -2), false), + Field::new("b", DataType::Decimal256(10, -2), false), + ]; + let schema = Schema::new(fields); + let schema_ref = Arc::new(schema); + let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap(); + + assert_eq!(batch.schema(), schema_ref); + assert_eq!(batch.num_columns(), schema_ref.fields().len()); + for array in batch.columns() { + assert_eq!(array.len(), size); + assert_eq!(array.null_count(), 0); + } + } } From 5ddddbdd7cc3ad84371d4438fa286d0e2e3401ce Mon Sep 17 00:00:00 2001 From: Lanqing Yang Date: Tue, 30 Dec 2025 21:34:19 +0900 Subject: [PATCH 09/33] Minor: avoid clone in RunArray row decoding via buffer stealing (#9052) # Which issue does this PR close? its a nitpick to replace "allocation + memcpy" with "allocation only". # Rationale for this change remove the value clone in decode path `decoded_values.push(decoded_data.clone())` and taking from decoded_data directly # What changes are included in this PR? # Are these changes tested? i think the current testing suite will do # Are there any user-facing changes? no Signed-off-by: lyang24 --- arrow-row/src/run.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arrow-row/src/run.rs b/arrow-row/src/run.rs index 3d962f43ada8..24eaaa18e018 100644 --- a/arrow-row/src/run.rs +++ b/arrow-row/src/run.rs @@ -134,7 +134,11 @@ pub unsafe fn decode( run_ends.push(R::Native::usize_as(idx)); } unique_row_indices.push(decoded_values.len()); - decoded_values.push(decoded_data.clone()); + let capacity = decoded_data.capacity(); + decoded_values.push(std::mem::replace( + &mut decoded_data, + Vec::with_capacity(capacity), + )); } } // Add the final run end From 9213ffd035f32b657965096bf5781d1ca1d5cf67 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Dec 2025 14:36:17 +0200 Subject: [PATCH 10/33] perf: improve performance of encoding `GenericByteArray` by 8% (#9054) # Which issue does this PR close? N/A # Rationale for this change Make row conversion faster # What changes are included in this PR? created "manual" iterator over the byte array and offsets with optimizations for no nulls # Are these changes tested? Existing tests # Are there any user-facing changes? No --- arrow-row/src/lib.rs | 14 +++++------- arrow-row/src/variable.rs | 46 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index aa6543485fe3..3ffa71e98c30 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1644,24 +1644,22 @@ fn encode_column( } } DataType::Binary => { - variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) + variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::(column), opts) } DataType::BinaryView => { variable::encode(data, offsets, column.as_binary_view().iter(), opts) } DataType::LargeBinary => { - variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) + variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::(column), opts) } - DataType::Utf8 => variable::encode( + DataType::Utf8 => variable::encode_generic_byte_array( data, offsets, - column.as_string::().iter().map(|x| x.map(|x| x.as_bytes())), + column.as_string::(), opts, ), - DataType::LargeUtf8 => variable::encode( + DataType::LargeUtf8 => variable::encode_generic_byte_array( data, offsets, - column.as_string::() - .iter() - .map(|x| x.map(|x| x.as_bytes())), + column.as_string::(), opts, ), DataType::Utf8View => variable::encode( diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index ac2c4cb97c20..73e19b197f92 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -17,9 +17,10 @@ use crate::null_sentinel; use arrow_array::builder::BufferBuilder; +use arrow_array::types::ByteArrayType; use arrow_array::*; -use arrow_buffer::MutableBuffer; use arrow_buffer::bit_util::ceil; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN}; use arrow_schema::{DataType, SortOptions}; use builder::make_view; @@ -84,6 +85,48 @@ pub fn encode<'a, I: Iterator>>( } } +/// Calls [`encode`] with optimized iterator for generic byte arrays +pub(crate) fn encode_generic_byte_array( + data: &mut [u8], + offsets: &mut [usize], + input_array: &GenericByteArray, + opts: SortOptions, +) { + let input_offsets = input_array.value_offsets(); + let bytes = input_array.values().as_slice(); + + if let Some(null_buffer) = input_array.nulls().filter(|x| x.null_count() > 0) { + let input_iter = + input_offsets + .windows(2) + .zip(null_buffer.iter()) + .map(|(start_end, is_valid)| { + if is_valid { + let item_range = start_end[0].as_usize()..start_end[1].as_usize(); + // SAFETY: the offsets of the input are valid by construction + // so it is ok to use unsafe here + let item = unsafe { bytes.get_unchecked(item_range) }; + Some(item) + } else { + None + } + }); + + encode(data, offsets, input_iter, opts); + } else { + // Skip null checks + let input_iter = input_offsets.windows(2).map(|start_end| { + let item_range = start_end[0].as_usize()..start_end[1].as_usize(); + // SAFETY: the offsets of the input are valid by construction + // so it is ok to use unsafe here + let item = unsafe { bytes.get_unchecked(item_range) }; + Some(item) + }); + + encode(data, offsets, input_iter, opts); + } +} + pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize { out[0] = null_sentinel(opts); 1 @@ -97,6 +140,7 @@ pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> usize { 1 } +#[inline] pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize { match val { None => encode_null(out, opts), From 843bee2c21dbe40a228c37dae7ecd876d73ee5de Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Wed, 31 Dec 2025 01:13:19 +0000 Subject: [PATCH 11/33] Fix headers and empty lines in code examples (#9064) # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/9063. # What changes are included in this PR? Fixed exaxmple headers and removes empty first lines # Are these changes tested? Yes # Are there any user-facing changes? Yes, the following doc pages will be updated: https://docs.rs/arrow/latest/arrow/compute/fn.min_boolean.html https://docs.rs/arrow/latest/arrow/compute/fn.max_boolean.html https://docs.rs/arrow/latest/arrow/compute/fn.lexsort.html https://docs.rs/arrow/latest/arrow/compute/fn.take_record_batch.html https://docs.rs/arrow/latest/arrow/compute/fn.shift.html --- arrow-arith/src/aggregate.rs | 4 ++-- arrow-ord/src/sort.rs | 3 +-- arrow-select/src/take.rs | 1 - arrow-select/src/window.rs | 1 - 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 0fbddbc6e6df..a043259694c1 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -332,10 +332,10 @@ fn aggregate, A: Numeric /// Returns the minimum value in the boolean array. /// +/// # Example /// ``` /// # use arrow_array::BooleanArray; /// # use arrow_arith::aggregate::min_boolean; -/// /// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); /// assert_eq!(min_boolean(&a), Some(false)) /// ``` @@ -390,10 +390,10 @@ pub fn min_boolean(array: &BooleanArray) -> Option { /// Returns the maximum value in the boolean array /// +/// # Example /// ``` /// # use arrow_array::BooleanArray; /// # use arrow_arith::aggregate::max_boolean; -/// /// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); /// assert_eq!(max_boolean(&a), Some(true)) /// ``` diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index fdedbbcae930..39d56f8fe9b2 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -846,7 +846,7 @@ pub struct SortColumn { /// Returns an `ArrowError::ComputeError(String)` if any of the array type is either unsupported by /// `lexsort_to_indices` or `take`. /// -/// Example: +/// # Example: /// /// ``` /// # use std::convert::From; @@ -855,7 +855,6 @@ pub struct SortColumn { /// # use arrow_array::types::Int64Type; /// # use arrow_array::cast::AsArray; /// # use arrow_ord::sort::{SortColumn, SortOptions, lexsort}; -/// /// let sorted_columns = lexsort(&vec![ /// SortColumn { /// values: Arc::new(PrimitiveArray::::from(vec![ diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 7f7791a07af0..1961a604d928 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1011,7 +1011,6 @@ to_indices_reinterpret!(Int64Type, UInt64Type); /// # use arrow_array::{StringArray, Int32Array, UInt32Array, RecordBatch}; /// # use arrow_schema::{DataType, Field, Schema}; /// # use arrow_select::take::take_record_batch; -/// /// let schema = Arc::new(Schema::new(vec![ /// Field::new("a", DataType::Int32, true), /// Field::new("b", DataType::Utf8, true), diff --git a/arrow-select/src/window.rs b/arrow-select/src/window.rs index fbd145d08d9d..74f7f4a79191 100644 --- a/arrow-select/src/window.rs +++ b/arrow-select/src/window.rs @@ -29,7 +29,6 @@ use num_traits::abs; /// ``` /// # use arrow_array::Int32Array; /// # use arrow_select::window::shift; -/// /// let a: Int32Array = vec![Some(1), None, Some(4)].into(); /// /// // shift array 1 element to the right From 6afdfbbe28718e0d186318e0c8d12d29ede9875f Mon Sep 17 00:00:00 2001 From: WaterWhisperer Date: Thu, 1 Jan 2026 23:18:23 +0800 Subject: [PATCH 12/33] docs: fix misleading reserve documentation (#9076) # Which issue does this PR close? - Closes #7236 # Rationale for this change The previous documentation for `BooleanBufferBuilder::reserve` mentioned that reading new bytes is undefined behavior. However, as noted in the issue, the safe APIs only expose initialized slices, making this warning misleading. # What changes are included in this PR? - Remove misleading UB warning from `BooleanBufferBuilder::reserve`. - Unify documentation for the `reserve` methods to follow a consistent format. # Are these changes tested? No, because it's doc change # Are there any user-facing changes? Yes --- arrow-buffer/src/builder/boolean.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow-buffer/src/builder/boolean.rs b/arrow-buffer/src/builder/boolean.rs index 512f729fda3e..41a75ef3e2c1 100644 --- a/arrow-buffer/src/builder/boolean.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -140,7 +140,6 @@ impl BooleanBufferBuilder { /// Reserve space to at least `additional` new bits. /// Capacity will be `>= self.len() + additional`. - /// New bytes are uninitialized and reading them is undefined behavior. #[inline] pub fn reserve(&mut self, additional: usize) { let capacity = self.len + additional; From 44d4c906c6775d2809d0640e74a0d34ea328c533 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 1 Jan 2026 17:21:13 +0200 Subject: [PATCH 13/33] chore: run validation when debug assertion enabled and not only for test (#9073) # Which issue does this PR close? N/A # Rationale for this change More coverage, I might not be compiling for tests and still want this validation # What changes are included in this PR? replace test with debug assertion # Are these changes tested? No # Are there any user-facing changes? not API change --- arrow-row/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 3ffa71e98c30..3c63f3bd6bf2 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -892,7 +892,7 @@ impl RowConverter { // and therefore must be valid let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?; - if cfg!(test) { + if cfg!(debug_assertions) { for (i, row) in rows.iter().enumerate() { if !row.is_empty() { return Err(ArrowError::InvalidArgumentError(format!( From 49c27d67a52e696a694e27631ffec14d01fe9018 Mon Sep 17 00:00:00 2001 From: Michael Kleen Date: Sat, 3 Jan 2026 03:07:37 +0100 Subject: [PATCH 14/33] Add special implementation for zip for Utf8View/BinaryView scalars (#8963) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/8724 # Rationale for this change It's explained in the issue. # What changes are included in this PR? This adds a special implementation for Utf8View/BinaryView scalars for zip based on the design from https://github.com/apache/arrow-rs/pull/8653. It also includes tests. Benchmarks are available here: - https://github.com/apache/arrow-rs/pull/8988 # Are these changes tested? Yes. # Are there any user-facing changes? There is a new struct `ByteViewScalarImpl`.
Benchmarks System: Apple M1 Max with 10 cores on macOS 26.1 ``` group branch main ----- ------ ---- zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/10pct_true 1.00 3.5±0.04µs ? ?/sec 37.06 128.9±1.36µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/1pct_true 1.00 3.5±0.07µs ? ?/sec 35.76 125.1±1.76µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/50pct_nulls 1.00 3.7±0.12µs ? ?/sec 36.91 136.8±2.17µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/50pct_true 1.00 3.5±0.06µs ? ?/sec 40.30 139.9±2.11µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/90pct_true 1.00 3.6±0.10µs ? ?/sec 30.57 108.5±2.62µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/99pct_true 1.00 3.5±0.05µs ? ?/sec 28.40 99.8±2.12µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/all_false 1.00 3.5±0.02µs ? ?/sec 36.04 127.4±3.14µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_null_scalar_vs_null_scalar/all_true 1.00 3.5±0.08µs ? ?/sec 27.39 97.1±1.11µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/10pct_true 1.00 28.2±0.37µs ? ?/sec 2.70 75.9±0.61µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/1pct_true 1.00 7.2±0.24µs ? ?/sec 9.89 71.4±12.56µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/50pct_nulls 1.00 51.0±2.97µs ? ?/sec 1.75 89.4±2.50µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/50pct_true 1.00 62.1±1.00µs ? ?/sec 1.61 99.7±4.68µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/90pct_true 1.00 28.8±0.64µs ? ?/sec 2.63 75.7±1.22µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/99pct_true 1.00 7.7±0.11µs ? ?/sec 8.98 69.0±0.74µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/all_false 1.00 3.7±0.13µs ? ?/sec 19.06 69.8±1.55µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/non_nulls_scalars/all_true 1.00 3.6±0.10µs ? ?/sec 18.90 68.0±1.12µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/10pct_true 1.00 3.8±0.07µs ? ?/sec 28.85 108.4±3.09µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/1pct_true 1.00 3.8±0.09µs ? ?/sec 25.83 98.7±2.71µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/50pct_nulls 1.00 3.9±0.06µs ? ?/sec 32.25 127.3±7.41µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/50pct_true 1.00 3.7±0.06µs ? ?/sec 37.66 139.5±3.00µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/90pct_true 1.00 3.8±0.16µs ? ?/sec 34.52 129.5±1.53µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/99pct_true 1.00 3.7±0.05µs ? ?/sec 33.83 124.8±1.28µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/all_false 1.00 3.8±0.09µs ? ?/sec 26.08 98.8±2.02µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 10/null_vs_non_null_scalar/all_true 1.00 3.8±0.08µs ? ?/sec 32.56 123.9±1.48µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/10pct_true 1.00 3.6±0.06µs ? ?/sec 36.09 129.8±6.06µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/1pct_true 1.00 3.6±0.35µs ? ?/sec 34.05 122.9±5.06µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/50pct_nulls 1.00 3.7±0.12µs ? ?/sec 36.77 137.9±5.49µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/50pct_true 1.00 3.6±0.09µs ? ?/sec 38.23 137.4±3.35µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/90pct_true 1.00 3.6±0.06µs ? ?/sec 29.20 104.8±1.64µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/99pct_true 1.00 3.6±0.15µs ? ?/sec 26.94 96.9±2.73µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/all_false 1.00 3.6±0.05µs ? ?/sec 34.97 127.5±5.81µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_null_scalar_vs_null_scalar/all_true 1.00 3.8±1.05µs ? ?/sec 24.98 95.0±2.14µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/10pct_true 1.00 28.9±0.46µs ? ?/sec 2.69 77.7±1.57µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/1pct_true 1.00 7.3±0.09µs ? ?/sec 9.81 71.6±1.96µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/50pct_nulls 1.00 50.3±1.16µs ? ?/sec 1.74 87.7±1.14µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/50pct_true 1.00 63.5±1.44µs ? ?/sec 1.59 100.7±1.97µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/90pct_true 1.00 29.8±0.48µs ? ?/sec 2.64 78.6±2.85µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/99pct_true 1.00 8.2±0.12µs ? ?/sec 8.54 69.7±0.91µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/all_false 1.00 3.8±0.07µs ? ?/sec 18.77 71.6±1.51µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/non_nulls_scalars/all_true 1.00 3.8±0.11µs ? ?/sec 18.31 68.8±1.10µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/10pct_true 1.00 3.8±0.07µs ? ?/sec 27.36 104.3±1.35µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/1pct_true 1.00 3.8±0.07µs ? ?/sec 24.86 94.8±1.12µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/50pct_nulls 1.00 4.0±0.04µs ? ?/sec 29.84 117.9±1.34µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/50pct_true 1.00 3.9±0.21µs ? ?/sec 35.19 137.1±3.87µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/90pct_true 1.00 3.8±0.06µs ? ?/sec 32.78 125.8±1.73µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/99pct_true 1.00 3.8±0.11µs ? ?/sec 31.87 121.5±1.47µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/all_false 1.00 3.8±0.07µs ? ?/sec 25.36 95.5±1.89µs ? ?/sec zip_8192_from_string_views size 10 and string_views size 100/null_vs_non_null_scalar/all_true 1.00 3.9±0.20µs ? ?/sec 30.83 121.7±3.36µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/10pct_true 1.00 3.7±0.73µs ? ?/sec 35.72 132.2±6.77µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/1pct_true 1.00 3.6±0.04µs ? ?/sec 35.35 125.8±2.79µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/50pct_nulls 1.00 3.8±0.11µs ? ?/sec 36.05 136.0±2.59µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/50pct_true 1.00 3.6±0.13µs ? ?/sec 39.36 142.5±6.32µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/90pct_true 1.00 3.6±0.11µs ? ?/sec 29.63 107.5±2.03µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/99pct_true 1.00 3.6±0.08µs ? ?/sec 28.40 102.2±6.74µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/all_false 1.00 3.6±0.05µs ? ?/sec 34.83 126.0±2.12µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_null_scalar_vs_null_scalar/all_true 1.00 3.6±0.05µs ? ?/sec 27.38 98.6±1.62µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/10pct_true 1.00 29.9±2.79µs ? ?/sec 2.51 75.1±0.98µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/1pct_true 1.00 7.2±0.16µs ? ?/sec 9.48 68.3±1.01µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/50pct_nulls 1.00 50.5±1.90µs ? ?/sec 1.68 84.6±1.27µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/50pct_true 1.00 64.4±0.60µs ? ?/sec 1.53 98.6±1.71µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/90pct_true 1.00 29.7±0.61µs ? ?/sec 2.57 76.1±1.15µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/99pct_true 1.00 7.9±0.09µs ? ?/sec 8.89 70.5±2.13µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/all_false 1.00 3.7±0.06µs ? ?/sec 18.31 67.8±0.86µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/non_nulls_scalars/all_true 1.00 3.7±0.06µs ? ?/sec 18.35 67.9±1.16µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/10pct_true 1.00 3.8±0.12µs ? ?/sec 28.20 107.5±2.55µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/1pct_true 1.00 3.9±0.16µs ? ?/sec 25.73 99.5±2.19µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/50pct_nulls 1.00 4.1±0.14µs ? ?/sec 29.98 122.2±2.27µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/50pct_true 1.00 3.8±0.08µs ? ?/sec 37.05 140.1±2.01µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/90pct_true 1.00 3.9±0.20µs ? ?/sec 33.52 131.8±3.10µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/99pct_true 1.00 3.8±0.09µs ? ?/sec 33.55 127.6±3.56µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/all_false 1.00 3.8±0.08µs ? ?/sec 26.47 100.8±5.55µs ? ?/sec zip_8192_from_string_views size 100 and string_views size 100/null_vs_non_null_scalar/all_true 1.00 3.9±0.06µs ? ?/sec 32.05 124.6±2.16µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/10pct_true 1.00 3.6±0.40µs ? ?/sec 35.16 126.4±1.92µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/1pct_true 1.00 3.5±0.07µs ? ?/sec 35.43 123.6±4.98µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/50pct_nulls 1.00 3.7±0.06µs ? ?/sec 36.06 132.4±1.80µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/50pct_true 1.00 3.6±0.06µs ? ?/sec 38.44 136.9±2.82µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/90pct_true 1.00 3.5±0.04µs ? ?/sec 29.82 105.2±2.25µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/99pct_true 1.00 3.5±0.08µs ? ?/sec 27.48 96.9±1.69µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/all_false 1.00 3.6±0.12µs ? ?/sec 33.80 123.0±2.52µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_null_scalar_vs_null_scalar/all_true 1.00 3.6±0.14µs ? ?/sec 26.74 95.0±1.74µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/10pct_true 1.00 27.9±0.32µs ? ?/sec 2.65 73.9±1.31µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/1pct_true 1.00 6.9±0.09µs ? ?/sec 9.64 67.0±0.92µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/50pct_nulls 1.00 49.0±0.60µs ? ?/sec 1.73 84.7±2.45µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/50pct_true 1.00 62.4±2.22µs ? ?/sec 1.56 97.1±2.37µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/90pct_true 1.00 28.7±0.37µs ? ?/sec 2.59 74.1±1.17µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/99pct_true 1.00 7.8±0.20µs ? ?/sec 8.69 67.7±1.34µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/all_false 1.00 3.6±0.09µs ? ?/sec 18.78 68.2±2.16µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/non_nulls_scalars/all_true 1.00 3.6±0.05µs ? ?/sec 19.10 68.4±11.77µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/10pct_true 1.00 3.8±0.21µs ? ?/sec 27.30 104.1±1.34µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/1pct_true 1.00 3.7±0.04µs ? ?/sec 25.76 95.8±2.00µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/50pct_nulls 1.00 4.2±0.96µs ? ?/sec 28.05 118.0±1.17µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/50pct_true 1.00 3.9±0.13µs ? ?/sec 35.42 136.6±3.78µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/90pct_true 1.00 3.8±0.10µs ? ?/sec 33.31 125.5±1.89µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/99pct_true 1.00 3.8±0.04µs ? ?/sec 32.36 121.6±1.80µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/all_false 1.00 3.7±0.04µs ? ?/sec 25.64 95.1±0.98µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 10/null_vs_non_null_scalar/all_true 1.00 3.9±0.07µs ? ?/sec 31.19 121.2±2.69µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/10pct_true 1.00 3.5±0.04µs ? ?/sec 35.69 126.5±2.89µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/1pct_true 1.00 3.6±0.05µs ? ?/sec 33.84 120.9±1.68µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/50pct_nulls 1.00 3.7±0.10µs ? ?/sec 35.72 133.2±3.49µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/50pct_true 1.00 3.6±0.12µs ? ?/sec 38.28 136.0±2.11µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/90pct_true 1.00 3.5±0.06µs ? ?/sec 29.81 104.4±1.56µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/99pct_true 1.00 3.5±0.08µs ? ?/sec 27.69 98.1±2.86µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/all_false 1.00 3.6±0.10µs ? ?/sec 33.58 122.3±1.77µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_null_scalar_vs_null_scalar/all_true 1.00 3.5±0.08µs ? ?/sec 26.79 94.7±1.02µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/10pct_true 1.00 29.0±0.51µs ? ?/sec 2.59 75.1±1.08µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/1pct_true 1.00 7.4±0.10µs ? ?/sec 9.41 69.2±1.76µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/50pct_nulls 1.00 50.2±0.54µs ? ?/sec 1.70 85.2±1.17µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/50pct_true 1.00 64.1±1.59µs ? ?/sec 1.51 96.9±1.22µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/90pct_true 1.00 29.8±0.36µs ? ?/sec 2.55 75.9±2.47µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/99pct_true 1.00 8.2±0.17µs ? ?/sec 8.24 67.8±1.11µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/all_false 1.00 3.8±0.07µs ? ?/sec 17.96 68.8±1.15µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/non_nulls_scalars/all_true 1.00 3.8±0.12µs ? ?/sec 17.37 66.1±0.97µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/10pct_true 1.00 3.8±0.27µs ? ?/sec 27.57 105.2±3.06µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/1pct_true 1.00 3.7±0.08µs ? ?/sec 25.44 94.8±0.94µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/50pct_nulls 1.00 3.9±0.07µs ? ?/sec 30.10 118.6±2.83µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/50pct_true 1.00 3.9±0.30µs ? ?/sec 35.20 135.6±1.67µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/90pct_true 1.00 3.9±0.55µs ? ?/sec 32.58 125.9±2.14µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/99pct_true 1.00 3.8±0.36µs ? ?/sec 32.47 122.9±4.15µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/all_false 1.00 3.8±0.10µs ? ?/sec 25.24 94.9±0.97µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 100/null_vs_non_null_scalar/all_true 1.00 3.8±0.09µs ? ?/sec 31.58 120.3±1.65µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/10pct_true 1.00 3.5±0.04µs ? ?/sec 37.39 131.4±4.74µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/1pct_true 1.00 3.5±0.09µs ? ?/sec 35.84 126.8±3.56µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/50pct_nulls 1.00 3.7±0.06µs ? ?/sec 37.15 137.8±3.16µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/50pct_true 1.00 3.5±0.06µs ? ?/sec 39.19 138.9±4.82µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/90pct_true 1.00 3.6±0.04µs ? ?/sec 30.30 107.9±5.71µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/99pct_true 1.00 3.6±0.05µs ? ?/sec 27.33 97.7±2.10µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/all_false 1.00 3.6±0.06µs ? ?/sec 34.64 124.7±2.24µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_null_scalar_vs_null_scalar/all_true 1.00 3.7±0.19µs ? ?/sec 26.17 96.9±1.75µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/10pct_true 1.00 28.7±0.55µs ? ?/sec 2.66 76.2±1.45µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/1pct_true 1.00 7.2±0.12µs ? ?/sec 9.58 69.0±0.80µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/50pct_nulls 1.00 49.5±1.15µs ? ?/sec 1.75 86.8±2.09µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/50pct_true 1.00 62.6±0.88µs ? ?/sec 1.65 103.4±16.82µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/90pct_true 1.00 29.1±0.49µs ? ?/sec 2.69 78.3±2.51µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/99pct_true 1.00 7.8±0.09µs ? ?/sec 9.01 70.2±1.72µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/all_false 1.00 3.7±0.06µs ? ?/sec 18.77 68.7±0.73µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/non_nulls_scalars/all_true 1.00 3.6±0.10µs ? ?/sec 18.73 68.2±1.44µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/10pct_true 1.00 3.9±0.11µs ? ?/sec 27.68 106.9±2.29µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/1pct_true 1.00 3.9±0.19µs ? ?/sec 26.12 101.9±8.79µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/50pct_nulls 1.00 4.1±0.07µs ? ?/sec 29.91 122.7±3.28µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/50pct_true 1.00 3.8±0.14µs ? ?/sec 36.82 141.4±3.69µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/90pct_true 1.00 3.8±0.10µs ? ?/sec 34.15 131.4±2.99µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/99pct_true 1.00 3.8±0.06µs ? ?/sec 32.89 125.2±3.21µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/all_false 1.00 3.8±0.06µs ? ?/sec 26.05 99.2±2.30µs ? ?/sec zip_8192_from_string_views size 3 and string_views size 3/null_vs_non_null_scalar/all_true 1.00 4.0±0.33µs ? ?/sec 32.00 126.7±25.05µs ? ?/sec ```
--- arrow-select/src/zip.rs | 341 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 337 insertions(+), 4 deletions(-) diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index e45b817dc6e8..6be034fca23d 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -19,14 +19,17 @@ use crate::filter::{SlicesIterator, prep_null_mask_filter}; use arrow_array::cast::AsArray; -use arrow_array::types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, Utf8Type}; +use arrow_array::types::{ + BinaryType, BinaryViewType, ByteArrayType, ByteViewType, LargeBinaryType, LargeUtf8Type, + StringViewType, Utf8Type, +}; use arrow_array::*; use arrow_buffer::{ BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, OffsetBufferBuilder, - ScalarBuffer, + ScalarBuffer, ToByteSlice, }; -use arrow_data::ArrayData; use arrow_data::transform::MutableArrayData; +use arrow_data::{ArrayData, ByteView}; use arrow_schema::{ArrowError, DataType}; use std::fmt::{Debug, Formatter}; use std::hash::Hash; @@ -284,7 +287,12 @@ impl ScalarZipper { DataType::LargeBinary => { Arc::new(BytesScalarImpl::::new(truthy, falsy)) as Arc }, - // TODO: Handle Utf8View https://github.com/apache/arrow-rs/issues/8724 + DataType::Utf8View => { + Arc::new(ByteViewScalarImpl::::new(truthy, falsy)) as Arc + }, + DataType::BinaryView => { + Arc::new(ByteViewScalarImpl::::new(truthy, falsy)) as Arc + }, _ => { Arc::new(FallbackImpl::new(truthy, falsy)) as Arc }, @@ -657,6 +665,177 @@ fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer { } } +struct ByteViewScalarImpl { + truthy_view: Option, + truthy_buffers: Vec, + falsy_view: Option, + falsy_buffers: Vec, + phantom: PhantomData, +} + +impl ByteViewScalarImpl { + fn new(truthy: &dyn Array, falsy: &dyn Array) -> Self { + let (truthy_view, truthy_buffers) = Self::get_value_from_scalar(truthy); + let (falsy_view, falsy_buffers) = Self::get_value_from_scalar(falsy); + Self { + truthy_view, + truthy_buffers, + falsy_view, + falsy_buffers, + phantom: PhantomData, + } + } + + fn get_value_from_scalar(scalar: &dyn Array) -> (Option, Vec) { + if scalar.is_null(0) { + (None, vec![]) + } else { + let (views, buffers, _) = scalar.as_byte_view::().clone().into_parts(); + (views.first().copied(), buffers) + } + } + + fn get_views_for_single_non_nullable( + predicate: BooleanBuffer, + value: u128, + buffers: Vec, + ) -> (ScalarBuffer, Vec, Option) { + let number_of_true = predicate.count_set_bits(); + let number_of_values = predicate.len(); + + // Fast path for all nulls + if number_of_true == 0 { + // All values are null + return ( + vec![0; number_of_values].into(), + vec![], + Some(NullBuffer::new_null(number_of_values)), + ); + } + let bytes = vec![value; number_of_values]; + + // If value is true and we want to handle the TRUTHY case, the null buffer will have 1 (meaning not null) + // If value is false and we want to handle the FALSY case, the null buffer will have 0 (meaning null) + let nulls = NullBuffer::new(predicate); + (bytes.into(), buffers, Some(nulls)) + } + + fn get_views_for_non_nullable( + predicate: BooleanBuffer, + result_len: usize, + truthy_view: u128, + truthy_buffers: Vec, + falsy_view: u128, + falsy_buffers: Vec, + ) -> (ScalarBuffer, Vec, Option) { + let true_count = predicate.count_set_bits(); + match true_count { + 0 => { + // all values are falsy + (vec![falsy_view; result_len].into(), falsy_buffers, None) + } + n if n == predicate.len() => { + // all values are truthy + (vec![truthy_view; result_len].into(), truthy_buffers, None) + } + _ => { + let true_count = predicate.count_set_bits(); + let mut buffers: Vec = truthy_buffers.to_vec(); + + // If the falsy buffers are empty, we can use the falsy view as it is, because the value + // is completely inlined. Otherwise, we have non-inlined values in the buffer, and we need + // to recalculate the falsy view + let view_falsy = if falsy_buffers.is_empty() { + falsy_view + } else { + let byte_view_falsy = ByteView::from(falsy_view); + let new_index_falsy_buffers = + buffers.len() as u32 + byte_view_falsy.buffer_index; + buffers.extend(falsy_buffers); + let byte_view_falsy = + byte_view_falsy.with_buffer_index(new_index_falsy_buffers); + byte_view_falsy.as_u128() + }; + + let total_number_of_bytes = true_count * 16 + (predicate.len() - true_count) * 16; + let mut mutable = MutableBuffer::new(total_number_of_bytes); + let mut filled = 0; + + SlicesIterator::from(&predicate).for_each(|(start, end)| { + if start > filled { + let false_repeat_count = start - filled; + mutable + .repeat_slice_n_times(view_falsy.to_byte_slice(), false_repeat_count); + } + let true_repeat_count = end - start; + mutable.repeat_slice_n_times(truthy_view.to_byte_slice(), true_repeat_count); + filled = end; + }); + + if filled < predicate.len() { + let false_repeat_count = predicate.len() - filled; + mutable.repeat_slice_n_times(view_falsy.to_byte_slice(), false_repeat_count); + } + + let bytes = Buffer::from(mutable); + (bytes.into(), buffers, None) + } + } + } +} + +impl Debug for ByteViewScalarImpl { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ByteViewScalarImpl") + .field("truthy", &self.truthy_view) + .field("falsy", &self.falsy_view) + .finish() + } +} + +impl ZipImpl for ByteViewScalarImpl { + fn create_output(&self, predicate: &BooleanArray) -> Result { + let result_len = predicate.len(); + // Nulls are treated as false + let predicate = maybe_prep_null_mask_filter(predicate); + + let (views, buffers, nulls) = match (self.truthy_view, self.falsy_view) { + (Some(truthy), Some(falsy)) => Self::get_views_for_non_nullable( + predicate, + result_len, + truthy, + self.truthy_buffers.clone(), + falsy, + self.falsy_buffers.clone(), + ), + (Some(truthy), None) => Self::get_views_for_single_non_nullable( + predicate, + truthy, + self.truthy_buffers.clone(), + ), + (None, Some(falsy)) => { + let predicate = predicate.not(); + Self::get_views_for_single_non_nullable( + predicate, + falsy, + self.falsy_buffers.clone(), + ) + } + (None, None) => { + // All values are null + ( + vec![0; result_len].into(), + vec![], + Some(NullBuffer::new_null(result_len)), + ) + } + }; + + let result = unsafe { GenericByteViewArray::::new_unchecked(views, buffers, nulls) }; + Ok(Arc::new(result)) + } +} + #[cfg(test)] mod test { use super::*; @@ -1222,4 +1401,158 @@ mod test { ]); assert_eq!(actual, &expected); } + + #[test] + fn test_zip_kernel_scalar_strings_array_view() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["world"])); + + let mask = BooleanArray::from(vec![true, false, true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("hello"), + Some("world"), + Some("hello"), + Some("world"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_binary_array_view() { + let scalar_truthy = Scalar::new(BinaryViewArray::from_iter_values(vec![b"hello"])); + let scalar_falsy = Scalar::new(BinaryViewArray::from_iter_values(vec![b"world"])); + + let mask = BooleanArray::from(vec![true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_byte_view(); + let expected = BinaryViewArray::from_iter_values(vec![b"hello", b"world"]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_with_nulls() { + let scalar_truthy = Scalar::new(StringViewArray::from_iter_values(["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::new_null(1)); + + let mask = BooleanArray::from(vec![true, true, false, false, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = StringViewArray::from_iter(vec![ + Some("hello"), + Some("hello"), + None, + None, + Some("hello"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_all_true_null() { + let scalar_truthy = Scalar::new(StringViewArray::new_null(1)); + let scalar_falsy = Scalar::new(StringViewArray::new_null(1)); + let mask = BooleanArray::from(vec![true, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = StringViewArray::from_iter(vec![None::, None]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_all_false_null() { + let scalar_truthy = Scalar::new(StringViewArray::new_null(1)); + let scalar_falsy = Scalar::new(StringViewArray::new_null(1)); + let mask = BooleanArray::from(vec![false, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = StringViewArray::from_iter(vec![None::, None]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_string_array_view_all_true() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["world"])); + + let mask = BooleanArray::from(vec![true, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![Some("hello"), Some("hello")]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_string_array_view_all_false() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["world"])); + + let mask = BooleanArray::from(vec![false, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![Some("world"), Some("world")]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_large_strings() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["another longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("longer than 12 bytes"), + Some("another longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_large_short_strings() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["hello"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![true, false, true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("hello"), + Some("longer than 12 bytes"), + Some("hello"), + Some("longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } + #[test] + fn test_zip_kernel_scalar_strings_array_view_large_all_true() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["another longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![true, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("longer than 12 bytes"), + Some("longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_strings_array_view_large_all_false() { + let scalar_truthy = Scalar::new(StringViewArray::from(vec!["longer than 12 bytes"])); + let scalar_falsy = Scalar::new(StringViewArray::from(vec!["another longer than 12 bytes"])); + + let mask = BooleanArray::from(vec![false, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_string_view(); + let expected = StringViewArray::from(vec![ + Some("another longer than 12 bytes"), + Some("another longer than 12 bytes"), + ]); + assert_eq!(actual, &expected); + } } From 1ba902ef0fb88d06dbb1e90c59d5641bd24d15bd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Jan 2026 07:02:11 -0500 Subject: [PATCH 15/33] Fix `nullif` kernel (#9087) # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/9085 # Rationale for this change Fix a regression introduced in https://github.com/apache/arrow-rs/pull/8996 # What changes are included in this PR? 1. Add test coverage for nullif kernel 1. Undeprecate `bitwise_unary_op_helper` 2. Document subtle differences 3. Restore nullif kernel from https://github.com/apache/arrow-rs/pull/8996 # Are these changes tested Yes # Are there any user-facing changes? Fix (not yet released) bug --- arrow-buffer/src/buffer/boolean.rs | 1 + arrow-buffer/src/buffer/ops.rs | 49 ++++++++++--- arrow-select/src/nullif.rs | 114 ++++++++++++++++++++++++----- 3 files changed, 136 insertions(+), 28 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 548401ed4201..ecd7de38c031 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -165,6 +165,7 @@ impl BooleanBuffer { /// * `op` must only apply bitwise operations /// on the relevant bits; the input `u64` may contain irrelevant bits /// and may be processed differently on different endian architectures. + /// * `op` may be called with input bits outside the requested range /// * The output always has zero offset /// /// # See Also diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 05593504b1cf..cb0925bb2cd1 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -20,7 +20,12 @@ use crate::BooleanBuffer; use crate::util::bit_util::ceil; /// Apply a bitwise operation `op` to four inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// The inputs are treated as bitmaps, meaning that offsets and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_quaternary_op_helper( buffers: [&Buffer; 4], offsets: [usize; 4], @@ -60,7 +65,12 @@ where } /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// The inputs are treated as bitmaps, meaning that offsets and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, @@ -93,21 +103,42 @@ where } /// Apply a bitwise operation `op` to one input and return the result as a Buffer. -/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. -#[deprecated( - since = "57.2.0", - note = "use BooleanBuffer::from_bitwise_unary_op instead" -)] +/// +/// The input is treated as a bitmap, meaning that offset and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_unary_op_helper( left: &Buffer, offset_in_bits: usize, len_in_bits: usize, - op: F, + mut op: F, ) -> Buffer where F: FnMut(u64) -> u64, { - BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, op).into_inner() + // reserve capacity and set length so we can get a typed view of u64 chunks + let mut result = + MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false); + + let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits); + + let result_chunks = result.typed_data_mut::().iter_mut(); + + result_chunks + .zip(left_chunks.iter()) + .for_each(|(res, left)| { + *res = op(left); + }); + + let remainder_bytes = ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + result.extend_from_slice(rem); + + result.into() } /// Apply a bitwise and to two inputs and return the result as a Buffer. diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 211cabf7afc0..e51016f9bad3 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -19,7 +19,7 @@ use arrow_array::{Array, ArrayRef, BooleanArray, make_array}; use arrow_buffer::buffer::bitwise_bin_op_helper; -use arrow_buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper}; use arrow_schema::{ArrowError, DataType}; /// Returns a new array with the same values and the validity bit to false where @@ -91,13 +91,11 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut null_count = 0; - let buffer = - BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }) - .into_inner(); + let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); (buffer, null_count) } }; @@ -122,7 +120,8 @@ mod tests { use arrow_array::{Int32Array, NullArray, StringArray, StructArray}; use arrow_data::ArrayData; use arrow_schema::{Field, Fields}; - use rand::{Rng, rng}; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; #[test] fn test_nullif_int_array() { @@ -494,23 +493,60 @@ mod tests { let r_data = r.to_data(); r_data.validate().unwrap(); - assert_eq!(r.as_ref(), &expected); + assert_eq!( + r.as_ref(), + &expected, + "expected nulls: {:#?}\n\n\ + result nulls: {:#?}\n\n\\ + expected values: {:#?}\n\n\ + result values: {:#?}", + expected.nulls(), + r.nulls(), + expected.values(), + r.as_primitive::().values() + ); + validate_nulls(expected.nulls()); + validate_nulls(r.nulls()); + } + + /// Ensures that the null count matches the actual number of nulls. + fn validate_nulls(nulls: Option<&NullBuffer>) { + let Some(nulls) = nulls else { + return; + }; + let mut actual_null_count = 0; + for i in 0..nulls.len() { + if nulls.is_null(i) { + actual_null_count += 1; + } + } + assert_eq!(actual_null_count, nulls.null_count()); } #[test] fn nullif_fuzz() { - let mut rng = rng(); + let mut rng = StdRng::seed_from_u64(7337); let arrays = [ - Int32Array::from(vec![0; 128]), - (0..128) - .map(|_| rng.random_bool(0.5).then_some(0)) + Int32Array::from(vec![0; 1024]), // no nulls + (0..1024) // 50% nulls + .map(|_| rng.random_bool(0.5).then_some(1)) .collect(), ]; for a in arrays { - let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)]; - + let a_slices = [ + (0, 128), + (0, 129), + (64, 64), + (0, 64), + (32, 32), + (0, 0), + (32, 0), + (5, 800), + (33, 53), + (77, 101), + ]; for (a_offset, a_length) in a_slices { let a = a.slice(a_offset, a_length); @@ -518,14 +554,54 @@ mod tests { let b_start_offset = rng.random_range(0..i); let b_end_offset = rng.random_range(0..i); + // b with 50% nulls let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset) .map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5))) .collect(); - let b = b.slice(b_start_offset, a_length); - - test_nullif(&a, &b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); + + // b with no nulls (and no null buffer) + let b = remove_null_buffer(&b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); + + // b with no nulls (but with a null buffer) + let b = remove_null_values(&b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); } } } } + + /// Returns a new BooleanArray with no null buffer + fn remove_null_buffer(array: &BooleanArray) -> BooleanArray { + make_array( + array + .into_data() + .into_builder() + .nulls(None) + .build() + .unwrap(), + ) + .as_boolean() + .clone() + } + + /// Returns a new BooleanArray with a null buffer where all values are valid + fn remove_null_values(array: &BooleanArray) -> BooleanArray { + let len = array.len(); + let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len)); + make_array( + array + .into_data() + .into_builder() + .nulls(Some(new_nulls)) + .build() + .unwrap(), + ) + .as_boolean() + .clone() + } } From b8a2c1ad9ea7a1b59350735ef3c52e6397406768 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Jan 2026 13:35:31 -0500 Subject: [PATCH 16/33] [parquet] Avoid a clone while resolving the read strategy (#9056) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Which issue does this PR close? - related to https://github.com/apache/datafusion/pull/19477 # Rationale for this change While working on https://github.com/apache/datafusion/pull/19477, and profiling ClickBench q7, I noticed that the RowSelectors was being cloned to resolve the strategy -- for a large number of selections this is expensive and shows up in the traces Screenshot 2025-12-28 at 4 49
49 PM ```shell samply record -- ./datafusion-cli-alamb_enable_pushdown -f q.sql > /dev/null 2>& ``` We should change the code to avoid cloning the RowSelectors when resolving the strategy. # Changes Don't clone / allocate while resolving the strategy. I don't expect this to have a massive impact, but it did show up in the profile FYI @hhhizzz -- perhaps you could review this PR # Are these changes tested? Yes by CI # Are there any user-facing changes? small performance improvement --- parquet/src/arrow/arrow_reader/read_plan.rs | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index 3c17a358f084..7c9eb36befe3 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -110,19 +110,22 @@ impl ReadPlanBuilder { None => return RowSelectionStrategy::Selectors, }; - let trimmed = selection.clone().trim(); - let selectors: Vec = trimmed.into(); - if selectors.is_empty() { - return RowSelectionStrategy::Mask; - } - - let total_rows: usize = selectors.iter().map(|s| s.row_count).sum(); - let selector_count = selectors.len(); - if selector_count == 0 { + // total_rows: total number of rows selected / skipped + // effective_count: number of non-empty selectors + let (total_rows, effective_count) = + selection.iter().fold((0usize, 0usize), |(rows, count), s| { + if s.row_count > 0 { + (rows + s.row_count, count + 1) + } else { + (rows, count) + } + }); + + if effective_count == 0 { return RowSelectionStrategy::Mask; } - if total_rows < selector_count.saturating_mul(threshold) { + if total_rows < effective_count.saturating_mul(threshold) { RowSelectionStrategy::Mask } else { RowSelectionStrategy::Selectors From a9d6e92664996a875bab2b5df60704781a396e6c Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Tue, 6 Jan 2026 04:21:08 +0800 Subject: [PATCH 17/33] [Variant] Move `ArrayVariantToArrowRowBuilder` to `variant_to_arrow` (#9094) # Which issue does this PR close? - Part of #8082. # Rationale for this change To support `Lists/Array` in `variant_get`, it's better to move `ArrayVariantToArrowRowBuilder` from `shred_variant` to `variant_to_arrow` and be shared with `variant_get`. In the meantime, some code movement in `variant_to_arrow` would help to get a better overview of the overall implementation # What changes are included in this PR? This PR can be reviewed commit by commit: - Move `VariantToArrowRowBuilder` and related impl to the top of `variant_to_arrow` - Push the `FixedSizeList` check from `make_variant_to_shredded_variant_arrow_row_builder` down to `ArrayVariantToArrowRowBuilder` - Move `ArrayVariantToArrowRowBuilder` to `variant_to_arrow` # Are these changes tested? Covered by existing tests # Are there any user-facing changes? No --- parquet-variant-compute/src/shred_variant.rs | 203 ++-------- .../src/variant_to_arrow.rs | 360 +++++++++++++----- 2 files changed, 291 insertions(+), 272 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 45e7fc95c9f9..7f253d249dfb 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -19,19 +19,17 @@ use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::variant_to_arrow::{ - PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder, + ArrayVariantToArrowRowBuilder, PrimitiveVariantToArrowRowBuilder, + make_primitive_variant_to_arrow_row_builder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::{ - ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, NullBufferBuilder, - OffsetSizeTrait, -}; -use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder}; +use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; -use arrow::datatypes::{ArrowNativeTypeOp, DataType, Field, FieldRef, Fields, TimeUnit}; +use arrow::datatypes::{DataType, Field, FieldRef, Fields, TimeUnit}; use arrow::error::{ArrowError, Result}; use indexmap::IndexMap; -use parquet_variant::{Variant, VariantBuilderExt, VariantList, VariantPath, VariantPathElement}; +use parquet_variant::{Variant, VariantBuilderExt, VariantPath, VariantPathElement}; use std::collections::BTreeMap; use std::sync::Arc; @@ -123,7 +121,8 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( DataType::List(_) | DataType::LargeList(_) | DataType::ListView(_) - | DataType::LargeListView(_) => { + | DataType::LargeListView(_) + | DataType::FixedSizeList(..) => { let typed_value_builder = VariantToShreddedArrayVariantRowBuilder::try_new( data_type, cast_options, @@ -131,11 +130,6 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( )?; VariantToShreddedVariantRowBuilder::Array(typed_value_builder) } - DataType::FixedSizeList(..) => { - return Err(ArrowError::NotYetImplemented( - "Shredding variant array values as fixed-size lists".to_string(), - )); - } // Supported shredded primitive types, see Variant shredding spec: // https://github.com/apache/parquet-format/blob/master/VariantShredding.md#shredded-value-types DataType::Boolean @@ -312,171 +306,6 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> { } } -enum ArrayVariantToArrowRowBuilder<'a> { - List(VariantToListArrowRowBuilder<'a, i32, false>), - LargeList(VariantToListArrowRowBuilder<'a, i64, false>), - ListView(VariantToListArrowRowBuilder<'a, i32, true>), - LargeListView(VariantToListArrowRowBuilder<'a, i64, true>), -} - -impl<'a> ArrayVariantToArrowRowBuilder<'a> { - fn try_new( - data_type: &'a DataType, - cast_options: &'a CastOptions, - capacity: usize, - ) -> Result { - use ArrayVariantToArrowRowBuilder::*; - - // Make List/ListView builders without repeating the constructor boilerplate. - macro_rules! make_list_builder { - ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => { - $variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new( - $field.clone(), - $field.data_type(), - cast_options, - capacity, - )?) - }; - } - - let builder = match data_type { - DataType::List(field) => make_list_builder!(List, i32, false, field), - DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field), - DataType::ListView(field) => make_list_builder!(ListView, i32, true, field), - DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field), - other => { - return Err(ArrowError::InvalidArgumentError(format!( - "Casting to {other:?} is not applicable for array Variant types" - ))); - } - }; - Ok(builder) - } - - fn append_null(&mut self) { - match self { - Self::List(builder) => builder.append_null(), - Self::LargeList(builder) => builder.append_null(), - Self::ListView(builder) => builder.append_null(), - Self::LargeListView(builder) => builder.append_null(), - } - } - - fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { - match self { - Self::List(builder) => builder.append_value(list), - Self::LargeList(builder) => builder.append_value(list), - Self::ListView(builder) => builder.append_value(list), - Self::LargeListView(builder) => builder.append_value(list), - } - } - - fn finish(self) -> Result { - match self { - Self::List(builder) => builder.finish(), - Self::LargeList(builder) => builder.finish(), - Self::ListView(builder) => builder.finish(), - Self::LargeListView(builder) => builder.finish(), - } - } -} - -struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool> -where - O: OffsetSizeTrait + ArrowNativeTypeOp, -{ - field: FieldRef, - offsets: Vec, - element_builder: Box>, - nulls: NullBufferBuilder, - current_offset: O, -} - -impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW> -where - O: OffsetSizeTrait + ArrowNativeTypeOp, -{ - fn try_new( - field: FieldRef, - element_data_type: &'a DataType, - cast_options: &'a CastOptions, - capacity: usize, - ) -> Result { - if capacity >= isize::MAX as usize { - return Err(ArrowError::ComputeError( - "Capacity exceeds isize::MAX when reserving list offsets".to_string(), - )); - } - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(O::ZERO); - let element_builder = make_variant_to_shredded_variant_arrow_row_builder( - element_data_type, - cast_options, - capacity, - false, - )?; - Ok(Self { - field, - offsets, - element_builder: Box::new(element_builder), - nulls: NullBufferBuilder::new(capacity), - current_offset: O::ZERO, - }) - } - - fn append_null(&mut self) { - self.offsets.push(self.current_offset); - self.nulls.append_null(); - } - - fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { - for element in list.iter() { - self.element_builder.append_value(element)?; - self.current_offset = self.current_offset.add_checked(O::ONE)?; - } - self.offsets.push(self.current_offset); - self.nulls.append_non_null(); - Ok(()) - } - - fn finish(mut self) -> Result { - let (value, typed_value, nulls) = self.element_builder.finish()?; - let element_array = - ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); - let field = Arc::new( - self.field - .as_ref() - .clone() - .with_data_type(element_array.data_type().clone()), - ); - - if IS_VIEW { - // NOTE: `offsets` is never empty (constructor pushes an entry) - let mut sizes = Vec::with_capacity(self.offsets.len() - 1); - for i in 1..self.offsets.len() { - sizes.push(self.offsets[i] - self.offsets[i - 1]); - } - self.offsets.pop(); - let list_view_array = GenericListViewArray::::new( - field, - ScalarBuffer::from(self.offsets), - ScalarBuffer::from(sizes), - ArrayRef::from(element_array), - self.nulls.finish(), - ); - Ok(Arc::new(list_view_array)) - } else { - let list_array = GenericListArray::::new( - field, - OffsetBuffer::::new(ScalarBuffer::from(self.offsets)), - ArrayRef::from(element_array), - self.nulls.finish(), - ); - Ok(Arc::new(list_array)) - } - } -} - pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> { value_builder: VariantValueArrayBuilder, typed_value_builders: IndexMap<&'a str, VariantToShreddedVariantRowBuilder<'a>>, @@ -1513,6 +1342,22 @@ mod tests { ); } + #[test] + fn test_array_shredding_as_fixed_size_list() { + let input = build_variant_array(vec![VariantRow::List(vec![ + VariantValue::from(1i64), + VariantValue::from(2i64), + VariantValue::from(3i64), + ])]); + let list_schema = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2); + let err = shred_variant(&input, &list_schema).unwrap_err(); + assert_eq!( + err.to_string(), + "Not yet implemented: Converting unshredded variant arrays to arrow fixed-size lists" + ); + } + #[test] fn test_array_shredding_with_array_elements() { let input = build_variant_array(vec![ diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 57d9944bb527..172bd4811bc3 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -15,23 +15,117 @@ // specific language governing permissions and limitations // under the License. +use crate::shred_variant::{ + VariantToShreddedVariantRowBuilder, make_variant_to_shredded_variant_arrow_row_builder, +}; +use crate::type_conversion::{ + PrimitiveFromVariant, TimestampFromVariant, variant_to_unscaled_decimal, +}; +use crate::variant_array::ShreddedVariantFieldArray; +use crate::{VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - ArrayRef, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, BinaryViewBuilder, - BooleanBuilder, FixedSizeBinaryBuilder, LargeBinaryBuilder, LargeStringBuilder, NullArray, - NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, + ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, + BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, + GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, + OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, }; +use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::{CastOptions, DecimalCast}; use arrow::datatypes::{self, DataType, DecimalType}; use arrow::error::{ArrowError, Result}; -use parquet_variant::{Variant, VariantPath}; +use arrow_schema::{FieldRef, TimeUnit}; +use parquet_variant::{Variant, VariantList, VariantPath}; +use std::sync::Arc; -use crate::type_conversion::{ - PrimitiveFromVariant, TimestampFromVariant, variant_to_unscaled_decimal, -}; -use crate::{VariantArray, VariantValueArrayBuilder}; +/// Builder for converting variant values into strongly typed Arrow arrays. +/// +/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly +/// with casting of leaf values to specific types. +pub(crate) enum VariantToArrowRowBuilder<'a> { + Primitive(PrimitiveVariantToArrowRowBuilder<'a>), + BinaryVariant(VariantToBinaryVariantArrowRowBuilder), -use arrow_schema::TimeUnit; -use std::sync::Arc; + // Path extraction wrapper - contains a boxed enum for any of the above + WithPath(VariantPathRowBuilder<'a>), +} + +impl<'a> VariantToArrowRowBuilder<'a> { + pub fn append_null(&mut self) -> Result<()> { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.append_null(), + BinaryVariant(b) => b.append_null(), + WithPath(path_builder) => path_builder.append_null(), + } + } + + pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.append_value(&value), + BinaryVariant(b) => b.append_value(value), + WithPath(path_builder) => path_builder.append_value(value), + } + } + + pub fn finish(self) -> Result { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.finish(), + BinaryVariant(b) => b.finish(), + WithPath(path_builder) => path_builder.finish(), + } + } +} + +pub(crate) fn make_variant_to_arrow_row_builder<'a>( + metadata: &BinaryViewArray, + path: VariantPath<'a>, + data_type: Option<&'a DataType>, + cast_options: &'a CastOptions, + capacity: usize, +) -> Result> { + use VariantToArrowRowBuilder::*; + + let mut builder = match data_type { + // If no data type was requested, build an unshredded VariantArray. + None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( + metadata.clone(), + capacity, + )), + Some(DataType::Struct(_)) => { + return Err(ArrowError::NotYetImplemented( + "Converting unshredded variant objects to arrow structs".to_string(), + )); + } + Some( + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(..), + ) => { + return Err(ArrowError::NotYetImplemented( + "Converting unshredded variant arrays to arrow lists".to_string(), + )); + } + Some(data_type) => { + let builder = + make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; + Primitive(builder) + } + }; + + // Wrap with path extraction if needed + if !path.is_empty() { + builder = WithPath(VariantPathRowBuilder { + builder: Box::new(builder), + path, + }) + }; + + Ok(builder) +} /// Builder for converting primitive variant values to Arrow arrays. It is used by both /// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in @@ -81,18 +175,6 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>), } -/// Builder for converting variant values into strongly typed Arrow arrays. -/// -/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly -/// with casting of leaf values to specific types. -pub(crate) enum VariantToArrowRowBuilder<'a> { - Primitive(PrimitiveVariantToArrowRowBuilder<'a>), - BinaryVariant(VariantToBinaryVariantArrowRowBuilder), - - // Path extraction wrapper - contains a boxed enum for any of the above - WithPath(VariantPathRowBuilder<'a>), -} - impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { pub fn append_null(&mut self) -> Result<()> { use PrimitiveVariantToArrowRowBuilder::*; @@ -227,35 +309,6 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { } } -impl<'a> VariantToArrowRowBuilder<'a> { - pub fn append_null(&mut self) -> Result<()> { - use VariantToArrowRowBuilder::*; - match self { - Primitive(b) => b.append_null(), - BinaryVariant(b) => b.append_null(), - WithPath(path_builder) => path_builder.append_null(), - } - } - - pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result { - use VariantToArrowRowBuilder::*; - match self { - Primitive(b) => b.append_value(&value), - BinaryVariant(b) => b.append_value(value), - WithPath(path_builder) => path_builder.append_value(value), - } - } - - pub fn finish(self) -> Result { - use VariantToArrowRowBuilder::*; - match self { - Primitive(b) => b.finish(), - BinaryVariant(b) => b.finish(), - WithPath(path_builder) => path_builder.finish(), - } - } -} - /// Creates a row builder that converts primitive `Variant` values into the requested Arrow data type. pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( data_type: &'a DataType, @@ -427,53 +480,78 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( Ok(builder) } -pub(crate) fn make_variant_to_arrow_row_builder<'a>( - metadata: &BinaryViewArray, - path: VariantPath<'a>, - data_type: Option<&'a DataType>, - cast_options: &'a CastOptions, - capacity: usize, -) -> Result> { - use VariantToArrowRowBuilder::*; +pub(crate) enum ArrayVariantToArrowRowBuilder<'a> { + List(VariantToListArrowRowBuilder<'a, i32, false>), + LargeList(VariantToListArrowRowBuilder<'a, i64, false>), + ListView(VariantToListArrowRowBuilder<'a, i32, true>), + LargeListView(VariantToListArrowRowBuilder<'a, i64, true>), +} - let mut builder = match data_type { - // If no data type was requested, build an unshredded VariantArray. - None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( - metadata.clone(), - capacity, - )), - Some(DataType::Struct(_)) => { - return Err(ArrowError::NotYetImplemented( - "Converting unshredded variant objects to arrow structs".to_string(), - )); - } - Some( - DataType::List(_) - | DataType::LargeList(_) - | DataType::ListView(_) - | DataType::LargeListView(_) - | DataType::FixedSizeList(..), - ) => { - return Err(ArrowError::NotYetImplemented( - "Converting unshredded variant arrays to arrow lists".to_string(), - )); +impl<'a> ArrayVariantToArrowRowBuilder<'a> { + pub(crate) fn try_new( + data_type: &'a DataType, + cast_options: &'a CastOptions, + capacity: usize, + ) -> Result { + use ArrayVariantToArrowRowBuilder::*; + + // Make List/ListView builders without repeating the constructor boilerplate. + macro_rules! make_list_builder { + ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => { + $variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new( + $field.clone(), + $field.data_type(), + cast_options, + capacity, + )?) + }; } - Some(data_type) => { - let builder = - make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; - Primitive(builder) + + let builder = match data_type { + DataType::List(field) => make_list_builder!(List, i32, false, field), + DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field), + DataType::ListView(field) => make_list_builder!(ListView, i32, true, field), + DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field), + DataType::FixedSizeList(..) => { + return Err(ArrowError::NotYetImplemented( + "Converting unshredded variant arrays to arrow fixed-size lists".to_string(), + )); + } + other => { + return Err(ArrowError::InvalidArgumentError(format!( + "Casting to {other:?} is not applicable for array Variant types" + ))); + } + }; + Ok(builder) + } + + pub(crate) fn append_null(&mut self) { + match self { + Self::List(builder) => builder.append_null(), + Self::LargeList(builder) => builder.append_null(), + Self::ListView(builder) => builder.append_null(), + Self::LargeListView(builder) => builder.append_null(), } - }; + } - // Wrap with path extraction if needed - if !path.is_empty() { - builder = WithPath(VariantPathRowBuilder { - builder: Box::new(builder), - path, - }) - }; + pub(crate) fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { + match self { + Self::List(builder) => builder.append_value(list), + Self::LargeList(builder) => builder.append_value(list), + Self::ListView(builder) => builder.append_value(list), + Self::LargeListView(builder) => builder.append_value(list), + } + } - Ok(builder) + pub(crate) fn finish(self) -> Result { + match self { + Self::List(builder) => builder.finish(), + Self::LargeList(builder) => builder.finish(), + Self::ListView(builder) => builder.finish(), + Self::LargeListView(builder) => builder.finish(), + } + } } /// A thin wrapper whose only job is to extract a specific path from a variant value and pass the @@ -708,6 +786,102 @@ impl<'a> VariantToUuidArrowRowBuilder<'a> { } } +pub(crate) struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool> +where + O: OffsetSizeTrait + ArrowNativeTypeOp, +{ + field: FieldRef, + offsets: Vec, + element_builder: Box>, + nulls: NullBufferBuilder, + current_offset: O, +} + +impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW> +where + O: OffsetSizeTrait + ArrowNativeTypeOp, +{ + fn try_new( + field: FieldRef, + element_data_type: &'a DataType, + cast_options: &'a CastOptions, + capacity: usize, + ) -> Result { + if capacity >= isize::MAX as usize { + return Err(ArrowError::ComputeError( + "Capacity exceeds isize::MAX when reserving list offsets".to_string(), + )); + } + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(O::ZERO); + let element_builder = make_variant_to_shredded_variant_arrow_row_builder( + element_data_type, + cast_options, + capacity, + false, + )?; + Ok(Self { + field, + offsets, + element_builder: Box::new(element_builder), + nulls: NullBufferBuilder::new(capacity), + current_offset: O::ZERO, + }) + } + + fn append_null(&mut self) { + self.offsets.push(self.current_offset); + self.nulls.append_null(); + } + + fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> { + for element in list.iter() { + self.element_builder.append_value(element)?; + self.current_offset = self.current_offset.add_checked(O::ONE)?; + } + self.offsets.push(self.current_offset); + self.nulls.append_non_null(); + Ok(()) + } + + fn finish(mut self) -> Result { + let (value, typed_value, nulls) = self.element_builder.finish()?; + let element_array = + ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); + let field = Arc::new( + self.field + .as_ref() + .clone() + .with_data_type(element_array.data_type().clone()), + ); + + if IS_VIEW { + // NOTE: `offsets` is never empty (constructor pushes an entry) + let mut sizes = Vec::with_capacity(self.offsets.len() - 1); + for i in 1..self.offsets.len() { + sizes.push(self.offsets[i] - self.offsets[i - 1]); + } + self.offsets.pop(); + let list_view_array = GenericListViewArray::::new( + field, + ScalarBuffer::from(self.offsets), + ScalarBuffer::from(sizes), + ArrayRef::from(element_array), + self.nulls.finish(), + ); + Ok(Arc::new(list_view_array)) + } else { + let list_array = GenericListArray::::new( + field, + OffsetBuffer::::new(ScalarBuffer::from(self.offsets)), + ArrayRef::from(element_array), + self.nulls.finish(), + ); + Ok(Arc::new(list_array)) + } + } +} + /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { metadata: BinaryViewArray, From b1dfb697babcb614040ea2ae17e842e9db557f69 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 5 Jan 2026 20:08:08 -0500 Subject: [PATCH 18/33] Fix row slice bug in Union column decoding with many columns (#9000) # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/8999 # Rationale for this change This PR fixes a bug in the row-to-column conversion for Union types when multiple union columns are present in the same row converter Previously, the row slice was being consumed from reading their data correctly. The fix tracks bytes consumed per row across all union fields, this way it properly advances row slices --- arrow-row/src/lib.rs | 157 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 3 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 3c63f3bd6bf2..307281bf9db1 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1901,12 +1901,9 @@ unsafe fn decode_column( let child_row = &row[1..]; rows_by_field[field_idx].push((idx, child_row)); - - *row = &row[row.len()..]; } let mut child_arrays: Vec = Vec::with_capacity(converters.len()); - let mut offsets = (*mode == UnionMode::Dense).then(|| Vec::with_capacity(len)); for (field_idx, converter) in converters.iter().enumerate() { @@ -1928,6 +1925,14 @@ unsafe fn decode_column( let child_array = unsafe { converter.convert_raw(&mut child_data, validate_utf8) }?; + // advance row slices by the bytes consumed + for ((row_idx, original_bytes), remaining_bytes) in + field_rows.iter().zip(child_data) + { + let consumed_length = 1 + original_bytes.len() - remaining_bytes.len(); + rows[*row_idx] = &rows[*row_idx][consumed_length..]; + } + child_arrays.push(child_array.into_iter().next().unwrap()); } UnionMode::Sparse => { @@ -1949,6 +1954,14 @@ unsafe fn decode_column( let child_array = unsafe { converter.convert_raw(&mut sparse_data, validate_utf8) }?; + + // advance row slices by the bytes consumed for rows that belong to this field + for (row_idx, child_row) in field_rows.iter() { + let remaining_len = sparse_data[*row_idx].len(); + let consumed_length = 1 + child_row.len() - remaining_len; + rows[*row_idx] = &rows[*row_idx][consumed_length..]; + } + child_arrays.push(child_array.into_iter().next().unwrap()); } } @@ -4049,6 +4062,144 @@ mod tests { assert!(rows.row(3) < rows.row(1)); } + #[test] + fn test_row_converter_roundtrip_with_many_union_columns() { + // col 1: Union(Int32, Utf8) [67, "hello"] + let fields1 = UnionFields::try_new( + vec![0, 1], + vec![ + Field::new("int", DataType::Int32, true), + Field::new("string", DataType::Utf8, true), + ], + ) + .unwrap(); + + let int_array1 = Int32Array::from(vec![Some(67), None]); + let string_array1 = StringArray::from(vec![None::<&str>, Some("hello")]); + let type_ids1 = vec![0i8, 1].into(); + + let union_array1 = UnionArray::try_new( + fields1.clone(), + type_ids1, + None, + vec![ + Arc::new(int_array1) as ArrayRef, + Arc::new(string_array1) as ArrayRef, + ], + ) + .unwrap(); + + // col 2: Union(Int32, Utf8) [100, "world"] + let fields2 = UnionFields::try_new( + vec![0, 1], + vec![ + Field::new("int", DataType::Int32, true), + Field::new("string", DataType::Utf8, true), + ], + ) + .unwrap(); + + let int_array2 = Int32Array::from(vec![Some(100), None]); + let string_array2 = StringArray::from(vec![None::<&str>, Some("world")]); + let type_ids2 = vec![0i8, 1].into(); + + let union_array2 = UnionArray::try_new( + fields2.clone(), + type_ids2, + None, + vec![ + Arc::new(int_array2) as ArrayRef, + Arc::new(string_array2) as ArrayRef, + ], + ) + .unwrap(); + + // create a row converter with 2 union columns + let field1 = Field::new("col1", DataType::Union(fields1, UnionMode::Sparse), true); + let field2 = Field::new("col2", DataType::Union(fields2, UnionMode::Sparse), true); + + let sort_field1 = SortField::new(field1.data_type().clone()); + let sort_field2 = SortField::new(field2.data_type().clone()); + + let converter = RowConverter::new(vec![sort_field1, sort_field2]).unwrap(); + + let rows = converter + .convert_columns(&[ + Arc::new(union_array1.clone()) as ArrayRef, + Arc::new(union_array2.clone()) as ArrayRef, + ]) + .unwrap(); + + // roundtrip + let out = converter.convert_rows(&rows).unwrap(); + + let [col1, col2] = out.as_slice() else { + panic!("expected 2 columns") + }; + + let col1 = col1.as_any().downcast_ref::().unwrap(); + let col2 = col2.as_any().downcast_ref::().unwrap(); + + for (expected, got) in [union_array1, union_array2].iter().zip([col1, col2]) { + assert_eq!(expected.len(), got.len()); + assert_eq!(expected.type_ids(), got.type_ids()); + + for i in 0..expected.len() { + assert_eq!(expected.value(i).as_ref(), got.value(i).as_ref()); + } + } + } + + #[test] + fn test_row_converter_roundtrip_with_one_union_column() { + let fields = UnionFields::try_new( + vec![0, 1], + vec![ + Field::new("int", DataType::Int32, true), + Field::new("string", DataType::Utf8, true), + ], + ) + .unwrap(); + + let int_array = Int32Array::from(vec![Some(67), None]); + let string_array = StringArray::from(vec![None::<&str>, Some("hello")]); + let type_ids = vec![0i8, 1].into(); + + let union_array = UnionArray::try_new( + fields.clone(), + type_ids, + None, + vec![ + Arc::new(int_array) as ArrayRef, + Arc::new(string_array) as ArrayRef, + ], + ) + .unwrap(); + + let field = Field::new("col", DataType::Union(fields, UnionMode::Sparse), true); + let sort_field = SortField::new(field.data_type().clone()); + let converter = RowConverter::new(vec![sort_field]).unwrap(); + + let rows = converter + .convert_columns(&[Arc::new(union_array.clone()) as ArrayRef]) + .unwrap(); + + // roundtrip + let out = converter.convert_rows(&rows).unwrap(); + + let [col1] = out.as_slice() else { + panic!("expected 1 column") + }; + + let col = col1.as_any().downcast_ref::().unwrap(); + assert_eq!(col.len(), union_array.len()); + assert_eq!(col.type_ids(), union_array.type_ids()); + + for i in 0..col.len() { + assert_eq!(col.value(i).as_ref(), union_array.value(i).as_ref()); + } + } + #[test] fn rows_size_should_count_for_capacity() { let row_converter = RowConverter::new(vec![SortField::new(DataType::UInt8)]).unwrap(); From 068a7e44f0881ffe542225c5dd8cb354b74a9615 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 6 Jan 2026 21:53:24 +0000 Subject: [PATCH 19/33] Add `DataType::is_decimal` (#9100) # Which issue does this PR close? - Closes #5163 # Rationale for this change I've implemented this function at least twice in other codebases, and `arrow-rs` now has 4 variants. # What changes are included in this PR? New public function to test if a `DataType` is any decimal variant. # Are these changes tested? Yes # Are there any user-facing changes? New function including docs. --- arrow-schema/src/datatype.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index e3f67e6ac06a..40c28649c25b 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -591,6 +591,16 @@ impl DataType { matches!(self, UInt8 | UInt16 | UInt32 | UInt64) } + /// Returns true if this type is decimal: (Decimal*). + #[inline] + pub fn is_decimal(&self) -> bool { + use DataType::*; + matches!( + self, + Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) + ) + } + /// Returns true if this type is valid as a dictionary key #[inline] pub fn is_dictionary_key_type(&self) -> bool { @@ -1168,6 +1178,15 @@ mod tests { assert!(!DataType::is_floating(&DataType::Int32)); } + #[test] + fn test_decimal() { + assert!(DataType::is_decimal(&DataType::Decimal32(4, 2))); + assert!(DataType::is_decimal(&DataType::Decimal64(4, 2))); + assert!(DataType::is_decimal(&DataType::Decimal128(4, 2))); + assert!(DataType::is_decimal(&DataType::Decimal256(4, 2))); + assert!(!DataType::is_decimal(&DataType::Float16)); + } + #[test] fn test_datatype_is_null() { assert!(DataType::is_null(&DataType::Null)); From 2507946be697d511689a8f59b6a7db45ef40854b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Wed, 7 Jan 2026 06:13:31 +0800 Subject: [PATCH 20/33] Add `FlightInfo::with_endpoints` method (#9075) # Which issue does this PR close? - Closes #NNN. # Rationale for this change Add convenient method to insert all endpoints. # What changes are included in this PR? Add `FlightInfo::with_endpoints` method # Are these changes tested? CI. # Are there any user-facing changes? Yes. --- arrow-flight/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 99f2a2ed462b..db900341560c 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -603,6 +603,12 @@ impl FlightInfo { self } + /// Add endpoints for fetching all data + pub fn with_endpoints(mut self, endpoints: Vec) -> Self { + self.endpoint = endpoints; + self + } + /// Add a [`FlightDescriptor`] describing what this data is pub fn with_descriptor(mut self, flight_descriptor: FlightDescriptor) -> Self { self.flight_descriptor = Some(flight_descriptor); From 10a976fc03af32d26f9e4bf4dcc420cb1a455ef0 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Wed, 7 Jan 2026 16:43:28 +0200 Subject: [PATCH 21/33] chore: increase row count and batch size for more deterministic tests (#9088) # Which issue does this PR close? - Closes #NNN. # Rationale for this change Previous benchmark is too fast to deterministically measure the performance improvement because they run only in 2-7 microsecond. # What changes are included in this PR? # Are these changes tested? # Are there any user-facing changes? --- arrow-json/benches/serde.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arrow-json/benches/serde.rs b/arrow-json/benches/serde.rs index 23f005cc90ab..282f2e7c76d0 100644 --- a/arrow-json/benches/serde.rs +++ b/arrow-json/benches/serde.rs @@ -22,12 +22,14 @@ use rand::{Rng, rng}; use serde::Serialize; use std::sync::Arc; +const ROWS: usize = 1 << 18; + #[allow(deprecated)] fn do_bench(c: &mut Criterion, name: &str, rows: &[R], schema: &Schema) { let schema = Arc::new(schema.clone()); c.bench_function(name, |b| { b.iter(|| { - let builder = ReaderBuilder::new(schema.clone()).with_batch_size(64); + let builder = ReaderBuilder::new(schema.clone()).with_batch_size(8192); let mut decoder = builder.build_decoder().unwrap(); decoder.serialize(rows) }) @@ -37,26 +39,26 @@ fn do_bench(c: &mut Criterion, name: &str, rows: &[R], schema: &Sc fn criterion_benchmark(c: &mut Criterion) { let mut rng = rng(); let schema = Schema::new(vec![Field::new("i32", DataType::Int32, false)]); - let v: Vec = (0..2048).map(|_| rng.random_range(0..10000)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0..10000)).collect(); do_bench(c, "small_i32", &v, &schema); - let v: Vec = (0..2048).map(|_| rng.random()).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random()).collect(); do_bench(c, "large_i32", &v, &schema); let schema = Schema::new(vec![Field::new("i64", DataType::Int64, false)]); - let v: Vec = (0..2048).map(|_| rng.random_range(0..10000)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0..10000)).collect(); do_bench(c, "small_i64", &v, &schema); - let v: Vec = (0..2048) + let v: Vec = (0..ROWS) .map(|_| rng.random_range(0..i32::MAX as _)) .collect(); do_bench(c, "medium_i64", &v, &schema); - let v: Vec = (0..2048).map(|_| rng.random()).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random()).collect(); do_bench(c, "large_i64", &v, &schema); let schema = Schema::new(vec![Field::new("f32", DataType::Float32, false)]); - let v: Vec = (0..2048).map(|_| rng.random_range(0.0..10000.)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0.0..10000.)).collect(); do_bench(c, "small_f32", &v, &schema); - let v: Vec = (0..2048).map(|_| rng.random_range(0.0..f32::MAX)).collect(); + let v: Vec = (0..ROWS).map(|_| rng.random_range(0.0..f32::MAX)).collect(); do_bench(c, "large_f32", &v, &schema); } From 2b179b805f62e927771f764f8ae92151249e9edd Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Wed, 7 Jan 2026 16:33:04 +0100 Subject: [PATCH 22/33] feat(parquet): relax type compatility check in parquet ArrowWriter (#9099) # Which issue does this PR close? - Closes #9098. # Rationale for this change Don't require strict equality for nested fields (including inner field name/metadata), just require that nested data types are logically equivalent. # What changes are included in this PR? Use `a.equals_datatype(b)` instead of `a == b` at the start of `LevelInfoBuilder::types_compatible`. # Are these changes tested? Yes. # Are there any user-facing changes? --- parquet/src/arrow/arrow_writer/levels.rs | 4 +- parquet/src/arrow/arrow_writer/mod.rs | 51 +++++++++++++++++++++++- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 3c283bcbe3d2..59bf6c602438 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -550,8 +550,8 @@ impl LevelInfoBuilder { /// and the other is a native array, the dictionary values must have the same type as the /// native array fn types_compatible(a: &DataType, b: &DataType) -> bool { - // if the Arrow data types are the same, the types are clearly compatible - if a == b { + // if the Arrow data types are equal, the types are deemed compatible + if a.equals_datatype(b) { return true; } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 3e3c9108d59c..6b1566a681e7 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1522,11 +1522,12 @@ fn get_fsb_array_slice( #[cfg(test)] mod tests { use super::*; + use std::collections::HashMap; use std::fs::File; - use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; + use crate::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY}; use crate::column::page::{Page, PageReader}; use crate::file::metadata::thrift::PageHeader; use crate::file::page_index::column_index::ColumnIndexMetaData; @@ -1539,7 +1540,7 @@ mod tests { use arrow::util::data_gen::create_random_array; use arrow::util::pretty::pretty_format_batches; use arrow::{array::*, buffer::Buffer}; - use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, i256}; + use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, OffsetBuffer, i256}; use arrow_schema::Fields; use half::f16; use num_traits::{FromPrimitive, ToPrimitive}; @@ -3323,6 +3324,52 @@ mod tests { BinaryViewArray::from_iter_values(vec![b"barquet"]), LargeBinaryArray::from_iter_values(vec![b"parquet", b"barquet"]), ); + + // check compatibility for list types + + let list_field_metadata = HashMap::from_iter(vec![( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )]); + let list_field = Field::new_list_field(DataType::Int32, false); + + let values1 = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4])); + let offsets1 = OffsetBuffer::new(vec![0, 2, 5].into()); + + let values2 = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9])); + let offsets2 = OffsetBuffer::new(vec![0, 3, 5].into()); + + let values_expected = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9])); + let offsets_expected = OffsetBuffer::new(vec![0, 2, 5, 8, 10].into()); + + ensure_compatible_write( + // when the initial schema has the metadata ... + ListArray::try_new( + Arc::new( + list_field + .clone() + .with_metadata(list_field_metadata.clone()), + ), + offsets1, + values1, + None, + ) + .unwrap(), + // ... and some intermediate schema doesn't have the metadata + ListArray::try_new(Arc::new(list_field.clone()), offsets2, values2, None).unwrap(), + // ... the write will still go through, and the resulting schema will inherit the initial metadata + ListArray::try_new( + Arc::new( + list_field + .clone() + .with_metadata(list_field_metadata.clone()), + ), + offsets_expected, + values_expected, + None, + ) + .unwrap(), + ); } #[test] From 721f373fba7493c873a64421e32f1f67114ac130 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:48:58 +0000 Subject: [PATCH 23/33] Seal Array trait (#9092) # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/9106 # Rationale for this change This trait is not meant to be overridden, and doing so will break many kernels in sometimes subtle ways. # What changes are included in this PR? Seals the Array trait to prevent implementation outside of arrow-array. # Are these changes tested? # Are there any user-facing changes? --------- Co-authored-by: Andrew Lamb --- arrow-array/src/array/boolean_array.rs | 2 ++ arrow-array/src/array/byte_array.rs | 2 ++ arrow-array/src/array/byte_view_array.rs | 2 ++ arrow-array/src/array/dictionary_array.rs | 4 ++++ arrow-array/src/array/fixed_size_binary_array.rs | 2 ++ arrow-array/src/array/fixed_size_list_array.rs | 2 ++ arrow-array/src/array/list_array.rs | 2 ++ arrow-array/src/array/list_view_array.rs | 2 ++ arrow-array/src/array/map_array.rs | 2 ++ arrow-array/src/array/mod.rs | 14 +++++++++++++- arrow-array/src/array/null_array.rs | 2 ++ arrow-array/src/array/primitive_array.rs | 2 ++ arrow-array/src/array/run_array.rs | 4 ++++ arrow-array/src/array/struct_array.rs | 2 ++ arrow-array/src/array/union_array.rs | 2 ++ 15 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 530121ea7853..acea680ae374 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -286,6 +286,8 @@ impl BooleanArray { } } +impl super::private::Sealed for BooleanArray {} + impl Array for BooleanArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index fbd8458846fc..bd85bffcfe44 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -462,6 +462,8 @@ impl std::fmt::Debug for GenericByteArray { } } +impl super::private::Sealed for GenericByteArray {} + impl Array for GenericByteArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index f677c4ae6757..b31c76ab5a27 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -854,6 +854,8 @@ impl Debug for GenericByteViewArray { } } +impl super::private::Sealed for GenericByteViewArray {} + impl Array for GenericByteViewArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 5243218392f6..be7703b13c5c 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -697,6 +697,8 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray } } +impl super::private::Sealed for DictionaryArray {} + impl Array for DictionaryArray { fn as_any(&self) -> &dyn Any { self @@ -856,6 +858,8 @@ impl<'a, K: ArrowDictionaryKeyType, V> TypedDictionaryArray<'a, K, V> { } } +impl super::private::Sealed for TypedDictionaryArray<'_, K, V> {} + impl Array for TypedDictionaryArray<'_, K, V> { fn as_any(&self) -> &dyn Any { self.dictionary diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index d13cecb18027..b94e168cfe7c 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -602,6 +602,8 @@ impl std::fmt::Debug for FixedSizeBinaryArray { } } +impl super::private::Sealed for FixedSizeBinaryArray {} + impl Array for FixedSizeBinaryArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index fca92a64812c..3d5e8a0787c2 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -462,6 +462,8 @@ impl From for ArrayData { } } +impl super::private::Sealed for FixedSizeListArray {} + impl Array for FixedSizeListArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 32add1abf557..225be14ae365 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -525,6 +525,8 @@ impl GenericListArray { } } +impl super::private::Sealed for GenericListArray {} + impl Array for GenericListArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 867dcf955be7..52c88d581d20 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -415,6 +415,8 @@ impl ArrayAccessor for &GenericListViewArray super::private::Sealed for GenericListViewArray {} + impl Array for GenericListViewArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index b5e611a92b57..86608d586f34 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -361,6 +361,8 @@ impl MapArray { } } +impl super::private::Sealed for MapArray {} + impl Array for MapArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index bb114be95045..75e32d57e89c 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -78,8 +78,18 @@ pub use list_view_array::*; use crate::iterator::ArrayIter; +mod private { + /// Private marker trait to ensure [`super::Array`] can not be implemented outside this crate + pub trait Sealed {} + + impl Sealed for &T {} +} + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) -pub trait Array: std::fmt::Debug + Send + Sync { +/// +/// This trait is sealed as it is not intended for custom array types, rather only +/// those defined in this crate. +pub trait Array: std::fmt::Debug + Send + Sync + private::Sealed { /// Returns the array as [`Any`] so that it can be /// downcasted to a specific implementation. /// @@ -341,6 +351,8 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// A reference-counted reference to a generic `Array` pub type ArrayRef = Arc; +impl private::Sealed for ArrayRef {} + /// Ergonomics: Allow use of an ArrayRef as an `&dyn Array` impl Array for ArrayRef { fn as_any(&self) -> &dyn Any { diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 72556a92a3bc..b682466b6738 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -76,6 +76,8 @@ impl NullArray { } } +impl super::private::Sealed for NullArray {} + impl Array for NullArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index e71f4d47193f..457c2428145e 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1190,6 +1190,8 @@ impl From> for ArrayData { } } +impl super::private::Sealed for PrimitiveArray {} + impl Array for PrimitiveArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index ddc99f8e172d..5254a0ed3cdc 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -260,6 +260,8 @@ impl From> for ArrayData { } } +impl super::private::Sealed for RunArray {} + impl Array for RunArray { fn as_any(&self) -> &dyn Any { self @@ -519,6 +521,8 @@ impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { } } +impl super::private::Sealed for TypedRunArray<'_, R, V> {} + impl Array for TypedRunArray<'_, R, V> { fn as_any(&self) -> &dyn Any { self.run_array diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 5b18bd35d026..6ad1ead0d250 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -401,6 +401,8 @@ impl TryFrom> for StructArray { } } +impl super::private::Sealed for StructArray {} + impl Array for StructArray { fn as_any(&self) -> &dyn Any { self diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 934107d075f7..e08542bc8638 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -738,6 +738,8 @@ impl From for ArrayData { } } +impl super::private::Sealed for UnionArray {} + impl Array for UnionArray { fn as_any(&self) -> &dyn Any { self From a8346be02a240788b1246d847bc8dfec21274306 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jan 2026 12:06:24 -0500 Subject: [PATCH 24/33] Minor: make it clear cache array reader is not cloning arrays (#9057) # Which issue does this PR close? - related to https://github.com/apache/datafusion/pull/19477 # Rationale for this change I am tracking down allocations in various parts of the parquet reader (to remove them) and I ran across this in the cached reader. # What changes are included in this PR? Use `Arc::clone` to make it clear there is no deep cloning going on . I don't expect this will have any impact on actual performance # Are these changes tested? By CI # Are there any user-facing changes? --- parquet/src/arrow/array_reader/cached_array_reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/array_reader/cached_array_reader.rs b/parquet/src/arrow/array_reader/cached_array_reader.rs index a2fa0e903599..b55b1e1d1a65 100644 --- a/parquet/src/arrow/array_reader/cached_array_reader.rs +++ b/parquet/src/arrow/array_reader/cached_array_reader.rs @@ -201,7 +201,7 @@ impl ArrayReader for CachedArrayReader { // Check local cache first let cached = if let Some(array) = self.local_cache.get(&batch_id) { - Some(array.clone()) + Some(Arc::clone(array)) } else { // If not in local cache, i.e., we are consumer, check shared cache let cache_content = self @@ -211,7 +211,7 @@ impl ArrayReader for CachedArrayReader { .get(self.column_idx, batch_id); if let Some(array) = cache_content.as_ref() { // Store in local cache for later use in consume_batch - self.local_cache.insert(batch_id, array.clone()); + self.local_cache.insert(batch_id, Arc::clone(array)); } cache_content }; From 28f66f94e0e7f0e2b27686207a91e40b60e3b957 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Wed, 7 Jan 2026 17:40:51 +0000 Subject: [PATCH 25/33] Add Union encoding documentation (#9102) # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/issues/9084. # What changes are included in this PR? Documentation on union types encoding in https://arrow.apache.org/rust/arrow_row/struct.RowConverter.html. # Are these changes tested? Yes. # Are there any user-facing changes? Yes. https://arrow.apache.org/rust/arrow_row/struct.RowConverter.html will get updated. --------- Co-authored-by: Andrew Lamb --- arrow-row/src/lib.rs | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 307281bf9db1..4cafbc2748ee 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -415,6 +415,41 @@ mod variable; /// ///``` /// +/// ## Union Encoding +/// +/// A union value is encoded as a single type-id byte followed by the row encoding of the selected child value. +/// The type-id byte is always present; union arrays have no top-level null marker, so nulls are represented by the child encoding. +/// +/// For example, given a union of Int32 (type_id = 0) and Utf8 (type_id = 1): +/// +/// ```text +/// ┌──┬──────────────┐ +/// 3 │00│01│80│00│00│03│ +/// └──┴──────────────┘ +/// │ └─ signed integer encoding (non-null) +/// └──── type_id +/// +/// ┌──┬────────────────────────────────┐ +/// "abc" │01│02│'a'│'b'│'c'│00│00│00│00│00│03│ +/// └──┴────────────────────────────────┘ +/// │ └─ string encoding (non-null) +/// └──── type_id +/// +/// ┌──┬──────────────┐ +/// null Int32 │00│00│00│00│00│00│ +/// └──┴──────────────┘ +/// │ └─ signed integer encoding (null) +/// └──── type_id +/// +/// ┌──┬──┐ +/// null Utf8 │01│00│ +/// └──┴──┘ +/// │ └─ string encoding (null) +/// └──── type_id +/// ``` +/// +/// See [`UnionArray`] for more details on union types. +/// /// # Ordering /// /// ## Float Ordering @@ -431,6 +466,12 @@ mod variable; /// The encoding described above will order nulls first, this can be inverted by representing /// nulls as `0xFF_u8` instead of `0_u8` /// +/// ## Union Ordering +/// +/// Values of the same type are ordered according to the ordering of that type. +/// Values of different types are ordered by their type id. +/// The type_id is negated when descending order is specified. +/// /// ## Reverse Column Ordering /// /// The order of a given column can be reversed by negating the encoded bytes of non-null values From 9e822e050016e659028741499388d454a74cf23c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jan 2026 13:04:39 -0500 Subject: [PATCH 26/33] Update version to `57.2.0`, add CHANGELOG (#9103) # Which issue does this PR close? - Part of https://github.com/apache/arrow-rs/issues/8465 # Rationale for this change - See https://github.com/apache/arrow-rs/issues/8465 # What changes are included in this PR? 1. Update version to 57.2.0 2. Add CHANGELOG. See rendered version https://github.com/alamb/arrow-rs/blob/alamb/prepare_57.2.0/CHANGELOG.md # Are these changes tested? By CI and I am testing them manually in DataFusion here: - https://github.com/apache/datafusion/pull/19355 # Are there any user-facing changes? Version and changelog --- CHANGELOG-old.md | 167 ++++++++++++++++++ CHANGELOG.md | 292 ++++++++++++++++--------------- Cargo.toml | 42 ++--- dev/release/update_change_log.sh | 4 +- 4 files changed, 337 insertions(+), 168 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 0aa159fa7993..a651a860f893 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -20,6 +20,173 @@ # Historical Changelog +## [57.1.0](https://github.com/apache/arrow-rs/tree/57.1.0) (2025-11-20) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/57.0.0...57.1.0) + +**Implemented enhancements:** + +- Eliminate bound checks in filter kernels [\#8865](https://github.com/apache/arrow-rs/issues/8865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Respect page index policy option for ParquetObjectReader when it's not skip [\#8856](https://github.com/apache/arrow-rs/issues/8856) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up collect\_bool and remove `unsafe` [\#8848](https://github.com/apache/arrow-rs/issues/8848) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error reading parquet FileMetaData with empty lists encoded as element-type=0 [\#8826](https://github.com/apache/arrow-rs/issues/8826) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- ValueStatistics methods can't be used from generic context in external crate [\#8823](https://github.com/apache/arrow-rs/issues/8823) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Custom Pretty-Printing Implementation for Column when Formatting Record Batches [\#8821](https://github.com/apache/arrow-rs/issues/8821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet-concat: supports bloom filter and page index [\#8804](https://github.com/apache/arrow-rs/issues/8804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[Parquet\] virtual row number support [\#7299](https://github.com/apache/arrow-rs/issues/7299) +- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8795](https://github.com/apache/arrow-rs/issues/8795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Simplify decision logic to call `FilterBuilder::optimize` or not [\#8781](https://github.com/apache/arrow-rs/issues/8781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Add variant to arrow for DataType::{Binary, LargeBinary, BinaryView} [\#8767](https://github.com/apache/arrow-rs/issues/8767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Provide algorithm that allows zipping arrays whose values are not prealigned [\#8752](https://github.com/apache/arrow-rs/issues/8752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Parquet\] ParquetMetadataReader decodes too much metadata under point-get scenerio [\#8751](https://github.com/apache/arrow-rs/issues/8751) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `arrow-json` supports encoding binary arrays, but not decoding [\#8736](https://github.com/apache/arrow-rs/issues/8736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow `FilterPredicate` instances to be reused for RecordBatches [\#8692](https://github.com/apache/arrow-rs/issues/8692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrowJsonBatch::from\_batch is incomplete [\#8684](https://github.com/apache/arrow-rs/issues/8684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet-layout: More info about layout including footer size, page index, bloom filter? [\#8682](https://github.com/apache/arrow-rs/issues/8682) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rewrite `ParquetRecordBatchStream` \(async API\) in terms of the PushDecoder [\#8677](https://github.com/apache/arrow-rs/issues/8677) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[JSON\] Add encoding for binary view [\#8674](https://github.com/apache/arrow-rs/issues/8674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8670](https://github.com/apache/arrow-rs/issues/8670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support Uuid/`FixedSizeBinary(16)` shredding [\#8665](https://github.com/apache/arrow-rs/issues/8665) +- \[Parquet\]There should be an encoding counter to know how many encodings the repo supports in total [\#8662](https://github.com/apache/arrow-rs/issues/8662) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Improve `parse_data_type` for `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList`, `Union`, `Map`, `RunEndCoded`. [\#8648](https://github.com/apache/arrow-rs/issues/8648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support variant to arrow primitive support null/time/decimal\_\* [\#8637](https://github.com/apache/arrow-rs/issues/8637) +- Return error from `RleDecoder::reset` rather than panic [\#8632](https://github.com/apache/arrow-rs/issues/8632) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add bitwise ops on `BooleanBufferBuilder` and `MutableBuffer` that mutate directly the buffer [\#8618](https://github.com/apache/arrow-rs/issues/8618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Add variant\_to\_arrow Utf-8, LargeUtf8, Utf8View types support [\#8567](https://github.com/apache/arrow-rs/issues/8567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Regression: Parsing `List(Int64)` results in nullable list in 57.0.0 and a non-nullable list in 57.1.0 [\#8883](https://github.com/apache/arrow-rs/issues/8883) +- Regression: FixedSlizeList data type parsing fails on 57.1.0 [\#8880](https://github.com/apache/arrow-rs/issues/8880) +- \(dyn ArrayFormatterFactory + 'static\) can't be safely shared between threads [\#8875](https://github.com/apache/arrow-rs/issues/8875) +- RowNumber reader has wrong row group ordering [\#8864](https://github.com/apache/arrow-rs/issues/8864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `ThriftMetadataWriter::write_column_indexes` cannot handle a `ColumnIndexMetaData::NONE` [\#8815](https://github.com/apache/arrow-rs/issues/8815) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- "Archery test With other arrows" Integration test failing on main: [\#8813](https://github.com/apache/arrow-rs/issues/8813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Parquet\] Writing in 57.0.0 seems 10% slower than 56.0.0 [\#8783](https://github.com/apache/arrow-rs/issues/8783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet reader cannot handle files with unknown logical types [\#8776](https://github.com/apache/arrow-rs/issues/8776) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- zip now treats nulls as false in provided mask regardless of the underlying bit value [\#8721](https://github.com/apache/arrow-rs/issues/8721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[avro\] Incorrect version in crate.io landing page [\#8691](https://github.com/apache/arrow-rs/issues/8691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Array: ViewType gc\(\) has bug when array sum length exceed i32::MAX [\#8681](https://github.com/apache/arrow-rs/issues/8681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet 56: encounter `error: item_reader def levels are None` when reading nested field with row filter [\#8657](https://github.com/apache/arrow-rs/issues/8657) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Degnerate and non-nullable `FixedSizeListArray`s are not handled [\#8623](https://github.com/apache/arrow-rs/issues/8623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Parquet\]Performance Degradation with RowFilter on Unsorted Columns due to Fragmented ReadPlan [\#8565](https://github.com/apache/arrow-rs/issues/8565) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- docs: Add example for creating a `MutableBuffer` from `Buffer` [\#8853](https://github.com/apache/arrow-rs/pull/8853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- docs: Add examples for creating MutableBuffer from Vec [\#8852](https://github.com/apache/arrow-rs/pull/8852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve ParquetDecoder docs [\#8802](https://github.com/apache/arrow-rs/pull/8802) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Update docs for zero copy conversion of ScalarBuffer [\#8772](https://github.com/apache/arrow-rs/pull/8772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add example to convert `PrimitiveArray` to a `Vec` [\#8771](https://github.com/apache/arrow-rs/pull/8771) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- docs: Add links for arrow-avro [\#8770](https://github.com/apache/arrow-rs/pull/8770) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Parquet\] Minor: Update comments in page decompressor [\#8764](https://github.com/apache/arrow-rs/pull/8764) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Document limitations of the `arrow_integration_test` crate [\#8738](https://github.com/apache/arrow-rs/pull/8738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) +- docs: Add link to the Arrow implementation status page [\#8732](https://github.com/apache/arrow-rs/pull/8732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- docs: Update Parquet readme implementation status [\#8731](https://github.com/apache/arrow-rs/pull/8731) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- `RowConverter::from_binary` should opportunistically take ownership of the buffer [\#8685](https://github.com/apache/arrow-rs/issues/8685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speed up filter some more \(up to 2x\) [\#8868](https://github.com/apache/arrow-rs/pull/8868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Speed up `collect_bool` and remove `unsafe`, optimize `take_bits`, `take_native` for null values [\#8849](https://github.com/apache/arrow-rs/pull/8849) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Change `BooleanBuffer::append_packed_range` to use `apply_bitwise_binary_op` [\#8812](https://github.com/apache/arrow-rs/pull/8812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Parquet\] Avoid copying `LogicalType` in `ColumnOrder::get_sort_order`, deprecate `get_logical_type` [\#8789](https://github.com/apache/arrow-rs/pull/8789) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- perf: Speed up Parquet file writing \(10%, back to speed of 56\) [\#8786](https://github.com/apache/arrow-rs/pull/8786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- perf: override `ArrayIter` default impl for `nth`, `nth_back`, `last` and `count` [\#8785](https://github.com/apache/arrow-rs/pull/8785) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Parquet\] Reduce one copy in `SerializedPageReader` [\#8745](https://github.com/apache/arrow-rs/pull/8745) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Small optimization in Parquet varint decoder [\#8742](https://github.com/apache/arrow-rs/pull/8742) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- perf: override `count`, `nth`, `nth_back`, `last` and `max` for BitIterator [\#8696](https://github.com/apache/arrow-rs/pull/8696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Add `FilterPredicate::filter_record_batch` [\#8693](https://github.com/apache/arrow-rs/pull/8693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) +- perf: zero-copy path in `RowConverter::from_binary` [\#8686](https://github.com/apache/arrow-rs/pull/8686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mzabaluev](https://github.com/mzabaluev)) +- perf: add optimized zip implementation for scalars [\#8653](https://github.com/apache/arrow-rs/pull/8653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- feat: add `apply_unary_op` and `apply_binary_op` bitwise operations [\#8619](https://github.com/apache/arrow-rs/pull/8619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Parquet\]Optimize the performance in record reader [\#8607](https://github.com/apache/arrow-rs/pull/8607) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) + +**Closed issues:** + +- Variant to NullType conversion ignores strict casting [\#8810](https://github.com/apache/arrow-rs/issues/8810) +- Unify display representation for `Field` [\#8784](https://github.com/apache/arrow-rs/issues/8784) +- Misleading configuration name: skip\_arrow\_metadata [\#8780](https://github.com/apache/arrow-rs/issues/8780) +- Inconsistent display for types with Metadata [\#8761](https://github.com/apache/arrow-rs/issues/8761) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Internal `arrow-integration-test` crate is linked from `arrow` docs [\#8739](https://github.com/apache/arrow-rs/issues/8739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add benchmark for RunEndEncoded casting [\#8709](https://github.com/apache/arrow-rs/issues/8709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Varaint\] Support `VariantArray::value` to return a `Result` [\#8672](https://github.com/apache/arrow-rs/issues/8672) + +**Merged pull requests:** + +- Fix regression caused by changes in Display for DataType - display \(`List(non-null Int64)` instead of `List(nullable Int64)` [\#8890](https://github.com/apache/arrow-rs/pull/8890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([etseidl](https://github.com/etseidl)) +- Support parsing for old style FixedSizeList [\#8882](https://github.com/apache/arrow-rs/pull/8882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Make ArrayFormatterFactory Send + Sync and add a test [\#8878](https://github.com/apache/arrow-rs/pull/8878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- Make `ArrowReaderOptions::with_virtual_columns` error rather than panic on invalid input [\#8867](https://github.com/apache/arrow-rs/pull/8867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Fix errors when reading nested Lists with pushdown predicates. [\#8866](https://github.com/apache/arrow-rs/pull/8866) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Fix `RowNumberReader` when not all row groups are selected [\#8863](https://github.com/apache/arrow-rs/pull/8863) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) +- Respect page index policy option for ParquetObjectReader when it's not skip [\#8857](https://github.com/apache/arrow-rs/pull/8857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zhuqi-lucas](https://github.com/zhuqi-lucas)) +- build\(deps\): update apache-avro requirement from 0.20.0 to 0.21.0 [\#8832](https://github.com/apache/arrow-rs/pull/8832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow Users to Provide Custom `ArrayFormatter`s when Pretty-Printing Record Batches [\#8829](https://github.com/apache/arrow-rs/pull/8829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- Allow reading of improperly constructed empty lists in Parquet metadata [\#8827](https://github.com/apache/arrow-rs/pull/8827) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- \[Variant\] Fix cast logic for Variant to Arrow for DataType::Null [\#8825](https://github.com/apache/arrow-rs/pull/8825) ([klion26](https://github.com/klion26)) +- remove T: ParquetValueType bound on ValueStatistics [\#8824](https://github.com/apache/arrow-rs/pull/8824) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pmarks](https://github.com/pmarks)) +- build\(deps\): update lz4\_flex requirement from 0.11 to 0.12 [\#8820](https://github.com/apache/arrow-rs/pull/8820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix bug in handling of empty Parquet page index structures [\#8817](https://github.com/apache/arrow-rs/pull/8817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Parquet-concat: supports page index and bloom filter [\#8811](https://github.com/apache/arrow-rs/pull/8811) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) +- \[Doc\] Correct `ListArray` documentation [\#8803](https://github.com/apache/arrow-rs/pull/8803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) +- \[Parquet\] Add additional docs for `ArrowReaderOptions` and `ArrowReaderMetadata` [\#8798](https://github.com/apache/arrow-rs/pull/8798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8796](https://github.com/apache/arrow-rs/pull/8796) ([liamzwbao](https://github.com/liamzwbao)) +- Add `VariantPath::is_empty` [\#8791](https://github.com/apache/arrow-rs/pull/8791) ([friendlymatthew](https://github.com/friendlymatthew)) +- Add FilterBuilder::is\_optimize\_beneficial [\#8782](https://github.com/apache/arrow-rs/pull/8782) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) +- \[Parquet\] Allow reading of files with unknown logical types [\#8777](https://github.com/apache/arrow-rs/pull/8777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- bench: add `ArrayIter` benchmarks [\#8774](https://github.com/apache/arrow-rs/pull/8774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Update Rust toolchain to 1.91 [\#8769](https://github.com/apache/arrow-rs/pull/8769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- \[Variant\] Add variant to arrow for `DataType::{Binary/LargeBinary/BinaryView}` [\#8768](https://github.com/apache/arrow-rs/pull/8768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([klion26](https://github.com/klion26)) +- feat: parse `DataType::Union`, `DataType::Map`, `DataType::RunEndEncoded` [\#8765](https://github.com/apache/arrow-rs/pull/8765) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- Add options to control various aspects of Parquet metadata decoding [\#8763](https://github.com/apache/arrow-rs/pull/8763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- feat: Ensure consistent metadata display for data types [\#8760](https://github.com/apache/arrow-rs/pull/8760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mhilton](https://github.com/mhilton)) +- Clean up predicate\_cache tests [\#8755](https://github.com/apache/arrow-rs/pull/8755) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- refactor `test_cache_projection_excludes_nested_columns` to use high level APIs [\#8754](https://github.com/apache/arrow-rs/pull/8754) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add `merge` and `merge_n` kernels [\#8753](https://github.com/apache/arrow-rs/pull/8753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) +- Fix lint in arrow-flight by updating assert\_cmd after it upgraded [\#8741](https://github.com/apache/arrow-rs/pull/8741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([vegarsti](https://github.com/vegarsti)) +- Remove link to internal `arrow-integration-test` crate from main `arrow` crate [\#8740](https://github.com/apache/arrow-rs/pull/8740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) +- Implement hex decoding of JSON strings to binary arrays [\#8737](https://github.com/apache/arrow-rs/pull/8737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) +- \[Parquet\] Adaptive Parquet Predicate Pushdown [\#8733](https://github.com/apache/arrow-rs/pull/8733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) +- \[Parquet\] Return error from `RleDecoder::reload` rather than panic [\#8729](https://github.com/apache/arrow-rs/pull/8729) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liamzwbao](https://github.com/liamzwbao)) +- fix: `ArrayIter` does not report size hint correctly after advancing from the iterator back [\#8728](https://github.com/apache/arrow-rs/pull/8728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- perf: Use Vec::with\_capacity in cast\_to\_run\_end\_encoded [\#8726](https://github.com/apache/arrow-rs/pull/8726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) +- \[Variant\] Fix the index of an item in VariantArray in a unit test [\#8725](https://github.com/apache/arrow-rs/pull/8725) ([martin-g](https://github.com/martin-g)) +- build\(deps\): bump actions/download-artifact from 5 to 6 [\#8720](https://github.com/apache/arrow-rs/pull/8720) ([dependabot[bot]](https://github.com/apps/dependabot)) +- \[Variant\] Add try\_value/value for VariantArray [\#8719](https://github.com/apache/arrow-rs/pull/8719) ([klion26](https://github.com/klion26)) +- General virtual columns support + row numbers as a first use-case [\#8715](https://github.com/apache/arrow-rs/pull/8715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) +- feat: Parquet-layout add Index and Footer info [\#8712](https://github.com/apache/arrow-rs/pull/8712) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) +- fix: `zip` now treats nulls as false in provided mask regardless of the underlying bit value [\#8711](https://github.com/apache/arrow-rs/pull/8711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Add benchmark for casting to RunEndEncoded \(REE\) [\#8710](https://github.com/apache/arrow-rs/pull/8710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) +- \[Minor\]: Document visibility for enums produced by Thrift macros [\#8706](https://github.com/apache/arrow-rs/pull/8706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Update `arrow-avro` `README.md` version to 57 [\#8695](https://github.com/apache/arrow-rs/pull/8695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) +- Fix: ViewType gc on huge batch would produce bad output [\#8694](https://github.com/apache/arrow-rs/pull/8694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) +- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8689](https://github.com/apache/arrow-rs/pull/8689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) +- check bit width to avoid panic in DeltaBitPackDecoder [\#8688](https://github.com/apache/arrow-rs/pull/8688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) +- \[thrift-remodel\] Use `thrift_enum` macro for `ConvertedType` [\#8680](https://github.com/apache/arrow-rs/pull/8680) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- \[JSON\] Map key supports utf8 view [\#8679](https://github.com/apache/arrow-rs/pull/8679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) +- \[JSON\] Add encoding for binary view [\#8675](https://github.com/apache/arrow-rs/pull/8675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) +- \[Parquet\] Account for FileDecryptor in ParquetMetaData heap size calculation [\#8671](https://github.com/apache/arrow-rs/pull/8671) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([adamreeve](https://github.com/adamreeve)) +- chore: update `OffsetBuffer::from_lengths(std::iter::repeat_n(, ));` with `OffsetBuffer::from_repeated_length(, );` [\#8669](https://github.com/apache/arrow-rs/pull/8669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Variant\] Support `shred_variant` for Uuids [\#8666](https://github.com/apache/arrow-rs/pull/8666) ([friendlymatthew](https://github.com/friendlymatthew)) +- \[Variant\] Remove `create_test_variant_array` helper method [\#8664](https://github.com/apache/arrow-rs/pull/8664) ([friendlymatthew](https://github.com/friendlymatthew)) +- \[parquet\] Adding counting method in thrift\_enum macro to support ENCODING\_SLOTS [\#8663](https://github.com/apache/arrow-rs/pull/8663) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) +- chore: add test case of RowSelection::trim [\#8660](https://github.com/apache/arrow-rs/pull/8660) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([lichuang](https://github.com/lichuang)) +- feat: add `new_repeated` to `ByteArray` [\#8659](https://github.com/apache/arrow-rs/pull/8659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- perf: add `repeat_slice_n_times` to `MutableBuffer` [\#8658](https://github.com/apache/arrow-rs/pull/8658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- perf: add optimized function to create offset with same length [\#8656](https://github.com/apache/arrow-rs/pull/8656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- \[Variant\] `rescale_decimal` followup [\#8655](https://github.com/apache/arrow-rs/pull/8655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) +- feat: parse DataType `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList` [\#8649](https://github.com/apache/arrow-rs/pull/8649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- Support more operations on ListView [\#8645](https://github.com/apache/arrow-rs/pull/8645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([a10y](https://github.com/a10y)) +- \[Variant\] Implement primitive type access for null/time/decimal\* [\#8638](https://github.com/apache/arrow-rs/pull/8638) ([klion26](https://github.com/klion26)) +- \[Variant\] refactor: Split builder.rs into several smaller files [\#8635](https://github.com/apache/arrow-rs/pull/8635) ([Weijun-H](https://github.com/Weijun-H)) +- add `try_new_with_length` constructor to `FixedSizeList` [\#8624](https://github.com/apache/arrow-rs/pull/8624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([connortsui20](https://github.com/connortsui20)) +- Change some panics to errors in parquet decoder [\#8602](https://github.com/apache/arrow-rs/pull/8602) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) +- Support `variant_to_arrow` for utf8 [\#8600](https://github.com/apache/arrow-rs/pull/8600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sdf-jkl](https://github.com/sdf-jkl)) +- Cast support for RunEndEncoded arrays [\#8589](https://github.com/apache/arrow-rs/pull/8589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) + + + ## [57.0.0](https://github.com/apache/arrow-rs/tree/57.0.0) (2025-10-19) [Full Changelog](https://github.com/apache/arrow-rs/compare/56.2.0...57.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 240e9681c2ef..fbbdba7d36ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,170 +19,172 @@ # Changelog -## [57.1.0](https://github.com/apache/arrow-rs/tree/57.1.0) (2025-11-20) +## [57.2.0](https://github.com/apache/arrow-rs/tree/57.2.0) (2026-01-07) -[Full Changelog](https://github.com/apache/arrow-rs/compare/57.0.0...57.1.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/57.1.0...57.2.0) + +**Breaking changes:** + +- Seal Array trait [\#9092](https://github.com/apache/arrow-rs/pull/9092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[Variant\] Unify the CastOptions usage in parquet-variant-compute [\#8984](https://github.com/apache/arrow-rs/pull/8984) ([klion26](https://github.com/klion26)) **Implemented enhancements:** -- Eliminate bound checks in filter kernels [\#8865](https://github.com/apache/arrow-rs/issues/8865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Respect page index policy option for ParquetObjectReader when it's not skip [\#8856](https://github.com/apache/arrow-rs/issues/8856) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Speed up collect\_bool and remove `unsafe` [\#8848](https://github.com/apache/arrow-rs/issues/8848) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Error reading parquet FileMetaData with empty lists encoded as element-type=0 [\#8826](https://github.com/apache/arrow-rs/issues/8826) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- ValueStatistics methods can't be used from generic context in external crate [\#8823](https://github.com/apache/arrow-rs/issues/8823) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Custom Pretty-Printing Implementation for Column when Formatting Record Batches [\#8821](https://github.com/apache/arrow-rs/issues/8821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet-concat: supports bloom filter and page index [\#8804](https://github.com/apache/arrow-rs/issues/8804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[Parquet\] virtual row number support [\#7299](https://github.com/apache/arrow-rs/issues/7299) -- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8795](https://github.com/apache/arrow-rs/issues/8795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Simplify decision logic to call `FilterBuilder::optimize` or not [\#8781](https://github.com/apache/arrow-rs/issues/8781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Add variant to arrow for DataType::{Binary, LargeBinary, BinaryView} [\#8767](https://github.com/apache/arrow-rs/issues/8767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Provide algorithm that allows zipping arrays whose values are not prealigned [\#8752](https://github.com/apache/arrow-rs/issues/8752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Parquet\] ParquetMetadataReader decodes too much metadata under point-get scenerio [\#8751](https://github.com/apache/arrow-rs/issues/8751) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `arrow-json` supports encoding binary arrays, but not decoding [\#8736](https://github.com/apache/arrow-rs/issues/8736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow `FilterPredicate` instances to be reused for RecordBatches [\#8692](https://github.com/apache/arrow-rs/issues/8692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ArrowJsonBatch::from\_batch is incomplete [\#8684](https://github.com/apache/arrow-rs/issues/8684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet-layout: More info about layout including footer size, page index, bloom filter? [\#8682](https://github.com/apache/arrow-rs/issues/8682) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Rewrite `ParquetRecordBatchStream` \(async API\) in terms of the PushDecoder [\#8677](https://github.com/apache/arrow-rs/issues/8677) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[JSON\] Add encoding for binary view [\#8674](https://github.com/apache/arrow-rs/issues/8674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8670](https://github.com/apache/arrow-rs/issues/8670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Support Uuid/`FixedSizeBinary(16)` shredding [\#8665](https://github.com/apache/arrow-rs/issues/8665) -- \[Parquet\]There should be an encoding counter to know how many encodings the repo supports in total [\#8662](https://github.com/apache/arrow-rs/issues/8662) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Improve `parse_data_type` for `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList`, `Union`, `Map`, `RunEndCoded`. [\#8648](https://github.com/apache/arrow-rs/issues/8648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Support variant to arrow primitive support null/time/decimal\_\* [\#8637](https://github.com/apache/arrow-rs/issues/8637) -- Return error from `RleDecoder::reset` rather than panic [\#8632](https://github.com/apache/arrow-rs/issues/8632) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add bitwise ops on `BooleanBufferBuilder` and `MutableBuffer` that mutate directly the buffer [\#8618](https://github.com/apache/arrow-rs/issues/8618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Variant\] Add variant\_to\_arrow Utf-8, LargeUtf8, Utf8View types support [\#8567](https://github.com/apache/arrow-rs/issues/8567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[parquet\] further relax `LevelInfoBuilder::types_compatible` for `ArrowWriter` [\#9098](https://github.com/apache/arrow-rs/issues/9098) +- Update arrow-row documentation with Union encoding [\#9084](https://github.com/apache/arrow-rs/issues/9084) +- Add code examples for min and max compute functions [\#9055](https://github.com/apache/arrow-rs/issues/9055) +- Add `append_n` to bytes view builder API [\#9034](https://github.com/apache/arrow-rs/issues/9034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Move `RunArray::get_physical_indices` to `RunEndBuffer` [\#9025](https://github.com/apache/arrow-rs/issues/9025) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow quote style in csv writer [\#9003](https://github.com/apache/arrow-rs/issues/9003) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- IPC support for ListView [\#9002](https://github.com/apache/arrow-rs/issues/9002) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `BinaryArrayType` for `&FixedSizeBinaryArray`s [\#8992](https://github.com/apache/arrow-rs/issues/8992) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-buffer: implement num-traits for i256 [\#8976](https://github.com/apache/arrow-rs/issues/8976) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support for `Arc` in `ParquetRecordWriter` derive macro [\#8972](https://github.com/apache/arrow-rs/issues/8972) +- \[arrow-avro\] suggest switching from xz to liblzma [\#8970](https://github.com/apache/arrow-rs/issues/8970) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-buffer: add i256::trailing\_zeros [\#8968](https://github.com/apache/arrow-rs/issues/8968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-buffer: make i256::leading\_zeros public [\#8965](https://github.com/apache/arrow-rs/issues/8965) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add spark like `ignoreLeadingWhiteSpace` and `ignoreTrailingWhiteSpace` options to the csv writer [\#8961](https://github.com/apache/arrow-rs/issues/8961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add round trip benchmark for Parquet writer/reader [\#8955](https://github.com/apache/arrow-rs/issues/8955) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support performant `interleave` for List/LargeList [\#8952](https://github.com/apache/arrow-rs/issues/8952) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support array access when parsing `VariantPath` [\#8946](https://github.com/apache/arrow-rs/issues/8946) +- Some panic!s could be represented as unimplemented!s [\#8932](https://github.com/apache/arrow-rs/issues/8932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] easier way to construct a shredded schema [\#8922](https://github.com/apache/arrow-rs/issues/8922) +- Support `DataType::ListView` and `DataType::LargeListView` in `ArrayData::new_null` [\#8908](https://github.com/apache/arrow-rs/issues/8908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `GenericListViewArray::from_iter_primitive` [\#8906](https://github.com/apache/arrow-rs/issues/8906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Unify the cast option usage in ParquentVariant [\#8873](https://github.com/apache/arrow-rs/issues/8873) +- Blog post about efficient filter representation in Parquet filter pushdown [\#8843](https://github.com/apache/arrow-rs/issues/8843) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add comparison support for Union arrays in the `cmp` kernel [\#8837](https://github.com/apache/arrow-rs/issues/8837) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support array shredding into `List/LargeList/ListView/LargeListView` [\#8830](https://github.com/apache/arrow-rs/issues/8830) +- Support `Union` data types for row format [\#8828](https://github.com/apache/arrow-rs/issues/8828) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FFI support for ListView [\#8819](https://github.com/apache/arrow-rs/issues/8819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[Variant\] Support more Arrow Datatypes from Variant primitive types [\#8805](https://github.com/apache/arrow-rs/issues/8805) +- `FixedSizeBinaryBuilder` supports `append_array` [\#8750](https://github.com/apache/arrow-rs/issues/8750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement special case `zip` with scalar for Utf8View [\#8724](https://github.com/apache/arrow-rs/issues/8724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[geometry\] Wire up arrow reader/writer for `GEOMETRY` and `GEOGRAPHY` [\#8717](https://github.com/apache/arrow-rs/issues/8717) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Regression: Parsing `List(Int64)` results in nullable list in 57.0.0 and a non-nullable list in 57.1.0 [\#8883](https://github.com/apache/arrow-rs/issues/8883) -- Regression: FixedSlizeList data type parsing fails on 57.1.0 [\#8880](https://github.com/apache/arrow-rs/issues/8880) -- \(dyn ArrayFormatterFactory + 'static\) can't be safely shared between threads [\#8875](https://github.com/apache/arrow-rs/issues/8875) -- RowNumber reader has wrong row group ordering [\#8864](https://github.com/apache/arrow-rs/issues/8864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `ThriftMetadataWriter::write_column_indexes` cannot handle a `ColumnIndexMetaData::NONE` [\#8815](https://github.com/apache/arrow-rs/issues/8815) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- "Archery test With other arrows" Integration test failing on main: [\#8813](https://github.com/apache/arrow-rs/issues/8813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Parquet\] Writing in 57.0.0 seems 10% slower than 56.0.0 [\#8783](https://github.com/apache/arrow-rs/issues/8783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet reader cannot handle files with unknown logical types [\#8776](https://github.com/apache/arrow-rs/issues/8776) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- zip now treats nulls as false in provided mask regardless of the underlying bit value [\#8721](https://github.com/apache/arrow-rs/issues/8721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[avro\] Incorrect version in crate.io landing page [\#8691](https://github.com/apache/arrow-rs/issues/8691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Array: ViewType gc\(\) has bug when array sum length exceed i32::MAX [\#8681](https://github.com/apache/arrow-rs/issues/8681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet 56: encounter `error: item_reader def levels are None` when reading nested field with row filter [\#8657](https://github.com/apache/arrow-rs/issues/8657) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Degnerate and non-nullable `FixedSizeListArray`s are not handled [\#8623](https://github.com/apache/arrow-rs/issues/8623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Parquet\]Performance Degradation with RowFilter on Unsorted Columns due to Fragmented ReadPlan [\#8565](https://github.com/apache/arrow-rs/issues/8565) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Soundness Bug in `try_binary` when `Array` is implemented incorrectly in external crate [\#9106](https://github.com/apache/arrow-rs/issues/9106) +- casting `Dict(_, LargeUtf8)` to `Utf8View` \(`StringViewArray`\) panics [\#9101](https://github.com/apache/arrow-rs/issues/9101) +- wrong results for null count of `nullif` kernel [\#9085](https://github.com/apache/arrow-rs/issues/9085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Empty first line in some code examples [\#9063](https://github.com/apache/arrow-rs/issues/9063) +- GenericByteViewArray::slice is not zero-copy but ought to be [\#9014](https://github.com/apache/arrow-rs/issues/9014) +- Regression in struct casting in 57.2.0 \(not yet released\) [\#9005](https://github.com/apache/arrow-rs/issues/9005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix panic when decoding multiple Union columns in RowConverter [\#8999](https://github.com/apache/arrow-rs/issues/8999) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `take_fixed_size_binary` Does Not Consider NULL Indices [\#8947](https://github.com/apache/arrow-rs/issues/8947) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[arrow-avro\] RecordEncoder Bugs [\#8934](https://github.com/apache/arrow-rs/issues/8934) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `FixedSizeBinaryArray::try_new(...)` Panics with Item Length of Zero [\#8926](https://github.com/apache/arrow-rs/issues/8926) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `cargo test -p arrow-cast` fails on main [\#8910](https://github.com/apache/arrow-rs/issues/8910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `GenericListViewArray::new_null` ignores `len` and returns an empty array [\#8904](https://github.com/apache/arrow-rs/issues/8904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `FixedSizeBinaryArray::new_null` Does Not Properly Set the Length of the Values Buffer [\#8900](https://github.com/apache/arrow-rs/issues/8900) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Struct casting requires same order of fields [\#8870](https://github.com/apache/arrow-rs/issues/8870) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cannot cast string dictionary to binary view [\#8841](https://github.com/apache/arrow-rs/issues/8841) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** -- docs: Add example for creating a `MutableBuffer` from `Buffer` [\#8853](https://github.com/apache/arrow-rs/pull/8853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- docs: Add examples for creating MutableBuffer from Vec [\#8852](https://github.com/apache/arrow-rs/pull/8852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve ParquetDecoder docs [\#8802](https://github.com/apache/arrow-rs/pull/8802) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Update docs for zero copy conversion of ScalarBuffer [\#8772](https://github.com/apache/arrow-rs/pull/8772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add example to convert `PrimitiveArray` to a `Vec` [\#8771](https://github.com/apache/arrow-rs/pull/8771) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- docs: Add links for arrow-avro [\#8770](https://github.com/apache/arrow-rs/pull/8770) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- \[Parquet\] Minor: Update comments in page decompressor [\#8764](https://github.com/apache/arrow-rs/pull/8764) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Document limitations of the `arrow_integration_test` crate [\#8738](https://github.com/apache/arrow-rs/pull/8738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) -- docs: Add link to the Arrow implementation status page [\#8732](https://github.com/apache/arrow-rs/pull/8732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- docs: Update Parquet readme implementation status [\#8731](https://github.com/apache/arrow-rs/pull/8731) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add Union encoding documentation [\#9102](https://github.com/apache/arrow-rs/pull/9102) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([EduardAkhmetshin](https://github.com/EduardAkhmetshin)) +- docs: fix misleading reserve documentation [\#9076](https://github.com/apache/arrow-rs/pull/9076) ([WaterWhisperer](https://github.com/WaterWhisperer)) +- Fix headers and empty lines in code examples [\#9064](https://github.com/apache/arrow-rs/pull/9064) ([EduardAkhmetshin](https://github.com/EduardAkhmetshin)) +- Add examples for min and max functions [\#9062](https://github.com/apache/arrow-rs/pull/9062) ([EduardAkhmetshin](https://github.com/EduardAkhmetshin)) +- Improve arrow-buffer documentation [\#9020](https://github.com/apache/arrow-rs/pull/9020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Move examples in arrow-csv to docstrings, polish up docs [\#9001](https://github.com/apache/arrow-rs/pull/9001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add example of parsing field names as VariantPath [\#8945](https://github.com/apache/arrow-rs/pull/8945) ([alamb](https://github.com/alamb)) +- Improve documentation for `prep\_null\_mask\_flter [\#8722](https://github.com/apache/arrow-rs/pull/8722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) **Performance improvements:** -- `RowConverter::from_binary` should opportunistically take ownership of the buffer [\#8685](https://github.com/apache/arrow-rs/issues/8685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speed up filter some more \(up to 2x\) [\#8868](https://github.com/apache/arrow-rs/pull/8868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Speed up `collect_bool` and remove `unsafe`, optimize `take_bits`, `take_native` for null values [\#8849](https://github.com/apache/arrow-rs/pull/8849) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Change `BooleanBuffer::append_packed_range` to use `apply_bitwise_binary_op` [\#8812](https://github.com/apache/arrow-rs/pull/8812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- \[Parquet\] Avoid copying `LogicalType` in `ColumnOrder::get_sort_order`, deprecate `get_logical_type` [\#8789](https://github.com/apache/arrow-rs/pull/8789) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- perf: Speed up Parquet file writing \(10%, back to speed of 56\) [\#8786](https://github.com/apache/arrow-rs/pull/8786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- perf: override `ArrayIter` default impl for `nth`, `nth_back`, `last` and `count` [\#8785](https://github.com/apache/arrow-rs/pull/8785) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Parquet\] Reduce one copy in `SerializedPageReader` [\#8745](https://github.com/apache/arrow-rs/pull/8745) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) -- Small optimization in Parquet varint decoder [\#8742](https://github.com/apache/arrow-rs/pull/8742) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- perf: override `count`, `nth`, `nth_back`, `last` and `max` for BitIterator [\#8696](https://github.com/apache/arrow-rs/pull/8696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- Add `FilterPredicate::filter_record_batch` [\#8693](https://github.com/apache/arrow-rs/pull/8693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) -- perf: zero-copy path in `RowConverter::from_binary` [\#8686](https://github.com/apache/arrow-rs/pull/8686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mzabaluev](https://github.com/mzabaluev)) -- perf: add optimized zip implementation for scalars [\#8653](https://github.com/apache/arrow-rs/pull/8653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- feat: add `apply_unary_op` and `apply_binary_op` bitwise operations [\#8619](https://github.com/apache/arrow-rs/pull/8619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Parquet\]Optimize the performance in record reader [\#8607](https://github.com/apache/arrow-rs/pull/8607) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) +- \[parquet\] Avoid a clone while resolving the read strategy [\#9056](https://github.com/apache/arrow-rs/pull/9056) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- perf: improve performance of encoding `GenericByteArray` by 8% [\#9054](https://github.com/apache/arrow-rs/pull/9054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Speed up unary `not` kernel by 50%, add `BooleanBuffer::from_bitwise_unary` [\#8996](https://github.com/apache/arrow-rs/pull/8996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- arrow-select: improve dictionary interleave fallback performance [\#8978](https://github.com/apache/arrow-rs/pull/8978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asubiotto](https://github.com/asubiotto)) +- Add special implementation for zip for Utf8View/BinaryView scalars [\#8963](https://github.com/apache/arrow-rs/pull/8963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mkleen](https://github.com/mkleen)) +- arrow-select: implement specialized interleave\_list [\#8953](https://github.com/apache/arrow-rs/pull/8953) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asubiotto](https://github.com/asubiotto)) **Closed issues:** -- Variant to NullType conversion ignores strict casting [\#8810](https://github.com/apache/arrow-rs/issues/8810) -- Unify display representation for `Field` [\#8784](https://github.com/apache/arrow-rs/issues/8784) -- Misleading configuration name: skip\_arrow\_metadata [\#8780](https://github.com/apache/arrow-rs/issues/8780) -- Inconsistent display for types with Metadata [\#8761](https://github.com/apache/arrow-rs/issues/8761) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Internal `arrow-integration-test` crate is linked from `arrow` docs [\#8739](https://github.com/apache/arrow-rs/issues/8739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add benchmark for RunEndEncoded casting [\#8709](https://github.com/apache/arrow-rs/issues/8709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[Varaint\] Support `VariantArray::value` to return a `Result` [\#8672](https://github.com/apache/arrow-rs/issues/8672) +- impl `Index` for `UnionFields` [\#8958](https://github.com/apache/arrow-rs/issues/8958) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Fix regression caused by changes in Display for DataType - display \(`List(non-null Int64)` instead of `List(nullable Int64)` [\#8890](https://github.com/apache/arrow-rs/pull/8890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([etseidl](https://github.com/etseidl)) -- Support parsing for old style FixedSizeList [\#8882](https://github.com/apache/arrow-rs/pull/8882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Make ArrayFormatterFactory Send + Sync and add a test [\#8878](https://github.com/apache/arrow-rs/pull/8878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) -- Make `ArrowReaderOptions::with_virtual_columns` error rather than panic on invalid input [\#8867](https://github.com/apache/arrow-rs/pull/8867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Fix errors when reading nested Lists with pushdown predicates. [\#8866](https://github.com/apache/arrow-rs/pull/8866) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Fix `RowNumberReader` when not all row groups are selected [\#8863](https://github.com/apache/arrow-rs/pull/8863) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) -- Respect page index policy option for ParquetObjectReader when it's not skip [\#8857](https://github.com/apache/arrow-rs/pull/8857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zhuqi-lucas](https://github.com/zhuqi-lucas)) -- build\(deps\): update apache-avro requirement from 0.20.0 to 0.21.0 [\#8832](https://github.com/apache/arrow-rs/pull/8832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Allow Users to Provide Custom `ArrayFormatter`s when Pretty-Printing Record Batches [\#8829](https://github.com/apache/arrow-rs/pull/8829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) -- Allow reading of improperly constructed empty lists in Parquet metadata [\#8827](https://github.com/apache/arrow-rs/pull/8827) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- \[Variant\] Fix cast logic for Variant to Arrow for DataType::Null [\#8825](https://github.com/apache/arrow-rs/pull/8825) ([klion26](https://github.com/klion26)) -- remove T: ParquetValueType bound on ValueStatistics [\#8824](https://github.com/apache/arrow-rs/pull/8824) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pmarks](https://github.com/pmarks)) -- build\(deps\): update lz4\_flex requirement from 0.11 to 0.12 [\#8820](https://github.com/apache/arrow-rs/pull/8820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix bug in handling of empty Parquet page index structures [\#8817](https://github.com/apache/arrow-rs/pull/8817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Parquet-concat: supports page index and bloom filter [\#8811](https://github.com/apache/arrow-rs/pull/8811) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) -- \[Doc\] Correct `ListArray` documentation [\#8803](https://github.com/apache/arrow-rs/pull/8803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) -- \[Parquet\] Add additional docs for `ArrowReaderOptions` and `ArrowReaderMetadata` [\#8798](https://github.com/apache/arrow-rs/pull/8798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- \[Variant\] Enforce shredded-type validation in `shred_variant` [\#8796](https://github.com/apache/arrow-rs/pull/8796) ([liamzwbao](https://github.com/liamzwbao)) -- Add `VariantPath::is_empty` [\#8791](https://github.com/apache/arrow-rs/pull/8791) ([friendlymatthew](https://github.com/friendlymatthew)) -- Add FilterBuilder::is\_optimize\_beneficial [\#8782](https://github.com/apache/arrow-rs/pull/8782) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) -- \[Parquet\] Allow reading of files with unknown logical types [\#8777](https://github.com/apache/arrow-rs/pull/8777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- bench: add `ArrayIter` benchmarks [\#8774](https://github.com/apache/arrow-rs/pull/8774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- Update Rust toolchain to 1.91 [\#8769](https://github.com/apache/arrow-rs/pull/8769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- \[Variant\] Add variant to arrow for `DataType::{Binary/LargeBinary/BinaryView}` [\#8768](https://github.com/apache/arrow-rs/pull/8768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([klion26](https://github.com/klion26)) -- feat: parse `DataType::Union`, `DataType::Map`, `DataType::RunEndEncoded` [\#8765](https://github.com/apache/arrow-rs/pull/8765) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) -- Add options to control various aspects of Parquet metadata decoding [\#8763](https://github.com/apache/arrow-rs/pull/8763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- feat: Ensure consistent metadata display for data types [\#8760](https://github.com/apache/arrow-rs/pull/8760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mhilton](https://github.com/mhilton)) -- Clean up predicate\_cache tests [\#8755](https://github.com/apache/arrow-rs/pull/8755) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- refactor `test_cache_projection_excludes_nested_columns` to use high level APIs [\#8754](https://github.com/apache/arrow-rs/pull/8754) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Add `merge` and `merge_n` kernels [\#8753](https://github.com/apache/arrow-rs/pull/8753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([pepijnve](https://github.com/pepijnve)) -- Fix lint in arrow-flight by updating assert\_cmd after it upgraded [\#8741](https://github.com/apache/arrow-rs/pull/8741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([vegarsti](https://github.com/vegarsti)) -- Remove link to internal `arrow-integration-test` crate from main `arrow` crate [\#8740](https://github.com/apache/arrow-rs/pull/8740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) -- Implement hex decoding of JSON strings to binary arrays [\#8737](https://github.com/apache/arrow-rs/pull/8737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phil-opp](https://github.com/phil-opp)) -- \[Parquet\] Adaptive Parquet Predicate Pushdown [\#8733](https://github.com/apache/arrow-rs/pull/8733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) -- \[Parquet\] Return error from `RleDecoder::reload` rather than panic [\#8729](https://github.com/apache/arrow-rs/pull/8729) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liamzwbao](https://github.com/liamzwbao)) -- fix: `ArrayIter` does not report size hint correctly after advancing from the iterator back [\#8728](https://github.com/apache/arrow-rs/pull/8728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- perf: Use Vec::with\_capacity in cast\_to\_run\_end\_encoded [\#8726](https://github.com/apache/arrow-rs/pull/8726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) -- \[Variant\] Fix the index of an item in VariantArray in a unit test [\#8725](https://github.com/apache/arrow-rs/pull/8725) ([martin-g](https://github.com/martin-g)) -- build\(deps\): bump actions/download-artifact from 5 to 6 [\#8720](https://github.com/apache/arrow-rs/pull/8720) ([dependabot[bot]](https://github.com/apps/dependabot)) -- \[Variant\] Add try\_value/value for VariantArray [\#8719](https://github.com/apache/arrow-rs/pull/8719) ([klion26](https://github.com/klion26)) -- General virtual columns support + row numbers as a first use-case [\#8715](https://github.com/apache/arrow-rs/pull/8715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([vustef](https://github.com/vustef)) -- feat: Parquet-layout add Index and Footer info [\#8712](https://github.com/apache/arrow-rs/pull/8712) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mapleFU](https://github.com/mapleFU)) -- fix: `zip` now treats nulls as false in provided mask regardless of the underlying bit value [\#8711](https://github.com/apache/arrow-rs/pull/8711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- Add benchmark for casting to RunEndEncoded \(REE\) [\#8710](https://github.com/apache/arrow-rs/pull/8710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) -- \[Minor\]: Document visibility for enums produced by Thrift macros [\#8706](https://github.com/apache/arrow-rs/pull/8706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Update `arrow-avro` `README.md` version to 57 [\#8695](https://github.com/apache/arrow-rs/pull/8695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) -- Fix: ViewType gc on huge batch would produce bad output [\#8694](https://github.com/apache/arrow-rs/pull/8694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) -- Refactor arrow-cast decimal casting to unify the rescale logic used in Parquet variant casts [\#8689](https://github.com/apache/arrow-rs/pull/8689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) -- check bit width to avoid panic in DeltaBitPackDecoder [\#8688](https://github.com/apache/arrow-rs/pull/8688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) -- \[thrift-remodel\] Use `thrift_enum` macro for `ConvertedType` [\#8680](https://github.com/apache/arrow-rs/pull/8680) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- \[JSON\] Map key supports utf8 view [\#8679](https://github.com/apache/arrow-rs/pull/8679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) -- \[JSON\] Add encoding for binary view [\#8675](https://github.com/apache/arrow-rs/pull/8675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mapleFU](https://github.com/mapleFU)) -- \[Parquet\] Account for FileDecryptor in ParquetMetaData heap size calculation [\#8671](https://github.com/apache/arrow-rs/pull/8671) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([adamreeve](https://github.com/adamreeve)) -- chore: update `OffsetBuffer::from_lengths(std::iter::repeat_n(, ));` with `OffsetBuffer::from_repeated_length(, );` [\#8669](https://github.com/apache/arrow-rs/pull/8669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Variant\] Support `shred_variant` for Uuids [\#8666](https://github.com/apache/arrow-rs/pull/8666) ([friendlymatthew](https://github.com/friendlymatthew)) -- \[Variant\] Remove `create_test_variant_array` helper method [\#8664](https://github.com/apache/arrow-rs/pull/8664) ([friendlymatthew](https://github.com/friendlymatthew)) -- \[parquet\] Adding counting method in thrift\_enum macro to support ENCODING\_SLOTS [\#8663](https://github.com/apache/arrow-rs/pull/8663) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hhhizzz](https://github.com/hhhizzz)) -- chore: add test case of RowSelection::trim [\#8660](https://github.com/apache/arrow-rs/pull/8660) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([lichuang](https://github.com/lichuang)) -- feat: add `new_repeated` to `ByteArray` [\#8659](https://github.com/apache/arrow-rs/pull/8659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- perf: add `repeat_slice_n_times` to `MutableBuffer` [\#8658](https://github.com/apache/arrow-rs/pull/8658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- perf: add optimized function to create offset with same length [\#8656](https://github.com/apache/arrow-rs/pull/8656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) -- \[Variant\] `rescale_decimal` followup [\#8655](https://github.com/apache/arrow-rs/pull/8655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liamzwbao](https://github.com/liamzwbao)) -- feat: parse DataType `List`, `ListView`, `LargeList`, `LargeListView`, `FixedSizeList` [\#8649](https://github.com/apache/arrow-rs/pull/8649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) -- Support more operations on ListView [\#8645](https://github.com/apache/arrow-rs/pull/8645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([a10y](https://github.com/a10y)) -- \[Variant\] Implement primitive type access for null/time/decimal\* [\#8638](https://github.com/apache/arrow-rs/pull/8638) ([klion26](https://github.com/klion26)) -- \[Variant\] refactor: Split builder.rs into several smaller files [\#8635](https://github.com/apache/arrow-rs/pull/8635) ([Weijun-H](https://github.com/Weijun-H)) -- add `try_new_with_length` constructor to `FixedSizeList` [\#8624](https://github.com/apache/arrow-rs/pull/8624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([connortsui20](https://github.com/connortsui20)) -- Change some panics to errors in parquet decoder [\#8602](https://github.com/apache/arrow-rs/pull/8602) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) -- Support `variant_to_arrow` for utf8 [\#8600](https://github.com/apache/arrow-rs/pull/8600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sdf-jkl](https://github.com/sdf-jkl)) -- Cast support for RunEndEncoded arrays [\#8589](https://github.com/apache/arrow-rs/pull/8589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) +- Add `DataType::is_decimal` [\#9100](https://github.com/apache/arrow-rs/pull/9100) ([AdamGS](https://github.com/AdamGS)) +- feat\(parquet\): relax type compatility check in parquet ArrowWriter [\#9099](https://github.com/apache/arrow-rs/pull/9099) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([gruuya](https://github.com/gruuya)) +- \[Variant\] Move `ArrayVariantToArrowRowBuilder` to `variant_to_arrow` [\#9094](https://github.com/apache/arrow-rs/pull/9094) ([liamzwbao](https://github.com/liamzwbao)) +- chore: increase row count and batch size for more deterministic tests [\#9088](https://github.com/apache/arrow-rs/pull/9088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Fix `nullif` kernel [\#9087](https://github.com/apache/arrow-rs/pull/9087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add `FlightInfo::with_endpoints` method [\#9075](https://github.com/apache/arrow-rs/pull/9075) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([lewiszlw](https://github.com/lewiszlw)) +- chore: run validation when debug assertion enabled and not only for test [\#9073](https://github.com/apache/arrow-rs/pull/9073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Minor: make it clear cache array reader is not cloning arrays [\#9057](https://github.com/apache/arrow-rs/pull/9057) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Minor: avoid clone in RunArray row decoding via buffer stealing [\#9052](https://github.com/apache/arrow-rs/pull/9052) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lyang24](https://github.com/lyang24)) +- Minor: avoid some clones when reading parquet [\#9048](https://github.com/apache/arrow-rs/pull/9048) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- fix: don't generate nulls for `Decimal128` and `Decimal256` when field is non-nullable and have non-zero `null_density` [\#9046](https://github.com/apache/arrow-rs/pull/9046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- fix: `Rows` `size` should use `capacity` and not `len` [\#9044](https://github.com/apache/arrow-rs/pull/9044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- fix: integration / Archery test With other arrows container ran out of space [\#9043](https://github.com/apache/arrow-rs/pull/9043) ([lyang24](https://github.com/lyang24)) +- feat: add new `try_append_value_n()` function to `GenericByteViewBuilder` [\#9040](https://github.com/apache/arrow-rs/pull/9040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lyang24](https://github.com/lyang24)) +- Rename fields in BooleanBuffer for clarity [\#9039](https://github.com/apache/arrow-rs/pull/9039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Allocate buffers before work in `boolean_kernels` benchmark [\#9035](https://github.com/apache/arrow-rs/pull/9035) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Move RunArray::get\_physical\_indices to RunEndBuffer [\#9027](https://github.com/apache/arrow-rs/pull/9027) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lyang24](https://github.com/lyang24)) +- Improve `RunArray` documentation [\#9019](https://github.com/apache/arrow-rs/pull/9019) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Add BooleanArray tests for null and slice behavior [\#9013](https://github.com/apache/arrow-rs/pull/9013) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([UtkarshSahay123](https://github.com/UtkarshSahay123)) +- feat: support array indices in VariantPath dot notation [\#9012](https://github.com/apache/arrow-rs/pull/9012) ([foskey51](https://github.com/foskey51)) +- arrow-cast: Bring back in-order field casting for `StructArray` [\#9007](https://github.com/apache/arrow-rs/pull/9007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- arrow-ipc: Add ListView support [\#9006](https://github.com/apache/arrow-rs/pull/9006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add quote style to csv writer [\#9004](https://github.com/apache/arrow-rs/pull/9004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xanderbailey](https://github.com/xanderbailey)) +- Fix row slice bug in Union column decoding with many columns [\#9000](https://github.com/apache/arrow-rs/pull/9000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- build\(deps\): bump actions/download-artifact from 6 to 7 [\#8995](https://github.com/apache/arrow-rs/pull/8995) ([dependabot[bot]](https://github.com/apps/dependabot)) +- minor: Add comment blocks to PR template [\#8994](https://github.com/apache/arrow-rs/pull/8994) ([Jefffrey](https://github.com/Jefffrey)) +- Implement `BinaryArrayType` for `&FixedSizeBinaryArray`s [\#8993](https://github.com/apache/arrow-rs/pull/8993) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- feat: impl BatchCoalescer::push\_batch\_with\_indices [\#8991](https://github.com/apache/arrow-rs/pull/8991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ClSlaid](https://github.com/ClSlaid)) +- \[Arrow\]Configure max deduplication length for `StringView` [\#8990](https://github.com/apache/arrow-rs/pull/8990) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lichuang](https://github.com/lichuang)) +- feat: implement append\_array for FixedSizeBinaryBuilder [\#8989](https://github.com/apache/arrow-rs/pull/8989) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ClSlaid](https://github.com/ClSlaid)) +- Add benchmarks for Utf8View scalars for zip [\#8988](https://github.com/apache/arrow-rs/pull/8988) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mkleen](https://github.com/mkleen)) +- build\(deps\): bump actions/cache from 4 to 5 [\#8986](https://github.com/apache/arrow-rs/pull/8986) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Take fsb null indices [\#8981](https://github.com/apache/arrow-rs/pull/8981) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add List to `interleave_kernels` benchmark [\#8980](https://github.com/apache/arrow-rs/pull/8980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix ipc errors for `LargeList` containing sliced `StringViews` [\#8979](https://github.com/apache/arrow-rs/pull/8979) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fabianmurariu](https://github.com/fabianmurariu)) +- arrow-buffer: implement num-traits numeric operations [\#8977](https://github.com/apache/arrow-rs/pull/8977) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([theirix](https://github.com/theirix)) +- Update `xz` crate dependency to use `liblzma` in arrow-avro [\#8975](https://github.com/apache/arrow-rs/pull/8975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) +- arrow-data: avoid allocating in get\_last\_run\_end [\#8974](https://github.com/apache/arrow-rs/pull/8974) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asubiotto](https://github.com/asubiotto)) +- Support for `Arc` in `ParquetRecordWriter` derive macro [\#8973](https://github.com/apache/arrow-rs/pull/8973) ([heilhead](https://github.com/heilhead)) +- feat: support casting `Time32` to `Int64` [\#8971](https://github.com/apache/arrow-rs/pull/8971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tshauck](https://github.com/tshauck)) +- arrow-buffer: add i256::trailing\_zeros [\#8969](https://github.com/apache/arrow-rs/pull/8969) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([theirix](https://github.com/theirix)) +- Perf: Vectorize check\_bounds\(2x speedup\) [\#8966](https://github.com/apache/arrow-rs/pull/8966) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gstvg](https://github.com/gstvg)) +- arrow-buffer: make i256::leading\_zeros public and tested [\#8964](https://github.com/apache/arrow-rs/pull/8964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([theirix](https://github.com/theirix)) +- Add ignore leading and trailing white space to csv parser [\#8960](https://github.com/apache/arrow-rs/pull/8960) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xanderbailey](https://github.com/xanderbailey)) +- Access `UnionFields` elements by index [\#8959](https://github.com/apache/arrow-rs/pull/8959) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add Parquet roundtrip benchmarks [\#8956](https://github.com/apache/arrow-rs/pull/8956) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- \[Variant\] Add variant to arrow for Date64/Timestamp\(Second/Millisecond\)/Time32/Time64 [\#8950](https://github.com/apache/arrow-rs/pull/8950) ([klion26](https://github.com/klion26)) +- Let `ArrowArrayStreamReader` handle schema with attached metadata + do schema checking [\#8944](https://github.com/apache/arrow-rs/pull/8944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonded94](https://github.com/jonded94)) +- Adds ExtensionType for Parquet geospatial WKB arrays [\#8943](https://github.com/apache/arrow-rs/pull/8943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([BlakeOrth](https://github.com/BlakeOrth)) +- Add builder to help create Schemas for shredding \(`ShreddedSchemaBuilder`\) [\#8940](https://github.com/apache/arrow-rs/pull/8940) ([XiangpengHao](https://github.com/XiangpengHao)) +- build\(deps\): update criterion requirement from 0.7.0 to 0.8.0 [\#8939](https://github.com/apache/arrow-rs/pull/8939) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: Resolve Avro RecordEncoder bugs related to nullable Struct fields and Union type ids [\#8935](https://github.com/apache/arrow-rs/pull/8935) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jecsand838](https://github.com/jecsand838)) +- Some panic!s could more semantically be unimplemented! [\#8933](https://github.com/apache/arrow-rs/pull/8933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([abacef](https://github.com/abacef)) +- fix: ipc decode panic with invalid data [\#8931](https://github.com/apache/arrow-rs/pull/8931) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([leiysky](https://github.com/leiysky)) +- Allow creating zero-sized FixedSizeBinary arrays [\#8927](https://github.com/apache/arrow-rs/pull/8927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- Update `test_variant_get_error_when_cast_failure...` tests to uses a valid `VariantArray` [\#8921](https://github.com/apache/arrow-rs/pull/8921) ([alamb](https://github.com/alamb)) +- Make flight sql client generic [\#8915](https://github.com/apache/arrow-rs/pull/8915) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([lewiszlw](https://github.com/lewiszlw)) +- \[minor\] Name Magic Number "8" in `FixedSizeBinaryArray::new_null` [\#8914](https://github.com/apache/arrow-rs/pull/8914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- fix: cast Binary/String dictionary to view [\#8912](https://github.com/apache/arrow-rs/pull/8912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- \[8910\]Fixed doc test with feature prettyprint [\#8911](https://github.com/apache/arrow-rs/pull/8911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([manishkr](https://github.com/manishkr)) +- feat: `ArrayData::new_null` for `ListView` / `LargeListView` [\#8909](https://github.com/apache/arrow-rs/pull/8909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- fead: add `GenericListViewArray::from_iter_primitive` [\#8907](https://github.com/apache/arrow-rs/pull/8907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- fix: `GenericListViewArray::new_null` returns empty array [\#8905](https://github.com/apache/arrow-rs/pull/8905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dqkqd](https://github.com/dqkqd)) +- Allocate a zeroed buffer for FixedSizeBinaryArray::null [\#8901](https://github.com/apache/arrow-rs/pull/8901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tobixdev](https://github.com/tobixdev)) +- build\(deps\): bump actions/checkout from 5 to 6 [\#8899](https://github.com/apache/arrow-rs/pull/8899) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add getters to `UnionFields` [\#8895](https://github.com/apache/arrow-rs/pull/8895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add validated constructors for UnionFields [\#8891](https://github.com/apache/arrow-rs/pull/8891) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add bit width check [\#8888](https://github.com/apache/arrow-rs/pull/8888) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rambleraptor](https://github.com/rambleraptor)) +- \[Variant\] Improve `variant_get` performance on a perfect shredding [\#8887](https://github.com/apache/arrow-rs/pull/8887) ([XiangpengHao](https://github.com/XiangpengHao)) +- Add UnionArray::fields [\#8884](https://github.com/apache/arrow-rs/pull/8884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Struct casting field order [\#8871](https://github.com/apache/arrow-rs/pull/8871) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add support for `Union` types in `RowConverter` [\#8839](https://github.com/apache/arrow-rs/pull/8839) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- Add comparison support for Union arrays [\#8838](https://github.com/apache/arrow-rs/pull/8838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([friendlymatthew](https://github.com/friendlymatthew)) +- \[Variant\] Support array shredding into `List/LargeList/ListView/LargeListView` [\#8831](https://github.com/apache/arrow-rs/pull/8831) ([liamzwbao](https://github.com/liamzwbao)) +- Add support for using ListView arrays and types through FFI [\#8822](https://github.com/apache/arrow-rs/pull/8822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([AdamGS](https://github.com/AdamGS)) +- Add ability to skip or transform page encoding statistics in Parquet metadata [\#8797](https://github.com/apache/arrow-rs/pull/8797) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Implement a `Vec` wrapper for `pyarrow.Table` convenience [\#8790](https://github.com/apache/arrow-rs/pull/8790) ([jonded94](https://github.com/jonded94)) +- Make Parquet SBBF serialize/deserialize helpers public for external reuse [\#8762](https://github.com/apache/arrow-rs/pull/8762) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([RoseZhang123](https://github.com/RoseZhang123)) +- Add cast support for \(Large\)ListView \<-\> \(Large\)List [\#8735](https://github.com/apache/arrow-rs/pull/8735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vegarsti](https://github.com/vegarsti)) diff --git a/Cargo.toml b/Cargo.toml index a1641d904b67..e4f1780d2914 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,7 @@ exclude = [ ] [workspace.package] -version = "57.1.0" +version = "57.2.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -85,26 +85,26 @@ edition = "2024" rust-version = "1.85" [workspace.dependencies] -arrow = { version = "57.1.0", path = "./arrow", default-features = false } -arrow-arith = { version = "57.1.0", path = "./arrow-arith" } -arrow-array = { version = "57.1.0", path = "./arrow-array" } -arrow-buffer = { version = "57.1.0", path = "./arrow-buffer" } -arrow-cast = { version = "57.1.0", path = "./arrow-cast" } -arrow-csv = { version = "57.1.0", path = "./arrow-csv" } -arrow-data = { version = "57.1.0", path = "./arrow-data" } -arrow-ipc = { version = "57.1.0", path = "./arrow-ipc" } -arrow-json = { version = "57.1.0", path = "./arrow-json" } -arrow-ord = { version = "57.1.0", path = "./arrow-ord" } -arrow-pyarrow = { version = "57.1.0", path = "./arrow-pyarrow" } -arrow-row = { version = "57.1.0", path = "./arrow-row" } -arrow-schema = { version = "57.1.0", path = "./arrow-schema" } -arrow-select = { version = "57.1.0", path = "./arrow-select" } -arrow-string = { version = "57.1.0", path = "./arrow-string" } -parquet = { version = "57.1.0", path = "./parquet", default-features = false } -parquet-geospatial = { version = "57.1.0", path = "./parquet-geospatial" } -parquet-variant = { version = "57.1.0", path = "./parquet-variant" } -parquet-variant-json = { version = "57.1.0", path = "./parquet-variant-json" } -parquet-variant-compute = { version = "57.1.0", path = "./parquet-variant-compute" } +arrow = { version = "57.2.0", path = "./arrow", default-features = false } +arrow-arith = { version = "57.2.0", path = "./arrow-arith" } +arrow-array = { version = "57.2.0", path = "./arrow-array" } +arrow-buffer = { version = "57.2.0", path = "./arrow-buffer" } +arrow-cast = { version = "57.2.0", path = "./arrow-cast" } +arrow-csv = { version = "57.2.0", path = "./arrow-csv" } +arrow-data = { version = "57.2.0", path = "./arrow-data" } +arrow-ipc = { version = "57.2.0", path = "./arrow-ipc" } +arrow-json = { version = "57.2.0", path = "./arrow-json" } +arrow-ord = { version = "57.2.0", path = "./arrow-ord" } +arrow-pyarrow = { version = "57.2.0", path = "./arrow-pyarrow" } +arrow-row = { version = "57.2.0", path = "./arrow-row" } +arrow-schema = { version = "57.2.0", path = "./arrow-schema" } +arrow-select = { version = "57.2.0", path = "./arrow-select" } +arrow-string = { version = "57.2.0", path = "./arrow-string" } +parquet = { version = "57.2.0", path = "./parquet", default-features = false } +parquet-geospatial = { version = "57.2.0", path = "./parquet-geospatial" } +parquet-variant = { version = "57.2.0", path = "./parquet-variant" } +parquet-variant-json = { version = "57.2.0", path = "./parquet-variant-json" } +parquet-variant-compute = { version = "57.2.0", path = "./parquet-variant-compute" } chrono = { version = "0.4.40", default-features = false, features = ["clock"] } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 2812988382d3..7f0195bbd7bb 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="57.0.0" -FUTURE_RELEASE="57.1.0" +SINCE_TAG="57.1.0" +FUTURE_RELEASE="57.2.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 67e04e758f1e62ec3d78d2f678daf433a4c54e30 Mon Sep 17 00:00:00 2001 From: WaterWhisperer Date: Thu, 8 Jan 2026 16:35:14 +0800 Subject: [PATCH 27/33] feat: change default behavior for Parquet `PageEncodingStats` to bitmask (#9051) # Which issue does this PR close? - Closes #8859 # Rationale for this change > Currently the default behavior is to parse the full vector of encoding stats, but given the limited use of this information we should instead default to the more concise and performant bitmask. # What changes are included in this PR? - Implement `Default` for `ParquetMetaDataOptions` with `encoding_stats_as_mask: true` - Update Thrift decoding logic to default to bitmask even if options are missing - Update documentation to reflect the new default behavior - Update existing tests to maintain coverage for full statistics - Add new tests to verify default behavior and full stats option # Are these changes tested? Yes # Are there any user-facing changes? Yes --- parquet/benches/metadata.rs | 39 ++++--------- parquet/src/arrow/arrow_writer/mod.rs | 5 +- parquet/src/arrow/mod.rs | 16 ++++- parquet/src/file/metadata/mod.rs | 77 ++++++++++++++++++++++++- parquet/src/file/metadata/options.rs | 20 ++++++- parquet/src/file/metadata/thrift/mod.rs | 14 ++++- parquet/src/file/serialized_reader.rs | 5 +- 7 files changed, 138 insertions(+), 38 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index c962a4c3fdf8..6f5f56745e90 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use parquet::basic::{Encoding, PageType, Type as PhysicalType}; use parquet::file::metadata::{ ColumnChunkMetaData, FileMetaData, PageEncodingStats, ParquetMetaData, ParquetMetaDataOptions, - ParquetMetaDataReader, ParquetMetaDataWriter, ParquetStatisticsPolicy, RowGroupMetaData, + ParquetMetaDataReader, ParquetMetaDataWriter, RowGroupMetaData, }; use parquet::file::statistics::Statistics; use parquet::file::writer::TrackedWrite; @@ -164,26 +164,17 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap(); - let options = ParquetMetaDataOptions::new().with_schema(schema); - c.bench_function("decode metadata with schema", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) - .unwrap(); - }) - }); - - let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true); - c.bench_function("decode metadata with stats mask", |b| { + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); + c.bench_function("decode metadata (full stats)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) .unwrap(); }) }); - let options = - ParquetMetaDataOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); - c.bench_function("decode metadata with skip PES", |b| { + let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap(); + let options = ParquetMetaDataOptions::new().with_schema(schema); + c.bench_function("decode metadata with schema", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) .unwrap(); @@ -197,24 +188,16 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap(); - let options = ParquetMetaDataOptions::new().with_schema(schema); - c.bench_function("decode metadata (wide) with schema", |b| { + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); + c.bench_function("decode metadata (wide) (full stats)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); - let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true); - c.bench_function("decode metadata (wide) with stats mask", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); - }) - }); - - let options = - ParquetMetaDataOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); - c.bench_function("decode metadata (wide) with skip PES", |b| { + let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap(); + let options = ParquetMetaDataOptions::new().with_schema(schema); + c.bench_function("decode metadata (wide) with schema", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 6b1566a681e7..a6cd2006782f 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -4480,7 +4480,10 @@ mod tests { .unwrap(); // check that the read metadata is also correct - let options = ReadOptionsBuilder::new().with_page_index().build(); + let options = ReadOptionsBuilder::new() + .with_page_index() + .with_encoding_stats_as_mask(false) + .build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let rowgroup = reader.get_row_group(0).expect("row group missing"); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 672ffb6fc521..52152988166f 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -494,7 +494,9 @@ pub fn parquet_column<'a>( #[cfg(test)] mod test { use crate::arrow::ArrowWriter; - use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter}; + use crate::file::metadata::{ + ParquetMetaData, ParquetMetaDataOptions, ParquetMetaDataReader, ParquetMetaDataWriter, + }; use crate::file::properties::{EnabledStatistics, WriterProperties}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; @@ -511,13 +513,17 @@ mod test { let parquet_bytes = create_parquet_file(); // read the metadata from the file WITHOUT the page index structures + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let original_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .parse_and_finish(&parquet_bytes) .unwrap(); // this should error because the page indexes are not present, but have offsets specified let metadata_bytes = metadata_to_bytes(&original_metadata); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let err = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .with_page_indexes(true) // there are no page indexes in the metadata .parse_and_finish(&metadata_bytes) .err() @@ -533,7 +539,9 @@ mod test { let parquet_bytes = create_parquet_file(); // read the metadata from the file + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let original_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .parse_and_finish(&parquet_bytes) .unwrap(); @@ -545,7 +553,9 @@ mod test { "metadata is subset of parquet" ); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let roundtrip_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .parse_and_finish(&metadata_bytes) .unwrap(); @@ -559,14 +569,18 @@ mod test { // read the metadata from the file including the page index structures // (which are stored elsewhere in the footer) + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let original_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .with_page_indexes(true) .parse_and_finish(&parquet_bytes) .unwrap(); // read metadata back from the serialized bytes and ensure it is the same let metadata_bytes = metadata_to_bytes(&original_metadata); + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); let roundtrip_metadata = ParquetMetaDataReader::new() + .with_metadata_options(Some(options)) .with_page_indexes(true) .parse_and_finish(&metadata_bytes) .unwrap(); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 6bd426ee677f..ca3a9e10978b 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1062,6 +1062,10 @@ impl ColumnChunkMetaData { /// Returns the page encoding statistics, or `None` if no page encoding statistics /// are available (or they were converted to a mask). + /// + /// Note: By default, this crate converts page encoding statistics to a mask for performance + /// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`] + /// to `false`. pub fn page_encoding_stats(&self) -> Option<&Vec> { match self.encoding_stats.as_ref() { Some(ParquetPageEncodingStats::Full(stats)) => Some(stats), @@ -1072,6 +1076,8 @@ impl ColumnChunkMetaData { /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are /// not available (or they were left in their original form). /// + /// Note: This is the default behavior for this crate. + /// /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to /// enable fast determination of whether all pages in a column chunk are dictionary encoded /// (see ). @@ -1667,7 +1673,9 @@ impl OffsetIndexBuilder { mod tests { use super::*; use crate::basic::{PageType, SortOrder}; - use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group}; + use crate::file::metadata::thrift::tests::{ + read_column_chunk, read_column_chunk_with_options, read_row_group, + }; #[test] fn test_row_group_metadata_thrift_conversion() { @@ -1822,7 +1830,72 @@ mod tests { let mut buf = Vec::new(); let mut writer = ThriftCompactOutputProtocol::new(&mut buf); col_metadata.write_thrift(&mut writer).unwrap(); - let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap(); + let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap(); + + let expected_metadata = ColumnChunkMetaData::builder(column_descr) + .set_encodings_mask(EncodingMask::new_from_encodings( + [Encoding::PLAIN, Encoding::RLE].iter(), + )) + .set_file_path("file_path".to_owned()) + .set_num_values(1000) + .set_compression(Compression::SNAPPY) + .set_total_compressed_size(2000) + .set_total_uncompressed_size(3000) + .set_data_page_offset(4000) + .set_dictionary_page_offset(Some(5000)) + .set_page_encoding_stats_mask(EncodingMask::new_from_encodings( + [Encoding::PLAIN, Encoding::RLE].iter(), + )) + .set_bloom_filter_offset(Some(6000)) + .set_bloom_filter_length(Some(25)) + .set_offset_index_offset(Some(7000)) + .set_offset_index_length(Some(25)) + .set_column_index_offset(Some(8000)) + .set_column_index_length(Some(25)) + .set_unencoded_byte_array_data_bytes(Some(2000)) + .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100]))) + .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200]))) + .build() + .unwrap(); + + assert_eq!(col_chunk_res, expected_metadata); + } + + #[test] + fn test_column_chunk_metadata_thrift_conversion_full_stats() { + let column_descr = get_test_schema_descr().column(0); + let stats = vec![ + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::PLAIN, + count: 3, + }, + PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::RLE, + count: 5, + }, + ]; + let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) + .set_encodings_mask(EncodingMask::new_from_encodings( + [Encoding::PLAIN, Encoding::RLE].iter(), + )) + .set_num_values(1000) + .set_compression(Compression::SNAPPY) + .set_total_compressed_size(2000) + .set_total_uncompressed_size(3000) + .set_data_page_offset(4000) + .set_page_encoding_stats(stats) + .build() + .unwrap(); + + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + col_metadata.write_thrift(&mut writer).unwrap(); + + let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); + let col_chunk_res = + read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap(); assert_eq!(col_chunk_res, col_metadata); } diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index c1ee22ff8de9..0bd0dfd9e30a 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -87,13 +87,23 @@ impl ParquetStatisticsPolicy { /// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData /// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader /// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder -#[derive(Default, Debug, Clone)] +#[derive(Debug, Clone)] pub struct ParquetMetaDataOptions { schema_descr: Option, encoding_stats_as_mask: bool, encoding_stats_policy: ParquetStatisticsPolicy, } +impl Default for ParquetMetaDataOptions { + fn default() -> Self { + Self { + schema_descr: None, + encoding_stats_as_mask: true, + encoding_stats_policy: ParquetStatisticsPolicy::KeepAll, + } + } +} + impl ParquetMetaDataOptions { /// Return a new default [`ParquetMetaDataOptions`]. pub fn new() -> Self { @@ -118,7 +128,7 @@ impl ParquetMetaDataOptions { } /// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData` - /// as a bitmask (defaults to `false`). + /// as a bitmask (defaults to `true`). /// /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this /// might be desirable. @@ -193,6 +203,12 @@ mod tests { }; use std::{io::Read, sync::Arc}; + #[test] + fn test_options_default() { + let options = ParquetMetaDataOptions::default(); + assert!(options.encoding_stats_as_mask()); + } + #[test] fn test_provide_schema() { let mut buf: Vec = Vec::new(); diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index 95ad67da6d95..154fde77edb9 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -410,7 +410,7 @@ fn read_column_metadata<'a>( let mut seen_mask = 0u16; let mut skip_pes = false; - let mut pes_mask = false; + let mut pes_mask = true; if let Some(opts) = options { skip_pes = opts.skip_encoding_stats(col_index); @@ -1704,7 +1704,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct); pub(crate) mod tests { use crate::errors::Result; use crate::file::metadata::thrift::{BoundingBox, SchemaElement, write_schema}; - use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; + use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, RowGroupMetaData}; use crate::parquet_thrift::tests::test_roundtrip; use crate::parquet_thrift::{ ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, read_thrift_vec, @@ -1726,9 +1726,17 @@ pub(crate) mod tests { pub(crate) fn read_column_chunk( buf: &mut [u8], column_descr: Arc, + ) -> Result { + read_column_chunk_with_options(buf, column_descr, None) + } + + pub(crate) fn read_column_chunk_with_options( + buf: &mut [u8], + column_descr: Arc, + options: Option<&ParquetMetaDataOptions>, ) -> Result { let mut reader = ThriftSliceInputProtocol::new(buf); - crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, None) + crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, options) } pub(crate) fn roundtrip_schema(schema: TypePtr) -> Result { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8ef7b972d7e1..68b44f3cbbde 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1855,7 +1855,10 @@ mod tests { fn test_file_reader_optional_metadata() { // file with optional metadata: bloom filters, encoding stats, column index and offset index. let file = get_test_file("data_index_bloom_encoding_stats.parquet"); - let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); + let options = ReadOptionsBuilder::new() + .with_encoding_stats_as_mask(false) + .build(); + let file_reader = Arc::new(SerializedFileReader::new_with_options(file, options).unwrap()); let row_group_metadata = file_reader.metadata.row_group(0); let col0_metadata = row_group_metadata.column(0); From 37d501365bfcb9b6b85e22754e1d2a6fff20514f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Jan 2026 11:09:15 -0500 Subject: [PATCH 28/33] docs: Update release schedule in README.md (#9111) # Which issue does this PR close? - releated to https://github.com/apache/arrow-rs/issues/7392 # Rationale for this change Keep website updated with our planned release schedule # What changes are included in this PR? Update README with planned release schedule # Are these changes tested? N/A # Are there any user-facing changes? Update readme --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 56921f382860..7726fc4c0703 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,17 @@ Planned Release Schedule | Approximate Date | Version | Notes | | ---------------- | ---------- | --------------------------------------- | -| October 2025 | [`57.0.0`] | Major, potentially breaking API changes | -| November 2025 | [`57.1.0`] | Minor, NO breaking API changes | | December 2025 | [`57.2.0`] | Minor, NO breaking API changes | | January 2026 | [`58.0.0`] | Major, potentially breaking API changes | +| February 2026 | [`58.1.0`] | Minor, NO breaking API changes | +| March 2026 | [`58.2.0`] | Minor, NO breaking API changes | +| April 2026 | [`59.0.0`] | Major, potentially breaking API changes | -[`57.0.0`]: https://github.com/apache/arrow-rs/issues/7835 -[`57.1.0`]: https://github.com/apache/arrow-rs/milestone/3 [`57.2.0`]: https://github.com/apache/arrow-rs/milestone/5 [`58.0.0`]: https://github.com/apache/arrow-rs/milestone/6 +[`58.1.0`]: https://github.com/apache/arrow-rs/issues/9108 +[`58.2.0`]: https://github.com/apache/arrow-rs/issues/9109 +[`59.0.0`]: https://github.com/apache/arrow-rs/issues/9110 [ticket #5368]: https://github.com/apache/arrow-rs/issues/5368 [semantic versioning]: https://semver.org/ From 73bbfeef68ef57969c45bf8bb3b868d12fc7d7ed Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Thu, 8 Jan 2026 19:06:00 +0200 Subject: [PATCH 29/33] feat: add benchmarks for json parser (#9107) # Which issue does this PR close? - Closes #NNN. # Rationale for this change Add targeted JSON reader benchmarks to track performance for wide objects, hex-encoded binary inputs, and projection workloads. # What changes are included in this PR? - Add `arrow-json/benches/wide_object.rs` for wide-object decode/serialize benchmarks. - Add `arrow-json/benches/binary_hex.rs` for hex string decoding into Binary/FixedSizeBinary/BinaryView. - Add `arrow-json/benches/wide_projection.rs` for full vs projected schema decoding. # Are these changes tested? No # Are there any user-facing changes? No --- arrow-json/Cargo.toml | 4 + arrow-json/benches/json-reader.rs | 250 ++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 arrow-json/benches/json-reader.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index f2653ec4e46e..5fcde480eb6d 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -65,3 +65,7 @@ rand = { version = "0.9", default-features = false, features = ["std", "std_rng" [[bench]] name = "serde" harness = false + +[[bench]] +name = "json-reader" +harness = false diff --git a/arrow-json/benches/json-reader.rs b/arrow-json/benches/json-reader.rs new file mode 100644 index 000000000000..504839f8ffe2 --- /dev/null +++ b/arrow-json/benches/json-reader.rs @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_json::reader::Decoder; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use serde_json::{Map, Number, Value}; +use std::fmt::Write; +use std::hint::black_box; +use std::sync::Arc; + +const ROWS: usize = 1 << 17; // 128K rows +const BATCH_SIZE: usize = 1 << 13; // 8K rows per batch + +const WIDE_FIELDS: usize = 64; +const BINARY_BYTES: usize = 64; +const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 + +fn decode_and_flush(decoder: &mut Decoder, data: &[u8]) { + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(black_box(&data[offset..])).unwrap(); + if read == 0 { + break; + } + offset += read; + while let Some(_batch) = decoder.flush().unwrap() {} + } +} + +fn build_schema(field_count: usize) -> Arc { + // Builds a schema with fields named f0..f{field_count-1}, all Int64 and non-nullable. + let fields: Vec = (0..field_count) + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_projection_schema(indices: &[usize]) -> Arc { + let fields: Vec = indices + .iter() + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_wide_json(rows: usize, fields: usize) -> Vec { + // Builds newline-delimited JSON objects with "wide" schema. + // Example (rows=2, fields=3): + // {"f0":0,"f1":1,"f2":2} + // {"f0":1,"f1":2,"f2":3} + let mut out = String::with_capacity(rows * fields * 12); + for row in 0..rows { + out.push('{'); + for field in 0..fields { + if field > 0 { + out.push(','); + } + let value = row as i64 + field as i64; + write!(&mut out, "\"f{field}\":{value}").unwrap(); + } + out.push('}'); + out.push('\n'); + } + out.into_bytes() +} + +fn build_wide_values(rows: usize, fields: usize) -> Vec { + // Mirrors build_wide_json but returns structured serde_json::Value objects. + let mut out = Vec::with_capacity(rows); + for row in 0..rows { + let mut map = Map::with_capacity(fields); + for field in 0..fields { + let key = format!("f{field}"); + let value = Number::from((row + field) as i64); + map.insert(key, Value::Number(value)); + } + out.push(Value::Object(map)); + } + out +} + +fn bench_decode_wide_object(c: &mut Criterion) { + let data = build_wide_json(ROWS, WIDE_FIELDS); + let schema = build_schema(WIDE_FIELDS); + + c.bench_function("decode_wide_object_i64_json", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, &data); + }) + }); +} + +fn bench_serialize_wide_object(c: &mut Criterion) { + let values = build_wide_values(ROWS, WIDE_FIELDS); + let schema = build_schema(WIDE_FIELDS); + + c.bench_function("decode_wide_object_i64_serialize", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + + decoder.serialize(&values).unwrap(); + while let Some(_batch) = decoder.flush().unwrap() {} + }) + }); +} + +fn bench_decode_binary(c: &mut Criterion, name: &str, data: &[u8], field: Arc) { + c.bench_function(name, |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new_with_field(field.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, data); + }) + }); +} + +#[inline] +fn append_hex_byte(buf: &mut String, byte: u8) { + const HEX: &[u8; 16] = b"0123456789abcdef"; + buf.push(HEX[(byte >> 4) as usize] as char); + buf.push(HEX[(byte & 0x0f) as usize] as char); +} + +fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { + let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); + for row in 0..rows { + data.push('"'); + for i in 0..bytes_per_row { + let byte = ((row + i) & 0xff) as u8; + append_hex_byte(&mut data, byte); + } + data.push('"'); + data.push('\n'); + } + data.into_bytes() +} + +fn bench_binary_hex(c: &mut Criterion) { + let binary_data = build_hex_lines(ROWS, BINARY_BYTES); + + let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); + bench_decode_binary(c, "decode_binary_hex_json", &binary_data, binary_field); + + let fixed_field = Arc::new(Field::new( + "item", + DataType::FixedSizeBinary(BINARY_BYTES as i32), + false, + )); + bench_decode_binary(c, "decode_fixed_binary_hex_json", &binary_data, fixed_field); + + let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); + bench_decode_binary(c, "decode_binary_view_hex_json", &binary_data, view_field); +} + +fn bench_decode_schema(c: &mut Criterion, name: &str, data: &[u8], schema: Arc) { + let mut group = c.benchmark_group(name); + group.throughput(Throughput::Bytes(data.len() as u64)); + group.sample_size(50); + group.measurement_time(std::time::Duration::from_secs(5)); + group.warm_up_time(std::time::Duration::from_secs(2)); + group.sampling_mode(SamplingMode::Flat); + group.bench_function(BenchmarkId::from_parameter(ROWS), |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, data); + }) + }); + group.finish(); +} + +fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec { + // Estimate: each field ~15 bytes ("fXX":VVVVVVV,), total ~15*100 + overhead + let per_row_size = total_fields * 15 + 10; + let mut data = String::with_capacity(rows * per_row_size); + + for _row in 0..rows { + data.push('{'); + for i in 0..total_fields { + if i > 0 { + data.push(','); + } + // Use fixed-width values for stable benchmarks: 7 digits + let _ = write!(data, "\"f{}\":{:07}", i, i); + } + data.push('}'); + data.push('\n'); + } + data.into_bytes() +} + +fn bench_wide_projection(c: &mut Criterion) { + // Wide projection workload: tests overhead of parsing unused fields + let wide_projection_data = build_wide_projection_json(ROWS, WIDE_PROJECTION_TOTAL_FIELDS); + + let full_schema = build_schema(WIDE_PROJECTION_TOTAL_FIELDS); + bench_decode_schema( + c, + "decode_wide_projection_full_json", + &wide_projection_data, + full_schema, + ); + + // Projected schema: only 3 fields (f0, f10, f50) out of 100 + let projected_schema = build_projection_schema(&[0, 10, 50]); + bench_decode_schema( + c, + "decode_wide_projection_narrow_json", + &wide_projection_data, + projected_schema, + ); +} + +criterion_group!( + benches, + bench_decode_wide_object, + bench_serialize_wide_object, + bench_binary_hex, + bench_wide_projection +); +criterion_main!(benches); From 964daecce22c08b60288bb4d00028ed950dabd56 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Jan 2026 15:05:54 -0500 Subject: [PATCH 30/33] chore: switch test from `bincode` to maintained `postcard` crate (RUSTSEC-2025-0141 ) (#9104) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Which issue does this PR close? - Closes https://github.com/apache/arrow-rs/pull/9009 - Also addresses https://rustsec.org/advisories/RUSTSEC-2025-0141 -- this is only used for testing but still # Rationale for this change https://crates.io/crates/bincode is unmaintained Screenshot 2026-01-06 at 5 57 13 PM There also appears to be some sort of drama related to the maintainer While we only use this code in tests, it would be nice to avoid issues sooner rather than later # What changes are included in this PR? Change to use postcard # Are these changes tested? by ci # Are there any user-facing changes? --- arrow-schema/Cargo.toml | 5 +---- arrow-schema/src/field.rs | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index cd9bf767e16b..fb6461a9e9ae 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -53,12 +53,9 @@ serde = ["dep:serde_core", "dep:serde"] all-features = true [dev-dependencies] -bincode = { version = "2.0.1", default-features = false, features = [ - "std", - "serde", -] } criterion = { workspace = true, default-features = false } insta = "1.43.1" +postcard = { version = "1.0.10", default-features = false, features = ["use-std"] } [[bench]] name = "ffi" diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 3b3372a78eae..1b9a298e5918 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -1448,10 +1448,8 @@ mod test { #[cfg(feature = "serde")] fn assert_binary_serde_round_trip(field: Field) { - let config = bincode::config::legacy(); - let serialized = bincode::serde::encode_to_vec(&field, config).unwrap(); - let (deserialized, _): (Field, _) = - bincode::serde::decode_from_slice(&serialized, config).unwrap(); + let serialized = postcard::to_stdvec(&field).unwrap(); + let deserialized: Field = postcard::from_bytes(&serialized).unwrap(); assert_eq!(field, deserialized) } From 96637fc8b928a94de53bbec3501337c0ecfbf936 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Jan 2026 05:51:52 -0500 Subject: [PATCH 31/33] Speed up binary kernels (30% faster `and` and `or`), add `BooleanBuffer::from_bitwise_binary_op` (#9090) # Which issue does this PR close? - Part of https://github.com/apache/arrow-rs/issues/8806 - Closes https://github.com/apache/arrow-rs/pull/8854 - Closes https://github.com/apache/arrow-rs/pull/8807 This is the next step after - https://github.com/apache/arrow-rs/pull/8996 # Rationale for this change - we can help rust / LLVM generate more optimal code by processing u64 words at a time when the buffer is already u64 aligned (see #8807) Also, it is hard to find the code to create new Buffers by applying bitwise unary operations. # What changes are included in this PR? - Introduce optimized `BooleanBuffer::from_bitwise_binary` - Migrate several kernels that use `bitwise_bin_op_helper` to use the new BooleanBuffer # Are these changes tested? Yes new tests are added Performance results show 30% performance improvement for the `and` and `or` kernels for aligned buffers (common case) # Are there any user-facing changes? A new API --- arrow-arith/src/boolean.rs | 24 ++--- arrow-buffer/src/buffer/boolean.rs | 150 ++++++++++++++++++++++++++++- arrow-buffer/src/buffer/ops.rs | 13 ++- arrow-select/src/nullif.rs | 2 +- 4 files changed, 169 insertions(+), 20 deletions(-) diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index d94df49de256..6bf438e64618 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -23,7 +23,7 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. use arrow_array::*; -use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; +use arrow_buffer::buffer::bitwise_quaternary_op_helper; use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not}; use arrow_schema::ArrowError; @@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_null_buffer.buffer(), right_null_buffer.offset(), left_values.inner(), @@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_nulls.buffer(), right_nulls.offset(), left_values.inner(), @@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result( + left: impl AsRef<[u8]>, + left_offset_in_bits: usize, + right: impl AsRef<[u8]>, + right_offset_in_bits: usize, + len_in_bits: usize, + mut op: F, + ) -> Self + where + F: FnMut(u64, u64) -> u64, + { + let left = left.as_ref(); + let right = right.as_ref(); + // try fast path for aligned input + // If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices + // to improve performance. + if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { + // align to byte boundary + let left = &left[left_offset_in_bits / 8..]; + let right = &right[right_offset_in_bits / 8..]; + + unsafe { + let (left_prefix, left_u64s, left_suffix) = left.align_to::(); + let (right_prefix, right_u64s, right_suffix) = right.align_to::(); + // if there is no prefix or suffix, both buffers are aligned and + // we can do the operation directly on u64s. + // TODO: consider `slice::as_chunks` and `u64::from_le_bytes` when MSRV reaches 1.88. + // https://github.com/apache/arrow-rs/pull/9022#discussion_r2639949361 + if left_prefix.is_empty() + && right_prefix.is_empty() + && left_suffix.is_empty() + && right_suffix.is_empty() + { + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + return BooleanBuffer { + buffer: Buffer::from(result_u64s), + bit_offset: 0, + bit_len: len_in_bits, + }; + } + } + } + let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits); + let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits); + + let chunks = left_chunks + .iter() + .zip(right_chunks.iter()) + .map(|(left, right)| op(left, right)); + // Soundness: `BitChunks` is a `BitChunks` trusted length iterator which + // correctly reports its upper bound + let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; + + let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + buffer.extend_from_slice(rem); + + BooleanBuffer { + buffer: Buffer::from(buffer), + bit_offset: 0, + bit_len: len_in_bits, + } + } + /// Returns the number of set bits in this buffer pub fn count_set_bits(&self) -> usize { self.buffer @@ -656,4 +762,42 @@ mod tests { assert_eq!(result, expected); } } + + #[test] + fn test_from_bitwise_binary_op() { + // pick random boolean inputs + let input_bools_left = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_bools_right = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]); + let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]); + + for left_offset in 0..200 { + for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] { + for len_offset in [0, 1, 44, 100, 256, 300, 512] { + let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds + // compute with AND + let result = BooleanBuffer::from_bitwise_binary_op( + input_buffer_left.values(), + left_offset, + input_buffer_right.values(), + right_offset, + len, + |a, b| a & b, + ); + // compute directly from bools + let expected = input_bools_left[left_offset..] + .iter() + .zip(&input_bools_right[right_offset..]) + .take(len) + .map(|(a, b)| *a & *b) + .collect::(); + assert_eq!(result, expected); + } + } + } + } } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index cb0925bb2cd1..36efe876432d 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -150,7 +150,7 @@ pub fn buffer_bin_and( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -158,6 +158,7 @@ pub fn buffer_bin_and( len_in_bits, |a, b| a & b, ) + .into_inner() } /// Apply a bitwise or to two inputs and return the result as a Buffer. @@ -169,7 +170,7 @@ pub fn buffer_bin_or( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -177,6 +178,7 @@ pub fn buffer_bin_or( len_in_bits, |a, b| a | b, ) + .into_inner() } /// Apply a bitwise xor to two inputs and return the result as a Buffer. @@ -188,7 +190,7 @@ pub fn buffer_bin_xor( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -196,6 +198,7 @@ pub fn buffer_bin_xor( len_in_bits, |a, b| a ^ b, ) + .into_inner() } /// Apply a bitwise and_not to two inputs and return the result as a Buffer. @@ -207,7 +210,7 @@ pub fn buffer_bin_and_not( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -215,11 +218,11 @@ pub fn buffer_bin_and_not( len_in_bits, |a, b| a & !b, ) + .into_inner() } /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { - // TODO: should we deprecate this function in favor of the Buffer ! impl ? BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, |a| !a).into_inner() } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index e51016f9bad3..fa875c20e302 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -23,7 +23,7 @@ use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper}; use arrow_schema::{ArrowError, DataType}; /// Returns a new array with the same values and the validity bit to false where -/// the corresponding element of`right` is true. +/// the corresponding element of `right` is true. /// /// This can be used to implement SQL `NULLIF` /// From 13c43c489628a051f2f00ed9ba97d853d34287a8 Mon Sep 17 00:00:00 2001 From: Congxian Qiu Date: Sat, 10 Jan 2026 04:13:21 +0800 Subject: [PATCH 32/33] [Variant] Optimize the object header generation logic in ObjectBuilder::finish (#8031) # Which issue does this PR close? This pr wants to optimize the logic of `ObjectBuilder::finish` - Closes #7978 . # Rationale for this change This pr wants to optimize the logic of `ObjectBuilder::finish` # What changes are included in this PR? This PR wants to optimize `ObjectBuilder::finish` with packedu3 iterator # Are these changes tested? This pr was covered by existing test # Are there any user-facing changes? No --- parquet-variant-compute/Cargo.toml | 1 + .../benches/variant_kernels.rs | 147 ++++++++++++++++++ parquet-variant-json/src/from_json.rs | 2 +- parquet-variant-json/src/lib.rs | 2 +- parquet-variant/src/builder.rs | 75 +-------- parquet-variant/src/builder/list.rs | 2 +- parquet-variant/src/builder/metadata.rs | 2 +- parquet-variant/src/builder/object.rs | 116 +++++++++----- 8 files changed, 236 insertions(+), 111 deletions(-) diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index 74c3dd3fb72f..85d66a9cf706 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -37,6 +37,7 @@ parquet-variant = { workspace = true } parquet-variant-json = { workspace = true } chrono = { workspace = true } uuid = { version = "1.18.0", features = ["v4"]} +serde_json = "1.0" [lib] name = "parquet_variant_compute" diff --git a/parquet-variant-compute/benches/variant_kernels.rs b/parquet-variant-compute/benches/variant_kernels.rs index 13ff77d9fb18..383697ab8cc6 100644 --- a/parquet-variant-compute/benches/variant_kernels.rs +++ b/parquet-variant-compute/benches/variant_kernels.rs @@ -23,12 +23,15 @@ use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantBuilder}; use parquet_variant_compute::{ GetOptions, VariantArray, VariantArrayBuilder, json_to_variant, variant_get, }; +use parquet_variant_json::append_json; use rand::Rng; use rand::SeedableRng; use rand::distr::Alphanumeric; use rand::rngs::StdRng; +use serde_json::Value; use std::fmt::Write; use std::sync::Arc; + fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { let input_array = StringArray::from_iter_values(json_repeated_struct(8000)); let array_ref: ArrayRef = Arc::new(input_array); @@ -66,6 +69,58 @@ fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { }); }); + let input_array = StringArray::from_iter_values(random_structure(8000, 200)); + let total_input_bytes = input_array + .iter() + .flatten() // filter None + .map(|v| v.len()) + .sum::(); + let id = format!( + "batch_json_string_to_variant object - 1 depth(200 fields) random_json({} bytes per document)", + total_input_bytes / input_array.len() + ); + let array_ref: ArrayRef = Arc::new(input_array); + let string_array = array_ref.as_any().downcast_ref::().unwrap(); + let mut json_array: Vec = Vec::with_capacity(string_array.len()); + for i in 0..string_array.len() { + json_array.push(serde_json::from_str(string_array.value(i)).unwrap()); + } + c.bench_function(&id, |b| { + b.iter(|| { + let mut variant_array_builder = VariantArrayBuilder::new(string_array.len()); + for json in &json_array { + append_json(json, &mut variant_array_builder).unwrap(); + } + let _ = variant_array_builder.build(); + }); + }); + + let input_array = StringArray::from_iter_values(random_structure(8000, 100)); + let total_input_bytes = input_array + .iter() + .flatten() // filter None + .map(|v| v.len()) + .sum::(); + let id = format!( + "batch_json_string_to_variant object - 1 depth(100 fields) random_json({} bytes per document)", + total_input_bytes / input_array.len() + ); + let array_ref: ArrayRef = Arc::new(input_array); + let string_array = array_ref.as_any().downcast_ref::().unwrap(); + let mut json_array: Vec = Vec::with_capacity(string_array.len()); + for i in 0..string_array.len() { + json_array.push(serde_json::from_str(string_array.value(i)).unwrap()); + } + c.bench_function(&id, |b| { + b.iter(|| { + let mut variant_array_builder = VariantArrayBuilder::new(string_array.len()); + for json in &json_array { + append_json(json, &mut variant_array_builder).unwrap(); + } + let _ = variant_array_builder.build(); + }); + }); + let input_array = StringArray::from_iter_values(random_json_structure(8000)); let total_input_bytes = input_array .iter() @@ -240,6 +295,22 @@ fn random_json_structure(count: usize) -> impl Iterator { (0..count).map(move |_| generator.next().to_string()) } +fn random_structure(count: usize, max_fields: usize) -> impl Iterator { + let mut generator = RandomJsonGenerator { + null_weight: 5, + string_weight: 25, + number_weight: 25, + boolean_weight: 10, + object_weight: 25, + array_weight: 0, + max_fields, + max_array_length: 0, + max_depth: 1, + ..Default::default() + }; + (0..count).map(move |_| generator.next_object().to_string()) +} + /// Creates JSON with random structure and fields. /// /// Each type is created in proportion controlled by the @@ -299,6 +370,82 @@ impl RandomJsonGenerator { &self.output_buffer } + fn next_object(&mut self) -> &str { + self.output_buffer.clear(); + self.append_random_json_for_object(); + &self.output_buffer + } + + fn append_random_json_for_object(&mut self) { + // use destructuring to ensure each field is used + let Self { + rng, + null_weight, + string_weight, + number_weight, + boolean_weight, + max_fields, + output_buffer, + .. + } = self; + + write!(output_buffer, "{{").unwrap(); + for i in 0..*max_fields { + let key_length = rng.random_range(1..=20); + let key: String = (0..key_length) + .map(|_| rng.sample(Alphanumeric) as char) + .collect(); + write!(output_buffer, "\"{key}\":").unwrap(); + + let total_weight = *null_weight + *string_weight + *number_weight + *boolean_weight; + + // Generate a random number to determine the type + let mut random_value: usize = rng.random_range(0..total_weight); + + if random_value <= *null_weight { + write!(output_buffer, "null").unwrap(); + } else { + random_value -= *null_weight; + + if random_value <= *string_weight { + // Generate a random string between 1 and 20 characters + let length = rng.random_range(1..=20); + let random_string: String = (0..length) + .map(|_| rng.sample(Alphanumeric) as char) + .collect(); + write!(output_buffer, "\"{random_string}\"",).unwrap(); + } else { + random_value -= *string_weight; + + if random_value <= *number_weight { + // 50% chance of generating an integer or a float + if rng.random_bool(0.5) { + // Generate a random integer + let random_integer: i64 = rng.random_range(-1000..1000); + write!(output_buffer, "{random_integer}",).unwrap(); + } else { + // Generate a random float + let random_float: f64 = rng.random_range(-1000.0..1000.0); + write!(output_buffer, "{random_float}",).unwrap(); + } + } else { + random_value -= *number_weight; + + if random_value <= *boolean_weight { + // Generate a random boolean + let random_boolean: bool = rng.random(); + write!(output_buffer, "{random_boolean}",).unwrap(); + } + } + } + } + if i < *max_fields - 1 { + write!(output_buffer, ",").unwrap(); + } + } + write!(&mut self.output_buffer, "}}").unwrap(); + } + /// Appends a random JSON value to the output buffer. fn append_random_json(&mut self, current_depth: usize) { // use destructuring to ensure each field is used diff --git a/parquet-variant-json/src/from_json.rs b/parquet-variant-json/src/from_json.rs index 33e1b2e6db9b..4c22785ef106 100644 --- a/parquet-variant-json/src/from_json.rs +++ b/parquet-variant-json/src/from_json.rs @@ -102,7 +102,7 @@ fn variant_from_number<'m, 'v>(n: &Number) -> Result, ArrowError } } -fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { +pub fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { match json { Value::Null => builder.append_value(Variant::Null), Value::Bool(b) => builder.append_value(*b), diff --git a/parquet-variant-json/src/lib.rs b/parquet-variant-json/src/lib.rs index f24c740818be..6b42b15bd480 100644 --- a/parquet-variant-json/src/lib.rs +++ b/parquet-variant-json/src/lib.rs @@ -34,5 +34,5 @@ mod from_json; mod to_json; -pub use from_json::JsonToVariant; +pub use from_json::{JsonToVariant, append_json}; pub use to_json::VariantToJson; diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 7094d935a5eb..e6122f062c38 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -14,7 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::decoder::{OffsetSizeBytes, VariantBasicType, VariantPrimitiveType}; use crate::{ ShortString, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantList, VariantMetadata, VariantObject, @@ -43,21 +43,15 @@ fn short_string_header(len: usize) -> u8 { (len as u8) << 2 | VariantBasicType::ShortString as u8 } -pub(crate) fn int_size(v: usize) -> u8 { +pub(crate) fn int_size(v: usize) -> OffsetSizeBytes { match v { - 0..=0xFF => 1, - 0x100..=0xFFFF => 2, - 0x10000..=0xFFFFFF => 3, - _ => 4, + 0..=0xFF => OffsetSizeBytes::One, + 0x100..=0xFFFF => OffsetSizeBytes::Two, + 0x10000..=0xFFFFFF => OffsetSizeBytes::Three, + _ => OffsetSizeBytes::Four, } } -/// Write little-endian integer to buffer at a specific position -fn write_offset_at_pos(buf: &mut [u8], start_pos: usize, value: usize, nbytes: u8) { - let bytes = value.to_le_bytes(); - buf[start_pos..start_pos + nbytes as usize].copy_from_slice(&bytes[..nbytes as usize]); -} - /// Wrapper around a `Vec` that provides methods for appending /// primitive values, variant types, and metadata. /// @@ -358,63 +352,6 @@ impl ValueBuilder { ); state.finish(); } - - /// Writes out the header byte for a variant object or list, from the starting position - /// of the builder, will return the position after this write - pub(crate) fn append_header_start_from_buf_pos( - &mut self, - start_pos: usize, // the start position where the header will be inserted - header_byte: u8, - is_large: bool, - num_fields: usize, - ) -> usize { - let buffer = self.inner_mut(); - - // Write header at the original start position - let mut header_pos = start_pos; - - // Write header byte - buffer[header_pos] = header_byte; - header_pos += 1; - - // Write number of fields - if is_large { - buffer[header_pos..header_pos + 4].copy_from_slice(&(num_fields as u32).to_le_bytes()); - header_pos += 4; - } else { - buffer[header_pos] = num_fields as u8; - header_pos += 1; - } - - header_pos - } - - /// Writes out the offsets for an array of offsets, including the final offset (data size). - /// from the starting position of the buffer, will return the position after this write - pub(crate) fn append_offset_array_start_from_buf_pos( - &mut self, - start_pos: usize, - offsets: impl IntoIterator, - data_size: Option, - nbytes: u8, - ) -> usize { - let buf = self.inner_mut(); - - let mut current_pos = start_pos; - for relative_offset in offsets { - write_offset_at_pos(buf, current_pos, relative_offset, nbytes); - current_pos += nbytes as usize; - } - - // Write data_size - if let Some(data_size) = data_size { - // Write data_size at the end of the offsets - write_offset_at_pos(buf, current_pos, data_size, nbytes); - current_pos += nbytes as usize; - } - - current_pos - } } /// A trait for managing state specific to different builder types. diff --git a/parquet-variant/src/builder/list.rs b/parquet-variant/src/builder/list.rs index 4c2682c50ac4..5064904ca7de 100644 --- a/parquet-variant/src/builder/list.rs +++ b/parquet-variant/src/builder/list.rs @@ -174,7 +174,7 @@ impl<'a, S: BuilderSpecificState> ListBuilder<'a, S> { // Make sure to reserve enough capacity to handle the extra bytes we'll truncate. let mut bytes_to_splice = Vec::with_capacity(header_size + 3); // Write header - let header = array_header(is_large, offset_size); + let header = array_header(is_large, offset_size as _); bytes_to_splice.push(header); append_packed_u32(&mut bytes_to_splice, num_elements as u32, num_elements_size); diff --git a/parquet-variant/src/builder/metadata.rs b/parquet-variant/src/builder/metadata.rs index 10163ba3e0cf..efccc2e4c63e 100644 --- a/parquet-variant/src/builder/metadata.rs +++ b/parquet-variant/src/builder/metadata.rs @@ -206,7 +206,7 @@ impl WritableMetadataBuilder { // Determine appropriate offset size based on the larger of dict size or total string size let max_offset = std::cmp::max(total_dict_size, nkeys); - let offset_size = int_size(max_offset); + let offset_size = int_size(max_offset) as u8; let offset_start = 1 + offset_size as usize; let string_start = offset_start + (nkeys + 1) * offset_size as usize; diff --git a/parquet-variant/src/builder/object.rs b/parquet-variant/src/builder/object.rs index ab04360c16a7..876c2e2d4c7c 100644 --- a/parquet-variant/src/builder/object.rs +++ b/parquet-variant/src/builder/object.rs @@ -24,14 +24,50 @@ use crate::{ use arrow_schema::ArrowError; use indexmap::IndexMap; -fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { - let large_bit = if large { 1 } else { 0 }; - (large_bit << (BASIC_TYPE_BITS + 4)) - | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) - | ((offset_size - 1) << BASIC_TYPE_BITS) +fn object_header() -> u8 { + (LARGE_BIT << (BASIC_TYPE_BITS + 4)) + | ((ID_SIZE - 1) << (BASIC_TYPE_BITS + 2)) + | ((OFFSET_SIZE - 1) << BASIC_TYPE_BITS) | VariantBasicType::Object as u8 } +struct ObjectHeaderWriter(); + +impl ObjectHeaderWriter { + fn write( + dst: &mut Vec, + num_fields: usize, + field_ids: impl Iterator, + offsets: impl Iterator, + data_size: usize, + ) { + let is_large = num_fields > u8::MAX as usize; + // num_fields will consume 4 bytes when it is larger than u8::MAX + if is_large { + dst.push(object_header::<1, { ID_SIZE }, { OFFSET_SIZE }>()); + append_packed_u32::<4>(dst, num_fields); + } else { + dst.push(object_header::<0, { ID_SIZE }, { OFFSET_SIZE }>()); + append_packed_u32::<1>(dst, num_fields); + } + + for id in field_ids { + append_packed_u32::(dst, id as usize); + } + + for off in offsets { + append_packed_u32::(dst, off); + } + + append_packed_u32::(dst, data_size); + } +} + +#[inline(always)] +fn append_packed_u32(dest: &mut Vec, value: usize) { + dest.extend_from_slice(&value.to_le_bytes()[..SIZE as usize]); +} + /// A builder for creating [`Variant::Object`] values. /// /// See the examples on [`VariantBuilder`] for usage. @@ -245,41 +281,45 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> { (num_fields * id_size as usize) + // field IDs ((num_fields + 1) * offset_size as usize); // field offsets + data_size + let mut bytes_to_splice = Vec::with_capacity(header_size); + + macro_rules! write_header { + ($offset_size:expr, $id_size:expr) => { + ObjectHeaderWriter::<{ $offset_size as u8 }, { $id_size as u8 }>::write( + &mut bytes_to_splice, + num_fields, + self.fields.keys().copied(), + self.fields.values().copied(), + data_size, + ) + }; + } + + use crate::decoder::OffsetSizeBytes::*; + match (offset_size, id_size) { + (One, One) => write_header!(One, One), + (One, Two) => write_header!(One, Two), + (One, Three) => write_header!(One, Three), + (One, Four) => write_header!(One, Four), + (Two, One) => write_header!(Two, One), + (Two, Two) => write_header!(Two, Two), + (Two, Three) => write_header!(Two, Three), + (Two, Four) => write_header!(Two, Four), + (Three, One) => write_header!(Three, One), + (Three, Two) => write_header!(Three, Two), + (Three, Three) => write_header!(Three, Three), + (Three, Four) => write_header!(Three, Four), + (Four, One) => write_header!(Four, One), + (Four, Two) => write_header!(Four, Two), + (Four, Three) => write_header!(Four, Three), + (Four, Four) => write_header!(Four, Four), + } + // Shift existing data to make room for the header - value_builder.inner_mut().splice( - starting_offset..starting_offset, - std::iter::repeat_n(0u8, header_size), - ); + value_builder + .inner_mut() + .splice(starting_offset..starting_offset, bytes_to_splice); - // Write header at the original start position - let mut header_pos = starting_offset; - - // Write header byte - let header = object_header(is_large, id_size, offset_size); - - header_pos = self - .parent_state - .value_builder() - .append_header_start_from_buf_pos(header_pos, header, is_large, num_fields); - - header_pos = self - .parent_state - .value_builder() - .append_offset_array_start_from_buf_pos( - header_pos, - self.fields.keys().copied().map(|id| id as usize), - None, - id_size, - ); - - self.parent_state - .value_builder() - .append_offset_array_start_from_buf_pos( - header_pos, - self.fields.values().copied(), - Some(data_size), - offset_size, - ); self.parent_state.finish(); } } From 34f62ef1ff45c012d8af0714dfde1b3efeedc16c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Jan 2026 15:28:49 -0500 Subject: [PATCH 33/33] Update null_if kernel to use Arc<[Buffer]> --- arrow-select/src/zip.rs | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index 6be034fca23d..8702b558d01f 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -35,7 +35,7 @@ use std::fmt::{Debug, Formatter}; use std::hash::Hash; use std::marker::PhantomData; use std::ops::Not; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; /// Zip two arrays by some boolean mask. /// @@ -667,12 +667,17 @@ fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer { struct ByteViewScalarImpl { truthy_view: Option, - truthy_buffers: Vec, + truthy_buffers: Arc<[Buffer]>, falsy_view: Option, - falsy_buffers: Vec, + falsy_buffers: Arc<[Buffer]>, phantom: PhantomData, } +static EMPTY_ARC: OnceLock> = OnceLock::new(); +fn empty_arc_buffers() -> Arc<[Buffer]> { + Arc::clone(EMPTY_ARC.get_or_init(|| Arc::new([]))) +} + impl ByteViewScalarImpl { fn new(truthy: &dyn Array, falsy: &dyn Array) -> Self { let (truthy_view, truthy_buffers) = Self::get_value_from_scalar(truthy); @@ -686,9 +691,9 @@ impl ByteViewScalarImpl { } } - fn get_value_from_scalar(scalar: &dyn Array) -> (Option, Vec) { + fn get_value_from_scalar(scalar: &dyn Array) -> (Option, Arc<[Buffer]>) { if scalar.is_null(0) { - (None, vec![]) + (None, empty_arc_buffers()) } else { let (views, buffers, _) = scalar.as_byte_view::().clone().into_parts(); (views.first().copied(), buffers) @@ -698,8 +703,8 @@ impl ByteViewScalarImpl { fn get_views_for_single_non_nullable( predicate: BooleanBuffer, value: u128, - buffers: Vec, - ) -> (ScalarBuffer, Vec, Option) { + buffers: Arc<[Buffer]>, + ) -> (ScalarBuffer, Arc<[Buffer]>, Option) { let number_of_true = predicate.count_set_bits(); let number_of_values = predicate.len(); @@ -708,7 +713,7 @@ impl ByteViewScalarImpl { // All values are null return ( vec![0; number_of_values].into(), - vec![], + empty_arc_buffers(), Some(NullBuffer::new_null(number_of_values)), ); } @@ -724,10 +729,10 @@ impl ByteViewScalarImpl { predicate: BooleanBuffer, result_len: usize, truthy_view: u128, - truthy_buffers: Vec, + truthy_buffers: Arc<[Buffer]>, falsy_view: u128, - falsy_buffers: Vec, - ) -> (ScalarBuffer, Vec, Option) { + falsy_buffers: Arc<[Buffer]>, + ) -> (ScalarBuffer, Arc<[Buffer]>, Option) { let true_count = predicate.count_set_bits(); match true_count { 0 => { @@ -751,7 +756,7 @@ impl ByteViewScalarImpl { let byte_view_falsy = ByteView::from(falsy_view); let new_index_falsy_buffers = buffers.len() as u32 + byte_view_falsy.buffer_index; - buffers.extend(falsy_buffers); + buffers.extend(falsy_buffers.iter().cloned()); let byte_view_falsy = byte_view_falsy.with_buffer_index(new_index_falsy_buffers); byte_view_falsy.as_u128() @@ -778,7 +783,7 @@ impl ByteViewScalarImpl { } let bytes = Buffer::from(mutable); - (bytes.into(), buffers, None) + (bytes.into(), buffers.into(), None) } } } @@ -804,28 +809,28 @@ impl ZipImpl for ByteViewScalarImpl { predicate, result_len, truthy, - self.truthy_buffers.clone(), + Arc::clone(&self.truthy_buffers), falsy, - self.falsy_buffers.clone(), + Arc::clone(&self.falsy_buffers), ), (Some(truthy), None) => Self::get_views_for_single_non_nullable( predicate, truthy, - self.truthy_buffers.clone(), + Arc::clone(&self.truthy_buffers), ), (None, Some(falsy)) => { let predicate = predicate.not(); Self::get_views_for_single_non_nullable( predicate, falsy, - self.falsy_buffers.clone(), + Arc::clone(&self.falsy_buffers), ) } (None, None) => { // All values are null ( vec![0; result_len].into(), - vec![], + empty_arc_buffers(), Some(NullBuffer::new_null(result_len)), ) }