diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index eda4952cf590b..ce5d1b20b1074 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -2989,13 +2989,8 @@ impl ScalarValue { }, ScalarValue::Utf8View(e) => match e { Some(value) => { - let mut builder = - StringViewBuilder::with_capacity(size).with_deduplicate_strings(); - // Replace with upstream arrow-rs code when available: - // https://github.com/apache/arrow-rs/issues/9034 - for _ in 0..size { - builder.append_value(value); - } + let mut builder = StringViewBuilder::with_capacity(size); + builder.try_append_value_n(value, size)?; let array = builder.finish(); Arc::new(array) } @@ -3013,11 +3008,8 @@ impl ScalarValue { }, ScalarValue::BinaryView(e) => match e { Some(value) => { - let mut builder = - BinaryViewBuilder::with_capacity(size).with_deduplicate_strings(); - for _ in 0..size { - builder.append_value(value); - } + let mut builder = BinaryViewBuilder::with_capacity(size); + builder.try_append_value_n(value, size)?; let array = builder.finish(); Arc::new(array) } diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 7b72c264e5557..ce7f534506d61 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -19,8 +19,8 @@ use arrow::{ array::{ - Array, ArrayRef, AsArray, BinaryArrayType, FixedSizeBinaryArray, - GenericBinaryArray, GenericStringArray, OffsetSizeTrait, + Array, ArrayRef, AsArray, BinaryArrayType, GenericBinaryArray, + GenericStringArray, OffsetSizeTrait, }, datatypes::DataType, }; @@ -239,7 +239,7 @@ fn encode_array(array: &ArrayRef, encoding: Encoding) -> Result { encoding.encode_array::<_, i64>(&array.as_binary::()) } DataType::FixedSizeBinary(_) => { - encoding.encode_fsb_array(array.as_fixed_size_binary()) + encoding.encode_array::<_, i32>(&array.as_fixed_size_binary()) } dt => { internal_err!("Unexpected data type for encode: {dt}") @@ -307,7 +307,7 @@ fn decode_array(array: &ArrayRef, encoding: Encoding) -> Result { let array = array.as_fixed_size_binary(); // TODO: could we be more conservative by accounting for nulls? let estimate = array.len().saturating_mul(*size as usize); - encoding.decode_fsb_array(array, estimate) + encoding.decode_array::<_, i32>(&array, estimate) } dt => { internal_err!("Unexpected data type for decode: {dt}") @@ -404,24 +404,6 @@ impl Encoding { } } - // TODO: refactor this away once https://github.com/apache/arrow-rs/pull/8993 lands - fn encode_fsb_array(self, array: &FixedSizeBinaryArray) -> Result { - match self { - Self::Base64 => { - let array: GenericStringArray = array - .iter() - .map(|x| x.map(|x| BASE64_ENGINE.encode(x))) - .collect(); - Ok(Arc::new(array)) - } - Self::Hex => { - let array: GenericStringArray = - array.iter().map(|x| x.map(hex::encode)).collect(); - Ok(Arc::new(array)) - } - } - } - // OutputOffset important to ensure Large types output Large arrays fn decode_array<'a, InputBinaryArray, OutputOffset>( self, @@ -461,73 +443,6 @@ impl Encoding { } } } - - // TODO: refactor this away once https://github.com/apache/arrow-rs/pull/8993 lands - fn decode_fsb_array( - self, - value: &FixedSizeBinaryArray, - approx_data_size: usize, - ) -> Result { - fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result { - // only write input / 2 bytes to buf - let out_len = input.len() / 2; - let buf = &mut buf[..out_len]; - hex::decode_to_slice(input, buf) - .map_err(|e| exec_datafusion_err!("Failed to decode from hex: {e}"))?; - Ok(out_len) - } - - fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result { - BASE64_ENGINE - .decode_slice(input, buf) - .map_err(|e| exec_datafusion_err!("Failed to decode from base64: {e}")) - } - - fn delegated_decode( - decode: DecodeFunction, - input: &FixedSizeBinaryArray, - conservative_upper_bound_size: usize, - ) -> Result - where - DecodeFunction: Fn(&[u8], &mut [u8]) -> Result, - { - let mut values = vec![0; conservative_upper_bound_size]; - let mut offsets = OffsetBufferBuilder::new(input.len()); - let mut total_bytes_decoded = 0; - for v in input.iter() { - if let Some(v) = v { - let cursor = &mut values[total_bytes_decoded..]; - let decoded = decode(v, cursor)?; - total_bytes_decoded += decoded; - offsets.push_length(decoded); - } else { - offsets.push_length(0); - } - } - // We reserved an upper bound size for the values buffer, but we only use the actual size - values.truncate(total_bytes_decoded); - let binary_array = GenericBinaryArray::::try_new( - offsets.finish(), - Buffer::from_vec(values), - input.nulls().cloned(), - )?; - Ok(Arc::new(binary_array)) - } - - match self { - Self::Base64 => { - let upper_bound = base64::decoded_len_estimate(approx_data_size); - delegated_decode(base64_decode, value, upper_bound) - } - Self::Hex => { - // Calculate the upper bound for decoded byte size - // For hex encoding, each pair of hex characters (2 bytes) represents 1 byte when decoded - // So the upper bound is half the length of the input values. - let upper_bound = approx_data_size / 2; - delegated_decode(hex_decode, value, upper_bound) - } - } - } } fn delegated_decode<'a, DecodeFunction, InputBinaryArray, OutputOffset>(