Skip to content

Commit 40409e4

Browse files
authored
Split arrow_cast::cast::string into it's own submodule (#5563)
* Spit cast::string into a submodule of cast * Remove duplicate function * Apply changes * Format change --------- Co-authored-by: Clide Stefani <[email protected]>
1 parent 8884083 commit 40409e4

File tree

2 files changed

+272
-252
lines changed

2 files changed

+272
-252
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 2 additions & 252 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@
4040
mod decimal;
4141
mod dictionary;
4242
mod list;
43+
mod string;
4344
use crate::cast::decimal::*;
4445
use crate::cast::dictionary::*;
4546
use crate::cast::list::*;
47+
use crate::cast::string::*;
4648

4749
use chrono::{NaiveTime, Offset, TimeZone, Utc};
4850
use std::cmp::Ordering;
@@ -2001,26 +2003,6 @@ where
20012003
from.unary_opt::<_, R>(num::cast::cast::<T::Native, R::Native>)
20022004
}
20032005

2004-
fn value_to_string<O: OffsetSizeTrait>(
2005-
array: &dyn Array,
2006-
options: &CastOptions,
2007-
) -> Result<ArrayRef, ArrowError> {
2008-
let mut builder = GenericStringBuilder::<O>::new();
2009-
let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
2010-
let nulls = array.nulls();
2011-
for i in 0..array.len() {
2012-
match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
2013-
true => builder.append_null(),
2014-
false => {
2015-
formatter.value(i).write(&mut builder)?;
2016-
// tell the builder the row is finished
2017-
builder.append_value("");
2018-
}
2019-
}
2020-
}
2021-
Ok(Arc::new(builder.finish()))
2022-
}
2023-
20242006
fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
20252007
array: &dyn Array,
20262008
) -> Result<ArrayRef, ArrowError> {
@@ -2034,172 +2016,6 @@ fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
20342016
)))
20352017
}
20362018

2037-
/// Parse UTF-8
2038-
fn parse_string<P: Parser, O: OffsetSizeTrait>(
2039-
array: &dyn Array,
2040-
cast_options: &CastOptions,
2041-
) -> Result<ArrayRef, ArrowError> {
2042-
let string_array = array.as_string::<O>();
2043-
let array = if cast_options.safe {
2044-
let iter = string_array.iter().map(|x| x.and_then(P::parse));
2045-
2046-
// Benefit:
2047-
// 20% performance improvement
2048-
// Soundness:
2049-
// The iterator is trustedLen because it comes from an `StringArray`.
2050-
unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
2051-
} else {
2052-
let v = string_array
2053-
.iter()
2054-
.map(|x| match x {
2055-
Some(v) => P::parse(v).ok_or_else(|| {
2056-
ArrowError::CastError(format!(
2057-
"Cannot cast string '{}' to value of {:?} type",
2058-
v,
2059-
P::DATA_TYPE
2060-
))
2061-
}),
2062-
None => Ok(P::Native::default()),
2063-
})
2064-
.collect::<Result<Vec<_>, ArrowError>>()?;
2065-
PrimitiveArray::new(v.into(), string_array.nulls().cloned())
2066-
};
2067-
2068-
Ok(Arc::new(array) as ArrayRef)
2069-
}
2070-
2071-
/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
2072-
fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
2073-
array: &dyn Array,
2074-
to_tz: &Option<Arc<str>>,
2075-
cast_options: &CastOptions,
2076-
) -> Result<ArrayRef, ArrowError> {
2077-
let array = array.as_string::<O>();
2078-
let out: PrimitiveArray<T> = match to_tz {
2079-
Some(tz) => {
2080-
let tz: Tz = tz.as_ref().parse()?;
2081-
cast_string_to_timestamp_impl(array, &tz, cast_options)?
2082-
}
2083-
None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
2084-
};
2085-
Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
2086-
}
2087-
2088-
fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType, Tz: TimeZone>(
2089-
array: &GenericStringArray<O>,
2090-
tz: &Tz,
2091-
cast_options: &CastOptions,
2092-
) -> Result<PrimitiveArray<T>, ArrowError> {
2093-
if cast_options.safe {
2094-
let iter = array.iter().map(|v| {
2095-
v.and_then(|v| {
2096-
let naive = string_to_datetime(tz, v).ok()?.naive_utc();
2097-
T::make_value(naive)
2098-
})
2099-
});
2100-
// Benefit:
2101-
// 20% performance improvement
2102-
// Soundness:
2103-
// The iterator is trustedLen because it comes from an `StringArray`.
2104-
2105-
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
2106-
} else {
2107-
let vec = array
2108-
.iter()
2109-
.map(|v| {
2110-
v.map(|v| {
2111-
let naive = string_to_datetime(tz, v)?.naive_utc();
2112-
T::make_value(naive).ok_or_else(|| {
2113-
ArrowError::CastError(format!(
2114-
"Overflow converting {naive} to {:?}",
2115-
T::UNIT
2116-
))
2117-
})
2118-
})
2119-
.transpose()
2120-
})
2121-
.collect::<Result<Vec<Option<i64>>, _>>()?;
2122-
2123-
// Benefit:
2124-
// 20% performance improvement
2125-
// Soundness:
2126-
// The iterator is trustedLen because it comes from an `StringArray`.
2127-
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
2128-
}
2129-
}
2130-
2131-
fn cast_string_to_interval<Offset, F, ArrowType>(
2132-
array: &dyn Array,
2133-
cast_options: &CastOptions,
2134-
parse_function: F,
2135-
) -> Result<ArrayRef, ArrowError>
2136-
where
2137-
Offset: OffsetSizeTrait,
2138-
ArrowType: ArrowPrimitiveType,
2139-
F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
2140-
{
2141-
let string_array = array
2142-
.as_any()
2143-
.downcast_ref::<GenericStringArray<Offset>>()
2144-
.unwrap();
2145-
let interval_array = if cast_options.safe {
2146-
let iter = string_array
2147-
.iter()
2148-
.map(|v| v.and_then(|v| parse_function(v).ok()));
2149-
2150-
// Benefit:
2151-
// 20% performance improvement
2152-
// Soundness:
2153-
// The iterator is trustedLen because it comes from an `StringArray`.
2154-
unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
2155-
} else {
2156-
let vec = string_array
2157-
.iter()
2158-
.map(|v| v.map(parse_function).transpose())
2159-
.collect::<Result<Vec<_>, ArrowError>>()?;
2160-
2161-
// Benefit:
2162-
// 20% performance improvement
2163-
// Soundness:
2164-
// The iterator is trustedLen because it comes from an `StringArray`.
2165-
unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
2166-
};
2167-
Ok(Arc::new(interval_array) as ArrayRef)
2168-
}
2169-
2170-
fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
2171-
array: &dyn Array,
2172-
cast_options: &CastOptions,
2173-
) -> Result<ArrayRef, ArrowError> {
2174-
cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
2175-
array,
2176-
cast_options,
2177-
parse_interval_year_month,
2178-
)
2179-
}
2180-
2181-
fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
2182-
array: &dyn Array,
2183-
cast_options: &CastOptions,
2184-
) -> Result<ArrayRef, ArrowError> {
2185-
cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
2186-
array,
2187-
cast_options,
2188-
parse_interval_day_time,
2189-
)
2190-
}
2191-
2192-
fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
2193-
array: &dyn Array,
2194-
cast_options: &CastOptions,
2195-
) -> Result<ArrayRef, ArrowError> {
2196-
cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
2197-
array,
2198-
cast_options,
2199-
parse_interval_month_day_nano,
2200-
)
2201-
}
2202-
22032019
fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
22042020
array: PrimitiveArray<Int64Type>,
22052021
to_tz: &Tz,
@@ -2222,41 +2038,6 @@ fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
22222038
Ok(adjusted)
22232039
}
22242040

2225-
/// Casts Utf8 to Boolean
2226-
fn cast_utf8_to_boolean<OffsetSize>(
2227-
from: &dyn Array,
2228-
cast_options: &CastOptions,
2229-
) -> Result<ArrayRef, ArrowError>
2230-
where
2231-
OffsetSize: OffsetSizeTrait,
2232-
{
2233-
let array = from
2234-
.as_any()
2235-
.downcast_ref::<GenericStringArray<OffsetSize>>()
2236-
.unwrap();
2237-
2238-
let output_array = array
2239-
.iter()
2240-
.map(|value| match value {
2241-
Some(value) => match value.to_ascii_lowercase().trim() {
2242-
"t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
2243-
"f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
2244-
Ok(Some(false))
2245-
}
2246-
invalid_value => match cast_options.safe {
2247-
true => Ok(None),
2248-
false => Err(ArrowError::CastError(format!(
2249-
"Cannot cast value '{invalid_value}' to value of Boolean type",
2250-
))),
2251-
},
2252-
},
2253-
None => Ok(None),
2254-
})
2255-
.collect::<Result<BooleanArray, _>>()?;
2256-
2257-
Ok(Arc::new(output_array))
2258-
}
2259-
22602041
/// Cast numeric types to Boolean
22612042
///
22622043
/// Any zero value returns `false` while non-zero returns `true`
@@ -2325,37 +2106,6 @@ where
23252106
unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
23262107
}
23272108

2328-
/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
2329-
/// offset size so re-encoding offset is unnecessary.
2330-
fn cast_binary_to_string<O: OffsetSizeTrait>(
2331-
array: &dyn Array,
2332-
cast_options: &CastOptions,
2333-
) -> Result<ArrayRef, ArrowError> {
2334-
let array = array
2335-
.as_any()
2336-
.downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
2337-
.unwrap();
2338-
2339-
match GenericStringArray::<O>::try_from_binary(array.clone()) {
2340-
Ok(a) => Ok(Arc::new(a)),
2341-
Err(e) => match cast_options.safe {
2342-
true => {
2343-
// Fallback to slow method to convert invalid sequences to nulls
2344-
let mut builder =
2345-
GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());
2346-
2347-
let iter = array
2348-
.iter()
2349-
.map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
2350-
2351-
builder.extend(iter);
2352-
Ok(Arc::new(builder.finish()))
2353-
}
2354-
false => Err(e),
2355-
},
2356-
}
2357-
}
2358-
23592109
/// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to 'FixedSizeBinaryArray'.
23602110
fn cast_binary_to_fixed_size_binary<O: OffsetSizeTrait>(
23612111
array: &dyn Array,

0 commit comments

Comments
 (0)