Skip to content

Commit 00c7ff3

Browse files
committed
Fixes after merge
- do not pin the `proc-macro` version to avoid conflicts in CubeStore, - move forked `string_to_timestamp_nanos` from DataFusion, - restore the serde annotation to allow bincode deserialization of schemas in CubeStore, - ignore tests that fail in our fork.
1 parent a70f960 commit 00c7ff3

File tree

3 files changed

+172
-54
lines changed

3 files changed

+172
-54
lines changed

arrow-flight/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ futures = { version = "0.3", default-features = false, features = ["alloc"]}
4141
tonic-build = "0.4"
4242
# Pin specific version of the tonic-build dependencies to avoid auto-generated
4343
# (and checked in) arrow.flight.protocol.rs from changing
44-
proc-macro2 = "=1.0.27"
44+
proc-macro2 = "1.0.27"
4545

4646
#[lib]
4747
#name = "flight"

arrow/src/compute/kernels/cast_utils.rs

Lines changed: 169 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@
1616
// under the License.
1717

1818
use crate::error::{ArrowError, Result};
19-
use chrono::{prelude::*, LocalResult};
19+
use chrono::format::Fixed::{Nanosecond as FixedNanosecond, TimezoneOffsetColon};
20+
use chrono::format::Item::{Fixed, Literal, Numeric};
21+
use chrono::format::Numeric::Nanosecond;
22+
use chrono::format::Pad::Zero;
23+
use chrono::format::{Item, Parsed};
24+
use chrono::prelude::*;
2025

2126
/// Accepts a string in RFC3339 / ISO8601 standard format and some
2227
/// variants and converts it to a nanosecond precision timestamp.
@@ -78,45 +83,92 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64> {
7883
// separating the date and time with a space ' ' rather than 'T' to be
7984
// (more) compatible with Apache Spark SQL
8085

81-
// timezone offset, using ' ' as a separator
82-
// Example: 2020-09-08 13:42:29.190855-05:00
83-
if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
84-
return Ok(ts.timestamp_nanos());
86+
// We parse the date and time prefix first to share work between all the different formats.
87+
let mut rest = s;
88+
let mut p;
89+
let separator_is_space;
90+
match try_parse_prefix(&mut rest) {
91+
Some(ParsedPrefix {
92+
result,
93+
separator_is_space: s,
94+
}) => {
95+
p = result;
96+
separator_is_space = s;
97+
}
98+
None => {
99+
return Err(ArrowError::CastError(format!(
100+
"Error parsing '{}' as timestamp",
101+
s
102+
)));
103+
}
85104
}
86105

87-
// with an explicit Z, using ' ' as a separator
88-
// Example: 2020-09-08 13:42:29Z
89-
if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
90-
return Ok(ts.timestamp_nanos());
91-
}
106+
if separator_is_space {
107+
// timezone offset, using ' ' as a separator
108+
// Example: 2020-09-08 13:42:29.190855-05:00
109+
// Full format string: "%Y-%m-%d %H:%M:%S%.f%:z".
110+
const FORMAT1: [Item; 2] = [Fixed(FixedNanosecond), Fixed(TimezoneOffsetColon)];
111+
if let Ok(ts) = chrono::format::parse(&mut p, rest, FORMAT1.iter())
112+
.and_then(|()| p.to_datetime())
113+
{
114+
return Ok(ts.timestamp_nanos());
115+
}
92116

93-
// Support timestamps without an explicit timezone offset, again
94-
// to be compatible with what Apache Spark SQL does.
117+
// with an explicit Z, using ' ' as a separator
118+
// Example: 2020-09-08 13:42:29Z
119+
// Full format string: "%Y-%m-%d %H:%M:%S%.fZ".
120+
const FORMAT2: [Item; 2] = [Fixed(FixedNanosecond), Literal("Z")];
121+
if let Ok(ts) = chrono::format::parse(&mut p, rest, FORMAT2.iter())
122+
.and_then(|()| p.to_datetime_with_timezone(&Utc))
123+
{
124+
return Ok(ts.timestamp_nanos());
125+
}
95126

96-
// without a timezone specifier as a local time, using T as a separator
97-
// Example: 2020-09-08T13:42:29.190855
98-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") {
99-
return naive_datetime_to_timestamp(s, ts);
100-
}
127+
// without a timezone specifier as a local time, using ' ' as a separator
128+
// Example: 2020-09-08 13:42:29.190855
129+
const FORMAT5: [Item; 2] = [Literal("."), Numeric(Nanosecond, Zero)];
130+
// Full format string: "%Y-%m-%d %H:%M:%S.%f".
131+
if let Ok(ts) = chrono::format::parse(&mut p, rest, FORMAT5.iter())
132+
.and_then(|()| p.to_naive_datetime_with_offset(0))
133+
{
134+
return naive_datetime_to_timestamp(s, ts);
135+
}
101136

102-
// without a timezone specifier as a local time, using T as a
103-
// separator, no fractional seconds
104-
// Example: 2020-09-08T13:42:29
105-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
106-
return naive_datetime_to_timestamp(s, ts);
137+
// without a timezone specifier as a local time, using ' ' as a
138+
// separator, no fractional seconds
139+
// Example: 2020-09-08 13:42:29
140+
// Full format string: "%Y-%m-%d %H:%M:%S".
141+
if rest.is_empty() {
142+
if let Ok(ts) = p.to_naive_datetime_with_offset(0) {
143+
return naive_datetime_to_timestamp(s, ts);
144+
}
145+
}
107146
}
108147

109-
// without a timezone specifier as a local time, using ' ' as a separator
110-
// Example: 2020-09-08 13:42:29.190855
111-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") {
112-
return naive_datetime_to_timestamp(s, ts);
113-
}
148+
// Support timestamps without an explicit timezone offset, again
149+
// to be compatible with what Apache Spark SQL does.
150+
if !separator_is_space
151+
/* i.e. separator == b'T' */
152+
{
153+
// without a timezone specifier as a local time, using T as a separator
154+
// Example: 2020-09-08T13:42:29.190855
155+
// Full format string: "%Y-%m-%dT%H:%M:%S.%f".
156+
const FORMAT3: [Item; 2] = [Literal("."), Numeric(Nanosecond, Zero)];
157+
if let Ok(ts) = chrono::format::parse(&mut p, rest, FORMAT3.iter())
158+
.and_then(|()| p.to_naive_datetime_with_offset(0))
159+
{
160+
return naive_datetime_to_timestamp(s, ts);
161+
}
114162

115-
// without a timezone specifier as a local time, using ' ' as a
116-
// separator, no fractional seconds
117-
// Example: 2020-09-08 13:42:29
118-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
119-
return naive_datetime_to_timestamp(s, ts);
163+
// without a timezone specifier as a local time, using T as a
164+
// separator, no fractional seconds
165+
// Example: 2020-09-08T13:42:29
166+
// Full format string: "%Y-%m-%dT%H:%M:%S".
167+
if rest.is_empty() {
168+
if let Ok(ts) = p.to_naive_datetime_with_offset(0) {
169+
return naive_datetime_to_timestamp(s, ts);
170+
}
171+
}
120172
}
121173

122174
// Note we don't pass along the error message from the underlying
@@ -130,33 +182,97 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64> {
130182
)))
131183
}
132184

185+
/// Parses YYYY-MM-DD(T| )HH:MM:SS.
186+
fn try_parse_prefix(s: &mut &str) -> Option<ParsedPrefix> {
187+
let mut p = Parsed::new();
188+
189+
let mut rest = s.as_bytes();
190+
let year = try_parse_num(&mut rest)?;
191+
try_consume(&mut rest, b'-')?;
192+
let month = try_parse_num(&mut rest)?;
193+
try_consume(&mut rest, b'-')?;
194+
let day = try_parse_num(&mut rest)?;
195+
if rest.is_empty() {
196+
return None;
197+
}
198+
199+
let separator_is_space;
200+
match rest[0] {
201+
b' ' => separator_is_space = true,
202+
b'T' => separator_is_space = false,
203+
_ => return None,
204+
}
205+
206+
rest = &rest[1..];
207+
let hour = try_parse_num(&mut rest)?;
208+
try_consume(&mut rest, b':')?;
209+
let minute = try_parse_num(&mut rest)?;
210+
try_consume(&mut rest, b':')?;
211+
let second = try_parse_num(&mut rest)?;
212+
213+
p.set_year(year).ok()?;
214+
p.set_month(month).ok()?;
215+
p.set_day(day).ok()?;
216+
p.set_hour(hour).ok()?;
217+
p.set_minute(minute).ok()?;
218+
p.set_second(second).ok()?;
219+
220+
*s = unsafe { std::str::from_utf8_unchecked(rest) };
221+
Some(ParsedPrefix {
222+
result: p,
223+
separator_is_space,
224+
})
225+
}
226+
227+
#[must_use]
228+
fn try_parse_num(s: &mut &[u8]) -> Option<i64> {
229+
if s.is_empty() {
230+
return None;
231+
}
232+
233+
let mut i;
234+
if s[0] == b'-' {
235+
i = 1
236+
} else {
237+
i = 0;
238+
}
239+
240+
while i < s.len() && b'0' <= s[i] && s[i] <= b'9' {
241+
i += 1
242+
}
243+
244+
let res = unsafe { std::str::from_utf8_unchecked(&s[0..i]) }
245+
.parse()
246+
.ok();
247+
*s = &s[i..];
248+
return res;
249+
}
250+
251+
#[must_use]
252+
fn try_consume(s: &mut &[u8], c: u8) -> Option<()> {
253+
if s.is_empty() || s[0] != c {
254+
return None;
255+
}
256+
*s = &s[1..];
257+
Some(())
258+
}
259+
260+
struct ParsedPrefix {
261+
result: Parsed,
262+
separator_is_space: bool, // When false, the separator is 'T'.
263+
}
264+
133265
/// Converts the naive datetime (which has no specific timezone) to a
134266
/// nanosecond epoch timestamp relative to UTC.
135-
fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result<i64> {
136-
let l = Local {};
137-
138-
match l.from_local_datetime(&datetime) {
139-
LocalResult::None => Err(ArrowError::CastError(format!(
140-
"Error parsing '{}' as timestamp: local time representation is invalid",
141-
s
142-
))),
143-
LocalResult::Single(local_datetime) => {
144-
Ok(local_datetime.with_timezone(&Utc).timestamp_nanos())
145-
}
146-
// Ambiguous times can happen if the timestamp is exactly when
147-
// a daylight savings time transition occurs, for example, and
148-
// so the datetime could validly be said to be in two
149-
// potential offsets. However, since we are about to convert
150-
// to UTC anyways, we can pick one arbitrarily
151-
LocalResult::Ambiguous(local_datetime, _) => {
152-
Ok(local_datetime.with_timezone(&Utc).timestamp_nanos())
153-
}
154-
}
267+
fn naive_datetime_to_timestamp(_s: &str, datetime: NaiveDateTime) -> Result<i64> {
268+
// CubeStore-specific: do not take timezones into account.
269+
Ok(Utc.from_utc_datetime(&datetime).timestamp_nanos())
155270
}
156271

157272
#[cfg(test)]
158273
mod tests {
159274
use super::*;
275+
use chrono::LocalResult;
160276

161277
#[test]
162278
fn string_to_timestamp_timezone() -> Result<()> {
@@ -220,6 +336,7 @@ mod tests {
220336
}
221337

222338
#[test]
339+
#[ignore = "CubeStore always uses UTC"]
223340
#[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime
224341
fn string_to_timestamp_no_timezone() -> Result<()> {
225342
// This test is designed to succeed in regardless of the local

arrow/src/datatypes/schema.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ pub struct Schema {
3535
pub(crate) fields: Vec<Field>,
3636
/// A map of key-value pairs containing additional meta data.
3737
#[serde(skip_serializing_if = "HashMap::is_empty")]
38-
#[serde(default)]
38+
#[serde(skip_deserializing)]
3939
pub(crate) metadata: HashMap<String, String>,
4040
}
4141

@@ -344,6 +344,7 @@ mod tests {
344344
use super::*;
345345

346346
#[test]
347+
#[ignore = "CubeStore ignores metadata during deserialization"]
347348
fn test_ser_de_metadata() {
348349
// ser/de with empty metadata
349350
let mut schema = Schema::new(vec![

0 commit comments

Comments
 (0)