Skip to content

Commit 9d03e41

Browse files
authored
Support old syntax for DataType parsing (#8541)
# Which issue does this PR close? - Closes #8539 # Rationale for this change Systems like DataFusion use the string representation of DataType in their public APIs but the type names have changed after - #8425 We should retain backwards compatibility with the old type names too # What changes are included in this PR? 1. Support old style `Timestamp(Nanosecond, None)` and `Timestamp(Nanosecond, None)` style timestamp specifiers # Are these changes tested? Yes, with new tests # Are there any user-facing changes? See above
1 parent 6948929 commit 9d03e41

File tree

1 file changed

+211
-5
lines changed

1 file changed

+211
-5
lines changed

arrow-schema/src/datatype_parse.rs

Lines changed: 211 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
1919

2020
use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
2121

22+
/// Parses a DataType from a string representation
23+
///
24+
/// For example, the string "Int32" would be parsed into [`DataType::Int32`]
2225
pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
2326
Parser::new(val).parse()
2427
}
@@ -36,8 +39,8 @@ fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowErro
3639
make_error(val, &format!("Expected '{expected}', got '{actual}'"))
3740
}
3841

39-
#[derive(Debug)]
4042
/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
43+
#[derive(Debug)]
4144
struct Parser<'a> {
4245
val: &'a str,
4346
tokenizer: Peekable<Tokenizer<'a>>,
@@ -199,9 +202,31 @@ impl<'a> Parser<'a> {
199202
let timezone;
200203
match self.next_token()? {
201204
Token::Comma => {
202-
timezone = Some(self.parse_double_quoted_string("Timezone")?);
205+
match self.next_token()? {
206+
// Support old style `Timestamp(Nanosecond, None)`
207+
Token::None => {
208+
timezone = None;
209+
}
210+
// Support old style `Timestamp(Nanosecond, Some("Timezone"))`
211+
Token::Some => {
212+
self.expect_token(Token::LParen)?;
213+
timezone = Some(self.parse_double_quoted_string("Timezone")?);
214+
self.expect_token(Token::RParen)?;
215+
}
216+
Token::DoubleQuotedString(tz) => {
217+
// Support new style `Timestamp(Nanosecond, "Timezone")`
218+
timezone = Some(tz);
219+
}
220+
tok => {
221+
return Err(make_error(
222+
self.val,
223+
&format!("Expected None, Some, or a timezone string, got {tok:?}"),
224+
));
225+
}
226+
};
203227
self.expect_token(Token::RParen)?;
204228
}
229+
// No timezone (e.g `Timestamp(ns)`)
205230
Token::RParen => {
206231
timezone = None;
207232
}
@@ -680,7 +705,7 @@ mod test {
680705
}
681706
}
682707

683-
/// convert data_type to a string, and then parse it as a type
708+
/// Ensure we converting data_type to a string, and then parse it as a type
684709
/// verifying it is the same
685710
fn round_trip(data_type: DataType) {
686711
let data_type_string = data_type.to_string();
@@ -831,9 +856,190 @@ mod test {
831856
];
832857

833858
for (data_type_string, expected_data_type) in cases {
834-
println!("Parsing '{data_type_string}', expecting '{expected_data_type}'");
835859
let parsed_data_type = parse_data_type(data_type_string).unwrap();
836-
assert_eq!(parsed_data_type, expected_data_type);
860+
assert_eq!(
861+
parsed_data_type, expected_data_type,
862+
"Parsing '{data_type_string}', expecting '{expected_data_type}'"
863+
);
864+
}
865+
}
866+
867+
/// Ensure that old style types can still be parsed
868+
#[test]
869+
fn test_parse_data_type_backwards_compatibility() {
870+
use DataType::*;
871+
use IntervalUnit::*;
872+
use TimeUnit::*;
873+
// List below created with:
874+
// for t in list_datatypes() {
875+
// println!(r#"("{t}", {t:?}),"#)
876+
// }
877+
// (string to parse, expected DataType)
878+
let cases = [
879+
("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
880+
("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
881+
("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
882+
("Timestamp(Second, None)", Timestamp(Second, None)),
883+
("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
884+
// Timezones
885+
(
886+
r#"Timestamp(Nanosecond, Some("+00:00"))"#,
887+
Timestamp(Nanosecond, Some("+00:00".into())),
888+
),
889+
(
890+
r#"Timestamp(Microsecond, Some("+00:00"))"#,
891+
Timestamp(Microsecond, Some("+00:00".into())),
892+
),
893+
(
894+
r#"Timestamp(Millisecond, Some("+00:00"))"#,
895+
Timestamp(Millisecond, Some("+00:00".into())),
896+
),
897+
(
898+
r#"Timestamp(Second, Some("+00:00"))"#,
899+
Timestamp(Second, Some("+00:00".into())),
900+
),
901+
("Null", Null),
902+
("Boolean", Boolean),
903+
("Int8", Int8),
904+
("Int16", Int16),
905+
("Int32", Int32),
906+
("Int64", Int64),
907+
("UInt8", UInt8),
908+
("UInt16", UInt16),
909+
("UInt32", UInt32),
910+
("UInt64", UInt64),
911+
("Float16", Float16),
912+
("Float32", Float32),
913+
("Float64", Float64),
914+
("Timestamp(s)", Timestamp(Second, None)),
915+
("Timestamp(ms)", Timestamp(Millisecond, None)),
916+
("Timestamp(µs)", Timestamp(Microsecond, None)),
917+
("Timestamp(ns)", Timestamp(Nanosecond, None)),
918+
(
919+
r#"Timestamp(ns, "+00:00")"#,
920+
Timestamp(Nanosecond, Some("+00:00".into())),
921+
),
922+
(
923+
r#"Timestamp(µs, "+00:00")"#,
924+
Timestamp(Microsecond, Some("+00:00".into())),
925+
),
926+
(
927+
r#"Timestamp(ms, "+00:00")"#,
928+
Timestamp(Millisecond, Some("+00:00".into())),
929+
),
930+
(
931+
r#"Timestamp(s, "+00:00")"#,
932+
Timestamp(Second, Some("+00:00".into())),
933+
),
934+
(
935+
r#"Timestamp(ns, "+08:00")"#,
936+
Timestamp(Nanosecond, Some("+08:00".into())),
937+
),
938+
(
939+
r#"Timestamp(µs, "+08:00")"#,
940+
Timestamp(Microsecond, Some("+08:00".into())),
941+
),
942+
(
943+
r#"Timestamp(ms, "+08:00")"#,
944+
Timestamp(Millisecond, Some("+08:00".into())),
945+
),
946+
(
947+
r#"Timestamp(s, "+08:00")"#,
948+
Timestamp(Second, Some("+08:00".into())),
949+
),
950+
("Date32", Date32),
951+
("Date64", Date64),
952+
("Time32(s)", Time32(Second)),
953+
("Time32(ms)", Time32(Millisecond)),
954+
("Time32(µs)", Time32(Microsecond)),
955+
("Time32(ns)", Time32(Nanosecond)),
956+
("Time64(s)", Time64(Second)),
957+
("Time64(ms)", Time64(Millisecond)),
958+
("Time64(µs)", Time64(Microsecond)),
959+
("Time64(ns)", Time64(Nanosecond)),
960+
("Duration(s)", Duration(Second)),
961+
("Duration(ms)", Duration(Millisecond)),
962+
("Duration(µs)", Duration(Microsecond)),
963+
("Duration(ns)", Duration(Nanosecond)),
964+
("Interval(YearMonth)", Interval(YearMonth)),
965+
("Interval(DayTime)", Interval(DayTime)),
966+
("Interval(MonthDayNano)", Interval(MonthDayNano)),
967+
("Binary", Binary),
968+
("BinaryView", BinaryView),
969+
("FixedSizeBinary(0)", FixedSizeBinary(0)),
970+
("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
971+
("FixedSizeBinary(-432)", FixedSizeBinary(-432)),
972+
("LargeBinary", LargeBinary),
973+
("Utf8", Utf8),
974+
("Utf8View", Utf8View),
975+
("LargeUtf8", LargeUtf8),
976+
("Decimal32(7, 8)", Decimal32(7, 8)),
977+
("Decimal64(6, 9)", Decimal64(6, 9)),
978+
("Decimal128(7, 12)", Decimal128(7, 12)),
979+
("Decimal256(6, 13)", Decimal256(6, 13)),
980+
(
981+
"Dictionary(Int32, Utf8)",
982+
Dictionary(Box::new(Int32), Box::new(Utf8)),
983+
),
984+
(
985+
"Dictionary(Int8, Utf8)",
986+
Dictionary(Box::new(Int8), Box::new(Utf8)),
987+
),
988+
(
989+
"Dictionary(Int8, Timestamp(ns))",
990+
Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))),
991+
),
992+
(
993+
"Dictionary(Int8, FixedSizeBinary(23))",
994+
Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
995+
),
996+
(
997+
"Dictionary(Int8, Dictionary(Int8, Utf8))",
998+
Dictionary(
999+
Box::new(Int8),
1000+
Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
1001+
),
1002+
),
1003+
(
1004+
r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#,
1005+
Struct(Fields::from(vec![
1006+
Field::new("f1", Int64, true),
1007+
Field::new("f2", Float64, true),
1008+
Field::new("f3", Timestamp(Second, Some("+08:00".into())), true),
1009+
Field::new(
1010+
"f4",
1011+
Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1012+
true,
1013+
),
1014+
])),
1015+
),
1016+
(
1017+
r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#,
1018+
Struct(Fields::from(vec![
1019+
Field::new("Int64", Int64, true),
1020+
Field::new("Float64", Float64, true),
1021+
])),
1022+
),
1023+
(
1024+
r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#,
1025+
Struct(Fields::from(vec![
1026+
Field::new("f1", Int64, true),
1027+
Field::new(
1028+
"nested_struct",
1029+
Struct(Fields::from(vec![Field::new("n1", Int64, true)])),
1030+
true,
1031+
),
1032+
])),
1033+
),
1034+
(r#"Struct()"#, Struct(Fields::empty())),
1035+
];
1036+
1037+
for (data_type_string, expected_data_type) in cases {
1038+
let parsed_data_type = parse_data_type(data_type_string).unwrap();
1039+
assert_eq!(
1040+
parsed_data_type, expected_data_type,
1041+
"Parsing '{data_type_string}', expecting '{expected_data_type}'"
1042+
);
8371043
}
8381044
}
8391045

0 commit comments

Comments
 (0)