@@ -19,6 +19,9 @@ use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
19
20
20
use crate :: { ArrowError , DataType , Field , Fields , IntervalUnit , TimeUnit } ;
21
21
22
+ /// Parses a DataType from a string representation
23
+ ///
24
+ /// For example, the string "Int32" would be parsed into [`DataType::Int32`]
22
25
pub ( crate ) fn parse_data_type ( val : & str ) -> ArrowResult < DataType > {
23
26
Parser :: new ( val) . parse ( )
24
27
}
@@ -36,8 +39,8 @@ fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowErro
36
39
make_error ( val, & format ! ( "Expected '{expected}', got '{actual}'" ) )
37
40
}
38
41
39
- #[ derive( Debug ) ]
40
42
/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
43
+ #[ derive( Debug ) ]
41
44
struct Parser < ' a > {
42
45
val : & ' a str ,
43
46
tokenizer : Peekable < Tokenizer < ' a > > ,
@@ -199,9 +202,31 @@ impl<'a> Parser<'a> {
199
202
let timezone;
200
203
match self . next_token ( ) ? {
201
204
Token :: Comma => {
202
- timezone = Some ( self . parse_double_quoted_string ( "Timezone" ) ?) ;
205
+ match self . next_token ( ) ? {
206
+ // Support old style `Timestamp(Nanosecond, None)`
207
+ Token :: None => {
208
+ timezone = None ;
209
+ }
210
+ // Support old style `Timestamp(Nanosecond, Some("Timezone"))`
211
+ Token :: Some => {
212
+ self . expect_token ( Token :: LParen ) ?;
213
+ timezone = Some ( self . parse_double_quoted_string ( "Timezone" ) ?) ;
214
+ self . expect_token ( Token :: RParen ) ?;
215
+ }
216
+ Token :: DoubleQuotedString ( tz) => {
217
+ // Support new style `Timestamp(Nanosecond, "Timezone")`
218
+ timezone = Some ( tz) ;
219
+ }
220
+ tok => {
221
+ return Err ( make_error (
222
+ self . val ,
223
+ & format ! ( "Expected None, Some, or a timezone string, got {tok:?}" ) ,
224
+ ) ) ;
225
+ }
226
+ } ;
203
227
self . expect_token ( Token :: RParen ) ?;
204
228
}
229
+ // No timezone (e.g `Timestamp(ns)`)
205
230
Token :: RParen => {
206
231
timezone = None ;
207
232
}
@@ -680,7 +705,7 @@ mod test {
680
705
}
681
706
}
682
707
683
- /// convert data_type to a string, and then parse it as a type
708
+ /// Ensure we converting data_type to a string, and then parse it as a type
684
709
/// verifying it is the same
685
710
fn round_trip ( data_type : DataType ) {
686
711
let data_type_string = data_type. to_string ( ) ;
@@ -831,9 +856,190 @@ mod test {
831
856
] ;
832
857
833
858
for ( data_type_string, expected_data_type) in cases {
834
- println ! ( "Parsing '{data_type_string}', expecting '{expected_data_type}'" ) ;
835
859
let parsed_data_type = parse_data_type ( data_type_string) . unwrap ( ) ;
836
- assert_eq ! ( parsed_data_type, expected_data_type) ;
860
+ assert_eq ! (
861
+ parsed_data_type, expected_data_type,
862
+ "Parsing '{data_type_string}', expecting '{expected_data_type}'"
863
+ ) ;
864
+ }
865
+ }
866
+
867
+ /// Ensure that old style types can still be parsed
868
+ #[ test]
869
+ fn test_parse_data_type_backwards_compatibility ( ) {
870
+ use DataType :: * ;
871
+ use IntervalUnit :: * ;
872
+ use TimeUnit :: * ;
873
+ // List below created with:
874
+ // for t in list_datatypes() {
875
+ // println!(r#"("{t}", {t:?}),"#)
876
+ // }
877
+ // (string to parse, expected DataType)
878
+ let cases = [
879
+ ( "Timestamp(Nanosecond, None)" , Timestamp ( Nanosecond , None ) ) ,
880
+ ( "Timestamp(Microsecond, None)" , Timestamp ( Microsecond , None ) ) ,
881
+ ( "Timestamp(Millisecond, None)" , Timestamp ( Millisecond , None ) ) ,
882
+ ( "Timestamp(Second, None)" , Timestamp ( Second , None ) ) ,
883
+ ( "Timestamp(Nanosecond, None)" , Timestamp ( Nanosecond , None ) ) ,
884
+ // Timezones
885
+ (
886
+ r#"Timestamp(Nanosecond, Some("+00:00"))"# ,
887
+ Timestamp ( Nanosecond , Some ( "+00:00" . into ( ) ) ) ,
888
+ ) ,
889
+ (
890
+ r#"Timestamp(Microsecond, Some("+00:00"))"# ,
891
+ Timestamp ( Microsecond , Some ( "+00:00" . into ( ) ) ) ,
892
+ ) ,
893
+ (
894
+ r#"Timestamp(Millisecond, Some("+00:00"))"# ,
895
+ Timestamp ( Millisecond , Some ( "+00:00" . into ( ) ) ) ,
896
+ ) ,
897
+ (
898
+ r#"Timestamp(Second, Some("+00:00"))"# ,
899
+ Timestamp ( Second , Some ( "+00:00" . into ( ) ) ) ,
900
+ ) ,
901
+ ( "Null" , Null ) ,
902
+ ( "Boolean" , Boolean ) ,
903
+ ( "Int8" , Int8 ) ,
904
+ ( "Int16" , Int16 ) ,
905
+ ( "Int32" , Int32 ) ,
906
+ ( "Int64" , Int64 ) ,
907
+ ( "UInt8" , UInt8 ) ,
908
+ ( "UInt16" , UInt16 ) ,
909
+ ( "UInt32" , UInt32 ) ,
910
+ ( "UInt64" , UInt64 ) ,
911
+ ( "Float16" , Float16 ) ,
912
+ ( "Float32" , Float32 ) ,
913
+ ( "Float64" , Float64 ) ,
914
+ ( "Timestamp(s)" , Timestamp ( Second , None ) ) ,
915
+ ( "Timestamp(ms)" , Timestamp ( Millisecond , None ) ) ,
916
+ ( "Timestamp(µs)" , Timestamp ( Microsecond , None ) ) ,
917
+ ( "Timestamp(ns)" , Timestamp ( Nanosecond , None ) ) ,
918
+ (
919
+ r#"Timestamp(ns, "+00:00")"# ,
920
+ Timestamp ( Nanosecond , Some ( "+00:00" . into ( ) ) ) ,
921
+ ) ,
922
+ (
923
+ r#"Timestamp(µs, "+00:00")"# ,
924
+ Timestamp ( Microsecond , Some ( "+00:00" . into ( ) ) ) ,
925
+ ) ,
926
+ (
927
+ r#"Timestamp(ms, "+00:00")"# ,
928
+ Timestamp ( Millisecond , Some ( "+00:00" . into ( ) ) ) ,
929
+ ) ,
930
+ (
931
+ r#"Timestamp(s, "+00:00")"# ,
932
+ Timestamp ( Second , Some ( "+00:00" . into ( ) ) ) ,
933
+ ) ,
934
+ (
935
+ r#"Timestamp(ns, "+08:00")"# ,
936
+ Timestamp ( Nanosecond , Some ( "+08:00" . into ( ) ) ) ,
937
+ ) ,
938
+ (
939
+ r#"Timestamp(µs, "+08:00")"# ,
940
+ Timestamp ( Microsecond , Some ( "+08:00" . into ( ) ) ) ,
941
+ ) ,
942
+ (
943
+ r#"Timestamp(ms, "+08:00")"# ,
944
+ Timestamp ( Millisecond , Some ( "+08:00" . into ( ) ) ) ,
945
+ ) ,
946
+ (
947
+ r#"Timestamp(s, "+08:00")"# ,
948
+ Timestamp ( Second , Some ( "+08:00" . into ( ) ) ) ,
949
+ ) ,
950
+ ( "Date32" , Date32 ) ,
951
+ ( "Date64" , Date64 ) ,
952
+ ( "Time32(s)" , Time32 ( Second ) ) ,
953
+ ( "Time32(ms)" , Time32 ( Millisecond ) ) ,
954
+ ( "Time32(µs)" , Time32 ( Microsecond ) ) ,
955
+ ( "Time32(ns)" , Time32 ( Nanosecond ) ) ,
956
+ ( "Time64(s)" , Time64 ( Second ) ) ,
957
+ ( "Time64(ms)" , Time64 ( Millisecond ) ) ,
958
+ ( "Time64(µs)" , Time64 ( Microsecond ) ) ,
959
+ ( "Time64(ns)" , Time64 ( Nanosecond ) ) ,
960
+ ( "Duration(s)" , Duration ( Second ) ) ,
961
+ ( "Duration(ms)" , Duration ( Millisecond ) ) ,
962
+ ( "Duration(µs)" , Duration ( Microsecond ) ) ,
963
+ ( "Duration(ns)" , Duration ( Nanosecond ) ) ,
964
+ ( "Interval(YearMonth)" , Interval ( YearMonth ) ) ,
965
+ ( "Interval(DayTime)" , Interval ( DayTime ) ) ,
966
+ ( "Interval(MonthDayNano)" , Interval ( MonthDayNano ) ) ,
967
+ ( "Binary" , Binary ) ,
968
+ ( "BinaryView" , BinaryView ) ,
969
+ ( "FixedSizeBinary(0)" , FixedSizeBinary ( 0 ) ) ,
970
+ ( "FixedSizeBinary(1234)" , FixedSizeBinary ( 1234 ) ) ,
971
+ ( "FixedSizeBinary(-432)" , FixedSizeBinary ( -432 ) ) ,
972
+ ( "LargeBinary" , LargeBinary ) ,
973
+ ( "Utf8" , Utf8 ) ,
974
+ ( "Utf8View" , Utf8View ) ,
975
+ ( "LargeUtf8" , LargeUtf8 ) ,
976
+ ( "Decimal32(7, 8)" , Decimal32 ( 7 , 8 ) ) ,
977
+ ( "Decimal64(6, 9)" , Decimal64 ( 6 , 9 ) ) ,
978
+ ( "Decimal128(7, 12)" , Decimal128 ( 7 , 12 ) ) ,
979
+ ( "Decimal256(6, 13)" , Decimal256 ( 6 , 13 ) ) ,
980
+ (
981
+ "Dictionary(Int32, Utf8)" ,
982
+ Dictionary ( Box :: new ( Int32 ) , Box :: new ( Utf8 ) ) ,
983
+ ) ,
984
+ (
985
+ "Dictionary(Int8, Utf8)" ,
986
+ Dictionary ( Box :: new ( Int8 ) , Box :: new ( Utf8 ) ) ,
987
+ ) ,
988
+ (
989
+ "Dictionary(Int8, Timestamp(ns))" ,
990
+ Dictionary ( Box :: new ( Int8 ) , Box :: new ( Timestamp ( Nanosecond , None ) ) ) ,
991
+ ) ,
992
+ (
993
+ "Dictionary(Int8, FixedSizeBinary(23))" ,
994
+ Dictionary ( Box :: new ( Int8 ) , Box :: new ( FixedSizeBinary ( 23 ) ) ) ,
995
+ ) ,
996
+ (
997
+ "Dictionary(Int8, Dictionary(Int8, Utf8))" ,
998
+ Dictionary (
999
+ Box :: new ( Int8 ) ,
1000
+ Box :: new ( Dictionary ( Box :: new ( Int8 ) , Box :: new ( Utf8 ) ) ) ,
1001
+ ) ,
1002
+ ) ,
1003
+ (
1004
+ r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"# ,
1005
+ Struct ( Fields :: from ( vec ! [
1006
+ Field :: new( "f1" , Int64 , true ) ,
1007
+ Field :: new( "f2" , Float64 , true ) ,
1008
+ Field :: new( "f3" , Timestamp ( Second , Some ( "+08:00" . into( ) ) ) , true ) ,
1009
+ Field :: new(
1010
+ "f4" ,
1011
+ Dictionary ( Box :: new( Int8 ) , Box :: new( FixedSizeBinary ( 23 ) ) ) ,
1012
+ true ,
1013
+ ) ,
1014
+ ] ) ) ,
1015
+ ) ,
1016
+ (
1017
+ r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"# ,
1018
+ Struct ( Fields :: from ( vec ! [
1019
+ Field :: new( "Int64" , Int64 , true ) ,
1020
+ Field :: new( "Float64" , Float64 , true ) ,
1021
+ ] ) ) ,
1022
+ ) ,
1023
+ (
1024
+ r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"# ,
1025
+ Struct ( Fields :: from ( vec ! [
1026
+ Field :: new( "f1" , Int64 , true ) ,
1027
+ Field :: new(
1028
+ "nested_struct" ,
1029
+ Struct ( Fields :: from( vec![ Field :: new( "n1" , Int64 , true ) ] ) ) ,
1030
+ true ,
1031
+ ) ,
1032
+ ] ) ) ,
1033
+ ) ,
1034
+ ( r#"Struct()"# , Struct ( Fields :: empty ( ) ) ) ,
1035
+ ] ;
1036
+
1037
+ for ( data_type_string, expected_data_type) in cases {
1038
+ let parsed_data_type = parse_data_type ( data_type_string) . unwrap ( ) ;
1039
+ assert_eq ! (
1040
+ parsed_data_type, expected_data_type,
1041
+ "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1042
+ ) ;
837
1043
}
838
1044
}
839
1045
0 commit comments