1
- use std:: sync:: Arc ;
2
-
3
- use deltalake:: arrow:: datatypes:: { DataType , Field , Schema , TimeUnit } ;
1
+ use deltalake:: kernel:: { ArrayType , DataType , StructField } ;
2
+ use deltalake:: { DeltaResult , Schema } ;
4
3
use etl:: types:: { TableSchema , Type } ;
5
4
use etl_postgres:: types:: is_array_type;
6
5
7
- /// Convert a Postgres scalar type to an equivalent Arrow DataType
8
- fn postgres_scalar_type_to_arrow ( typ : & Type ) -> DataType {
6
+ /// Convert a Postgres scalar type to an equivalent Delta DataType
7
+ fn postgres_scalar_type_to_delta ( typ : & Type ) -> DataType {
9
8
match typ {
10
- & Type :: BOOL => DataType :: Boolean ,
9
+ & Type :: BOOL => DataType :: BOOLEAN ,
11
10
& Type :: CHAR | & Type :: BPCHAR | & Type :: VARCHAR | & Type :: NAME | & Type :: TEXT => {
12
- DataType :: Utf8
11
+ DataType :: STRING
13
12
}
14
- & Type :: INT2 => DataType :: Int16 ,
15
- & Type :: INT4 => DataType :: Int32 ,
16
- & Type :: INT8 => DataType :: Int64 ,
17
- & Type :: FLOAT4 => DataType :: Float32 ,
18
- & Type :: FLOAT8 => DataType :: Float64 ,
19
- // Without precision/scale information, map NUMERIC to Utf8 for now
20
- & Type :: NUMERIC => DataType :: Utf8 ,
21
- & Type :: DATE => DataType :: Date32 ,
22
- & Type :: TIME => DataType :: Time64 ( TimeUnit :: Microsecond ) ,
23
- & Type :: TIMESTAMP => DataType :: Timestamp ( TimeUnit :: Microsecond , None ) ,
24
- & Type :: TIMESTAMPTZ => DataType :: Timestamp ( TimeUnit :: Microsecond , Some ( "UTC" . into ( ) ) ) ,
25
- // Arrow has no native UUID type; represent as string
26
- & Type :: UUID => DataType :: Utf8 ,
13
+ & Type :: INT2 => DataType :: SHORT ,
14
+ & Type :: INT4 => DataType :: INTEGER ,
15
+ & Type :: INT8 => DataType :: LONG ,
16
+ & Type :: FLOAT4 => DataType :: FLOAT ,
17
+ & Type :: FLOAT8 => DataType :: DOUBLE ,
18
+ // Without precision/scale information, map NUMERIC to STRING for now
19
+ & Type :: NUMERIC => DataType :: STRING ,
20
+ & Type :: DATE => DataType :: DATE ,
21
+ // Delta Lake doesn't have a separate TIME type, use TIMESTAMP_NTZ
22
+ & Type :: TIME => DataType :: TIMESTAMP_NTZ ,
23
+ & Type :: TIMESTAMP => DataType :: TIMESTAMP_NTZ ,
24
+ & Type :: TIMESTAMPTZ => DataType :: TIMESTAMP ,
25
+ // Delta Lake has no native UUID type; represent as string
26
+ & Type :: UUID => DataType :: STRING ,
27
27
// Represent JSON as string
28
- & Type :: JSON | & Type :: JSONB => DataType :: Utf8 ,
29
- // OID is 32-bit unsigned in Postgres
30
- & Type :: OID => DataType :: UInt32 ,
31
- & Type :: BYTEA => DataType :: Binary ,
32
- _ => DataType :: Utf8 ,
28
+ & Type :: JSON | & Type :: JSONB => DataType :: STRING ,
29
+ // OID is 32-bit unsigned in Postgres, map to INTEGER
30
+ & Type :: OID => DataType :: INTEGER ,
31
+ & Type :: BYTEA => DataType :: BINARY ,
32
+ // Default fallback for unsupported types
33
+ _ => DataType :: STRING ,
33
34
}
34
35
}
35
36
36
- /// Convert a Postgres array type to an Arrow List type
37
- fn postgres_array_type_to_arrow ( typ : & Type ) -> DataType {
37
+ /// Convert a Postgres array type to a Delta Array type
38
+ fn postgres_array_type_to_delta ( typ : & Type ) -> DataType {
38
39
let element_type = match typ {
39
- & Type :: BOOL_ARRAY => DataType :: Boolean ,
40
- & Type :: CHAR_ARRAY | & Type :: BPCHAR_ARRAY | & Type :: VARCHAR_ARRAY | & Type :: NAME_ARRAY
41
- | & Type :: TEXT_ARRAY => DataType :: Utf8 ,
42
- & Type :: INT2_ARRAY => DataType :: Int16 ,
43
- & Type :: INT4_ARRAY => DataType :: Int32 ,
44
- & Type :: INT8_ARRAY => DataType :: Int64 ,
45
- & Type :: FLOAT4_ARRAY => DataType :: Float32 ,
46
- & Type :: FLOAT8_ARRAY => DataType :: Float64 ,
40
+ & Type :: BOOL_ARRAY => DataType :: BOOLEAN ,
41
+ & Type :: CHAR_ARRAY
42
+ | & Type :: BPCHAR_ARRAY
43
+ | & Type :: VARCHAR_ARRAY
44
+ | & Type :: NAME_ARRAY
45
+ | & Type :: TEXT_ARRAY => DataType :: STRING ,
46
+ & Type :: INT2_ARRAY => DataType :: SHORT ,
47
+ & Type :: INT4_ARRAY => DataType :: INTEGER ,
48
+ & Type :: INT8_ARRAY => DataType :: LONG ,
49
+ & Type :: FLOAT4_ARRAY => DataType :: FLOAT ,
50
+ & Type :: FLOAT8_ARRAY => DataType :: DOUBLE ,
47
51
// Map NUMERIC arrays to string arrays until precision/scale available
48
- & Type :: NUMERIC_ARRAY => DataType :: Utf8 ,
49
- & Type :: DATE_ARRAY => DataType :: Date32 ,
50
- & Type :: TIME_ARRAY => DataType :: Time64 ( TimeUnit :: Microsecond ) ,
51
- & Type :: TIMESTAMP_ARRAY => DataType :: Timestamp ( TimeUnit :: Microsecond , None ) ,
52
- & Type :: TIMESTAMPTZ_ARRAY => {
53
- DataType :: Timestamp ( TimeUnit :: Microsecond , Some ( "UTC" . into ( ) ) )
54
- }
55
- & Type :: UUID_ARRAY => DataType :: Utf8 ,
56
- & Type :: JSON_ARRAY | & Type :: JSONB_ARRAY => DataType :: Utf8 ,
57
- & Type :: OID_ARRAY => DataType :: UInt32 ,
58
- & Type :: BYTEA_ARRAY => DataType :: Binary ,
59
- _ => DataType :: Utf8 ,
52
+ & Type :: NUMERIC_ARRAY => DataType :: STRING ,
53
+ & Type :: DATE_ARRAY => DataType :: DATE ,
54
+ & Type :: TIME_ARRAY => DataType :: TIMESTAMP_NTZ ,
55
+ & Type :: TIMESTAMP_ARRAY => DataType :: TIMESTAMP_NTZ ,
56
+ & Type :: TIMESTAMPTZ_ARRAY => DataType :: TIMESTAMP ,
57
+ & Type :: UUID_ARRAY => DataType :: STRING ,
58
+ & Type :: JSON_ARRAY | & Type :: JSONB_ARRAY => DataType :: STRING ,
59
+ & Type :: OID_ARRAY => DataType :: INTEGER ,
60
+ & Type :: BYTEA_ARRAY => DataType :: BINARY ,
61
+ _ => DataType :: STRING ,
60
62
} ;
61
63
62
- DataType :: List ( Arc :: new ( Field :: new ( "item" , element_type, true ) ) )
64
+ ArrayType :: new ( element_type, true ) . into ( )
63
65
}
64
66
65
- /// Convert a Postgres `TableSchema` to an Arrow `Schema`
66
- pub fn postgres_to_arrow_schema ( schema : & TableSchema ) -> Arc < Schema > {
67
- let fields: Vec < Field > = schema
67
+ /// Convert a Postgres `TableSchema` to a Delta `Schema`
68
+ pub fn postgres_to_delta_schema ( schema : & TableSchema ) -> DeltaResult < Schema > {
69
+ let fields: Vec < StructField > = schema
68
70
. column_schemas
69
71
. iter ( )
70
72
. map ( |col| {
71
73
let data_type = if is_array_type ( & col. typ ) {
72
- postgres_array_type_to_arrow ( & col. typ )
74
+ postgres_array_type_to_delta ( & col. typ )
73
75
} else {
74
- postgres_scalar_type_to_arrow ( & col. typ )
76
+ postgres_scalar_type_to_delta ( & col. typ )
75
77
} ;
76
- Field :: new ( & col. name , data_type, col. nullable )
78
+ StructField :: new ( & col. name , data_type, col. nullable )
77
79
} )
78
80
. collect ( ) ;
79
81
80
- Arc :: new ( Schema :: new ( fields) )
82
+ Ok ( Schema :: new ( fields) )
81
83
}
82
84
83
85
#[ cfg( test) ]
@@ -86,22 +88,80 @@ mod tests {
86
88
87
89
#[ test]
88
90
fn test_scalar_mappings ( ) {
89
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: BOOL ) , DataType :: Boolean ) ) ;
90
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: TEXT ) , DataType :: Utf8 ) ) ;
91
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: INT2 ) , DataType :: Int16 ) ) ;
92
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: INT4 ) , DataType :: Int32 ) ) ;
93
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: INT8 ) , DataType :: Int64 ) ) ;
94
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: FLOAT4 ) , DataType :: Float32 ) ) ;
95
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: FLOAT8 ) , DataType :: Float64 ) ) ;
96
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: DATE ) , DataType :: Date32 ) ) ;
97
- assert ! ( matches!( postgres_scalar_type_to_arrow( & Type :: BYTEA ) , DataType :: Binary ) ) ;
91
+ assert ! ( matches!(
92
+ postgres_scalar_type_to_delta( & Type :: BOOL ) ,
93
+ DataType :: BOOLEAN
94
+ ) ) ;
95
+ assert ! ( matches!(
96
+ postgres_scalar_type_to_delta( & Type :: TEXT ) ,
97
+ DataType :: STRING
98
+ ) ) ;
99
+ assert ! ( matches!(
100
+ postgres_scalar_type_to_delta( & Type :: INT2 ) ,
101
+ DataType :: SHORT
102
+ ) ) ;
103
+ assert ! ( matches!(
104
+ postgres_scalar_type_to_delta( & Type :: INT4 ) ,
105
+ DataType :: INTEGER
106
+ ) ) ;
107
+ assert ! ( matches!(
108
+ postgres_scalar_type_to_delta( & Type :: INT8 ) ,
109
+ DataType :: LONG
110
+ ) ) ;
111
+ assert ! ( matches!(
112
+ postgres_scalar_type_to_delta( & Type :: FLOAT4 ) ,
113
+ DataType :: FLOAT
114
+ ) ) ;
115
+ assert ! ( matches!(
116
+ postgres_scalar_type_to_delta( & Type :: FLOAT8 ) ,
117
+ DataType :: DOUBLE
118
+ ) ) ;
119
+ assert ! ( matches!(
120
+ postgres_scalar_type_to_delta( & Type :: DATE ) ,
121
+ DataType :: DATE
122
+ ) ) ;
123
+ assert ! ( matches!(
124
+ postgres_scalar_type_to_delta( & Type :: BYTEA ) ,
125
+ DataType :: BINARY
126
+ ) ) ;
98
127
}
99
128
100
129
#[ test]
101
130
fn test_array_mappings ( ) {
102
- let dt = postgres_array_type_to_arrow ( & Type :: INT4_ARRAY ) ;
103
- if let DataType :: List ( inner) = dt { assert_eq ! ( inner. name( ) , "item" ) ; } else { panic ! ( ) ; }
131
+ let dt = postgres_array_type_to_delta ( & Type :: INT4_ARRAY ) ;
132
+ if let DataType :: Array ( array_type) = dt {
133
+ assert ! ( matches!( array_type. element_type( ) , & DataType :: INTEGER ) ) ;
134
+ assert ! ( array_type. contains_null( ) ) ;
135
+ } else {
136
+ panic ! ( "Expected Array type, got: {:?}" , dt) ;
137
+ }
104
138
}
105
- }
106
139
140
+ #[ test]
141
+ fn test_timestamp_mappings ( ) {
142
+ assert ! ( matches!(
143
+ postgres_scalar_type_to_delta( & Type :: TIMESTAMP ) ,
144
+ DataType :: TIMESTAMP_NTZ
145
+ ) ) ;
146
+ assert ! ( matches!(
147
+ postgres_scalar_type_to_delta( & Type :: TIMESTAMPTZ ) ,
148
+ DataType :: TIMESTAMP
149
+ ) ) ;
150
+ }
107
151
152
+ #[ test]
153
+ fn test_string_mappings ( ) {
154
+ assert ! ( matches!(
155
+ postgres_scalar_type_to_delta( & Type :: UUID ) ,
156
+ DataType :: STRING
157
+ ) ) ;
158
+ assert ! ( matches!(
159
+ postgres_scalar_type_to_delta( & Type :: JSON ) ,
160
+ DataType :: STRING
161
+ ) ) ;
162
+ assert ! ( matches!(
163
+ postgres_scalar_type_to_delta( & Type :: JSONB ) ,
164
+ DataType :: STRING
165
+ ) ) ;
166
+ }
167
+ }
0 commit comments