|
| 1 | +use std::sync::Arc; |
| 2 | + |
| 3 | +use deltalake::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; |
| 4 | +use etl::types::{TableSchema, Type}; |
| 5 | +use etl_postgres::types::is_array_type; |
| 6 | + |
| 7 | +/// Convert a Postgres scalar type to an equivalent Arrow DataType |
| 8 | +fn postgres_scalar_type_to_arrow(typ: &Type) -> DataType { |
| 9 | + match typ { |
| 10 | + &Type::BOOL => DataType::Boolean, |
| 11 | + &Type::CHAR | &Type::BPCHAR | &Type::VARCHAR | &Type::NAME | &Type::TEXT => { |
| 12 | + DataType::Utf8 |
| 13 | + } |
| 14 | + &Type::INT2 => DataType::Int16, |
| 15 | + &Type::INT4 => DataType::Int32, |
| 16 | + &Type::INT8 => DataType::Int64, |
| 17 | + &Type::FLOAT4 => DataType::Float32, |
| 18 | + &Type::FLOAT8 => DataType::Float64, |
| 19 | + // Without precision/scale information, map NUMERIC to Utf8 for now |
| 20 | + &Type::NUMERIC => DataType::Utf8, |
| 21 | + &Type::DATE => DataType::Date32, |
| 22 | + &Type::TIME => DataType::Time64(TimeUnit::Microsecond), |
| 23 | + &Type::TIMESTAMP => DataType::Timestamp(TimeUnit::Microsecond, None), |
| 24 | + &Type::TIMESTAMPTZ => DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), |
| 25 | + // Arrow has no native UUID type; represent as string |
| 26 | + &Type::UUID => DataType::Utf8, |
| 27 | + // Represent JSON as string |
| 28 | + &Type::JSON | &Type::JSONB => DataType::Utf8, |
| 29 | + // OID is 32-bit unsigned in Postgres |
| 30 | + &Type::OID => DataType::UInt32, |
| 31 | + &Type::BYTEA => DataType::Binary, |
| 32 | + _ => DataType::Utf8, |
| 33 | + } |
| 34 | +} |
| 35 | + |
| 36 | +/// Convert a Postgres array type to an Arrow List type |
| 37 | +fn postgres_array_type_to_arrow(typ: &Type) -> DataType { |
| 38 | + let element_type = match typ { |
| 39 | + &Type::BOOL_ARRAY => DataType::Boolean, |
| 40 | + &Type::CHAR_ARRAY | &Type::BPCHAR_ARRAY | &Type::VARCHAR_ARRAY | &Type::NAME_ARRAY |
| 41 | + | &Type::TEXT_ARRAY => DataType::Utf8, |
| 42 | + &Type::INT2_ARRAY => DataType::Int16, |
| 43 | + &Type::INT4_ARRAY => DataType::Int32, |
| 44 | + &Type::INT8_ARRAY => DataType::Int64, |
| 45 | + &Type::FLOAT4_ARRAY => DataType::Float32, |
| 46 | + &Type::FLOAT8_ARRAY => DataType::Float64, |
| 47 | + // Map NUMERIC arrays to string arrays until precision/scale available |
| 48 | + &Type::NUMERIC_ARRAY => DataType::Utf8, |
| 49 | + &Type::DATE_ARRAY => DataType::Date32, |
| 50 | + &Type::TIME_ARRAY => DataType::Time64(TimeUnit::Microsecond), |
| 51 | + &Type::TIMESTAMP_ARRAY => DataType::Timestamp(TimeUnit::Microsecond, None), |
| 52 | + &Type::TIMESTAMPTZ_ARRAY => { |
| 53 | + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) |
| 54 | + } |
| 55 | + &Type::UUID_ARRAY => DataType::Utf8, |
| 56 | + &Type::JSON_ARRAY | &Type::JSONB_ARRAY => DataType::Utf8, |
| 57 | + &Type::OID_ARRAY => DataType::UInt32, |
| 58 | + &Type::BYTEA_ARRAY => DataType::Binary, |
| 59 | + _ => DataType::Utf8, |
| 60 | + }; |
| 61 | + |
| 62 | + DataType::List(Arc::new(Field::new("item", element_type, true))) |
| 63 | +} |
| 64 | + |
| 65 | +/// Convert a Postgres `TableSchema` to an Arrow `Schema` |
| 66 | +pub fn postgres_to_arrow_schema(schema: &TableSchema) -> Arc<Schema> { |
| 67 | + let fields: Vec<Field> = schema |
| 68 | + .column_schemas |
| 69 | + .iter() |
| 70 | + .map(|col| { |
| 71 | + let data_type = if is_array_type(&col.typ) { |
| 72 | + postgres_array_type_to_arrow(&col.typ) |
| 73 | + } else { |
| 74 | + postgres_scalar_type_to_arrow(&col.typ) |
| 75 | + }; |
| 76 | + Field::new(&col.name, data_type, col.nullable) |
| 77 | + }) |
| 78 | + .collect(); |
| 79 | + |
| 80 | + Arc::new(Schema::new(fields)) |
| 81 | +} |
| 82 | + |
| 83 | +#[cfg(test)] |
| 84 | +mod tests { |
| 85 | + use super::*; |
| 86 | + |
| 87 | + #[test] |
| 88 | + fn test_scalar_mappings() { |
| 89 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::BOOL), DataType::Boolean)); |
| 90 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::TEXT), DataType::Utf8)); |
| 91 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::INT2), DataType::Int16)); |
| 92 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::INT4), DataType::Int32)); |
| 93 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::INT8), DataType::Int64)); |
| 94 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::FLOAT4), DataType::Float32)); |
| 95 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::FLOAT8), DataType::Float64)); |
| 96 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::DATE), DataType::Date32)); |
| 97 | + assert!(matches!(postgres_scalar_type_to_arrow(&Type::BYTEA), DataType::Binary)); |
| 98 | + } |
| 99 | + |
| 100 | + #[test] |
| 101 | + fn test_array_mappings() { |
| 102 | + let dt = postgres_array_type_to_arrow(&Type::INT4_ARRAY); |
| 103 | + if let DataType::List(inner) = dt { assert_eq!(inner.name(), "item"); } else { panic!(); } |
| 104 | + } |
| 105 | +} |
| 106 | + |
| 107 | + |
0 commit comments