wip refactor arrow schema

abhi-airspace-intelligence · abhi-airspace-intelligence · commit 13a74ab8f5db · 2025-09-29T15:24:35.000-04:00
Signed-off-by: Abhi Agarwal &lt;abhi@airspace-intelligence.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -32,7 +32,7 @@ actix-web = { version = "4.11.0", default-features = false }
 actix-web-httpauth = { version = "0.8.2", default-features = false }
 actix-web-metrics = { version = "0.3.0", default-features = false }
 anyhow = { version = "1.0.98", default-features = false }
-arrow = { version = "55.0", default-features = false }
+arrow = { version = "56.2.0", default-features = false }
 async-trait = { version = "0.1.88" }
 aws-lc-rs = { version = "1.13.3", default-features = false }
 base64 = { version = "0.22.1", default-features = false }
@@ -55,7 +55,7 @@ k8s-openapi = { version = "0.25.0", default-features = false }
 kube = { version = "1.1.0", default-features = false }
 metrics = { version = "0.24.2", default-features = false }
 metrics-exporter-prometheus = { version = "0.17.2", default-features = false }
-parquet = { version = "55.0", default-features = false }
+parquet = { version = "56.2.0", default-features = false }
 pg_escape = { version = "0.1.1", default-features = false }
 pin-project-lite = { version = "0.2.16", default-features = false }
 postgres-replication = { git = "https://github.com/MaterializeInc/rust-postgres", default-features = false, rev = "c4b473b478b3adfbf8667d2fbe895d8423f1290b" }
diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml
@@ -29,6 +29,7 @@ deltalake = [
     "dep:futures",
     "dep:tokio",
     "dep:tracing",
+    "arrow",
 ]
 
 [dependencies]
diff --git a/etl-destinations/src/arrow/encoding.rs b/etl-destinations/src/arrow/encoding.rs
@@ -2,17 +2,21 @@ use std::sync::Arc;
 
 use arrow::{
     array::{
-        ArrayRef, ArrowPrimitiveType, BooleanBuilder, FixedSizeBinaryBuilder, LargeBinaryBuilder,
-        ListBuilder, PrimitiveBuilder, RecordBatch, StringBuilder, TimestampMicrosecondBuilder,
+        ArrayRef, ArrowPrimitiveType, BooleanBuilder, Decimal128Array, FixedSizeBinaryBuilder,
+        LargeBinaryBuilder, ListBuilder, PrimitiveBuilder, RecordBatch, StringBuilder,
+        TimestampMicrosecondBuilder,
     },
     datatypes::{
-        DataType, Date32Type, FieldRef, Float32Type, Float64Type, Int32Type, Int64Type, Schema,
-        Time64MicrosecondType, TimeUnit, TimestampMicrosecondType,
+        DataType, Date32Type, Field, FieldRef, Float32Type, Float64Type, Int16Type, Int32Type,
+        Int64Type, Schema, Time64MicrosecondType, TimeUnit, TimestampMicrosecondType, UInt32Type,
     },
     error::ArrowError,
 };
 use chrono::{NaiveDate, NaiveTime};
-use etl::types::{ArrayCell, Cell, DATE_FORMAT, TIME_FORMAT, TIMESTAMP_FORMAT, TableRow};
+use etl::types::{
+    ArrayCell, Cell, DATE_FORMAT, TIME_FORMAT, TIMESTAMP_FORMAT, TableRow,
+    TableSchema as PgTableSchema, Type as PgType,
+};
 
 pub const UNIX_EPOCH: NaiveDate =
     NaiveDate::from_ymd_opt(1970, 1, 1).expect("unix epoch is a valid date");
@@ -21,6 +25,30 @@ const MIDNIGHT: NaiveTime = NaiveTime::from_hms_opt(0, 0, 0).expect("midnight is
 
 const UUID_BYTE_WIDTH: i32 = 16;
 
+/// Extract numeric precision from Postgres atttypmod
+/// Based on: https://stackoverflow.com/questions/72725508/how-to-calculate-numeric-precision-and-other-vals-from-atttypmod
+fn extract_numeric_precision(atttypmod: i32) -> u8 {
+    if atttypmod == -1 {
+        // No limit specified, use maximum precision
+        38
+    } else {
+        let precision = ((atttypmod - 4) >> 16) & 65535;
+        std::cmp::min(precision as u8, 38) // Cap at Arrow's max precision
+    }
+}
+
+/// Extract numeric scale from Postgres atttypmod
+/// Based on: https://stackoverflow.com/questions/72725508/how-to-calculate-numeric-precision-and-other-vals-from-atttypmod
+fn extract_numeric_scale(atttypmod: i32) -> i8 {
+    if atttypmod == -1 {
+        // No limit specified, use reasonable default scale
+        18
+    } else {
+        let scale = (atttypmod - 4) & 65535;
+        std::cmp::min(scale as i8, 38) // Cap at reasonable scale
+    }
+}
+
 /// Converts a slice of [`TableRow`]s to an Arrow [`RecordBatch`].
 ///
 /// This function transforms tabular data from the ETL pipeline's internal format
@@ -56,22 +84,31 @@ pub fn rows_to_record_batch(rows: &[TableRow], schema: Schema) -> Result<RecordB
 fn build_array_for_field(rows: &[TableRow], field_idx: usize, data_type: &DataType) -> ArrayRef {
     match data_type {
         DataType::Boolean => build_boolean_array(rows, field_idx),
+        DataType::Int16 => build_primitive_array::<Int16Type, _>(rows, field_idx, cell_to_i16),
         DataType::Int32 => build_primitive_array::<Int32Type, _>(rows, field_idx, cell_to_i32),
         DataType::Int64 => build_primitive_array::<Int64Type, _>(rows, field_idx, cell_to_i64),
+        DataType::UInt32 => build_primitive_array::<UInt32Type, _>(rows, field_idx, cell_to_u32),
         DataType::Float32 => build_primitive_array::<Float32Type, _>(rows, field_idx, cell_to_f32),
         DataType::Float64 => build_primitive_array::<Float64Type, _>(rows, field_idx, cell_to_f64),
         DataType::Utf8 => build_string_array(rows, field_idx),
+        DataType::Binary => build_binary_array(rows, field_idx),
         DataType::LargeBinary => build_binary_array(rows, field_idx),
         DataType::Date32 => build_primitive_array::<Date32Type, _>(rows, field_idx, cell_to_date32),
         DataType::Time64(TimeUnit::Microsecond) => {
             build_primitive_array::<Time64MicrosecondType, _>(rows, field_idx, cell_to_time64)
         }
+        DataType::Time64(TimeUnit::Nanosecond) => {
+            build_primitive_array::<Time64MicrosecondType, _>(rows, field_idx, cell_to_time64)
+        }
         DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => {
             build_timestamptz_array(rows, field_idx, tz)
         }
         DataType::Timestamp(TimeUnit::Microsecond, None) => {
             build_primitive_array::<TimestampMicrosecondType, _>(rows, field_idx, cell_to_timestamp)
         }
+        DataType::Decimal128(precision, scale) => {
+            build_decimal128_array(rows, field_idx, *precision, *scale)
+        }
         DataType::FixedSizeBinary(UUID_BYTE_WIDTH) => build_uuid_array(rows, field_idx),
         DataType::List(field) => build_list_array(rows, field_idx, field.clone()),
         _ => build_string_array(rows, field_idx),
@@ -123,6 +160,22 @@ impl_array_builder!(build_boolean_array, BooleanBuilder, cell_to_bool);
 impl_array_builder!(build_string_array, StringBuilder, cell_to_string);
 impl_array_builder!(build_binary_array, LargeBinaryBuilder, cell_to_bytes);
 
+/// Builds a decimal128 array from [`TableRow`]s for a specific field.
+fn build_decimal128_array(
+    rows: &[TableRow],
+    field_idx: usize,
+    precision: u8,
+    scale: i8,
+) -> ArrayRef {
+    let values: Vec<Option<i128>> = rows
+        .iter()
+        .map(|row| cell_to_decimal128(&row.values[field_idx], precision, scale))
+        .collect();
+
+    let decimal_type = DataType::Decimal128(precision, scale);
+    Arc::new(Decimal128Array::from(values).with_data_type(decimal_type))
+}
+
 /// Builds a timezone-aware timestamp array from [`TableRow`]s.
 ///
 /// This function creates an Arrow timestamp array with microsecond precision
@@ -213,6 +266,22 @@ fn cell_to_i64(cell: &Cell) -> Option<i64> {
     }
 }
 
+/// Converts a [`Cell`] to a 16-bit signed integer.
+fn cell_to_i16(cell: &Cell) -> Option<i16> {
+    match cell {
+        Cell::I16(v) => Some(*v),
+        _ => None,
+    }
+}
+
+/// Converts a [`Cell`] to a 32-bit unsigned integer.
+fn cell_to_u32(cell: &Cell) -> Option<u32> {
+    match cell {
+        Cell::U32(v) => Some(*v),
+        _ => None,
+    }
+}
+
 /// Converts a [`Cell`] to a 32-bit floating-point number.
 ///
 /// Extracts 32-bit float values from [`Cell::F32`] variants, returning
@@ -235,6 +304,23 @@ fn cell_to_f64(cell: &Cell) -> Option<f64> {
     }
 }
 
+/// Converts a [`Cell`] to a decimal128 value.
+fn cell_to_decimal128(cell: &Cell, _precision: u8, scale: i8) -> Option<i128> {
+    match cell {
+        Cell::Numeric(n) => {
+            // This is a simplified conversion - ideally we'd preserve the exact decimal representation
+            if let Ok(string_val) = n.to_string().parse::<f64>() {
+                // Scale up by the scale factor and convert to i128
+                let scaled = (string_val * 10_f64.powi(scale as i32)) as i128;
+                Some(scaled)
+            } else {
+                None
+            }
+        }
+        _ => None,
+    }
+}
+
 /// Converts a [`Cell`] to a byte vector.
 ///
 /// Extracts binary data from [`Cell::Bytes`] variants by cloning the
@@ -375,20 +461,27 @@ fn cell_to_array_cell(cell: &Cell) -> Option<&ArrayCell> {
 fn build_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef) -> ArrayRef {
     match field.data_type() {
         DataType::Boolean => build_boolean_list_array(rows, field_idx, field),
+        DataType::Int16 => build_int16_list_array(rows, field_idx, field),
         DataType::Int32 => build_int32_list_array(rows, field_idx, field),
         DataType::Int64 => build_int64_list_array(rows, field_idx, field),
+        DataType::UInt32 => build_uint32_list_array(rows, field_idx, field),
         DataType::Float32 => build_float32_list_array(rows, field_idx, field),
         DataType::Float64 => build_float64_list_array(rows, field_idx, field),
         DataType::Utf8 => build_string_list_array(rows, field_idx, field),
+        DataType::Binary => build_binary_list_array(rows, field_idx, field),
         DataType::LargeBinary => build_binary_list_array(rows, field_idx, field),
         DataType::Date32 => build_date32_list_array(rows, field_idx, field),
         DataType::Time64(TimeUnit::Microsecond) => build_time64_list_array(rows, field_idx, field),
+        DataType::Time64(TimeUnit::Nanosecond) => build_time64_list_array(rows, field_idx, field),
         DataType::Timestamp(TimeUnit::Microsecond, None) => {
             build_timestamp_list_array(rows, field_idx, field)
         }
         DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
             build_timestamptz_list_array(rows, field_idx, field)
         }
+        DataType::Decimal128(precision, scale) => {
+            build_decimal128_list_array(rows, field_idx, field.clone(), *precision, *scale)
+        }
         DataType::FixedSizeBinary(UUID_BYTE_WIDTH) => build_uuid_list_array(rows, field_idx, field),
         // For unsupported element types, fall back to string representation
         _ => build_list_array_for_strings(rows, field_idx, field),
@@ -421,6 +514,32 @@ fn build_boolean_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef
     Arc::new(list_builder.finish())
 }
 
+/// Builds a list array for 16-bit integer elements.
+fn build_int16_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef) -> ArrayRef {
+    let mut list_builder =
+        ListBuilder::new(PrimitiveBuilder::<Int16Type>::new()).with_field(field.clone());
+
+    for row in rows {
+        if let Some(array_cell) = cell_to_array_cell(&row.values[field_idx]) {
+            match array_cell {
+                ArrayCell::I16(vec) => {
+                    for item in vec {
+                        list_builder.values().append_option(*item);
+                    }
+                    list_builder.append(true);
+                }
+                _ => {
+                    return build_list_array_for_strings(rows, field_idx, field);
+                }
+            }
+        } else {
+            list_builder.append_null();
+        }
+    }
+
+    Arc::new(list_builder.finish())
+}
+
 /// Builds a list array for 32-bit integer elements.
 fn build_int32_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef) -> ArrayRef {
     let mut list_builder =
@@ -485,6 +604,32 @@ fn build_int64_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef)
     Arc::new(list_builder.finish())
 }
 
+/// Builds a list array for 32-bit unsigned integer elements.
+fn build_uint32_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef) -> ArrayRef {
+    let mut list_builder =
+        ListBuilder::new(PrimitiveBuilder::<UInt32Type>::new()).with_field(field.clone());
+
+    for row in rows {
+        if let Some(array_cell) = cell_to_array_cell(&row.values[field_idx]) {
+            match array_cell {
+                ArrayCell::U32(vec) => {
+                    for item in vec {
+                        list_builder.values().append_option(*item);
+                    }
+                    list_builder.append(true);
+                }
+                _ => {
+                    return build_list_array_for_strings(rows, field_idx, field);
+                }
+            }
+        } else {
+            list_builder.append_null();
+        }
+    }
+
+    Arc::new(list_builder.finish())
+}
+
 /// Builds a list array for 32-bit float elements.
 fn build_float32_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef) -> ArrayRef {
     let mut list_builder =
@@ -763,6 +908,19 @@ fn build_uuid_list_array(rows: &[TableRow], field_idx: usize, field: FieldRef) -
     Arc::new(list_builder.finish())
 }
 
+/// Builds a list array for Decimal128 elements.
+fn build_decimal128_list_array(
+    rows: &[TableRow],
+    field_idx: usize,
+    field: FieldRef,
+    _precision: u8,
+    _scale: i8,
+) -> ArrayRef {
+    // For now, fall back to string representation for decimal arrays
+    // This is a simplified implementation that avoids complex Arrow data type manipulation
+    build_list_array_for_strings(rows, field_idx, field)
+}
+
 /// Builds a list array for string elements.
 ///
 /// This function creates an Arrow list array with string elements by processing
diff --git a/etl-destinations/src/deltalake/expr.rs b/etl-destinations/src/deltalake/expr.rs
@@ -1,6 +1,7 @@
 // Utilities related to constructing DataFusion expressions
 
 use crate::deltalake::schema::TableRowEncoder;
+use crate::deltalake::schema::cell_to_scalar_value_for_arrow;
 use deltalake::datafusion::common::Column;
 use deltalake::datafusion::prelude::{Expr, lit};
 use etl::error::EtlResult;
@@ -16,7 +17,7 @@ pub fn cell_to_scalar_expr(
         &schema.column_schemas[col_idx].typ,
         schema.column_schemas[col_idx].modifier,
     );
-    let sv = TableRowEncoder::cell_to_scalar_value_for_arrow(cell, &arrow_type)?;
+    let sv = cell_to_scalar_value_for_arrow(cell, &arrow_type)?;
     Ok(lit(sv))
 }
 
diff --git a/etl-destinations/src/deltalake/schema.rs b/etl-destinations/src/deltalake/schema.rs

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ deltalake = [`
`29`	`29`	`"dep:futures",`
`30`	`30`	`"dep:tokio",`
`31`	`31`	`"dep:tracing",`
	`32`	`+ "arrow",`
`32`	`33`	`]`
`33`	`34`
`34`	`35`	`[dependencies]`