Skip to content

Commit f141491

Browse files
authored
Merge pull request #15 from monoscope-tech/variant-support2
Add Variant type support and fix SLT tests
2 parents b61b6e6 + 2b72658 commit f141491

16 files changed

+1034
-88
lines changed

Cargo.lock

Lines changed: 170 additions & 32 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ log = "0.4.27"
2020
color-eyre = "0.6.5"
2121
arrow-schema = "57.1.0"
2222
regex = "1.11.1"
23-
# Updated to delta-rs with datafusion 52 Utf8View fixes (includes commits 987e535f, ffb794ba)
24-
deltalake = { git = "https://github.com/delta-io/delta-rs.git", rev = "ffb794ba0745394fc4b747a4ef2e11c2d4ec086a", features = [
23+
# Using fork with VariantType support until upstream merges the feature
24+
deltalake = { git = "https://github.com/tonyalaribe/delta-rs.git", rev = "ba769136c5dd9b84a7335ea67e42b67884bfcce3", features = [
2525
"datafusion",
2626
"s3",
2727
] }
@@ -75,6 +75,12 @@ bincode = { version = "2.0", features = ["serde"] }
7575
walrus-rust = "0.2.0"
7676
thiserror = "2.0"
7777
strum = { version = "0.27", features = ["derive"] }
78+
datafusion-variant = { git = "https://github.com/tonyalaribe/datafusion-variant.git", rev = "8b6b270" }
79+
parquet-variant-compute = "57.2.0"
80+
parquet-variant-json = "57.2.0"
81+
parquet-variant = "57.2.0"
82+
serde_json_path = "0.7"
83+
base64 = "0.22"
7884

7985
[dev-dependencies]
8086
sqllogictest = { git = "https://github.com/risinglightdb/sqllogictest-rs.git" }

schemas/otel_logs_and_spans.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ fields:
6161
data_type: 'Timestamp(Microsecond, Some("UTC"))'
6262
nullable: true
6363
- name: context
64-
data_type: Utf8
64+
data_type: Variant
6565
nullable: true
6666
- name: context___trace_id
6767
data_type: Utf8
@@ -79,13 +79,13 @@ fields:
7979
data_type: Utf8
8080
nullable: true
8181
- name: events
82-
data_type: Utf8
82+
data_type: Variant
8383
nullable: true
8484
- name: links
85-
data_type: Utf8
85+
data_type: Variant
8686
nullable: true
8787
- name: attributes
88-
data_type: Utf8
88+
data_type: Variant
8989
nullable: true
9090
- name: attributes___client___address
9191
data_type: Utf8
@@ -235,7 +235,7 @@ fields:
235235
data_type: Utf8
236236
nullable: true
237237
- name: resource
238-
data_type: Utf8
238+
data_type: Variant
239239
nullable: true
240240
- name: resource___service___name
241241
data_type: Utf8

src/database.rs

Lines changed: 97 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
use crate::config::{self, AppConfig};
22
use crate::object_store_cache::{FoyerCacheConfig, FoyerObjectStoreCache, SharedFoyerCache};
3-
use crate::schema_loader::{get_default_schema, get_schema};
3+
use crate::schema_loader::{get_default_schema, get_schema, is_variant_type};
44
use crate::statistics::DeltaStatisticsExtractor;
55
use anyhow::Result;
6-
use arrow_schema::SchemaRef;
6+
use arrow_schema::{Schema, SchemaRef};
77
use async_trait::async_trait;
88
use chrono::Utc;
99
use datafusion::arrow::array::Array;
@@ -36,10 +36,10 @@ use deltalake::operations::create::CreateBuilder;
3636
use deltalake::{DeltaTable, DeltaTableBuilder};
3737
use futures::StreamExt;
3838
use instrumented_object_store::instrument_object_store;
39-
use std::sync::Mutex;
4039
use serde::{Deserialize, Serialize};
4140
use sqlx::{PgPool, postgres::PgPoolOptions};
4241
use std::fmt;
42+
use std::sync::Mutex;
4343
use std::sync::OnceLock;
4444
use std::{any::Any, collections::HashMap, sync::Arc};
4545
use tokio::sync::RwLock;
@@ -82,6 +82,85 @@ pub fn extract_project_id(batch: &RecordBatch) -> Option<String> {
8282
})
8383
}
8484

85+
/// Convert string columns to Variant binary format where the target schema expects Variant type.
86+
/// This enables automatic JSON string → Variant conversion during INSERT.
87+
pub fn convert_variant_columns(batch: RecordBatch, target_schema: &SchemaRef) -> DFResult<RecordBatch> {
88+
use datafusion::arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
89+
use datafusion::arrow::datatypes::{DataType, Field};
90+
91+
let batch_schema = batch.schema();
92+
let mut columns: Vec<ArrayRef> = batch.columns().to_vec();
93+
let mut new_fields: Vec<Arc<Field>> = batch_schema.fields().iter().cloned().collect();
94+
95+
for (idx, target_field) in target_schema.fields().iter().enumerate() {
96+
if !is_variant_type(target_field.data_type()) {
97+
continue;
98+
}
99+
// Skip columns beyond batch length - this is normal for INSERT with fewer columns than table schema
100+
// (e.g., columns with defaults or nullable columns omitted from INSERT)
101+
if idx >= columns.len() {
102+
continue;
103+
}
104+
105+
let col = &columns[idx];
106+
let col_type = col.data_type();
107+
108+
// Only convert if source is a string type and target is Variant
109+
let converted: Option<ArrayRef> =
110+
match col_type {
111+
DataType::Utf8View => {
112+
let arr = col.as_any().downcast_ref::<StringViewArray>().ok_or_else(|| {
113+
DataFusionError::Execution(format!("Expected StringViewArray for field '{}' but downcast failed", target_field.name()))
114+
})?;
115+
Some(Arc::new(json_strings_to_variant(arr.iter())?))
116+
}
117+
DataType::Utf8 => {
118+
let arr = col
119+
.as_any()
120+
.downcast_ref::<StringArray>()
121+
.ok_or_else(|| DataFusionError::Execution(format!("Expected StringArray for field '{}' but downcast failed", target_field.name())))?;
122+
Some(Arc::new(json_strings_to_variant(arr.iter())?))
123+
}
124+
DataType::LargeUtf8 => {
125+
let arr = col.as_any().downcast_ref::<LargeStringArray>().ok_or_else(|| {
126+
DataFusionError::Execution(format!("Expected LargeStringArray for field '{}' but downcast failed", target_field.name()))
127+
})?;
128+
Some(Arc::new(json_strings_to_variant(arr.iter())?))
129+
}
130+
_ => None, // Already Variant or other type, skip
131+
};
132+
133+
if let Some(variant_array) = converted {
134+
columns[idx] = variant_array;
135+
new_fields[idx] = target_field.clone();
136+
}
137+
}
138+
139+
let new_schema = Arc::new(Schema::new(new_fields));
140+
RecordBatch::try_new(new_schema, columns).map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
141+
}
142+
143+
/// Convert an iterator of optional JSON strings to a Variant StructArray.
144+
/// Fails fast on invalid JSON to ensure data integrity.
145+
fn json_strings_to_variant<'a>(iter: impl Iterator<Item = Option<&'a str>>) -> DFResult<datafusion::arrow::array::StructArray> {
146+
use parquet_variant_compute::VariantArrayBuilder;
147+
use parquet_variant_json::JsonToVariant;
148+
149+
let items: Vec<_> = iter.collect();
150+
let mut builder = VariantArrayBuilder::new(items.len());
151+
152+
for (row_idx, item) in items.into_iter().enumerate() {
153+
match item {
154+
Some(json_str) => builder
155+
.append_json(json_str)
156+
.map_err(|e| DataFusionError::Execution(format!("Invalid JSON at row {}: {} (value: '{}')", row_idx, e, json_str)))?,
157+
None => builder.append_null(),
158+
}
159+
}
160+
161+
Ok(builder.build().into())
162+
}
163+
85164
// Compression level for parquet files - kept for WriterProperties fallback
86165
const ZSTD_COMPRESSION_LEVEL: i32 = 3;
87166

@@ -712,11 +791,14 @@ impl Database {
712791

713792
self.register_pg_settings_table(ctx)?;
714793
self.register_set_config_udf(ctx);
715-
self.register_json_functions(ctx);
716794

717-
// Register custom PostgreSQL-compatible functions
795+
// CRITICAL: Register custom functions BEFORE JSON functions to ensure VariantAwareExprPlanner
796+
// intercepts -> and ->> operators on Variant columns before JsonExprPlanner handles them as strings
718797
crate::functions::register_custom_functions(ctx).map_err(|e| DataFusionError::Execution(format!("Failed to register custom functions: {}", e)))?;
719798

799+
// JSON functions (JsonExprPlanner for -> and ->> on string columns - must come after Variant handlers)
800+
self.register_json_functions(ctx);
801+
720802
Ok(())
721803
}
722804

@@ -1205,7 +1287,10 @@ impl Database {
12051287

12061288
// Fallback to legacy batch queue if configured
12071289
let enable_queue = self.config.core.enable_batch_queue;
1208-
if !skip_queue && enable_queue && let Some(ref queue) = self.batch_queue {
1290+
if !skip_queue
1291+
&& enable_queue
1292+
&& let Some(ref queue) = self.batch_queue
1293+
{
12091294
span.record("use_queue", true);
12101295
for batch in batches {
12111296
if let Err(e) = queue.queue(batch) {
@@ -1892,14 +1977,18 @@ impl DataSink for ProjectRoutingTable {
18921977
let span = tracing::Span::current();
18931978
let mut total_row_count = 0;
18941979
let mut project_batches: HashMap<String, Vec<RecordBatch>> = HashMap::new();
1980+
let target_schema = self.schema();
18951981

1896-
// Collect and group batches by project_id
1982+
// Collect and group batches by project_id, converting variant columns
18971983
while let Some(batch) = data.next().await.transpose()? {
18981984
let batch_rows = batch.num_rows();
18991985
debug!("write_all: received batch with {} rows", batch_rows);
19001986
total_row_count += batch_rows;
19011987
let project_id = extract_project_id(&batch).unwrap_or_else(|| self.default_project.clone());
1902-
project_batches.entry(project_id).or_default().push(batch);
1988+
1989+
// Convert string columns to Variant where target schema expects Variant
1990+
let converted_batch = convert_variant_columns(batch, &target_schema)?;
1991+
project_batches.entry(project_id).or_default().push(converted_batch);
19031992
}
19041993

19051994
span.record("rows.count", total_row_count);

0 commit comments

Comments
 (0)