splitgraph
diff --git a/‎Cargo.lock‎
Lines changed: 221 additions & 131 deletions b/‎Cargo.lock‎
Lines changed: 221 additions & 131 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 17 additions & 17 deletions b/‎Cargo.toml‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎datafusion_remote_tables/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎datafusion_remote_tables/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/config/context.rs‎
Lines changed: 6 additions & 6 deletions b/‎src/config/context.rs‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/context/iceberg.rs‎
Lines changed: 4 additions & 2 deletions b/‎src/context/iceberg.rs‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/context/logical.rs‎
Lines changed: 5 additions & 5 deletions b/‎src/context/logical.rs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/datafusion/parser.rs‎
Lines changed: 81 additions & 19 deletions b/‎src/datafusion/parser.rs‎
Lines changed: 81 additions & 19 deletions
diff --git a/‎src/datafusion/utils.rs‎
Lines changed: 9 additions & 1 deletion b/‎src/datafusion/utils.rs‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/sync/planner.rs‎
Lines changed: 1 addition & 2 deletions b/‎src/sync/planner.rs‎
Lines changed: 1 addition & 2 deletions
@@ -2,27 +2,27 @@
 members = ["clade", "object_store_factory"]
 
 [workspace.dependencies]
-arrow = { version = "53.2.0", features = ["test_utils"] }
-arrow-buffer = "53.2.0"
-arrow-csv = "53.2.0"
-arrow-flight = "53.2.0"
+arrow = { version = "53.3.0", features = ["test_utils"] }
+arrow-buffer = "53.3.0"
+arrow-csv = "53.3.0"
+arrow-flight = "53.3.0"
 # For the JSON format support
 # https://github.com/apache/arrow-rs/pull/2868
 # https://github.com/apache/arrow-rs/pull/2724
-arrow-integration-test = "53.2.0"
-arrow-row = "53.2.0"
-arrow-schema = "53.2.0"
+arrow-integration-test = "53.3.0"
+arrow-row = "53.3.0"
+arrow-schema = "53.3.0"
 async-trait = "0.1.83"
 
-datafusion = { version = "43.0.0", features = ["backtrace"] }
-datafusion-common = "43.0.0"
-datafusion-expr = "43.0.0"
-datafusion-functions-nested = "43.0.0"
+datafusion = { version = "44.0.0", features = ["backtrace"] }
+datafusion-common = "44.0.0"
+datafusion-expr = "44.0.0"
+datafusion-functions-nested = "44.0.0"
 
 futures = "0.3"
 
-iceberg = { git = "https://github.com/splitgraph/iceberg-rust", rev = "1e01b7b7b2009076941f3ec1f04340e961d4628a" }
-iceberg-datafusion = { git = "https://github.com/splitgraph/iceberg-rust", rev = "1e01b7b7b2009076941f3ec1f04340e961d4628a" }
+iceberg = { git = "https://github.com/splitgraph/iceberg-rust", rev = "cb5c36565b641d2309d4e96fcf6a1ee21308aa0d" }
+iceberg-datafusion = { git = "https://github.com/splitgraph/iceberg-rust", rev = "cb5c36565b641d2309d4e96fcf6a1ee21308aa0d" }
 
 itertools = ">=0.10.0"
 object_store = { version = "0.11", features = ["aws", "azure", "gcp"] }
@@ -87,8 +87,8 @@ clap = { version = "4.5.21", features = [ "derive" ] }
 config = "0.14.0"
 
 # PG wire protocol support
-convergence = { git = "https://github.com/splitgraph/convergence", branch = "datafusion-43-upgrade", optional = true }
-convergence-arrow = { git = "https://github.com/splitgraph/convergence", branch = "datafusion-43-upgrade", optional = true }
+convergence = { git = "https://github.com/splitgraph/convergence", branch = "datafusion-44-upgrade", optional = true }
+convergence-arrow = { git = "https://github.com/splitgraph/convergence", branch = "datafusion-44-upgrade", optional = true }
 
 dashmap = "6.1.0"
 
@@ -99,7 +99,7 @@ datafusion-functions-nested = { workspace = true }
 
 datafusion-remote-tables = { path = "./datafusion_remote_tables", optional = true }
 
-deltalake = { git = "https://github.com/splitgraph/delta-rs", rev = "eff5735698279c12ae4a3aac2afa268d168242b2", features = ["datafusion", "s3"] }
+deltalake = { git = "https://github.com/splitgraph/delta-rs", rev = "a639dea6289839161baae15ff368db74dbac2074", features = ["datafusion", "s3"] }
 fastrand = "2.2.0"
 
 futures = "0.3"
@@ -133,7 +133,7 @@ rustyline = "14.0"
 serde = { workspace = true }
 serde_json = { workspace = true }
 sha2 = ">=0.10.1"
-sqlparser = { version = "0.51", features = ["visitor"] }
+sqlparser = { version = "0.53", features = ["visitor"] }
 sqlx = { version = "0.7.1", features = [ "runtime-tokio-rustls", "sqlite", "any", "uuid" ] }
 strum = ">=0.24"
 strum_macros = ">=0.24"
 
@@ -19,7 +19,7 @@ arrow-schema = { workspace = true }
 async-trait = { workspace = true }
 
 # Remote query execution for a variety of DBs
-connectorx = { git = "https://github.com/splitgraph/connector-x", branch = "datafusion-43-upgrade", features = [ "dst_arrow", "src_postgres", "src_mysql", "src_sqlite" ] }
+connectorx = { git = "https://github.com/splitgraph/connector-x", branch = "datafusion-44-upgrade", features = [ "dst_arrow", "src_postgres", "src_mysql", "src_sqlite" ] }
 
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 
@@ -15,7 +15,7 @@ use datafusion::execution::{
 };
 use datafusion::{
     common::Result,
-    execution::runtime_env::{RuntimeConfig, RuntimeEnv},
+    execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder},
     prelude::{SessionConfig, SessionContext},
 };
 use deltalake::delta_datafusion::DeltaTableFactory;
@@ -132,7 +132,7 @@ pub fn setup_metrics(metrics: &schema::Metrics) {
 }
 
 pub async fn build_context(cfg: schema::SeafowlConfig) -> Result<SeafowlContext> {
-    let mut runtime_config = RuntimeConfig::new();
+    let mut runtime_env_builder = RuntimeEnvBuilder::new();
 
     let memory_pool: Arc<dyn MemoryPool> =
         if let Some(max_memory) = cfg.runtime.max_memory {
@@ -143,18 +143,18 @@ pub async fn build_context(cfg: schema::SeafowlConfig) -> Result<SeafowlContext>
             Arc::new(UnboundedMemoryPool::default())
         };
 
-    runtime_config =
-        runtime_config.with_memory_pool(Arc::new(MemoryPoolMetrics::new(memory_pool)));
+    runtime_env_builder = runtime_env_builder
+        .with_memory_pool(Arc::new(MemoryPoolMetrics::new(memory_pool)));
 
     if let Some(temp_dir) = &cfg.runtime.temp_dir {
-        runtime_config = runtime_config.with_temp_file_path(temp_dir);
+        runtime_env_builder = runtime_env_builder.with_temp_file_path(temp_dir);
     }
 
     let session_config = SessionConfig::from_env()?
         .with_information_schema(true)
         .with_default_catalog_and_schema(DEFAULT_DB, DEFAULT_SCHEMA);
 
-    let runtime_env = RuntimeEnv::try_new(runtime_config)?;
+    let runtime_env = runtime_env_builder.build()?;
     let state = build_state_with_table_factories(session_config, Arc::new(runtime_env));
     let context = SessionContext::new_with_state(state);
 
 
@@ -31,7 +31,7 @@ use iceberg::TableCreation;
 use opendal;
 use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
 use parquet::file::properties::WriterProperties;
-use tracing::info;
+use tracing::{info, warn};
 use url::Url;
 use uuid::Uuid;
 
@@ -363,11 +363,13 @@ pub async fn record_batches_to_iceberg(
             if let Some(opendal_error) =
                 iceberg_error_source.downcast_ref::<opendal::Error>()
             {
-                if opendal_error.kind() == opendal::ErrorKind::ConditionNotMatch {
+                if opendal_error.kind() == opendal::ErrorKind::AlreadyExists {
+                    warn!("Failed writing new metadata file {new_metadata_location} due to a concurrency error");
                     return Err(DataLoadingError::OptimisticConcurrencyError());
                 }
             }
         }
+        warn!("Failed writing new metadata file {new_metadata_location} due to: {iceberg_error}");
         return Err(iceberg_error.into());
     };
     info!("Wrote new metadata: {:?}", new_metadata_location);
 
@@ -26,9 +26,9 @@ use datafusion_expr::logical_plan::{Extension, LogicalPlan};
 use deltalake::DeltaTable;
 use itertools::Itertools;
 use sqlparser::ast::{
-    AlterTableOperation, CreateFunctionBody, CreateTable as CreateTableSql,
-    Expr as SqlExpr, Expr, Insert, ObjectType, Query, Statement, TableFactor,
-    TableWithJoins, Value, VisitMut,
+    AlterTableOperation, CreateFunction as CreateFunctionSql, CreateFunctionBody,
+    CreateTable as CreateTableSql, Expr as SqlExpr, Expr, Insert, ObjectType, Query,
+    Statement, TableFactor, TableWithJoins, Value, VisitMut,
 };
 use std::sync::Arc;
 use tracing::debug;
@@ -223,13 +223,13 @@ impl SeafowlContext {
                     state.statement_to_plan(stmt).await
                 },
 
-                Statement::CreateFunction {
+                Statement::CreateFunction(CreateFunctionSql {
                     or_replace,
                     temporary: false,
                     name,
                     function_body: Some(CreateFunctionBody::AsBeforeOptions(Expr::Value(Value::SingleQuotedString(details)))),
                     ..
-                } => {
+                }) => {
                     // We abuse the fact that in CREATE FUNCTION AS [class_name], class_name can be an arbitrary string
                     // and so we can get the user to put some JSON in there
                     let function_details: CreateFunctionDetails = serde_json::from_str(details)
 
@@ -31,9 +31,12 @@ use lazy_static::lazy_static;
 use sqlparser::ast::{
     CreateFunctionBody, Expr, ObjectName, OrderByExpr, TruncateTableTarget, Value,
 };
-use sqlparser::tokenizer::{TokenWithLocation, Word};
+use sqlparser::tokenizer::{TokenWithSpan, Word};
 use sqlparser::{
-    ast::{ColumnDef, ColumnOptionDef, Statement as SQLStatement, TableConstraint},
+    ast::{
+        ColumnDef, ColumnOptionDef, CreateFunction, Statement as SQLStatement,
+        TableConstraint,
+    },
     dialect::{keywords::Keyword, Dialect, GenericDialect},
     parser::{Parser, ParserError},
     tokenizer::{Token, Tokenizer},
@@ -135,7 +138,7 @@ impl<'a> DFParser<'a> {
     fn expected<T>(
         &self,
         expected: &str,
-        found: TokenWithLocation,
+        found: TokenWithSpan,
     ) -> Result<T, ParserError> {
         parser_err!(format!("Expected {expected}, found: {found}"))
     }
@@ -225,6 +228,7 @@ impl<'a> DFParser<'a> {
             only: false,
             identity: None,
             cascade: None,
+            on_cluster: None,
         })))
     }
 
@@ -242,6 +246,7 @@ impl<'a> DFParser<'a> {
             only: false,
             identity: None,
             cascade: None,
+            on_cluster: None,
         })))
     }
 
@@ -258,23 +263,76 @@ impl<'a> DFParser<'a> {
             CopyToSource::Relation(table_name)
         };
 
-        self.parser.expect_keyword(Keyword::TO)?;
+        #[derive(Default)]
+        struct Builder {
+            stored_as: Option<String>,
+            target: Option<String>,
+            partitioned_by: Option<Vec<String>>,
+            options: Option<Vec<(String, Value)>>,
+        }
 
-        let target = self.parser.parse_literal_string()?;
+        let mut builder = Builder::default();
 
-        // check for options in parens
-        let options = if self.parser.peek_token().token == Token::LParen {
-            self.parse_value_options()?
-        } else {
-            vec![]
+        loop {
+            if let Some(keyword) = self.parser.parse_one_of_keywords(&[
+                Keyword::STORED,
+                Keyword::TO,
+                Keyword::PARTITIONED,
+                Keyword::OPTIONS,
+                Keyword::WITH,
+            ]) {
+                match keyword {
+                    Keyword::STORED => {
+                        self.parser.expect_keyword(Keyword::AS)?;
+                        ensure_not_set(&builder.stored_as, "STORED AS")?;
+                        builder.stored_as = Some(self.parse_file_format()?);
+                    }
+                    Keyword::TO => {
+                        ensure_not_set(&builder.target, "TO")?;
+                        builder.target = Some(self.parser.parse_literal_string()?);
+                    }
+                    Keyword::WITH => {
+                        self.parser.expect_keyword(Keyword::HEADER)?;
+                        self.parser.expect_keyword(Keyword::ROW)?;
+                        return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')");
+                    }
+                    Keyword::PARTITIONED => {
+                        self.parser.expect_keyword(Keyword::BY)?;
+                        ensure_not_set(&builder.partitioned_by, "PARTITIONED BY")?;
+                        builder.partitioned_by = Some(self.parse_partitions()?);
+                    }
+                    Keyword::OPTIONS => {
+                        ensure_not_set(&builder.options, "OPTIONS")?;
+                        builder.options = Some(self.parse_value_options()?);
+                    }
+                    _ => {
+                        unreachable!()
+                    }
+                }
+            } else {
+                let token = self.parser.next_token();
+                if token == Token::EOF || token == Token::SemiColon {
+                    break;
+                } else {
+                    return Err(ParserError::ParserError(format!(
+                        "Unexpected token {token}"
+                    )));
+                }
+            }
+        }
+
+        let Some(target) = builder.target else {
+            return Err(ParserError::ParserError(
+                "Missing TO clause in COPY statement".into(),
+            ));
         };
 
         Ok(Statement::CopyTo(CopyToStatement {
             source,
             target,
-            options,
-            partitioned_by: vec![],
-            stored_as: None,
+            partitioned_by: builder.partitioned_by.unwrap_or(vec![]),
+            stored_as: builder.stored_as,
+            options: builder.options.unwrap_or(vec![]),
         }))
     }
 
@@ -358,7 +416,7 @@ impl<'a> DFParser<'a> {
         self.parser.expect_keyword(Keyword::AS)?;
         let body = self.parse_create_function_body_string()?;
 
-        let create_function = SQLStatement::CreateFunction {
+        let create_function = SQLStatement::CreateFunction(CreateFunction {
             or_replace,
             temporary,
             if_not_exists: false,
@@ -374,7 +432,7 @@ impl<'a> DFParser<'a> {
             determinism_specifier: None,
             options: None,
             remote_connection: None,
-        };
+        });
 
         Ok(Statement::Statement(Box::from(create_function)))
     }
@@ -544,6 +602,10 @@ impl<'a> DFParser<'a> {
         &mut self,
         unbounded: bool,
     ) -> Result<Statement, ParserError> {
+        let temporary = self
+            .parser
+            .parse_one_of_keywords(&[Keyword::TEMP, Keyword::TEMPORARY])
+            .is_some();
         self.parser.expect_keyword(Keyword::TABLE)?;
         let if_not_exists =
             self.parser
@@ -606,10 +668,10 @@ impl<'a> DFParser<'a> {
                         // Note that mixing both names and definitions is not allowed
                         let peeked = self.parser.peek_nth_token(2);
                         if peeked == Token::Comma || peeked == Token::RParen {
-                            // list of column names
+                            // List of column names
                             builder.table_partition_cols = Some(self.parse_partitions()?)
                         } else {
-                            // list of column defs
+                            // List of column defs
                             let (cols, cons) = self.parse_columns()?;
                             builder.table_partition_cols = Some(
                                 cols.iter().map(|col| col.name.to_string()).collect(),
@@ -665,7 +727,7 @@ impl<'a> DFParser<'a> {
             table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]),
             order_exprs: builder.order_exprs,
             if_not_exists,
-            temporary: false,
+            temporary,
             unbounded,
             options: builder.options.unwrap_or(Vec::new()),
             constraints,
@@ -692,7 +754,7 @@ impl<'a> DFParser<'a> {
             options.push((key, value));
             let comma = self.parser.consume_token(&Token::Comma);
             if self.parser.consume_token(&Token::RParen) {
-                // allow a trailing comma, even though it's not in standard
+                // Allow a trailing comma, even though it's not in standard
                 break;
             } else if !comma {
                 return self.expected(
 
@@ -117,7 +117,7 @@ pub(crate) fn convert_simple_data_type(sql_type: &SQLDataType) -> Result<DataTyp
         | SQLDataType::Regclass
         | SQLDataType::Custom(_, _)
         | SQLDataType::Array(_)
-        | SQLDataType::Enum(_)
+        | SQLDataType::Enum(_, _)
         | SQLDataType::Set(_)
         | SQLDataType::MediumInt(_)
         | SQLDataType::UnsignedMediumInt(_)
@@ -163,6 +163,14 @@ pub(crate) fn convert_simple_data_type(sql_type: &SQLDataType) -> Result<DataTyp
         | SQLDataType::Nullable(_)
         | SQLDataType::LowCardinality(_)
         | SQLDataType::Trigger
+        | SQLDataType::TinyBlob
+        | SQLDataType::MediumBlob
+        | SQLDataType::LongBlob
+        | SQLDataType::TinyText
+        | SQLDataType::MediumText
+        | SQLDataType::LongText
+        | SQLDataType::Bit(_)
+        | SQLDataType::BitVarying(_)
         => not_impl_err!(
                 "Unsupported SQL type {sql_type:?}"
             ),
 
@@ -984,8 +984,7 @@ mod tests {
             "            RepartitionExec: partitioning=Hash",
             "              ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, true as __lower_rel]",
             "                DeltaScan",
-            "                  RepartitionExec: partitioning=RoundRobinBatch",
-            "                    ParquetExec: file_groups={1 group: [[]]}, projection=[c1, c2]",
+            "                  ParquetExec: file_groups={1 group: [[]]}, projection=[c1, c2]",
             "          CoalesceBatchesExec: target_batch_size=8192",
             "            RepartitionExec: partitioning=Hash",
             "              UnnestExec",
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ use iceberg::TableCreation;`
`31`	`31`	`use opendal;`
`32`	`32`	`use parquet::arrow::PARQUET_FIELD_ID_META_KEY;`
`33`	`33`	`use parquet::file::properties::WriterProperties;`
`34`		`-use tracing::info;`
	`34`	`+use tracing::{info, warn};`
`35`	`35`	`use url::Url;`
`36`	`36`	`use uuid::Uuid;`
`37`	`37`
`@@ -363,11 +363,13 @@ pub async fn record_batches_to_iceberg(`
`363`	`363`	`if let Some(opendal_error) =`
`364`	`364`	`iceberg_error_source.downcast_ref::<opendal::Error>()`
`365`	`365`	`{`
`366`		`- if opendal_error.kind() == opendal::ErrorKind::ConditionNotMatch {`
	`366`	`+ if opendal_error.kind() == opendal::ErrorKind::AlreadyExists {`
	`367`	`+ warn!("Failed writing new metadata file {new_metadata_location} due to a concurrency error");`
`367`	`368`	`return Err(DataLoadingError::OptimisticConcurrencyError());`
`368`	`369`	`}`
`369`	`370`	`}`
`370`	`371`	`}`
	`372`	`+ warn!("Failed writing new metadata file {new_metadata_location} due to: {iceberg_error}");`
`371`	`373`	`return Err(iceberg_error.into());`
`372`	`374`	`};`
`373`	`375`	`info!("Wrote new metadata: {:?}", new_metadata_location);`