datafusion-contrib
diff --git a/‎Cargo.lock‎
Lines changed: 266 additions & 24 deletions b/‎Cargo.lock‎
Lines changed: 266 additions & 24 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/bin/logictest.rs‎
Lines changed: 137 additions & 0 deletions b/‎src/bin/logictest.rs‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎src/test_utils/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎src/test_utils/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/test_utils/sqllogictest.rs‎
Lines changed: 164 additions & 0 deletions b/‎src/test_utils/sqllogictest.rs‎
Lines changed: 164 additions & 0 deletions
@@ -39,6 +39,10 @@ parquet = { version = "55.2.0", optional = true }
 arrow = { version = "55.2.0", optional = true }
 tokio-stream = { version = "0.1.17", optional = true }
 hyper-util = { version = "0.1.16", optional = true }
+sqllogictest = { version = "0.20", optional = true }
+regex = { version = "1.0", optional = true }
+clap = { version = "4.0", features = ["derive"], optional = true }
+env_logger = { version = "0.10", optional = true }
 pin-project = "1.1.10"
 
 [features]
@@ -50,6 +54,10 @@ integration = [
     "arrow",
     "tokio-stream",
     "hyper-util",
+    "sqllogictest",
+    "regex",
+    "clap",
+    "env_logger",
 ]
 
 tpch = ["integration"]
 
@@ -0,0 +1,137 @@
+#[cfg(feature = "integration")]
+use clap::Parser;
+#[cfg(feature = "integration")]
+use datafusion_distributed::test_utils::sqllogictest::sqllogictest::DataFusionDistributedDB;
+#[cfg(feature = "integration")]
+use sqllogictest::Runner;
+#[cfg(feature = "integration")]
+use std::path::PathBuf;
+
+#[cfg(feature = "integration")]
+#[derive(Parser)]
+#[command(name = "logictest")]
+#[command(about = "A SQLLogicTest runner for DataFusion Distributed")]
+struct Args {
+    /// Test files or directories to run
+    #[arg(required = true)]
+    files: Vec<PathBuf>,
+    
+    /// Override mode: update test files with actual output
+    #[arg(long = "override")]
+    override_mode: bool,
+    
+    /// Number of distributed nodes to start
+    #[arg(long, default_value = "3")]
+    nodes: usize,
+    
+}
+
+
+#[cfg(feature = "integration")]
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args = Args::parse();
+    
+    
+    println!("Starting DataFusion Distributed SQLLogicTest Runner");
+    println!("Mode: {}", if args.override_mode { "Override" } else { "Verify" });
+    println!("Nodes: {}", args.nodes);
+    
+    // Create a closure that creates new database connections
+    let nodes = args.nodes;
+    let mut runner = Runner::new(move || async move {
+        Ok(DataFusionDistributedDB::new(nodes).await)
+    });
+    
+    // Configure runner based on override mode
+    if args.override_mode {
+        // Override mode: use sqllogictest's built-in override functionality
+        for file_path in &args.files {
+            if file_path.is_file() {
+                println!("Generating completion for: {}", file_path.display());
+                let file_path_str = file_path.to_str().expect("Invalid file path");
+                
+                // Use the built-in update_test_file with default comparison functions
+                match runner.update_test_file(
+                    file_path_str, 
+                    file_path_str, 
+                    sqllogictest::default_validator,
+                    sqllogictest::default_column_validator
+                ).await {
+                    Ok(_) => println!("✅ {}: Generated", file_path.display()),
+                    Err(e) => {
+                        eprintln!("❌ {}: Failed to generate", file_path.display());
+                        eprintln!("   Error: {}", e);
+                    }
+                }
+            } else {
+                eprintln!("Override mode only works with individual files, not directories");
+            }
+        }
+    } else {
+        // Verify mode: compare results against expected output
+        for file_path in &args.files {
+            if file_path.is_file() {
+                println!("Running test file: {}", file_path.display());
+                match runner.run_file_async(file_path).await {
+                    Ok(_) => println!("✅ {}: PASSED", file_path.display()),
+                    Err(e) => {
+                        eprintln!("❌ {}: FAILED", file_path.display());
+                        eprintln!("   Error: {}", e);
+                    }
+                }
+            } else if file_path.is_dir() {
+                println!("Running tests in directory: {}", file_path.display());
+                run_directory(&mut runner, file_path).await?;
+            } else {
+                eprintln!("Warning: {} is neither a file nor directory", file_path.display());
+            }
+        }
+    }
+    
+    Ok(())
+}
+
+#[cfg(feature = "integration")]
+async fn run_directory<D, M>(
+    runner: &mut Runner<D, M>,
+    dir_path: &PathBuf,
+) -> Result<(), Box<dyn std::error::Error>>
+where 
+    D: sqllogictest::AsyncDB,
+    M: sqllogictest::MakeConnection<Conn = D>,
+{
+    let mut entries: Vec<_> = std::fs::read_dir(dir_path)?
+        .filter_map(|entry| entry.ok())
+        .filter(|entry| {
+            entry.path().extension()
+                .and_then(|ext| ext.to_str())
+                .map(|ext| ext == "slt")
+                .unwrap_or(false)
+        })
+        .collect();
+    
+    // Sort entries for consistent order
+    entries.sort_by_key(|entry| entry.path());
+    
+    for entry in entries {
+        let file_path = entry.path();
+        println!("Running test file: {}", file_path.display());
+        match runner.run_file_async(&file_path).await {
+            Ok(_) => println!("✅ {}: PASSED", file_path.display()),
+            Err(e) => {
+                eprintln!("❌ {}: FAILED", file_path.display());
+                eprintln!("   Error: {}", e);
+            }
+        }
+    }
+    
+    Ok(())
+}
+
+#[cfg(not(feature = "integration"))]
+fn main() {
+    eprintln!("This binary requires the 'integration' feature to be enabled.");
+    eprintln!("Run with: cargo run --features integration --bin sqllogictest-runner");
+    std::process::exit(1);
+}
@@ -5,4 +5,5 @@ pub mod metrics;
 pub mod mock_exec;
 pub mod parquet;
 pub mod session_context;
+pub mod sqllogictest;
 pub mod tpch;
@@ -0,0 +1,164 @@
+#[cfg(feature = "integration")]
+pub mod sqllogictest {
+    use async_trait::async_trait;
+    use datafusion::arrow::array::RecordBatch;
+    use datafusion::arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::execution::context::SessionContext;
+    use crate::test_utils::localhost::start_localhost_context;
+    use crate::DefaultSessionBuilder;
+    use sqllogictest::{AsyncDB, DefaultColumnType, DBOutput};
+    use std::sync::Arc;
+
+    pub struct DataFusionDistributedDB {
+        ctx: SessionContext,
+    }
+
+    impl DataFusionDistributedDB {
+        pub async fn new(num_nodes: usize) -> Self {
+            // Start distributed context with specified number of nodes
+            let (ctx, _guard) = start_localhost_context(num_nodes, DefaultSessionBuilder).await;
+            
+            // Use existing parquet tables from the test_utils
+            use crate::test_utils::parquet::register_parquet_tables;
+            register_parquet_tables(&ctx).await.unwrap();
+            
+            // Keep the guard alive by forgetting it (for CLI purposes)
+            std::mem::forget(_guard);
+            
+            Self { ctx }
+        }
+        
+        async fn handle_explain_analyze(&mut self, sql: &str) -> Result<DBOutput<DefaultColumnType>, datafusion::error::DataFusionError> {
+            // For now, just treat it as a regular EXPLAIN
+            // TODO: Implement proper distributed EXPLAIN ANALYZE with metrics
+            let explain_sql = sql.replace("EXPLAIN ANALYZE", "EXPLAIN");
+            let df = self.ctx.sql(&explain_sql).await?;
+            let batches = df.collect().await?;
+            self.convert_batches_to_output(batches)
+        }
+        
+        // TODO: Implement proper metrics obfuscation for EXPLAIN ANALYZE
+        #[allow(dead_code)]
+        fn obfuscate_metrics(&self, plan_str: &str) -> String {
+            use regex::Regex;
+            
+            let mut obfuscated = plan_str.to_string();
+            
+            // Replace timestamps with <TIMESTAMP>
+            let timestamp_regex = Regex::new(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+").unwrap();
+            obfuscated = timestamp_regex.replace_all(&obfuscated, "<TIMESTAMP>").to_string();
+            
+            // Replace durations (e.g., "123.456ms", "1.234s") with <DURATION>
+            let duration_regex = Regex::new(r"\d+\.\d+[μmn]?s").unwrap();
+            obfuscated = duration_regex.replace_all(&obfuscated, "<DURATION>").to_string();
+            
+            // Replace memory sizes (e.g., "1.2MB", "345KB") with <MEMORY>
+            let memory_regex = Regex::new(r"\d+\.\d+[KMGT]?B").unwrap();
+            obfuscated = memory_regex.replace_all(&obfuscated, "<MEMORY>").to_string();
+            
+            // Replace row counts and other large numbers with <COUNT>
+            let count_regex = Regex::new(r"rows=\d+").unwrap();
+            obfuscated = count_regex.replace_all(&obfuscated, "rows=<COUNT>").to_string();
+            
+            obfuscated
+        }
+        
+        fn convert_batches_to_output(&self, batches: Vec<RecordBatch>) -> Result<DBOutput<DefaultColumnType>, datafusion::error::DataFusionError> {
+            if batches.is_empty() {
+                return Ok(DBOutput::Rows {
+                    types: vec![],
+                    rows: vec![],
+                });
+            }
+            
+            let num_columns = batches[0].num_columns();
+            let column_types = vec![DefaultColumnType::Text; num_columns]; // Everything as text
+            
+            let mut rows = Vec::new();
+            for batch in batches {
+                for row_idx in 0..batch.num_rows() {
+                    let mut row = Vec::new();
+                    for col_idx in 0..batch.num_columns() {
+                        let column = batch.column(col_idx);
+                        let value = datafusion::arrow::util::display::array_value_to_string(column, row_idx)
+                            .map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None))?;
+                        row.push(value);
+                    }
+                    rows.push(row);
+                }
+            }
+            
+            Ok(DBOutput::Rows {
+                types: column_types,
+                rows,
+            })
+        }
+    }
+
+    #[async_trait]
+    impl AsyncDB for DataFusionDistributedDB {
+        type Error = datafusion::error::DataFusionError;
+        type ColumnType = DefaultColumnType;
+
+        async fn run(&mut self, sql: &str) -> Result<DBOutput<Self::ColumnType>, Self::Error> {
+            let sql = sql.trim();
+            
+            // Handle different types of SQL statements
+            if sql.to_uppercase().starts_with("CREATE") || 
+               sql.to_uppercase().starts_with("INSERT") || 
+               sql.to_uppercase().starts_with("DROP") {
+                // For DDL/DML statements, just return an empty result
+                return Ok(DBOutput::StatementComplete(0));
+            }
+            
+            // Handle EXPLAIN ANALYZE
+            if sql.to_uppercase().starts_with("EXPLAIN ANALYZE") {
+                return self.handle_explain_analyze(sql).await;
+            }
+            
+            // Handle regular EXPLAIN - use distributed optimizer
+            if sql.to_uppercase().starts_with("EXPLAIN") {
+                let query = sql.trim_start_matches("EXPLAIN").trim();
+                let df = self.ctx.sql(query).await?;
+                let physical_plan = df.create_physical_plan().await?;
+                
+                // Apply distributed optimizer to get the distributed plan
+                use crate::DistributedPhysicalOptimizerRule;
+                use datafusion::physical_optimizer::PhysicalOptimizerRule;
+                use datafusion::physical_plan::displayable;
+                
+                let physical_distributed = DistributedPhysicalOptimizerRule::default()
+                    .with_network_shuffle_tasks(2)
+                    .with_network_coalesce_tasks(2)
+                    .optimize(physical_plan, &Default::default())?;
+                    
+                let physical_distributed_str = displayable(physical_distributed.as_ref())
+                    .indent(true)
+                    .to_string();
+                
+                // Create a RecordBatch with the plan string
+                use datafusion::arrow::array::{ArrayRef, StringArray};
+                use datafusion::arrow::datatypes::{DataType, Field, Schema};
+                
+                let lines: Vec<String> = physical_distributed_str.lines().map(|s| s.to_string()).collect();
+                let schema = Arc::new(Schema::new(vec![Field::new("plan", DataType::Utf8, false)]));
+                let batch = RecordBatch::try_new(
+                    schema,
+                    vec![Arc::new(StringArray::from(lines)) as ArrayRef],
+                )?;
+                
+                return self.convert_batches_to_output(vec![batch]);
+            }
+            
+            // Execute query
+            let df = self.ctx.sql(sql).await?;
+            let batches = df.collect().await?;
+            
+            self.convert_batches_to_output(batches)
+        }
+
+        fn engine_name(&self) -> &str {
+            "datafusion-distributed"
+        }
+    }
+}