From 5290ea81662d62a433c51f48bb3079256b003551 Mon Sep 17 00:00:00 2001 From: aditya singh rathore Date: Fri, 25 Jul 2025 19:50:28 +0530 Subject: [PATCH 1/7] Create 2025-07-27-extending-sql-with-satafusion --- content/blog/2025-07-27-extending-sql-with-satafusion.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 content/blog/2025-07-27-extending-sql-with-satafusion.md diff --git a/content/blog/2025-07-27-extending-sql-with-satafusion.md b/content/blog/2025-07-27-extending-sql-with-satafusion.md new file mode 100644 index 00000000..e69de29b From aee7ebca383e13bead6c1094d94a4c968902cf60 Mon Sep 17 00:00:00 2001 From: aditya singh rathore Date: Sat, 26 Jul 2025 09:28:24 +0530 Subject: [PATCH 2/7] Updating the example --- ...025-07-27-extending-sql-with-satafusion.md | 248 ++++++++++++++++++ 1 file changed, 248 insertions(+) diff --git a/content/blog/2025-07-27-extending-sql-with-satafusion.md b/content/blog/2025-07-27-extending-sql-with-satafusion.md index e69de29b..9c758dd4 100644 --- a/content/blog/2025-07-27-extending-sql-with-satafusion.md +++ b/content/blog/2025-07-27-extending-sql-with-satafusion.md @@ -0,0 +1,248 @@ +--- +layout: post +title: Implementing your own SQL dialect and SQL statements with DataFusion +date: 2025-07-26 +author: Aditya Singh Rathore +categories: [tutorial] +--- + + + +Have you ever wished you could extend SQL with custom statements tailored to your specific use case? Maybe you're working with a parquet-files-on-S3 storage approach and need an `ATTACH DATABASE` statement, or perhaps you want to implement catalog management features similar to [DuckDB] or [SQLite]. With [Apache DataFusion], you can do exactly that – and it's more straightforward than you might think. + +[DuckDB]: https://duckdb.org/ +[SQLite]: https://www.sqlite.org/ +[Apache DataFusion]: https://datafusion.apache.org/ + +## The Challenge: Beyond Standard SQL + +Imagine you're building a data platform that uses a parquet-files-on-S3 storage pattern. Your application needs to dynamically discover and attach databases, similar to how [DuckDB] handles multiple databases with its `ATTACH` statement. While [DataFusion] supports `CREATE EXTERNAL TABLE`, you need something more flexible – perhaps a statement like: + +```sql +CREATE EXTERNAL CATALOG my_catalog +STORED AS PARQUET +LOCATION 's3://my-bucket/data/' +OPTIONS ( + 'aws.region' = 'us-west-2', + 'catalog.type' = 'hive_metastore' +); +``` + +Standard SQL doesn't have this capability, but DataFusion's extensible architecture makes it possible to add custom SQL statements like this. + +## Understanding the SQL Processing Pipeline + +Before diving into custom implementations, let's understand how DataFusion processes SQL queries. The journey from SQL text to execution follows this path: + +```text ++-------+ +--------+ +-----+ +-------------+ +--------------+ +----------+ +| Query | ---> | Parser | ---> | AST | ---> |Logical Plan | ---> |Physical Plan | ---> |Execution | ++-------+ +--------+ +-----+ +-------------+ +--------------+ +----------+ + +``` + +1. **SQL Text:** The raw SQL string you write +2. **Parser:** DataFusion's `DFParser` converts SQL text into an Abstract Syntax Tree (AST) +3. **AST:** A structured representation of the SQL statement using DataFusion's `Statement` enum +4. **Logical Plan:** DataFusion's internal representation that describes what to do +5. **Physical Plan:** The executable plan that describes how to do it +6. **Execution:** The actual query execution + +The key insight is that DataFusion allows you to wrap and extend the existing `DFParser` to handle custom statements while still leveraging all the built-in SQL parsing capabilities + +## Extending the SQL Parser: Learning from the Example + +DataFusion provides an excellent example of custom SQL dialect implementation in their [sql_dialect.rs] example. Let's break down how it works and then apply the pattern to our `ATTACH DATABASE` use case. +[sql_dialect.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/sql_dialect.rs) + +## The DataFusion Pattern: Wrapping and Extending + +Let's examine the actual DataFusion example to understand the pattern: + +### Step 1: Wrap the DataFusion Parser + +```rust + +use datafusion::sql::{ + parser::{CopyToSource, CopyToStatement, DFParser, DFParserBuilder, Statement}, + sqlparser::{keywords::Keyword, tokenizer::Token}, +}; + +/// Wrap the existing DFParser rather than replacing it +struct MyParser<'a> { + df_parser: DFParser<'a>, +} + +impl<'a> MyParser<'a> { + fn new(sql: &'a str) -> Result { + let df_parser = DFParserBuilder::new(sql).build()?; + Ok(Self { df_parser }) + } + + /// Check if we're dealing with a COPY statement + fn is_copy(&self) -> bool { + matches!( + self.df_parser.parser.peek_token().token, + Token::Word(w) if w.keyword == Keyword::COPY + ) + } +} + +``` + +### Step 2: Delegate Parsing, Add Custom Logic + +The crucial part – let DataFusion do the parsing, then add your custom logic: + +```rust +impl<'a> MyParser<'a> { + pub fn parse_statement(&mut self) -> Result { + if self.is_copy() { + // Let DataFusion parse the COPY statement structure + self.df_parser.parser.next_token(); // consume COPY + let df_statement = self.df_parser.parse_copy()?; + + // Now add our custom logic + if let Statement::CopyTo(s) = df_statement { + Ok(MyStatement::from(s)) // This is where the magic happens + } else { + Ok(MyStatement::DFStatement(Box::from(df_statement))) + } + } else { + // For non-COPY statements, delegate completely to DataFusion + let df_statement = self.df_parser.parse_statement()?; + Ok(MyStatement::from(df_statement)) + } + } +} +``` + +### Step 3: Custom Statement Types with Smart Conversion + +Here's the elegant part – the custom logic happens in the type conversion: + +```rust +enum MyStatement { + DFStatement(Box), // Standard DataFusion statements + MyCopyTo(MyCopyToStatement), // Our custom COPY handling +} + +// The key insight: custom logic in the From implementation +impl From for MyStatement { + fn from(s: CopyToStatement) -> Self { + // Check if this is our special case + if s.stored_as == Some("FASTA".to_string()) { + // Handle FASTA format with custom logic + Self::MyCopyTo(MyCopyToStatement::from(s)) + } else { + // Fall back to standard DataFusion behavior + Self::DFStatement(Box::from(Statement::CopyTo(s))) + } + } +} + +// Our custom COPY statement for FASTA format +struct MyCopyToStatement { + pub source: CopyToSource, + pub target: String, +} + +impl From for MyCopyToStatement { + fn from(s: CopyToStatement) -> Self { + Self { + source: s.source, + target: s.target, + } + } +} +``` + +## The Power of This Pattern + +What makes this approach so powerful is its simplicity and reusability. Notice how the example: + +1. **Reuses all DataFusion parsing logic** - no need to handle SQL syntax errors, keyword parsing, or complex grammar rules +2. **Extends through type conversion** - the custom behavior is cleanly separated in the `From` implementations +3. **Maintains backward compatibility** - standard `COPY` statements work exactly as before +4. **Provides clear extension points** - you can easily add more custom formats + +### Running the Example + +Here's how the complete example works in practice: + +```rust +#[tokio::main] +async fn main() -> Result<()> { + let mut my_parser = + MyParser::new("COPY source_table TO 'file.fasta' STORED AS FASTA")?; + + let my_statement = my_parser.parse_statement()?; + + match my_statement { + MyStatement::DFStatement(s) => { + println!("Standard DataFusion statement: {s}"); + // Handle with regular DataFusion execution + } + MyStatement::MyCopyTo(s) => { + println!("Custom FASTA export: {s}"); + // Execute your custom FASTA export logic + execute_fasta_export(s).await?; + } + } + + Ok(()) +} + +async fn execute_fasta_export(stmt: MyCopyToStatement) -> Result<()> { + println!("Exporting {} to {} in FASTA format", stmt.source, stmt.target); + + // Your custom FASTA export implementation here: + // 1. Execute the query to get results + // 2. Format results as FASTA sequences + // 3. Write to the target file + + Ok(()) +} +``` + +## Key Insights from the DataFusion Example + +The DataFusion [sql_dialect.rs] example teaches us several crucial patterns: + +1. **Don't reinvent SQL parsing** - Wrap `DFParser` and leverage its robust SQL handling +2. **Extend through type conversion** - Put custom logic in `From` implementations, not parsing code +3. **Maintain compatibility** - Standard statements continue to work unchanged +4. **Keep it simple** - The example focuses on one clear extension rather than trying to do everything +5. **Use enums for clean abstraction** - `MyStatement` provides a unified interface for both standard and custom statements + [sql_dialect.rs]:https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/sql_dialect.rs + +## Conclusion + +The DataFusion's example demonstrates a remarkably elegant approach to extending SQL: don't fight the existing system, enhance it. By wrapping `DFParser` and adding custom logic through type conversions, you get the best of both worlds – robust SQL parsing and domain-specific functionality. +The pattern's power lies in its simplicity. You write minimal code, reuse [DataFusion]'s battle-tested SQL parsing, and create clean extension points for your custom logic. Whether you need specialized export formats, custom catalog management, or domain-specific data operations, this approach provides a solid foundation. +Start with the DataFusion example, understand how the type conversions work, and then apply the same pattern to extend the SQL statements that matter for your use case. The DataFusion community has created an excellent template – now it's time to build something amazing on top of it. +[DataFusion]: https://datafusion.apache.org/ + +A heartfelt thank you to [@alamb] and [@andygrove] for their invaluable reviews and thoughtful feedback—they’ve been instrumental in shaping this post. + +The Apache Arrow and Apache DataFusion communities are vibrant, welcoming, and full of passionate developers building something truly powerful. If you’re excited about high-performance analytics and want to be part of an open-source journey, I highly encourage you to explore the [official documentation](<(https://datafusion.apache.org/)>) and dive into one of the many [open issues](https://github.com/apache/datafusion/issues). There’s never been a better time to get involved! + +[@andygrove]: https://github.com/andygrove +[@alamb]: https://github.com/alamb From 41a753ed86ed25b3a7128440c90720fdce1b629b Mon Sep 17 00:00:00 2001 From: aditya singh rathore <142787780+Adez017@users.noreply.github.com> Date: Mon, 28 Jul 2025 17:49:04 +0530 Subject: [PATCH 3/7] Update content/blog/2025-07-27-extending-sql-with-satafusion.md Co-authored-by: Jax Liu --- content/blog/2025-07-27-extending-sql-with-satafusion.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/content/blog/2025-07-27-extending-sql-with-satafusion.md b/content/blog/2025-07-27-extending-sql-with-satafusion.md index 9c758dd4..3a0bbdd5 100644 --- a/content/blog/2025-07-27-extending-sql-with-satafusion.md +++ b/content/blog/2025-07-27-extending-sql-with-satafusion.md @@ -52,10 +52,9 @@ Standard SQL doesn't have this capability, but DataFusion's extensible architect Before diving into custom implementations, let's understand how DataFusion processes SQL queries. The journey from SQL text to execution follows this path: ```text -+-------+ +--------+ +-----+ +-------------+ +--------------+ +----------+ -| Query | ---> | Parser | ---> | AST | ---> |Logical Plan | ---> |Physical Plan | ---> |Execution | -+-------+ +--------+ +-----+ +-------------+ +--------------+ +----------+ - ++----------+ +-----+ +--------------+ +---------------+ +-----------+ +| SQL Text |---> | AST | ---> | Logical Plan | ---> | Physical Plan | ---> | Execution | ++----------+ +-----+ +--------------+ +---------------+ +-----------+ ``` 1. **SQL Text:** The raw SQL string you write From c87e2c036c15afdc960c39848c4b2cb90e153cdb Mon Sep 17 00:00:00 2001 From: aditya singh rathore <142787780+Adez017@users.noreply.github.com> Date: Mon, 28 Jul 2025 17:49:18 +0530 Subject: [PATCH 4/7] Update content/blog/2025-07-27-extending-sql-with-satafusion.md Co-authored-by: Jax Liu --- content/blog/2025-07-27-extending-sql-with-satafusion.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/content/blog/2025-07-27-extending-sql-with-satafusion.md b/content/blog/2025-07-27-extending-sql-with-satafusion.md index 3a0bbdd5..af06c99c 100644 --- a/content/blog/2025-07-27-extending-sql-with-satafusion.md +++ b/content/blog/2025-07-27-extending-sql-with-satafusion.md @@ -58,8 +58,7 @@ Before diving into custom implementations, let's understand how DataFusion proce ``` 1. **SQL Text:** The raw SQL string you write -2. **Parser:** DataFusion's `DFParser` converts SQL text into an Abstract Syntax Tree (AST) -3. **AST:** A structured representation of the SQL statement using DataFusion's `Statement` enum +2. **AST:** DataFusion's `DFParser` converts SQL text into an Abstract Syntax Tree (AST), a structured representation of the SQL statement using DataFusion's `Statement` enum 4. **Logical Plan:** DataFusion's internal representation that describes what to do 5. **Physical Plan:** The executable plan that describes how to do it 6. **Execution:** The actual query execution From df1b67f41d4cca9988c9557705b5ae6078e7ae01 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 29 Jul 2025 06:21:57 -0400 Subject: [PATCH 5/7] Adjust publish date, and fix a typo in file name --- ...atafusion.md => 2025-08-04-extending-sql-with-datafusion.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename content/blog/{2025-07-27-extending-sql-with-satafusion.md => 2025-08-04-extending-sql-with-datafusion.md} (99%) diff --git a/content/blog/2025-07-27-extending-sql-with-satafusion.md b/content/blog/2025-08-04-extending-sql-with-datafusion.md similarity index 99% rename from content/blog/2025-07-27-extending-sql-with-satafusion.md rename to content/blog/2025-08-04-extending-sql-with-datafusion.md index af06c99c..5d5565e8 100644 --- a/content/blog/2025-07-27-extending-sql-with-satafusion.md +++ b/content/blog/2025-08-04-extending-sql-with-datafusion.md @@ -1,7 +1,7 @@ --- layout: post title: Implementing your own SQL dialect and SQL statements with DataFusion -date: 2025-07-26 +date: 2025-08-04 author: Aditya Singh Rathore categories: [tutorial] --- From 7d1873959928c9553ee22622fea0a77b9444e443 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 29 Jul 2025 06:43:35 -0400 Subject: [PATCH 6/7] Add some more references and tweak the execution flow section --- ...025-08-04-extending-sql-with-datafusion.md | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/content/blog/2025-08-04-extending-sql-with-datafusion.md b/content/blog/2025-08-04-extending-sql-with-datafusion.md index 5d5565e8..cf233982 100644 --- a/content/blog/2025-08-04-extending-sql-with-datafusion.md +++ b/content/blog/2025-08-04-extending-sql-with-datafusion.md @@ -33,7 +33,7 @@ Have you ever wished you could extend SQL with custom statements tailored to you ## The Challenge: Beyond Standard SQL -Imagine you're building a data platform that uses a parquet-files-on-S3 storage pattern. Your application needs to dynamically discover and attach databases, similar to how [DuckDB] handles multiple databases with its `ATTACH` statement. While [DataFusion] supports `CREATE EXTERNAL TABLE`, you need something more flexible – perhaps a statement like: +Imagine you're building a data platform that uses a parquet-files-on-S3 storage pattern. Your application needs to dynamically discover and attach databases, similar to how [DuckDB] handles multiple databases with its `ATTACH` statement. While [DataFusion] supports [CREATE EXTERNAL TABLE], you need something more flexible – perhaps a statement like: ```sql CREATE EXTERNAL CATALOG my_catalog @@ -45,30 +45,40 @@ OPTIONS ( ); ``` -Standard SQL doesn't have this capability, but DataFusion's extensible architecture makes it possible to add custom SQL statements like this. +[CREATE EXTERNAL TABLE]: https://datafusion.apache.org/user-guide/sql/ddl.html#create-external-table + +While the SQL provided in DataFusion doesn't have this capability, its extensible architecture makes it possible to add custom SQL statements by leveraging the existing SQL parsing and execution framework and injecting your custom logic where needed. ## Understanding the SQL Processing Pipeline -Before diving into custom implementations, let's understand how DataFusion processes SQL queries. The journey from SQL text to execution follows this path: +Before diving into custom implementations, let's review how DataFusion processes SQL queries. As in most query engines, the journey from SQL text to execution follows this path: ```text +----------+ +-----+ +--------------+ +---------------+ +-----------+ -| SQL Text |---> | AST | ---> | Logical Plan | ---> | Physical Plan | ---> | Execution | +| SQL Text |---> | AST | ---> | Logical Plan | ---> | Physical Plan | ---> | Stream | +----------+ +-----+ +--------------+ +---------------+ +-----------+ ``` 1. **SQL Text:** The raw SQL string you write -2. **AST:** DataFusion's `DFParser` converts SQL text into an Abstract Syntax Tree (AST), a structured representation of the SQL statement using DataFusion's `Statement` enum -4. **Logical Plan:** DataFusion's internal representation that describes what to do -5. **Physical Plan:** The executable plan that describes how to do it -6. **Execution:** The actual query execution +2. **AST:** DataFusion's [DFParser] converts SQL text into an [Abstract Syntax Tree] (AST), a structured representation of the SQL statement using DataFusion's [Statement] enum +4. **Logical Plan:** A tree of [LogicalPlan] nodes that describes what to do +5. **Physical Plan:** A tree of [ExecutionPlan] nodes that describes how to do it +6. **Stream:** [SendableRecordBatchStream]s form a data flow graph to efficiently execute the query + +[DFParser]: https://docs.rs/datafusion/latest/datafusion/sql/parser/struct.DFParser.html +[Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree +[Statement]: https://docs.rs/datafusion/latest/datafusion/sql/parser/enum.Statement.html +[LogicalPlan]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html +[ExecutionPlan]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html +[SendableRecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/type.SendableRecordBatchStream.html -The key insight is that DataFusion allows you to wrap and extend the existing `DFParser` to handle custom statements while still leveraging all the built-in SQL parsing capabilities +The key insight is that DataFusion allows you to wrap and extend the existing `DFParser` to handle custom statements while delegating to the built-in SQL parsing capabilities and the the rest of the SQL processing pipeline. ## Extending the SQL Parser: Learning from the Example -DataFusion provides an excellent example of custom SQL dialect implementation in their [sql_dialect.rs] example. Let's break down how it works and then apply the pattern to our `ATTACH DATABASE` use case. -[sql_dialect.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/sql_dialect.rs) +DataFusion provides an excellent example of custom SQL dialect implementation in the [sql_dialect.rs] example. Let's break down how it works and then apply the pattern to our `ATTACH DATABASE` use case. + +[sql_dialect.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/sql_dialect.rs ## The DataFusion Pattern: Wrapping and Extending From accf0b5a18219998218f78570460084ab58cf020 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 29 Jul 2025 06:44:51 -0400 Subject: [PATCH 7/7] tweak section 1 --- .../blog/2025-08-04-extending-sql-with-datafusion.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/content/blog/2025-08-04-extending-sql-with-datafusion.md b/content/blog/2025-08-04-extending-sql-with-datafusion.md index cf233982..a764777e 100644 --- a/content/blog/2025-08-04-extending-sql-with-datafusion.md +++ b/content/blog/2025-08-04-extending-sql-with-datafusion.md @@ -87,13 +87,7 @@ Let's examine the actual DataFusion example to understand the pattern: ### Step 1: Wrap the DataFusion Parser ```rust - -use datafusion::sql::{ - parser::{CopyToSource, CopyToStatement, DFParser, DFParserBuilder, Statement}, - sqlparser::{keywords::Keyword, tokenizer::Token}, -}; - -/// Wrap the existing DFParser rather than replacing it +/// Our custom parser wrap the existing DFParser rather than replacing it struct MyParser<'a> { df_parser: DFParser<'a>, } @@ -104,7 +98,8 @@ impl<'a> MyParser<'a> { Ok(Self { df_parser }) } - /// Check if we're dealing with a COPY statement + /// Check if we're dealing with a COPY statement by + /// inspecting the next token in the parser's stream fn is_copy(&self) -> bool { matches!( self.df_parser.parser.peek_token().token, @@ -112,7 +107,6 @@ impl<'a> MyParser<'a> { ) } } - ``` ### Step 2: Delegate Parsing, Add Custom Logic