Skip to content

Commit 74800aa

Browse files
committed
Add DataFusion dialect
1 parent a8fa037 commit 74800aa

File tree

21 files changed

+2483
-8
lines changed

21 files changed

+2483
-8
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ jobs:
3232
- name: Transpilation tests
3333
run: cargo test --test sqlglot_transpilation test_sqlglot_transpilation_all -p polyglot-sql -- --nocapture
3434

35+
- name: Custom dialect tests
36+
run: cargo test --test custom_dialect_tests -p polyglot-sql -- --nocapture
37+
3538
sdk-build:
3639
needs: rust-test
3740
runs-on: ubuntu-latest

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
# debug information files
4141
*.dwo
4242

43+
**/.DS_Store
44+
4345
# AI
4446
external-projects/
4547
CLAUDE.md

Makefile

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
.PHONY: help extract-fixtures test-rust test-rust-all test-rust-identity test-rust-dialect \
22
test-rust-transpile test-rust-pretty test-rust-roundtrip test-rust-matrix \
3-
test-rust-compat test-rust-errors test-rust-functions test-rust-lib test-rust-verify \
3+
test-rust-compat test-rust-errors test-rust-functions test-rust-custom test-rust-lib test-rust-verify \
44
test-compare build-wasm setup-fixtures clean-fixtures clean generate-bindings copy-bindings \
55
bench-compare bench-rust bench-python \
66
playground-dev playground-build playground-preview playground-deploy \
@@ -20,7 +20,7 @@ help:
2020
@echo " make test-rust - Run all Rust tests"
2121
@echo " make test-rust-all - Run all sqlglot fixture tests"
2222
@echo " make test-rust-lib - Run lib unit tests (704)"
23-
@echo " make test-rust-verify - Run lib + identity + dialect + transpilation"
23+
@echo " make test-rust-verify - Run lib + identity + dialect + transpilation + custom"
2424
@echo ""
2525
@echo " SQLGlot Fixture Tests (8,455 tests):"
2626
@echo " make test-rust-identity - Generic identity tests (955)"
@@ -34,6 +34,7 @@ help:
3434
@echo " make test-rust-compat - SQLGlot compatibility tests"
3535
@echo " make test-rust-errors - Error handling tests"
3636
@echo " make test-rust-functions - Function normalization tests"
37+
@echo " make test-rust-custom - Custom dialect tests (DataFusion, etc.)"
3738
@echo ""
3839
@echo "Full Comparison (slow, ~60s):"
3940
@echo " make test-compare - Run JS comparison tool (requires WASM build)"
@@ -122,7 +123,7 @@ test-rust-all: setup-fixtures
122123
cargo test -p polyglot-sql --test sqlglot_identity --test sqlglot_dialect_identity \
123124
--test sqlglot_transpilation --test sqlglot_pretty -- --nocapture
124125

125-
# Run lib + identity + dialect identity + transpilation (full verification)
126+
# Run lib + identity + dialect identity + transpilation + custom dialects (full verification)
126127
test-rust-verify: setup-fixtures
127128
@echo "=== Lib unit tests ==="
128129
@cargo test --lib -p polyglot-sql
@@ -135,6 +136,9 @@ test-rust-verify: setup-fixtures
135136
@echo ""
136137
@echo "=== Transpilation tests ==="
137138
@cargo test --test sqlglot_transpilation test_sqlglot_transpilation_all -p polyglot-sql -- --nocapture
139+
@echo ""
140+
@echo "=== Custom dialect tests ==="
141+
@cargo test --test custom_dialect_tests -p polyglot-sql -- --nocapture
138142

139143
# -----------------------------------------------------------------------------
140144
# Additional Rust Tests
@@ -160,6 +164,10 @@ test-rust-errors:
160164
test-rust-functions:
161165
cargo test -p polyglot-sql --test test_function_normalizations -- --nocapture
162166

167+
# Run custom dialect tests (auto-discovers all dialects in custom_fixtures/)
168+
test-rust-custom:
169+
cargo test -p polyglot-sql --test custom_dialect_tests -- --nocapture
170+
163171
# Quick check - just compile tests
164172
test-rust-check:
165173
cargo check -p polyglot-sql --tests

crates/polyglot-sql-wasm/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ pub fn get_dialects() -> String {
251251
"drill",
252252
"dremio",
253253
"exasol",
254+
"datafusion",
254255
];
255256
serde_json::to_string(&dialects).unwrap()
256257
}
@@ -1321,6 +1322,7 @@ mod tests {
13211322
"drill",
13221323
"dremio",
13231324
"exasol",
1325+
"datafusion",
13241326
];
13251327

13261328
for dialect in dialects {
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
//! Apache DataFusion SQL Dialect
2+
//!
3+
//! DataFusion is an Arrow-based query engine with modern SQL extensions.
4+
//! Reference: https://datafusion.apache.org/user-guide/sql/
5+
//!
6+
//! Key characteristics:
7+
//! - Arrow-native type system (Int8, Int16, Int32, Int64, Float32, Float64, Utf8, etc.)
8+
//! - Double-quote identifiers
9+
//! - Lowercase function names by default
10+
//! - QUALIFY clause support
11+
//! - EXCEPT for column exclusion (SELECT * EXCEPT(col))
12+
//! - LEFT SEMI JOIN / LEFT ANTI JOIN syntax
13+
//! - TRY_CAST support
14+
//! - Pipe operator (|>) for query chaining
15+
//! - No UPDATE/DELETE support
16+
//! - arrow_cast() and arrow_typeof() functions
17+
//! - COPY ... TO syntax (no INTO keyword)
18+
//! - Nested comment support
19+
20+
use super::{DialectImpl, DialectType};
21+
use crate::error::Result;
22+
use crate::expressions::{Expression, Function};
23+
use crate::generator::GeneratorConfig;
24+
use crate::tokens::TokenizerConfig;
25+
26+
/// Apache DataFusion dialect
27+
pub struct DataFusionDialect;
28+
29+
impl DialectImpl for DataFusionDialect {
30+
fn dialect_type(&self) -> DialectType {
31+
DialectType::DataFusion
32+
}
33+
34+
fn tokenizer_config(&self) -> TokenizerConfig {
35+
let mut config = TokenizerConfig::default();
36+
// DataFusion uses double quotes for identifiers
37+
config.identifiers.insert('"', '"');
38+
// DataFusion supports nested comments
39+
config.nested_comments = true;
40+
config
41+
}
42+
43+
fn generator_config(&self) -> GeneratorConfig {
44+
use crate::generator::{IdentifierQuoteStyle, LimitFetchStyle, NormalizeFunctions};
45+
GeneratorConfig {
46+
identifier_quote: '"',
47+
identifier_quote_style: IdentifierQuoteStyle::DOUBLE_QUOTE,
48+
dialect: Some(DialectType::DataFusion),
49+
// DataFusion lowercases function names
50+
normalize_functions: NormalizeFunctions::Lower,
51+
// TRY_CAST is supported
52+
try_supported: true,
53+
// DataFusion uses EXCEPT for column exclusion: SELECT * EXCEPT(col)
54+
star_except: "EXCEPT",
55+
// No multi-arg DISTINCT: COUNT(DISTINCT a, b) not supported
56+
multi_arg_distinct: false,
57+
// Window EXCLUDE not supported
58+
supports_window_exclude: false,
59+
// Interval allows plural form (DAYS, HOURS, etc.)
60+
interval_allows_plural_form: true,
61+
// Normalize date parts in EXTRACT
62+
normalize_extract_date_parts: true,
63+
// LIMIT style (not FETCH)
64+
limit_fetch_style: LimitFetchStyle::Limit,
65+
// No hints
66+
join_hints: false,
67+
table_hints: false,
68+
query_hints: false,
69+
// LEFT SEMI JOIN / LEFT ANTI JOIN syntax
70+
semi_anti_join_with_side: true,
71+
// COPY does not use INTO keyword
72+
copy_has_into_keyword: false,
73+
// NVL2 is supported (via coalesce-like behavior)
74+
nvl2_supported: true,
75+
// MEDIAN is supported
76+
supports_median: true,
77+
// Can implement array_any
78+
can_implement_array_any: true,
79+
// LIKE quantifiers not supported
80+
supports_like_quantifiers: false,
81+
// Aggregate FILTER is supported
82+
aggregate_filter_supported: true,
83+
// BETWEEN flags not supported
84+
supports_between_flags: false,
85+
..Default::default()
86+
}
87+
}
88+
89+
fn transform_expr(&self, expr: Expression) -> Result<Expression> {
90+
match expr {
91+
// Function transformations
92+
Expression::Function(f) => self.transform_function(*f),
93+
94+
// Aggregate function transformations
95+
Expression::AggregateFunction(f) => self.transform_aggregate_function(f),
96+
97+
// Pass through everything else
98+
_ => Ok(expr),
99+
}
100+
}
101+
}
102+
103+
impl DataFusionDialect {
104+
fn transform_function(&self, f: Function) -> Result<Expression> {
105+
let name_upper = f.name.to_uppercase();
106+
match name_upper.as_str() {
107+
// IFNULL → COALESCE (DataFusion uses COALESCE)
108+
"IFNULL" => Ok(Expression::Function(Box::new(Function::new(
109+
"coalesce".to_string(),
110+
f.args,
111+
)))),
112+
113+
// SQUARE(x) → POWER(x, 2)
114+
"SQUARE" => {
115+
let mut args = f.args;
116+
args.push(Expression::Literal(crate::expressions::Literal::Number(
117+
"2".to_string(),
118+
)));
119+
Ok(Expression::Function(Box::new(Function::new(
120+
"power".to_string(),
121+
args,
122+
))))
123+
}
124+
125+
// REGEXP_MATCHES → REGEXP_MATCH
126+
"REGEXP_MATCHES" => Ok(Expression::Function(Box::new(Function::new(
127+
"regexp_match".to_string(),
128+
f.args,
129+
)))),
130+
131+
// DATE_FORMAT / TIME_TO_STR / STRFTIME → TO_CHAR
132+
"DATE_FORMAT" | "TIME_TO_STR" => Ok(Expression::Function(Box::new(Function::new(
133+
"to_char".to_string(),
134+
f.args,
135+
)))),
136+
137+
// Pass through everything else
138+
_ => Ok(Expression::Function(Box::new(f))),
139+
}
140+
}
141+
142+
fn transform_aggregate_function(
143+
&self,
144+
f: Box<crate::expressions::AggregateFunction>,
145+
) -> Result<Expression> {
146+
let name_upper = f.name.to_uppercase();
147+
match name_upper.as_str() {
148+
// GROUP_CONCAT → STRING_AGG
149+
"GROUP_CONCAT" => Ok(Expression::Function(Box::new(Function::new(
150+
"string_agg".to_string(),
151+
f.args,
152+
)))),
153+
154+
// LISTAGG → STRING_AGG
155+
"LISTAGG" => Ok(Expression::Function(Box::new(Function::new(
156+
"string_agg".to_string(),
157+
f.args,
158+
)))),
159+
160+
// Pass through everything else
161+
_ => Ok(Expression::AggregateFunction(f)),
162+
}
163+
}
164+
}

crates/polyglot-sql/src/dialects/mod.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ mod fabric;
5252
mod drill;
5353
mod dremio;
5454
mod exasol;
55+
mod datafusion;
5556

5657
pub use generic::GenericDialect;
5758
pub use postgres::PostgresDialect;
@@ -86,6 +87,7 @@ pub use fabric::FabricDialect;
8687
pub use drill::DrillDialect;
8788
pub use dremio::DremioDialect;
8889
pub use exasol::ExasolDialect;
90+
pub use datafusion::DataFusionDialect;
8991

9092
use crate::error::Result;
9193
use crate::expressions::{Expression, FunctionBody};
@@ -173,6 +175,8 @@ pub enum DialectType {
173175
Dremio,
174176
/// Exasol -- in-memory analytic database.
175177
Exasol,
178+
/// Apache DataFusion -- Arrow-based query engine with modern SQL extensions.
179+
DataFusion,
176180
}
177181

178182
impl Default for DialectType {
@@ -217,6 +221,7 @@ impl std::fmt::Display for DialectType {
217221
DialectType::Drill => write!(f, "drill"),
218222
DialectType::Dremio => write!(f, "dremio"),
219223
DialectType::Exasol => write!(f, "exasol"),
224+
DialectType::DataFusion => write!(f, "datafusion"),
220225
}
221226
}
222227
}
@@ -259,6 +264,7 @@ impl std::str::FromStr for DialectType {
259264
"drill" => Ok(DialectType::Drill),
260265
"dremio" => Ok(DialectType::Dremio),
261266
"exasol" => Ok(DialectType::Exasol),
267+
"datafusion" | "arrow-datafusion" | "arrow_datafusion" => Ok(DialectType::DataFusion),
262268
_ => Err(crate::error::Error::parse(format!("Unknown dialect: {}", s))),
263269
}
264270
}
@@ -1316,6 +1322,11 @@ where
13161322
f.this = transform_recursive(f.this, transform_fn)?;
13171323
Expression::BitwiseXorAgg(f)
13181324
}
1325+
Expression::PipeOperator(mut pipe) => {
1326+
pipe.this = transform_recursive(pipe.this, transform_fn)?;
1327+
pipe.expression = transform_recursive(pipe.expression, transform_fn)?;
1328+
Expression::PipeOperator(pipe)
1329+
}
13191330

13201331
// Pass through leaf nodes unchanged
13211332
other => other,
@@ -1378,6 +1389,7 @@ fn configs_for_dialect_type(
13781389
DialectType::Drill => dialect_configs!(DrillDialect),
13791390
DialectType::Dremio => dialect_configs!(DremioDialect),
13801391
DialectType::Exasol => dialect_configs!(ExasolDialect),
1392+
DialectType::DataFusion => dialect_configs!(DataFusionDialect),
13811393
_ => dialect_configs!(GenericDialect),
13821394
}
13831395
}
@@ -1861,6 +1873,8 @@ impl Dialect {
18611873
let expr = transforms::expand_between_in_delete(expr)?;
18621874
Ok(expr)
18631875
}
1876+
// DataFusion supports QUALIFY and semi/anti joins natively
1877+
DialectType::DataFusion => Ok(expr),
18641878
// Oracle - no special preprocessing needed
18651879
DialectType::Oracle => {
18661880
Ok(expr)
@@ -11597,7 +11611,7 @@ impl Dialect {
1159711611
let is_source_nulls_last = matches!(source,
1159811612
DialectType::DuckDB | DialectType::Presto | DialectType::Trino
1159911613
| DialectType::Dremio | DialectType::Athena | DialectType::ClickHouse
11600-
| DialectType::Drill | DialectType::Exasol
11614+
| DialectType::Drill | DialectType::Exasol | DialectType::DataFusion
1160111615
);
1160211616

1160311617
// Determine target category to check if default matches
@@ -11608,7 +11622,7 @@ impl Dialect {
1160811622
let is_target_nulls_last = matches!(target,
1160911623
DialectType::DuckDB | DialectType::Presto | DialectType::Trino
1161011624
| DialectType::Dremio | DialectType::Athena | DialectType::ClickHouse
11611-
| DialectType::Drill | DialectType::Exasol
11625+
| DialectType::Drill | DialectType::Exasol | DialectType::DataFusion
1161211626
);
1161311627

1161411628
// Compute the implied nulls_first for source

crates/polyglot-sql/src/expressions.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ pub enum Expression {
9696
Intersect(Box<Intersect>),
9797
Except(Box<Except>),
9898
Subquery(Box<Subquery>),
99+
PipeOperator(Box<PipeOperator>),
99100
Pivot(Box<Pivot>),
100101
PivotAlias(Box<PivotAlias>),
101102
Unpivot(Box<Unpivot>),
@@ -1967,6 +1968,19 @@ pub struct Subquery {
19671968
pub trailing_comments: Vec<String>,
19681969
}
19691970

1971+
/// Pipe operator expression: query |> transform
1972+
///
1973+
/// Used in DataFusion and BigQuery pipe syntax:
1974+
/// FROM t |> WHERE x > 1 |> SELECT x, y |> ORDER BY x |> LIMIT 10
1975+
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1976+
#[cfg_attr(feature = "bindings", derive(TS))]
1977+
pub struct PipeOperator {
1978+
/// The input query/expression (left side of |>)
1979+
pub this: Expression,
1980+
/// The piped operation (right side of |>)
1981+
pub expression: Expression,
1982+
}
1983+
19701984
/// VALUES table constructor: VALUES (1, 'a'), (2, 'b')
19711985
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
19721986
#[cfg_attr(feature = "bindings", derive(TS))]

crates/polyglot-sql/src/generator.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2006,6 +2006,7 @@ impl Generator {
20062006
}
20072007
Expression::Array(arr) => self.generate_array(arr),
20082008
Expression::Tuple(tuple) => self.generate_tuple(tuple),
2009+
Expression::PipeOperator(pipe) => self.generate_pipe_operator(pipe),
20092010
Expression::Ordered(ordered) => self.generate_ordered(ordered),
20102011
Expression::DataType(dt) => self.generate_data_type(dt),
20112012
Expression::Raw(raw) => {
@@ -18375,6 +18376,13 @@ impl Generator {
1837518376
Ok(())
1837618377
}
1837718378

18379+
fn generate_pipe_operator(&mut self, pipe: &PipeOperator) -> Result<()> {
18380+
self.generate_expression(&pipe.this)?;
18381+
self.write(" |> ");
18382+
self.generate_expression(&pipe.expression)?;
18383+
Ok(())
18384+
}
18385+
1837818386
fn generate_ordered(&mut self, ordered: &Ordered) -> Result<()> {
1837918387
self.generate_expression(&ordered.this)?;
1838018388
if ordered.desc {

0 commit comments

Comments
 (0)