Skip to content

Commit ac13687

Browse files
cetra3alamb
andauthored
Allow setting the recursion limit for sql parsing (#14756)
* Allow setting the recursion limit for sql parsing * Add some examples, restore old APIs and deprecate * cleanup --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 5e49094 commit ac13687

File tree

6 files changed

+163
-36
lines changed

6 files changed

+163
-36
lines changed

datafusion-examples/examples/sql_dialect.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::fmt::Display;
1919

2020
use datafusion::error::Result;
2121
use datafusion::sql::{
22-
parser::{CopyToSource, CopyToStatement, DFParser, Statement},
22+
parser::{CopyToSource, CopyToStatement, DFParser, DFParserBuilder, Statement},
2323
sqlparser::{keywords::Keyword, parser::ParserError, tokenizer::Token},
2424
};
2525

@@ -46,9 +46,9 @@ struct MyParser<'a> {
4646
df_parser: DFParser<'a>,
4747
}
4848

49-
impl MyParser<'_> {
50-
fn new(sql: &str) -> Result<Self> {
51-
let df_parser = DFParser::new(sql)?;
49+
impl<'a> MyParser<'a> {
50+
fn new(sql: &'a str) -> Result<Self> {
51+
let df_parser = DFParserBuilder::new(sql).build()?;
5252
Ok(Self { df_parser })
5353
}
5454

datafusion/common/src/config.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ config_namespace! {
256256
/// query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected
257257
/// and recorded in the logical plan nodes.
258258
pub collect_spans: bool, default = false
259+
260+
/// Specifies the recursion depth limit when parsing complex SQL Queries
261+
pub recursion_limit: usize, default = 50
259262
}
260263
}
261264

datafusion/core/src/execution/session_state.rs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
6868
use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
6969
use datafusion_physical_optimizer::PhysicalOptimizerRule;
7070
use datafusion_physical_plan::ExecutionPlan;
71-
use datafusion_sql::parser::{DFParser, Statement};
71+
use datafusion_sql::parser::{DFParserBuilder, Statement};
7272
use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel};
7373

7474
use async_trait::async_trait;
@@ -483,12 +483,21 @@ impl SessionState {
483483
MsSQL, ClickHouse, BigQuery, Ansi, DuckDB, Databricks."
484484
)
485485
})?;
486-
let mut statements = DFParser::parse_sql_with_dialect(sql, dialect.as_ref())?;
486+
487+
let recursion_limit = self.config.options().sql_parser.recursion_limit;
488+
489+
let mut statements = DFParserBuilder::new(sql)
490+
.with_dialect(dialect.as_ref())
491+
.with_recursion_limit(recursion_limit)
492+
.build()?
493+
.parse_statements()?;
494+
487495
if statements.len() > 1 {
488496
return not_impl_err!(
489497
"The context currently only supports a single SQL statement"
490498
);
491499
}
500+
492501
let statement = statements.pop_front().ok_or_else(|| {
493502
plan_datafusion_err!("No SQL statements were provided in the query string")
494503
})?;
@@ -522,7 +531,12 @@ impl SessionState {
522531
)
523532
})?;
524533

525-
let expr = DFParser::parse_sql_into_expr_with_dialect(sql, dialect.as_ref())?;
534+
let recursion_limit = self.config.options().sql_parser.recursion_limit;
535+
let expr = DFParserBuilder::new(sql)
536+
.with_dialect(dialect.as_ref())
537+
.with_recursion_limit(recursion_limit)
538+
.build()?
539+
.parse_expr()?;
526540

527541
Ok(expr)
528542
}

datafusion/sql/src/parser.rs

Lines changed: 136 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -269,33 +269,107 @@ pub struct DFParser<'a> {
269269
pub parser: Parser<'a>,
270270
}
271271

272-
impl<'a> DFParser<'a> {
273-
/// Create a new parser for the specified tokens using the
272+
/// Same as `sqlparser`
273+
const DEFAULT_RECURSION_LIMIT: usize = 50;
274+
const DEFAULT_DIALECT: GenericDialect = GenericDialect {};
275+
276+
/// Builder for [`DFParser`]
277+
///
278+
/// # Example: Create and Parse SQL statements
279+
/// ```
280+
/// # use datafusion_sql::parser::DFParserBuilder;
281+
/// # use datafusion_common::Result;
282+
/// # fn test() -> Result<()> {
283+
/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2")
284+
/// .build()?;
285+
/// // parse the SQL into DFStatements
286+
/// let statements = parser.parse_statements()?;
287+
/// assert_eq!(statements.len(), 2);
288+
/// # Ok(())
289+
/// # }
290+
/// ```
291+
///
292+
/// # Example: Create and Parse expression with a different dialect
293+
/// ```
294+
/// # use datafusion_sql::parser::DFParserBuilder;
295+
/// # use datafusion_common::Result;
296+
/// # use datafusion_sql::sqlparser::dialect::MySqlDialect;
297+
/// # use datafusion_sql::sqlparser::ast::Expr;
298+
/// # fn test() -> Result<()> {
299+
/// let dialect = MySqlDialect{}; // Parse using MySQL dialect
300+
/// let mut parser = DFParserBuilder::new("1 + 2")
301+
/// .with_dialect(&dialect)
302+
/// .build()?;
303+
/// // parse 1+2 into an sqlparser::ast::Expr
304+
/// let res = parser.parse_expr()?;
305+
/// assert!(matches!(res.expr, Expr::BinaryOp {..}));
306+
/// # Ok(())
307+
/// # }
308+
/// ```
309+
pub struct DFParserBuilder<'a> {
310+
/// The SQL string to parse
311+
sql: &'a str,
312+
/// The Dialect to use (defaults to [`GenericDialect`]
313+
dialect: &'a dyn Dialect,
314+
/// The recursion limit while parsing
315+
recursion_limit: usize,
316+
}
317+
318+
impl<'a> DFParserBuilder<'a> {
319+
/// Create a new parser builder for the specified tokens using the
274320
/// [`GenericDialect`].
275-
pub fn new(sql: &str) -> Result<Self, ParserError> {
276-
let dialect = &GenericDialect {};
277-
DFParser::new_with_dialect(sql, dialect)
321+
pub fn new(sql: &'a str) -> Self {
322+
Self {
323+
sql,
324+
dialect: &DEFAULT_DIALECT,
325+
recursion_limit: DEFAULT_RECURSION_LIMIT,
326+
}
278327
}
279328

280-
/// Create a new parser for the specified tokens with the
281-
/// specified dialect.
282-
pub fn new_with_dialect(
283-
sql: &str,
284-
dialect: &'a dyn Dialect,
285-
) -> Result<Self, ParserError> {
286-
let mut tokenizer = Tokenizer::new(dialect, sql);
329+
/// Adjust the parser builder's dialect. Defaults to [`GenericDialect`]
330+
pub fn with_dialect(mut self, dialect: &'a dyn Dialect) -> Self {
331+
self.dialect = dialect;
332+
self
333+
}
334+
335+
/// Adjust the recursion limit of sql parsing. Defaults to 50
336+
pub fn with_recursion_limit(mut self, recursion_limit: usize) -> Self {
337+
self.recursion_limit = recursion_limit;
338+
self
339+
}
340+
341+
pub fn build(self) -> Result<DFParser<'a>, ParserError> {
342+
let mut tokenizer = Tokenizer::new(self.dialect, self.sql);
287343
let tokens = tokenizer.tokenize_with_location()?;
288344

289345
Ok(DFParser {
290-
parser: Parser::new(dialect).with_tokens_with_locations(tokens),
346+
parser: Parser::new(self.dialect)
347+
.with_tokens_with_locations(tokens)
348+
.with_recursion_limit(self.recursion_limit),
291349
})
292350
}
351+
}
352+
353+
impl<'a> DFParser<'a> {
354+
#[deprecated(since = "46.0.0", note = "DFParserBuilder")]
355+
pub fn new(sql: &'a str) -> Result<Self, ParserError> {
356+
DFParserBuilder::new(sql).build()
357+
}
358+
359+
#[deprecated(since = "46.0.0", note = "DFParserBuilder")]
360+
pub fn new_with_dialect(
361+
sql: &'a str,
362+
dialect: &'a dyn Dialect,
363+
) -> Result<Self, ParserError> {
364+
DFParserBuilder::new(sql).with_dialect(dialect).build()
365+
}
293366

294367
/// Parse a sql string into one or [`Statement`]s using the
295368
/// [`GenericDialect`].
296-
pub fn parse_sql(sql: &str) -> Result<VecDeque<Statement>, ParserError> {
297-
let dialect = &GenericDialect {};
298-
DFParser::parse_sql_with_dialect(sql, dialect)
369+
pub fn parse_sql(sql: &'a str) -> Result<VecDeque<Statement>, ParserError> {
370+
let mut parser = DFParserBuilder::new(sql).build()?;
371+
372+
parser.parse_statements()
299373
}
300374

301375
/// Parse a SQL string and produce one or more [`Statement`]s with
@@ -304,37 +378,43 @@ impl<'a> DFParser<'a> {
304378
sql: &str,
305379
dialect: &dyn Dialect,
306380
) -> Result<VecDeque<Statement>, ParserError> {
307-
let mut parser = DFParser::new_with_dialect(sql, dialect)?;
381+
let mut parser = DFParserBuilder::new(sql).with_dialect(dialect).build()?;
382+
parser.parse_statements()
383+
}
384+
385+
pub fn parse_sql_into_expr_with_dialect(
386+
sql: &str,
387+
dialect: &dyn Dialect,
388+
) -> Result<ExprWithAlias, ParserError> {
389+
let mut parser = DFParserBuilder::new(sql).with_dialect(dialect).build()?;
390+
391+
parser.parse_expr()
392+
}
393+
394+
/// Parse a sql string into one or [`Statement`]s
395+
pub fn parse_statements(&mut self) -> Result<VecDeque<Statement>, ParserError> {
308396
let mut stmts = VecDeque::new();
309397
let mut expecting_statement_delimiter = false;
310398
loop {
311399
// ignore empty statements (between successive statement delimiters)
312-
while parser.parser.consume_token(&Token::SemiColon) {
400+
while self.parser.consume_token(&Token::SemiColon) {
313401
expecting_statement_delimiter = false;
314402
}
315403

316-
if parser.parser.peek_token() == Token::EOF {
404+
if self.parser.peek_token() == Token::EOF {
317405
break;
318406
}
319407
if expecting_statement_delimiter {
320-
return parser.expected("end of statement", parser.parser.peek_token());
408+
return self.expected("end of statement", self.parser.peek_token());
321409
}
322410

323-
let statement = parser.parse_statement()?;
411+
let statement = self.parse_statement()?;
324412
stmts.push_back(statement);
325413
expecting_statement_delimiter = true;
326414
}
327415
Ok(stmts)
328416
}
329417

330-
pub fn parse_sql_into_expr_with_dialect(
331-
sql: &str,
332-
dialect: &dyn Dialect,
333-
) -> Result<ExprWithAlias, ParserError> {
334-
let mut parser = DFParser::new_with_dialect(sql, dialect)?;
335-
parser.parse_expr()
336-
}
337-
338418
/// Report an unexpected token
339419
fn expected<T>(
340420
&self,
@@ -883,6 +963,7 @@ impl<'a> DFParser<'a> {
883963
#[cfg(test)]
884964
mod tests {
885965
use super::*;
966+
use datafusion_common::assert_contains;
886967
use sqlparser::ast::Expr::Identifier;
887968
use sqlparser::ast::{BinaryOperator, DataType, Expr, Ident};
888969
use sqlparser::dialect::SnowflakeDialect;
@@ -1613,4 +1694,30 @@ mod tests {
16131694
fn verified_stmt(sql: &str) -> Statement {
16141695
one_statement_parses_to(sql, sql)
16151696
}
1697+
1698+
#[test]
1699+
/// Checks the recursion limit works for sql queries
1700+
/// Recursion can happen easily with binary exprs (i.e, AND or OR)
1701+
fn test_recursion_limit() {
1702+
let sql = "SELECT 1 OR 2";
1703+
1704+
// Expect parse to succeed
1705+
DFParserBuilder::new(sql)
1706+
.build()
1707+
.unwrap()
1708+
.parse_statements()
1709+
.unwrap();
1710+
1711+
let err = DFParserBuilder::new(sql)
1712+
.with_recursion_limit(1)
1713+
.build()
1714+
.unwrap()
1715+
.parse_statements()
1716+
.unwrap_err();
1717+
1718+
assert_contains!(
1719+
err.to_string(),
1720+
"sql parser error: recursion limit exceeded"
1721+
);
1722+
}
16161723
}

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ datafusion.sql_parser.dialect generic
263263
datafusion.sql_parser.enable_ident_normalization true
264264
datafusion.sql_parser.enable_options_value_normalization false
265265
datafusion.sql_parser.parse_float_as_decimal false
266+
datafusion.sql_parser.recursion_limit 50
266267
datafusion.sql_parser.support_varchar_with_length true
267268

268269
# show all variables with verbose
@@ -359,6 +360,7 @@ datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusi
359360
datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
360361
datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.
361362
datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type
363+
datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries
362364
datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.
363365

364366
# show_variable_in_config_options

docs/source/user-guide/configs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,4 @@ Environment variables are read during `SessionConfig` initialisation so they mus
128128
| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. |
129129
| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. |
130130
| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes. |
131+
| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries |

0 commit comments

Comments
 (0)