Skip to content

Commit 07d9958

Browse files
fivetran-MichaelLeeMichael Leegeorgesittas
authored
chore(optimizer)!: replace direct comparison with dialect properties (#6398)
* chore(optimizer): replace direct comparison with dialect properties * fix type error * formatting * remove redundant Databricks property * update comments * rename property * refactor schema dialect property to be Dialect * formatting * change property for checking unnest columns * Update sqlglot/dialects/dialect.py Co-authored-by: Jo <[email protected]> * make dialect an optional property on Schema * remove unnecessary get_or_raise * update comments for QUERY_RESULTS_ARE_STRUCTS * rename functions * remove redundant comment * remove unnecessary default function * add pickling support to Version and Dialect * formatting * adding new properties for bigquery support * remove serialization from Dialect --------- Co-authored-by: Michael Lee <[email protected]> Co-authored-by: Jo <[email protected]>
1 parent 38ee187 commit 07d9958

File tree

11 files changed

+193
-52
lines changed

11 files changed

+193
-52
lines changed

sqlglot/dialects/bigquery.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,10 +353,17 @@ class BigQuery(Dialect):
353353
LOG_BASE_FIRST = False
354354
HEX_LOWERCASE = True
355355
FORCE_EARLY_ALIAS_REF_EXPANSION = True
356+
EXPAND_ONLY_GROUP_ALIAS_REF = True
356357
PRESERVE_ORIGINAL_NAMES = True
357358
HEX_STRING_IS_INTEGER_TYPE = True
358359
BYTE_STRING_IS_BYTES_TYPE = True
359360
UUID_IS_STRING_TYPE = True
361+
ANNOTATE_ALL_SCOPES = True
362+
PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = True
363+
TABLES_REFERENCEABLE_AS_COLUMNS = True
364+
SUPPORTS_STRUCT_STAR_EXPANSION = True
365+
EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = True
366+
QUERY_RESULTS_ARE_STRUCTS = True
360367
JSON_EXTRACT_SCALAR_SCALAR_ONLY = True
361368

362369
# https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap

sqlglot/dialects/dialect.py

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,14 +443,139 @@ class Dialect(metaclass=_Dialect):
443443
to "WHERE id = 1 GROUP BY id HAVING id = 1"
444444
"""
445445

446-
EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = False
446+
EXPAND_ONLY_GROUP_ALIAS_REF = False
447447
"""Whether alias reference expansion before qualification should only happen for the GROUP BY clause."""
448448

449+
ANNOTATE_ALL_SCOPES = False
450+
"""Whether to annotate all scopes during optimization. Used by BigQuery for UNNEST support."""
451+
452+
DISABLES_ALIAS_REF_EXPANSION = False
453+
"""
454+
Whether alias reference expansion is disabled for this dialect.
455+
456+
Some dialects like Oracle do NOT support referencing aliases in projections or WHERE clauses.
457+
The original expression must be repeated instead.
458+
459+
For example, in Oracle:
460+
SELECT y.foo AS bar, bar * 2 AS baz FROM y -- INVALID
461+
SELECT y.foo AS bar, y.foo * 2 AS baz FROM y -- VALID
462+
"""
463+
464+
SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = False
465+
"""
466+
Whether alias references are allowed in JOIN ... ON clauses.
467+
468+
Most dialects do not support this, but Snowflake allows alias expansion in the JOIN ... ON
469+
clause (and almost everywhere else)
470+
471+
For example, in Snowflake:
472+
SELECT a.id AS user_id FROM a JOIN b ON user_id = b.id -- VALID
473+
474+
Reference: https://docs.snowflake.com/en/sql-reference/sql/select#usage-notes
475+
"""
476+
449477
SUPPORTS_ORDER_BY_ALL = False
450478
"""
451479
Whether ORDER BY ALL is supported (expands to all the selected columns) as in DuckDB, Spark3/Databricks
452480
"""
453481

482+
PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = False
483+
"""
484+
Whether projection alias names can shadow table/source names in GROUP BY and HAVING clauses.
485+
486+
In BigQuery, when a projection alias has the same name as a source table, the alias takes
487+
precedence in GROUP BY and HAVING clauses, and the table becomes inaccessible by that name.
488+
489+
For example, in BigQuery:
490+
SELECT id, ARRAY_AGG(col) AS custom_fields
491+
FROM custom_fields
492+
GROUP BY id
493+
HAVING id >= 1
494+
495+
The "custom_fields" source is shadowed by the projection alias, so we cannot qualify "id"
496+
with "custom_fields" in GROUP BY/HAVING.
497+
"""
498+
499+
TABLES_REFERENCEABLE_AS_COLUMNS = False
500+
"""
501+
Whether table names can be referenced as columns (treated as structs).
502+
503+
BigQuery allows tables to be referenced as columns in queries, automatically treating
504+
them as struct values containing all the table's columns.
505+
506+
For example, in BigQuery:
507+
SELECT t FROM my_table AS t -- Returns entire row as a struct
508+
"""
509+
510+
SUPPORTS_STRUCT_STAR_EXPANSION = False
511+
"""
512+
Whether the dialect supports expanding struct fields using star notation (e.g., struct_col.*).
513+
514+
BigQuery allows struct fields to be expanded with the star operator:
515+
SELECT t.struct_col.* FROM table t
516+
RisingWave also allows struct field expansion with the star operator using parentheses:
517+
SELECT (t.struct_col).* FROM table t
518+
519+
This expands to all fields within the struct.
520+
"""
521+
522+
EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = False
523+
"""
524+
Whether pseudocolumns should be excluded from star expansion (SELECT *).
525+
526+
Pseudocolumns are special dialect-specific columns (e.g., Oracle's ROWNUM, ROWID, LEVEL,
527+
or BigQuery's _PARTITIONTIME, _PARTITIONDATE) that are implicitly available but not part
528+
of the table schema. When this is True, SELECT * will not include these pseudocolumns;
529+
they must be explicitly selected.
530+
"""
531+
532+
QUERY_RESULTS_ARE_STRUCTS = False
533+
"""
534+
Whether query results are typed as structs in metadata for type inference.
535+
536+
In BigQuery, subqueries store their column types as a STRUCT in metadata,
537+
enabling special type inference for ARRAY(SELECT ...) expressions:
538+
ARRAY(SELECT x, y FROM t) → ARRAY<STRUCT<...>>
539+
540+
For single column subqueries, BigQuery unwraps the struct:
541+
ARRAY(SELECT x FROM t) → ARRAY<type_of_x>
542+
543+
This is metadata-only for type inference.
544+
"""
545+
546+
REQUIRES_PARENTHESIZED_STRUCT_ACCESS = False
547+
"""
548+
Whether struct field access requires parentheses around the expression.
549+
550+
RisingWave requires parentheses for struct field access in certain contexts:
551+
SELECT (col.field).subfield FROM table -- Parentheses required
552+
553+
Without parentheses, the parser may not correctly interpret nested struct access.
554+
555+
Reference: https://docs.risingwave.com/sql/data-types/struct#retrieve-data-in-a-struct
556+
"""
557+
558+
SUPPORTS_NULL_TYPE = False
559+
"""
560+
Whether NULL/VOID is supported as a valid data type (not just a value).
561+
562+
Databricks and Spark v3+ support NULL as an actual type, allowing expressions like:
563+
SELECT NULL AS col -- Has type NULL, not just value NULL
564+
CAST(x AS VOID) -- Valid type cast
565+
"""
566+
567+
COALESCE_COMPARISON_NON_STANDARD = False
568+
"""
569+
Whether COALESCE in comparisons has non-standard NULL semantics.
570+
571+
We can't convert `COALESCE(x, 1) = 2` into `NOT x IS NULL AND x = 2` for redshift,
572+
because they are not always equivalent. For example, if `x` is `NULL` and it comes
573+
from a table, then the result is `NULL`, despite `FALSE AND NULL` evaluating to `FALSE`.
574+
575+
In standard SQL and most dialects, these expressions are equivalent, but Redshift treats
576+
table NULLs differently in this context.
577+
"""
578+
454579
HAS_DISTINCT_ARRAY_CONSTRUCTORS = False
455580
"""
456581
Whether the ARRAY constructor is context-sensitive, i.e in Redshift ARRAY[1, 2, 3] != ARRAY(1, 2, 3)

sqlglot/dialects/oracle.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class Oracle(Dialect):
4545
NULL_ORDERING = "nulls_are_large"
4646
ON_CONDITION_EMPTY_BEFORE_ERROR = False
4747
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = False
48+
DISABLES_ALIAS_REF_EXPANSION = True
4849

4950
# See section 8: https://docs.oracle.com/cd/A97630_01/server.920/a96540/sql_elements9a.htm
5051
NORMALIZATION_STRATEGY = NormalizationStrategy.UPPERCASE

sqlglot/dialects/redshift.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class Redshift(Postgres):
4747
COPY_PARAMS_ARE_CSV = False
4848
HEX_LOWERCASE = True
4949
HAS_DISTINCT_ARRAY_CONSTRUCTORS = True
50+
COALESCE_COMPARISON_NON_STANDARD = True
5051

5152
# ref: https://docs.aws.amazon.com/redshift/latest/dg/r_FORMAT_strings.html
5253
TIME_FORMAT = "'YYYY-MM-DD HH24:MI:SS'"

sqlglot/dialects/risingwave.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99

1010
class RisingWave(Postgres):
11+
REQUIRES_PARENTHESIZED_STRUCT_ACCESS = True
12+
SUPPORTS_STRUCT_STAR_EXPANSION = True
13+
1114
class Tokenizer(Postgres.Tokenizer):
1215
KEYWORDS = {
1316
**Postgres.Tokenizer.KEYWORDS,

sqlglot/dialects/snowflake.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@ class Snowflake(Dialect):
553553
ARRAY_AGG_INCLUDES_NULLS = None
554554
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = False
555555
TRY_CAST_REQUIRES_STRING = True
556+
SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = True
556557

557558
EXPRESSION_METADATA = EXPRESSION_METADATA.copy()
558559

sqlglot/dialects/spark.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def _groupconcat_sql(self: Spark.Generator, expression: exp.GroupConcat) -> str:
111111

112112
class Spark(Spark2):
113113
SUPPORTS_ORDER_BY_ALL = True
114+
SUPPORTS_NULL_TYPE = True
114115

115116
class Tokenizer(Spark2.Tokenizer):
116117
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False

sqlglot/optimizer/annotate_types.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import typing as t
66

77
from sqlglot import exp
8+
from sqlglot.dialects.dialect import Dialect
89
from sqlglot.helper import (
910
ensure_list,
1011
is_date_unit,
@@ -14,7 +15,6 @@
1415
)
1516
from sqlglot.optimizer.scope import Scope, traverse_scope
1617
from sqlglot.schema import MappingSchema, Schema, ensure_schema
17-
from sqlglot.dialects.dialect import Dialect
1818

1919
if t.TYPE_CHECKING:
2020
from sqlglot._typing import B, E
@@ -188,13 +188,10 @@ def __init__(
188188
overwrite_types: bool = True,
189189
) -> None:
190190
self.schema = schema
191-
self.dialect = Dialect.get_or_raise(schema.dialect)
192-
self.expression_metadata = (
193-
expression_metadata or Dialect.get_or_raise(schema.dialect).EXPRESSION_METADATA
194-
)
195-
self.coerces_to = (
196-
coerces_to or Dialect.get_or_raise(schema.dialect).COERCES_TO or self.COERCES_TO
197-
)
191+
dialect = schema.dialect or Dialect()
192+
self.dialect = dialect
193+
self.expression_metadata = expression_metadata or dialect.EXPRESSION_METADATA
194+
self.coerces_to = coerces_to or dialect.COERCES_TO or self.COERCES_TO
198195
self.binary_coercions = binary_coercions or self.BINARY_COERCIONS
199196

200197
# Caches the ids of annotated sub-Expressions, to ensure we only visit them once
@@ -204,7 +201,7 @@ def __init__(
204201
self._null_expressions: t.Dict[int, exp.Expression] = {}
205202

206203
# Databricks and Spark ≥v3 actually support NULL (i.e., VOID) as a type
207-
self._supports_null_type = schema.dialect in ("databricks", "spark")
204+
self._supports_null_type = dialect.SUPPORTS_NULL_TYPE
208205

209206
# Maps an exp.SetOperation's id (e.g. UNION) to its projection types. This is computed if the
210207
# exp.SetOperation is the expression of a scope source, as selecting from it multiple times
@@ -368,7 +365,7 @@ def annotate_scope(self, scope: Scope) -> None:
368365
# Iterate through all the expressions of the current scope in post-order, and annotate
369366
self._annotate_expression(scope.expression, scope, selects)
370367

371-
if self.schema.dialect == "bigquery" and isinstance(scope.expression, exp.Query):
368+
if self.dialect.QUERY_RESULTS_ARE_STRUCTS and isinstance(scope.expression, exp.Query):
372369
struct_type = exp.DataType(
373370
this=exp.DataType.Type.STRUCT,
374371
expressions=[
@@ -482,7 +479,7 @@ def _maybe_coerce(
482479
def _annotate_binary(self, expression: B) -> B:
483480
left, right = expression.left, expression.right
484481
if not left or not right:
485-
expression_sql = expression.sql(self.schema.dialect)
482+
expression_sql = expression.sql(self.dialect)
486483
logger.warning(f"Failed to annotate badly formed binary expression: {expression_sql}")
487484
self._set_type(expression, None)
488485
return expression

0 commit comments

Comments
 (0)