Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions sqlglot/dialects/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,15 @@ class BigQuery(Dialect):
LOG_BASE_FIRST = False
HEX_LOWERCASE = True
FORCE_EARLY_ALIAS_REF_EXPANSION = True
EXPAND_ONLY_GROUP_ALIAS_REF = True
PRESERVE_ORIGINAL_NAMES = True
HEX_STRING_IS_INTEGER_TYPE = True
BYTE_STRING_IS_BYTES_TYPE = True
UUID_IS_STRING_TYPE = True
PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = True
TABLES_REFERENCEABLE_AS_COLUMNS = True
SUPPORTS_STRUCT_STAR_EXPANSION = True
QUERY_RESULTS_ARE_STRUCTS = True

# https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap
INITCAP_DEFAULT_DELIMITER_CHARS = ' \t\n\r\f\v\\[\\](){}/|<>!?@"^#$&~_,.:;*%+\\-'
Expand Down
114 changes: 113 additions & 1 deletion sqlglot/dialects/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,14 +457,126 @@ class Dialect(metaclass=_Dialect):
to "WHERE id = 1 GROUP BY id HAVING id = 1"
"""

EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = False
EXPAND_ONLY_GROUP_ALIAS_REF = False
"""Whether alias reference expansion before qualification should only happen for the GROUP BY clause."""

DISABLES_ALIAS_REF_EXPANSION = False
"""
Whether alias reference expansion is disabled for this dialect.

Some dialects like Oracle do NOT support referencing aliases in projections or WHERE clauses.
The original expression must be repeated instead.

For example, in Oracle:
SELECT y.foo AS bar, bar * 2 AS baz FROM y -- INVALID
SELECT y.foo AS bar, y.foo * 2 AS baz FROM y -- VALID
"""

SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = False
"""
Whether alias references are allowed in JOIN ... ON clauses.

Most dialects do not support this, but Snowflake allows alias expansion in the JOIN ... ON
clause (and almost everywhere else)
# https://docs.snowflake.com/en/sql-reference/sql/select#usage-notes

For example, in Snowflake:
SELECT a.id AS user_id FROM a JOIN b ON user_id = b.id -- VALID
"""

SUPPORTS_ORDER_BY_ALL = False
"""
Whether ORDER BY ALL is supported (expands to all the selected columns) as in DuckDB, Spark3/Databricks
"""

PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = False
"""
Whether projection alias names can shadow table/source names in GROUP BY and HAVING clauses.

In BigQuery, when a projection alias has the same name as a source table, the alias takes
precedence in GROUP BY and HAVING clauses, and the table becomes inaccessible by that name.

For example, in BigQuery:
SELECT id, ARRAY_AGG(col) AS custom_fields
FROM custom_fields
GROUP BY id
HAVING id >= 1

The "custom_fields" source is shadowed by the projection alias, so we cannot qualify "id"
with "custom_fields" in GROUP BY/HAVING.
"""

TABLES_REFERENCEABLE_AS_COLUMNS = False
"""
Whether table names can be referenced as columns (treated as structs).

BigQuery allows tables to be referenced as columns in queries, automatically treating
them as struct values containing all the table's columns.

For example, in BigQuery:
SELECT t FROM my_table AS t -- Returns entire row as a struct
"""

SUPPORTS_STRUCT_STAR_EXPANSION = False
"""
Whether the dialect supports expanding struct fields using star notation (e.g., struct_col.*).

BigQuery allows struct fields to be expanded with the star operator:
SELECT t.struct_col.* FROM table t
RisingWave also allows struct field expansion with the star operator using parentheses:
SELECT (t.struct_col).* FROM table t

This expands to all fields within the struct.
"""

QUERY_RESULTS_ARE_STRUCTS = False
"""
Whether query results have internal struct type representation for type inference.

In BigQuery, subqueries used as data sources are internally represented as
structs, enabling advanced type inference. For example:
- ARRAY(SELECT 'foo') unwraps to ARRAY<STRING>, not ARRAY<STRUCT<STRING>>
- Column types propagate correctly through subqueries

This does NOT mean subquery results can be accessed with dot notation.
For field access, use SELECT AS STRUCT explicitly:
SELECT (SELECT AS STRUCT 1 AS x, 2 AS y).x -- Valid
SELECT (SELECT 1 AS x, 2 AS y).x -- Invalid
"""

REQUIRES_PARENTHESIZED_STRUCT_ACCESS = False
"""
Whether struct field access requires parentheses around the expression.

RisingWave requires parentheses for struct field access in certain contexts:
SELECT (col.field).subfield FROM table -- Parentheses required

Without parentheses, the parser may not correctly interpret nested struct access.

Reference: https://docs.risingwave.com/sql/data-types/struct#retrieve-data-in-a-struct
"""

SUPPORTS_NULL_TYPE = False
"""
Whether NULL/VOID is supported as a valid data type (not just a value).

Databricks and Spark v3+ support NULL as an actual type, allowing expressions like:
SELECT NULL AS col -- Has type NULL, not just value NULL
CAST(x AS VOID) -- Valid type cast
"""

COALESCE_COMPARISON_NON_STANDARD = False
"""
Whether COALESCE in comparisons has non-standard NULL semantics.

We can't convert `COALESCE(x, 1) = 2` into `NOT x IS NULL AND x = 2` for redshift,
because they are not always equivalent. For example, if `x` is `NULL` and it comes
from a table, then the result is `NULL`, despite `FALSE AND NULL` evaluating to `FALSE`.

In standard SQL and most dialects, these expressions are equivalent, but Redshift treats
table NULLs differently in this context.
"""

HAS_DISTINCT_ARRAY_CONSTRUCTORS = False
"""
Whether the ARRAY constructor is context-sensitive, i.e in Redshift ARRAY[1, 2, 3] != ARRAY(1, 2, 3)
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class Oracle(Dialect):
NULL_ORDERING = "nulls_are_large"
ON_CONDITION_EMPTY_BEFORE_ERROR = False
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = False
DISABLES_ALIAS_REF_EXPANSION = True

# See section 8: https://docs.oracle.com/cd/A97630_01/server.920/a96540/sql_elements9a.htm
NORMALIZATION_STRATEGY = NormalizationStrategy.UPPERCASE
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Redshift(Postgres):
COPY_PARAMS_ARE_CSV = False
HEX_LOWERCASE = True
HAS_DISTINCT_ARRAY_CONSTRUCTORS = True
COALESCE_COMPARISON_NON_STANDARD = True

# ref: https://docs.aws.amazon.com/redshift/latest/dg/r_FORMAT_strings.html
TIME_FORMAT = "'YYYY-MM-DD HH24:MI:SS'"
Expand Down
3 changes: 3 additions & 0 deletions sqlglot/dialects/risingwave.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@


class RisingWave(Postgres):
REQUIRES_PARENTHESIZED_STRUCT_ACCESS = True
SUPPORTS_STRUCT_STAR_EXPANSION = True

class Tokenizer(Postgres.Tokenizer):
KEYWORDS = {
**Postgres.Tokenizer.KEYWORDS,
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ class Snowflake(Dialect):
ARRAY_AGG_INCLUDES_NULLS = None
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = False
TRY_CAST_REQUIRES_STRING = True
SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = True

EXPRESSION_METADATA = EXPRESSION_METADATA.copy()

Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def _groupconcat_sql(self: Spark.Generator, expression: exp.GroupConcat) -> str:

class Spark(Spark2):
SUPPORTS_ORDER_BY_ALL = True
SUPPORTS_NULL_TYPE = True

class Tokenizer(Spark2.Tokenizer):
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
Expand Down
17 changes: 7 additions & 10 deletions sqlglot/optimizer/annotate_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
)
from sqlglot.optimizer.scope import Scope, traverse_scope
from sqlglot.schema import MappingSchema, Schema, ensure_schema
from sqlglot.dialects.dialect import Dialect

if t.TYPE_CHECKING:
from sqlglot._typing import B, E
Expand Down Expand Up @@ -188,13 +187,9 @@ def __init__(
overwrite_types: bool = True,
) -> None:
self.schema = schema
self.dialect = Dialect.get_or_raise(schema.dialect)
self.expression_metadata = (
expression_metadata or Dialect.get_or_raise(schema.dialect).EXPRESSION_METADATA
)
self.coerces_to = (
coerces_to or Dialect.get_or_raise(schema.dialect).COERCES_TO or self.COERCES_TO
)
self.dialect = schema.dialect
self.expression_metadata = expression_metadata or schema.dialect.EXPRESSION_METADATA
self.coerces_to = coerces_to or schema.dialect.COERCES_TO or self.COERCES_TO
self.binary_coercions = binary_coercions or self.BINARY_COERCIONS

# Caches the ids of annotated sub-Expressions, to ensure we only visit them once
Expand All @@ -204,7 +199,7 @@ def __init__(
self._null_expressions: t.Dict[int, exp.Expression] = {}

# Databricks and Spark ≥v3 actually support NULL (i.e., VOID) as a type
self._supports_null_type = schema.dialect in ("databricks", "spark")
self._supports_null_type = schema.dialect.SUPPORTS_NULL_TYPE

# Maps an exp.SetOperation's id (e.g. UNION) to its projection types. This is computed if the
# exp.SetOperation is the expression of a scope source, as selecting from it multiple times
Expand Down Expand Up @@ -368,7 +363,9 @@ def annotate_scope(self, scope: Scope) -> None:
# Iterate through all the expressions of the current scope in post-order, and annotate
self._annotate_expression(scope.expression, scope, selects)

if self.schema.dialect == "bigquery" and isinstance(scope.expression, exp.Query):
if self.schema.dialect.QUERY_RESULTS_ARE_STRUCTS and isinstance(
scope.expression, exp.Query
):
struct_type = exp.DataType(
this=exp.DataType.Type.STRUCT,
expressions=[
Expand Down
40 changes: 19 additions & 21 deletions sqlglot/optimizer/qualify_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,8 @@ def qualify_columns(
schema = ensure_schema(schema, dialect=dialect)
annotator = TypeAnnotator(schema)
infer_schema = schema.empty if infer_schema is None else infer_schema
dialect = Dialect.get_or_raise(schema.dialect)
dialect = schema.dialect
pseudocolumns = dialect.PSEUDOCOLUMNS
bigquery = dialect == "bigquery"

for scope in traverse_scope(expression):
if dialect.PREFER_CTE_ALIAS_COLUMN:
Expand All @@ -77,7 +76,7 @@ def qualify_columns(
scope,
resolver,
dialect,
expand_only_groupby=bigquery,
expand_only_groupby=dialect.EXPAND_ONLY_GROUP_ALIAS_REF,
)

_convert_columns_to_dots(scope, resolver)
Expand Down Expand Up @@ -107,7 +106,7 @@ def qualify_columns(
# https://www.postgresql.org/docs/current/sql-select.html#SQL-DISTINCT
_expand_order_by_and_distinct_on(scope, resolver)

if bigquery:
if dialect.SUPPORTS_STRUCT_STAR_EXPANSION:
annotator.annotate_scope(scope)

return expression
Expand Down Expand Up @@ -303,12 +302,11 @@ def _expand_alias_refs(
"""
expression = scope.expression

if not isinstance(expression, exp.Select) or dialect == "oracle":
if not isinstance(expression, exp.Select) or dialect.DISABLES_ALIAS_REF_EXPANSION:
return

alias_to_expression: t.Dict[str, t.Tuple[exp.Expression, int]] = {}
projections = {s.alias_or_name for s in expression.selects}
is_bigquery = dialect == "bigquery"
replaced = False

def replace_columns(
Expand Down Expand Up @@ -346,12 +344,12 @@ def replace_columns(
# SELECT x.a, max(x.b) as x FROM x GROUP BY 1 HAVING x > 1;
# If "HAVING x" is expanded to "HAVING max(x.b)", BQ would blindly replace the "x" reference with the projection MAX(x.b)
# i.e HAVING MAX(MAX(x.b).b), resulting in the error: "Aggregations of aggregations are not allowed"
if is_having and is_bigquery:
if is_having and dialect.PROJECTION_ALIASES_SHADOW_SOURCE_NAMES:
skip_replace = skip_replace or any(
node.parts[0].name in projections
for node in alias_expr.find_all(exp.Column)
)
elif is_bigquery and (is_group_by or is_having):
elif dialect.PROJECTION_ALIASES_SHADOW_SOURCE_NAMES and (is_group_by or is_having):
column_table = table.name if table else column.table
if column_table in projections:
# BigQuery's GROUP BY and HAVING clauses get confused if the column name
Expand Down Expand Up @@ -404,9 +402,7 @@ def replace_columns(
replace_columns(expression.args.get("having"), resolve_table=True)
replace_columns(expression.args.get("qualify"), resolve_table=True)

# Snowflake allows alias expansion in the JOIN ... ON clause (and almost everywhere else)
# https://docs.snowflake.com/en/sql-reference/sql/select#usage-notes
if dialect == "snowflake":
if dialect.SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS:
for join in expression.args.get("joins") or []:
replace_columns(join)

Expand Down Expand Up @@ -476,7 +472,7 @@ def _expand_positional_references(
else:
select = select.this

if dialect == "bigquery":
if Dialect.get_or_raise(dialect).PROJECTION_ALIASES_SHADOW_SOURCE_NAMES:
if ambiguous_projections is None:
# When a projection name is also a source name and it is referenced in the
# GROUP BY clause, BQ can't understand what the identifier corresponds to
Expand Down Expand Up @@ -598,7 +594,7 @@ def _qualify_columns(
if column_table:
column.set("table", column_table)
elif (
resolver.schema.dialect == "bigquery"
resolver.schema.dialect.TABLES_REFERENCEABLE_AS_COLUMNS
and len(column.parts) == 1
and column_name in scope.selected_sources
):
Expand Down Expand Up @@ -767,10 +763,9 @@ def _expand_stars(
if not pivot_output_columns:
pivot_output_columns = [c.alias_or_name for c in pivot.expressions]

is_bigquery = dialect == "bigquery"
is_risingwave = dialect == "risingwave"

if (is_bigquery or is_risingwave) and any(isinstance(col, exp.Dot) for col in scope.stars):
if dialect.SUPPORTS_STRUCT_STAR_EXPANSION and any(
isinstance(col, exp.Dot) for col in scope.stars
):
# Found struct expansion, annotate scope ahead of time
annotator.annotate_scope(scope)

Expand All @@ -787,12 +782,15 @@ def _expand_stars(
_add_except_columns(expression.this, tables, except_columns)
_add_replace_columns(expression.this, tables, replace_columns)
_add_rename_columns(expression.this, tables, rename_columns)
elif is_bigquery:
elif (
dialect.SUPPORTS_STRUCT_STAR_EXPANSION
and not dialect.REQUIRES_PARENTHESIZED_STRUCT_ACCESS
):
struct_fields = _expand_struct_stars_bigquery(expression)
if struct_fields:
new_selections.extend(struct_fields)
continue
elif is_risingwave:
elif dialect.REQUIRES_PARENTHESIZED_STRUCT_ACCESS:
struct_fields = _expand_struct_stars_risingwave(expression)
if struct_fields:
new_selections.extend(struct_fields)
Expand All @@ -809,7 +807,7 @@ def _expand_stars(
columns = resolver.get_source_columns(table, only_visible=True)
columns = columns or scope.outer_columns

if pseudocolumns and is_bigquery:
if pseudocolumns and dialect.SUPPORTS_STRUCT_STAR_EXPANSION:
columns = [name for name in columns if name.upper() not in pseudocolumns]

if not columns or "*" in columns:
Expand Down Expand Up @@ -1094,7 +1092,7 @@ def get_source_columns(self, name: str, only_visible: bool = False) -> t.Sequenc
# in bigquery, unnest structs are automatically scoped as tables, so you can
# directly select a struct field in a query.
# this handles the case where the unnest is statically defined.
if self.schema.dialect == "bigquery":
if self.schema.dialect.UNNEST_COLUMN_ONLY:
if source.expression.is_type(exp.DataType.Type.STRUCT):
for k in source.expression.type.expressions: # type: ignore
columns.append(k.name)
Expand Down
9 changes: 2 additions & 7 deletions sqlglot/optimizer/simplify.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,8 @@ def simplify_parens(expression: exp.Expression, dialect: DialectType) -> exp.Exp
if isinstance(parent, (exp.SubqueryPredicate, exp.Bracket)):
return expression

# Handle risingwave struct columns
# see https://docs.risingwave.com/sql/data-types/struct#retrieve-data-in-a-struct
if (
dialect == "risingwave"
Dialect.get_or_raise(dialect).REQUIRES_PARENTHESIZED_STRUCT_ACCESS
and isinstance(parent, exp.Dot)
and (isinstance(parent.right, (exp.Identifier, exp.Star)))
):
Expand Down Expand Up @@ -1193,10 +1191,7 @@ def simplify_coalesce(self, expression: exp.Expression) -> exp.Expression:
):
return expression.this

# We can't convert `COALESCE(x, 1) = 2` into `NOT x IS NULL AND x = 2` for redshift,
# because they are not always equivalent. For example, if `x` is `NULL` and it comes
# from a table, then the result is `NULL`, despite `FALSE AND NULL` evaluating to `FALSE`
if self.dialect == "redshift":
if self.dialect.COALESCE_COMPARISON_NON_STANDARD:
return expression

if not isinstance(expression, self.COMPARISONS):
Expand Down
Loading