Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions sqlglot/dialects/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,17 @@ class BigQuery(Dialect):
LOG_BASE_FIRST = False
HEX_LOWERCASE = True
FORCE_EARLY_ALIAS_REF_EXPANSION = True
EXPAND_ONLY_GROUP_ALIAS_REF = True
PRESERVE_ORIGINAL_NAMES = True
HEX_STRING_IS_INTEGER_TYPE = True
BYTE_STRING_IS_BYTES_TYPE = True
UUID_IS_STRING_TYPE = True
ANNOTATE_ALL_SCOPES = True
PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = True
TABLES_REFERENCEABLE_AS_COLUMNS = True
SUPPORTS_STRUCT_STAR_EXPANSION = True
EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = True
QUERY_RESULTS_ARE_STRUCTS = True
JSON_EXTRACT_SCALAR_SCALAR_ONLY = True

# https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap
Expand Down
127 changes: 126 additions & 1 deletion sqlglot/dialects/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,14 +443,139 @@ class Dialect(metaclass=_Dialect):
to "WHERE id = 1 GROUP BY id HAVING id = 1"
"""

EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = False
EXPAND_ONLY_GROUP_ALIAS_REF = False
"""Whether alias reference expansion before qualification should only happen for the GROUP BY clause."""

ANNOTATE_ALL_SCOPES = False
"""Whether to annotate all scopes during optimization. Used by BigQuery for UNNEST support."""

DISABLES_ALIAS_REF_EXPANSION = False
"""
Whether alias reference expansion is disabled for this dialect.

Some dialects like Oracle do NOT support referencing aliases in projections or WHERE clauses.
The original expression must be repeated instead.

For example, in Oracle:
SELECT y.foo AS bar, bar * 2 AS baz FROM y -- INVALID
SELECT y.foo AS bar, y.foo * 2 AS baz FROM y -- VALID
"""

SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = False
"""
Whether alias references are allowed in JOIN ... ON clauses.

Most dialects do not support this, but Snowflake allows alias expansion in the JOIN ... ON
clause (and almost everywhere else)

For example, in Snowflake:
SELECT a.id AS user_id FROM a JOIN b ON user_id = b.id -- VALID

Reference: https://docs.snowflake.com/en/sql-reference/sql/select#usage-notes
"""

SUPPORTS_ORDER_BY_ALL = False
"""
Whether ORDER BY ALL is supported (expands to all the selected columns) as in DuckDB, Spark3/Databricks
"""

PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = False
"""
Whether projection alias names can shadow table/source names in GROUP BY and HAVING clauses.

In BigQuery, when a projection alias has the same name as a source table, the alias takes
precedence in GROUP BY and HAVING clauses, and the table becomes inaccessible by that name.

For example, in BigQuery:
SELECT id, ARRAY_AGG(col) AS custom_fields
FROM custom_fields
GROUP BY id
HAVING id >= 1

The "custom_fields" source is shadowed by the projection alias, so we cannot qualify "id"
with "custom_fields" in GROUP BY/HAVING.
"""

TABLES_REFERENCEABLE_AS_COLUMNS = False
"""
Whether table names can be referenced as columns (treated as structs).

BigQuery allows tables to be referenced as columns in queries, automatically treating
them as struct values containing all the table's columns.

For example, in BigQuery:
SELECT t FROM my_table AS t -- Returns entire row as a struct
"""

SUPPORTS_STRUCT_STAR_EXPANSION = False
"""
Whether the dialect supports expanding struct fields using star notation (e.g., struct_col.*).

BigQuery allows struct fields to be expanded with the star operator:
SELECT t.struct_col.* FROM table t
RisingWave also allows struct field expansion with the star operator using parentheses:
SELECT (t.struct_col).* FROM table t

This expands to all fields within the struct.
"""

EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = False
"""
Whether pseudocolumns should be excluded from star expansion (SELECT *).

Pseudocolumns are special dialect-specific columns (e.g., Oracle's ROWNUM, ROWID, LEVEL,
or BigQuery's _PARTITIONTIME, _PARTITIONDATE) that are implicitly available but not part
of the table schema. When this is True, SELECT * will not include these pseudocolumns;
they must be explicitly selected.
"""

QUERY_RESULTS_ARE_STRUCTS = False
"""
Whether query results are typed as structs in metadata for type inference.

In BigQuery, subqueries store their column types as a STRUCT in metadata,
enabling special type inference for ARRAY(SELECT ...) expressions:
ARRAY(SELECT x, y FROM t) → ARRAY<STRUCT<...>>

For single column subqueries, BigQuery unwraps the struct:
ARRAY(SELECT x FROM t) → ARRAY<type_of_x>

This is metadata-only for type inference.
"""

REQUIRES_PARENTHESIZED_STRUCT_ACCESS = False
"""
Whether struct field access requires parentheses around the expression.

RisingWave requires parentheses for struct field access in certain contexts:
SELECT (col.field).subfield FROM table -- Parentheses required

Without parentheses, the parser may not correctly interpret nested struct access.

Reference: https://docs.risingwave.com/sql/data-types/struct#retrieve-data-in-a-struct
"""

SUPPORTS_NULL_TYPE = False
"""
Whether NULL/VOID is supported as a valid data type (not just a value).

Databricks and Spark v3+ support NULL as an actual type, allowing expressions like:
SELECT NULL AS col -- Has type NULL, not just value NULL
CAST(x AS VOID) -- Valid type cast
"""

COALESCE_COMPARISON_NON_STANDARD = False
"""
Whether COALESCE in comparisons has non-standard NULL semantics.

We can't convert `COALESCE(x, 1) = 2` into `NOT x IS NULL AND x = 2` for redshift,
because they are not always equivalent. For example, if `x` is `NULL` and it comes
from a table, then the result is `NULL`, despite `FALSE AND NULL` evaluating to `FALSE`.

In standard SQL and most dialects, these expressions are equivalent, but Redshift treats
table NULLs differently in this context.
"""

HAS_DISTINCT_ARRAY_CONSTRUCTORS = False
"""
Whether the ARRAY constructor is context-sensitive, i.e in Redshift ARRAY[1, 2, 3] != ARRAY(1, 2, 3)
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class Oracle(Dialect):
NULL_ORDERING = "nulls_are_large"
ON_CONDITION_EMPTY_BEFORE_ERROR = False
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = False
DISABLES_ALIAS_REF_EXPANSION = True

# See section 8: https://docs.oracle.com/cd/A97630_01/server.920/a96540/sql_elements9a.htm
NORMALIZATION_STRATEGY = NormalizationStrategy.UPPERCASE
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Redshift(Postgres):
COPY_PARAMS_ARE_CSV = False
HEX_LOWERCASE = True
HAS_DISTINCT_ARRAY_CONSTRUCTORS = True
COALESCE_COMPARISON_NON_STANDARD = True

# ref: https://docs.aws.amazon.com/redshift/latest/dg/r_FORMAT_strings.html
TIME_FORMAT = "'YYYY-MM-DD HH24:MI:SS'"
Expand Down
3 changes: 3 additions & 0 deletions sqlglot/dialects/risingwave.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@


class RisingWave(Postgres):
REQUIRES_PARENTHESIZED_STRUCT_ACCESS = True
SUPPORTS_STRUCT_STAR_EXPANSION = True

class Tokenizer(Postgres.Tokenizer):
KEYWORDS = {
**Postgres.Tokenizer.KEYWORDS,
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ class Snowflake(Dialect):
ARRAY_AGG_INCLUDES_NULLS = None
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = False
TRY_CAST_REQUIRES_STRING = True
SUPPORTS_ALIAS_REFS_IN_JOIN_CONDITIONS = True

EXPRESSION_METADATA = EXPRESSION_METADATA.copy()

Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def _groupconcat_sql(self: Spark.Generator, expression: exp.GroupConcat) -> str:

class Spark(Spark2):
SUPPORTS_ORDER_BY_ALL = True
SUPPORTS_NULL_TYPE = True

class Tokenizer(Spark2.Tokenizer):
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = False
Expand Down
19 changes: 8 additions & 11 deletions sqlglot/optimizer/annotate_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import typing as t

from sqlglot import exp
from sqlglot.dialects.dialect import Dialect
from sqlglot.helper import (
ensure_list,
is_date_unit,
Expand All @@ -14,7 +15,6 @@
)
from sqlglot.optimizer.scope import Scope, traverse_scope
from sqlglot.schema import MappingSchema, Schema, ensure_schema
from sqlglot.dialects.dialect import Dialect

if t.TYPE_CHECKING:
from sqlglot._typing import B, E
Expand Down Expand Up @@ -188,13 +188,10 @@ def __init__(
overwrite_types: bool = True,
) -> None:
self.schema = schema
self.dialect = Dialect.get_or_raise(schema.dialect)
self.expression_metadata = (
expression_metadata or Dialect.get_or_raise(schema.dialect).EXPRESSION_METADATA
)
self.coerces_to = (
coerces_to or Dialect.get_or_raise(schema.dialect).COERCES_TO or self.COERCES_TO
)
dialect = schema.dialect or Dialect()
self.dialect = dialect
self.expression_metadata = expression_metadata or dialect.EXPRESSION_METADATA
self.coerces_to = coerces_to or dialect.COERCES_TO or self.COERCES_TO
self.binary_coercions = binary_coercions or self.BINARY_COERCIONS

# Caches the ids of annotated sub-Expressions, to ensure we only visit them once
Expand All @@ -204,7 +201,7 @@ def __init__(
self._null_expressions: t.Dict[int, exp.Expression] = {}

# Databricks and Spark ≥v3 actually support NULL (i.e., VOID) as a type
self._supports_null_type = schema.dialect in ("databricks", "spark")
self._supports_null_type = dialect.SUPPORTS_NULL_TYPE

# Maps an exp.SetOperation's id (e.g. UNION) to its projection types. This is computed if the
# exp.SetOperation is the expression of a scope source, as selecting from it multiple times
Expand Down Expand Up @@ -368,7 +365,7 @@ def annotate_scope(self, scope: Scope) -> None:
# Iterate through all the expressions of the current scope in post-order, and annotate
self._annotate_expression(scope.expression, scope, selects)

if self.schema.dialect == "bigquery" and isinstance(scope.expression, exp.Query):
if self.dialect.QUERY_RESULTS_ARE_STRUCTS and isinstance(scope.expression, exp.Query):
struct_type = exp.DataType(
this=exp.DataType.Type.STRUCT,
expressions=[
Expand Down Expand Up @@ -482,7 +479,7 @@ def _maybe_coerce(
def _annotate_binary(self, expression: B) -> B:
left, right = expression.left, expression.right
if not left or not right:
expression_sql = expression.sql(self.schema.dialect)
expression_sql = expression.sql(self.dialect)
logger.warning(f"Failed to annotate badly formed binary expression: {expression_sql}")
self._set_type(expression, None)
return expression
Expand Down
Loading