Skip to content

Commit 964a275

Browse files
authored
Fix!: Make UNION column qualification recursive (#5508)
* Fix: Make UNION column qualification recursive * Pre -> post
1 parent 4687798 commit 964a275

File tree

2 files changed

+59
-25
lines changed

2 files changed

+59
-25
lines changed

sqlglot/optimizer/qualify_columns.py

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,44 @@ def all_columns(self) -> t.Set[str]:
972972
}
973973
return self._all_columns
974974

975+
def get_source_columns_from_set_op(self, expression: exp.Expression) -> t.List[str]:
976+
if isinstance(expression, exp.Select):
977+
return expression.named_selects
978+
if isinstance(expression, exp.Subquery) and isinstance(expression.this, exp.SetOperation):
979+
# Different types of SET modifiers can be chained together if they're explicitly grouped by nesting
980+
return self.get_source_columns_from_set_op(expression.this)
981+
if not isinstance(expression, exp.SetOperation):
982+
raise OptimizeError(f"Unknown set operation: {expression}")
983+
984+
set_op = expression
985+
986+
# BigQuery specific set operations modifiers, e.g INNER UNION ALL BY NAME
987+
on_column_list = set_op.args.get("on")
988+
989+
if on_column_list:
990+
# The resulting columns are the columns in the ON clause:
991+
# {INNER | LEFT | FULL} UNION ALL BY NAME ON (col1, col2, ...)
992+
columns = [col.name for col in on_column_list]
993+
elif set_op.side or set_op.kind:
994+
side = set_op.side
995+
kind = set_op.kind
996+
997+
# Visit the children UNIONs (if any) in a post-order traversal
998+
left = self.get_source_columns_from_set_op(set_op.left)
999+
right = self.get_source_columns_from_set_op(set_op.right)
1000+
1001+
# We use dict.fromkeys to deduplicate keys and maintain insertion order
1002+
if side == "LEFT":
1003+
columns = left
1004+
elif side == "FULL":
1005+
columns = list(dict.fromkeys(left + right))
1006+
elif kind == "INNER":
1007+
columns = list(dict.fromkeys(left).keys() & dict.fromkeys(right).keys())
1008+
else:
1009+
columns = set_op.named_selects
1010+
1011+
return columns
1012+
9751013
def get_source_columns(self, name: str, only_visible: bool = False) -> t.Sequence[str]:
9761014
"""Resolve the source columns for a given source `name`."""
9771015
cache_key = (name, only_visible)
@@ -996,31 +1034,8 @@ def get_source_columns(self, name: str, only_visible: bool = False) -> t.Sequenc
9961034
for k in source.expression.type.expressions: # type: ignore
9971035
columns.append(k.name)
9981036
elif isinstance(source, Scope) and isinstance(source.expression, exp.SetOperation):
999-
set_op = source.expression
1000-
1001-
# BigQuery specific set operations modifiers, e.g INNER UNION ALL BY NAME
1002-
on_column_list = set_op.args.get("on")
1003-
1004-
if on_column_list:
1005-
# The resulting columns are the columns in the ON clause:
1006-
# {INNER | LEFT | FULL} UNION ALL BY NAME ON (col1, col2, ...)
1007-
columns = [col.name for col in on_column_list]
1008-
elif set_op.side or set_op.kind:
1009-
side = set_op.side
1010-
kind = set_op.kind
1011-
1012-
left = set_op.left.named_selects
1013-
right = set_op.right.named_selects
1014-
1015-
# We use dict.fromkeys to deduplicate keys and maintain insertion order
1016-
if side == "LEFT":
1017-
columns = left
1018-
elif side == "FULL":
1019-
columns = list(dict.fromkeys(left + right))
1020-
elif kind == "INNER":
1021-
columns = list(dict.fromkeys(left).keys() & dict.fromkeys(right).keys())
1022-
else:
1023-
columns = set_op.named_selects
1037+
columns = self.get_source_columns_from_set_op(source.expression)
1038+
10241039
else:
10251040
select = seq_get(source.expression.selects, 0)
10261041

tests/fixtures/optimizer/qualify_columns.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,25 @@ SELECT _q_0.foo AS foo, _q_0.qux AS qux FROM ((SELECT 1 AS foo, 2 AS bar LEFT UN
389389
SELECT * FROM (((SELECT 1 AS foo, 2 AS bar LEFT UNION ALL BY NAME SELECT 3 AS bar, 4 AS baz) FULL UNION ALL BY NAME ON (foo, qux) SELECT 3 AS qux, 4 AS bar) INNER UNION ALL BY NAME ON (foo) SELECT 6 AS foo);
390390
SELECT _q_0.foo AS foo FROM (((SELECT 1 AS foo, 2 AS bar LEFT UNION ALL BY NAME SELECT 3 AS bar, 4 AS baz) FULL UNION ALL BY NAME ON (foo, qux) SELECT 3 AS qux, 4 AS bar) INNER UNION ALL BY NAME ON (foo) SELECT 6 AS foo) AS _q_0;
391391

392+
# Title: Nested set operations with modifiers
393+
# dialect: bigquery
394+
# execute: false
395+
WITH t1 AS (SELECT 1 AS a, 2 AS b), t2 AS (SELECT 2 AS b, 3 AS c), t3 AS (SELECT 2 AS c, 3 AS d), t4 AS (SELECT 2 AS e, 3 AS f) SELECT * FROM ((SELECT * FROM t1 FULL OUTER UNION ALL BY NAME (SELECT * FROM t2 FULL OUTER UNION ALL BY NAME (SELECT * FROM t3 FULL OUTER UNION ALL BY NAME SELECT * FROM t4))));
396+
WITH t1 AS (SELECT 1 AS a, 2 AS b), t2 AS (SELECT 2 AS b, 3 AS c), t3 AS (SELECT 2 AS c, 3 AS d), t4 AS (SELECT 2 AS e, 3 AS f) SELECT _q_0.a AS a, _q_0.b AS b, _q_0.c AS c, _q_0.d AS d, _q_0.e AS e, _q_0.f AS f FROM ((SELECT t1.a AS a, t1.b AS b FROM t1 AS t1 FULL OUTER UNION ALL BY NAME (SELECT t2.b AS b, t2.c AS c FROM t2 AS t2 FULL OUTER UNION ALL BY NAME (SELECT t3.c AS c, t3.d AS d FROM t3 AS t3 FULL OUTER UNION ALL BY NAME SELECT t4.e AS e, t4.f AS f FROM t4 AS t4))) AS _q_0);
397+
398+
399+
# Title: Nested set operations with different modifiers (FULL + INNER)
400+
# dialect: bigquery
401+
# execute: false
402+
WITH t1 AS (SELECT 1 AS a, 2 AS b), t2 AS (SELECT 2 AS b, 3 AS c), t3 AS (SELECT 2 AS c, 3 AS d), t4 AS (SELECT 2 AS e, 3 AS f) SELECT * FROM ((SELECT * FROM t1 FULL OUTER UNION ALL BY NAME (SELECT * FROM t2 INNER UNION ALL BY NAME (SELECT * FROM t3 FULL OUTER UNION ALL BY NAME SELECT * FROM t4))));
403+
WITH t1 AS (SELECT 1 AS a, 2 AS b), t2 AS (SELECT 2 AS b, 3 AS c), t3 AS (SELECT 2 AS c, 3 AS d), t4 AS (SELECT 2 AS e, 3 AS f) SELECT _q_0.a AS a, _q_0.b AS b, _q_0.c AS c FROM ((SELECT t1.a AS a, t1.b AS b FROM t1 AS t1 FULL OUTER UNION ALL BY NAME (SELECT t2.b AS b, t2.c AS c FROM t2 AS t2 INNER UNION ALL BY NAME (SELECT t3.c AS c, t3.d AS d FROM t3 AS t3 FULL OUTER UNION ALL BY NAME SELECT t4.e AS e, t4.f AS f FROM t4 AS t4))) AS _q_0);
404+
405+
# Title: Nested set operations with different modifiers (FULL + LEFT)
406+
# dialect: bigquery
407+
# execute: false
408+
WITH t1 AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d), t2 AS (SELECT 2 AS b, 3 AS c), t3 AS (SELECT 2 AS c, 3 AS d), t4 AS (SELECT 2 AS d, 3 AS e) SELECT * FROM ((SELECT * FROM t1 FULL OUTER UNION ALL BY NAME (SELECT * FROM t2 FULL UNION ALL BY NAME (SELECT * FROM t3 LEFT UNION ALL BY NAME SELECT * FROM t4))));
409+
WITH t1 AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d), t2 AS (SELECT 2 AS b, 3 AS c), t3 AS (SELECT 2 AS c, 3 AS d), t4 AS (SELECT 2 AS d, 3 AS e) SELECT _q_0.a AS a, _q_0.b AS b, _q_0.c AS c, _q_0.d AS d FROM ((SELECT t1.a AS a, t1.b AS b, t1.c AS c, t1.d AS d FROM t1 AS t1 FULL OUTER UNION ALL BY NAME (SELECT t2.b AS b, t2.c AS c FROM t2 AS t2 FULL UNION ALL BY NAME (SELECT t3.c AS c, t3.d AS d FROM t3 AS t3 LEFT UNION ALL BY NAME SELECT t4.d AS d, t4.e AS e FROM t4 AS t4))) AS _q_0);
410+
392411
--------------------------------------
393412
-- Subqueries
394413
--------------------------------------

0 commit comments

Comments
 (0)