Skip to content

Commit 84f78aa

Browse files
authored
Fix(bigquery)!: parse information schema views into a single identifier (#4336)
* Fix(parser)!: always parse INFORMATION_SCHEMA.X table ref into a dot * PR feedback * Refactor: produce a single identifeir for information schema view * Fix(parser)!: always parse INFORMATION_SCHEMA.X table ref into a dot * PR feedback
1 parent 71f4a47 commit 84f78aa

File tree

5 files changed

+84
-16
lines changed

5 files changed

+84
-16
lines changed

sqlglot/dialects/bigquery.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,7 @@ def _parse_table_parts(
576576
table.set("this", exp.Identifier(this=parts[1]))
577577

578578
if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts):
579+
alias = table.this
579580
catalog, db, this, *rest = (
580581
exp.to_identifier(p, quoted=True)
581582
for p in split_num_words(".".join(p.name for p in table.parts), ".", 3)
@@ -588,6 +589,36 @@ def _parse_table_parts(
588589
this=this, db=db, catalog=catalog, pivots=table.args.get("pivots")
589590
)
590591
table.meta["quoted_table"] = True
592+
else:
593+
alias = None
594+
595+
# The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or
596+
# dataset, so if the project identifier is omitted we need to fix the ast so that
597+
# the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier.
598+
# Otherwise, we wouldn't correctly qualify a `Table` node that references these
599+
# views, because it would seem like the "catalog" part is set, when it'd actually
600+
# be the region/dataset. Merging the two identifiers into a single one is done to
601+
# avoid producing a 4-part Table reference, which would cause issues in the schema
602+
# module, when there are 3-part table names mixed with information schema views.
603+
#
604+
# See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax
605+
table_parts = table.parts
606+
if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA":
607+
# We need to alias the table here to avoid breaking existing qualified columns.
608+
# This is expected to be safe, because if there's an actual alias coming up in
609+
# the token stream, it will overwrite this one. If there isn't one, we are only
610+
# exposing the name that can be used to reference the view explicitly (a no-op).
611+
exp.alias_(
612+
table,
613+
t.cast(exp.Identifier, alias or table_parts[-1]),
614+
table=True,
615+
copy=False,
616+
)
617+
618+
info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}"
619+
table.set("this", exp.Identifier(this=info_schema_view, quoted=True))
620+
table.set("db", seq_get(table_parts, -3))
621+
table.set("catalog", seq_get(table_parts, -4))
591622

592623
return table
593624

sqlglot/schema.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,7 @@ def supported_table_args(self) -> t.Tuple[str, ...]:
151151
return self._supported_table_args
152152

153153
def table_parts(self, table: exp.Table) -> t.List[str]:
154-
if isinstance(table.this, exp.ReadCSV):
155-
return [table.this.name]
156-
return [table.text(part) for part in exp.TABLE_PARTS if table.text(part)]
154+
return [part.name for part in reversed(table.parts)]
157155

158156
def find(
159157
self, table: exp.Table, raise_on_missing: bool = True, ensure_data_types: bool = False
@@ -417,12 +415,10 @@ def _normalize_table(
417415
normalized_table = exp.maybe_parse(table, into=exp.Table, dialect=dialect, copy=normalize)
418416

419417
if normalize:
420-
for arg in exp.TABLE_PARTS:
421-
value = normalized_table.args.get(arg)
422-
if isinstance(value, exp.Identifier):
423-
normalized_table.set(
424-
arg,
425-
normalize_name(value, dialect=dialect, is_table=True, normalize=normalize),
418+
for part in normalized_table.parts:
419+
if isinstance(part, exp.Identifier):
420+
part.replace(
421+
normalize_name(part, dialect=dialect, is_table=True, normalize=normalize)
426422
)
427423

428424
return normalized_table

tests/dialects/test_bigquery.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
UnsupportedError,
1010
exp,
1111
parse,
12-
parse_one,
1312
transpile,
1413
)
1514
from sqlglot.helper import logger as helper_logger
@@ -85,12 +84,21 @@ def test_bigquery(self):
8584
"PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%E*S%z', x)",
8685
)
8786

88-
table = parse_one("x-0._y.z", dialect="bigquery", into=exp.Table)
87+
for prefix in ("c.db.", "db.", ""):
88+
with self.subTest(f"Parsing {prefix}INFORMATION_SCHEMA.X into a Table"):
89+
table = self.parse_one(f"`{prefix}INFORMATION_SCHEMA.X`", into=exp.Table)
90+
this = table.this
91+
92+
self.assertIsInstance(this, exp.Identifier)
93+
self.assertTrue(this.quoted)
94+
self.assertEqual(this.name, "INFORMATION_SCHEMA.X")
95+
96+
table = self.parse_one("x-0._y.z", into=exp.Table)
8997
self.assertEqual(table.catalog, "x-0")
9098
self.assertEqual(table.db, "_y")
9199
self.assertEqual(table.name, "z")
92100

93-
table = parse_one("x-0._y", dialect="bigquery", into=exp.Table)
101+
table = self.parse_one("x-0._y", into=exp.Table)
94102
self.assertEqual(table.db, "x-0")
95103
self.assertEqual(table.name, "_y")
96104

@@ -200,9 +208,6 @@ def test_bigquery(self):
200208
self.validate_identity(
201209
"MERGE INTO dataset.NewArrivals USING (SELECT * FROM UNNEST([('microwave', 10, 'warehouse #1'), ('dryer', 30, 'warehouse #1'), ('oven', 20, 'warehouse #2')])) ON FALSE WHEN NOT MATCHED THEN INSERT ROW WHEN NOT MATCHED BY SOURCE THEN DELETE"
202210
)
203-
self.validate_identity(
204-
"SELECT * FROM `SOME_PROJECT_ID.SOME_DATASET_ID.INFORMATION_SCHEMA.SOME_VIEW`"
205-
)
206211
self.validate_identity(
207212
"SELECT * FROM test QUALIFY a IS DISTINCT FROM b WINDOW c AS (PARTITION BY d)"
208213
)
@@ -233,6 +238,22 @@ def test_bigquery(self):
233238
self.validate_identity(
234239
"CREATE OR REPLACE VIEW test (tenant_id OPTIONS (description='Test description on table creation')) AS SELECT 1 AS tenant_id, 1 AS customer_id",
235240
)
241+
self.validate_identity(
242+
"SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
243+
"SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW` AS `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
244+
)
245+
self.validate_identity(
246+
"SELECT * FROM region_or_dataset.INFORMATION_SCHEMA.TABLES",
247+
"SELECT * FROM region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
248+
)
249+
self.validate_identity(
250+
"SELECT * FROM region_or_dataset.INFORMATION_SCHEMA.TABLES AS some_name",
251+
"SELECT * FROM region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS some_name",
252+
)
253+
self.validate_identity(
254+
"SELECT * FROM proj.region_or_dataset.INFORMATION_SCHEMA.TABLES",
255+
"SELECT * FROM proj.region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
256+
)
236257
self.validate_identity(
237258
"CREATE VIEW `d.v` OPTIONS (expiration_timestamp=TIMESTAMP '2020-01-02T04:05:06.007Z') AS SELECT 1 AS c",
238259
"CREATE VIEW `d.v` OPTIONS (expiration_timestamp=CAST('2020-01-02T04:05:06.007Z' AS TIMESTAMP)) AS SELECT 1 AS c",

tests/fixtures/optimizer/qualify_tables.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,26 @@ SELECT 1 FROM x.y.z AS z;
1414
SELECT 1 FROM x.y.z AS z;
1515
SELECT 1 FROM x.y.z AS z;
1616

17+
# title: only information schema
18+
# dialect: bigquery
19+
SELECT * FROM information_schema.tables;
20+
SELECT * FROM c.db.`information_schema.tables` AS tables;
21+
22+
# title: information schema with db
23+
# dialect: bigquery
24+
SELECT * FROM y.information_schema.tables;
25+
SELECT * FROM c.y.`information_schema.tables` AS tables;
26+
27+
# title: information schema with db, catalog
28+
# dialect: bigquery
29+
SELECT * FROM x.y.information_schema.tables;
30+
SELECT * FROM x.y.`information_schema.tables` AS tables;
31+
32+
# title: information schema with db, catalog, alias
33+
# dialect: bigquery
34+
SELECT * FROM x.y.information_schema.tables AS z;
35+
SELECT * FROM x.y.`information_schema.tables` AS z;
36+
1737
# title: redshift unnest syntax, z.a should be a column, not a table
1838
# dialect: redshift
1939
SELECT 1 FROM y.z AS z, z.a;

tests/test_optimizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def test_qualify_columns(self, logger):
315315
),
316316
dialect="bigquery",
317317
).sql(),
318-
'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA"."COLUMNS" AS "COLUMNS") SELECT "x"."a" AS "a" FROM "x" AS "x"',
318+
'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA.COLUMNS" AS "columns") SELECT "x"."a" AS "a" FROM "x" AS "x"',
319319
)
320320

321321
self.assertEqual(

0 commit comments

Comments
 (0)