Fix(bigquery)!: parse information schema views into a single identifier (#4336)

georgesittas · web-flow · commit 84f78aafd5d7 · 2024-11-05T18:23:48.000+02:00
* Fix(parser)!: always parse INFORMATION_SCHEMA.X table ref into a dot

* PR feedback

* Refactor: produce a single identifeir for information schema view

* Fix(parser)!: always parse INFORMATION_SCHEMA.X table ref into a dot

* PR feedback
diff --git a/sqlglot/dialects/bigquery.py b/sqlglot/dialects/bigquery.py
@@ -576,6 +576,7 @@ def _parse_table_parts(
                         table.set("this", exp.Identifier(this=parts[1]))
 
             if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts):
+                alias = table.this
                 catalog, db, this, *rest = (
                     exp.to_identifier(p, quoted=True)
                     for p in split_num_words(".".join(p.name for p in table.parts), ".", 3)
@@ -588,6 +589,36 @@ def _parse_table_parts(
                     this=this, db=db, catalog=catalog, pivots=table.args.get("pivots")
                 )
                 table.meta["quoted_table"] = True
+            else:
+                alias = None
+
+            # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or
+            # dataset, so if the project identifier is omitted we need to fix the ast so that
+            # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier.
+            # Otherwise, we wouldn't correctly qualify a `Table` node that references these
+            # views, because it would seem like the "catalog" part is set, when it'd actually
+            # be the region/dataset. Merging the two identifiers into a single one is done to
+            # avoid producing a 4-part Table reference, which would cause issues in the schema
+            # module, when there are 3-part table names mixed with information schema views.
+            #
+            # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax
+            table_parts = table.parts
+            if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA":
+                # We need to alias the table here to avoid breaking existing qualified columns.
+                # This is expected to be safe, because if there's an actual alias coming up in
+                # the token stream, it will overwrite this one. If there isn't one, we are only
+                # exposing the name that can be used to reference the view explicitly (a no-op).
+                exp.alias_(
+                    table,
+                    t.cast(exp.Identifier, alias or table_parts[-1]),
+                    table=True,
+                    copy=False,
+                )
+
+                info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}"
+                table.set("this", exp.Identifier(this=info_schema_view, quoted=True))
+                table.set("db", seq_get(table_parts, -3))
+                table.set("catalog", seq_get(table_parts, -4))
 
             return table
 
diff --git a/sqlglot/schema.py b/sqlglot/schema.py
@@ -151,9 +151,7 @@ def supported_table_args(self) -> t.Tuple[str, ...]:
         return self._supported_table_args
 
     def table_parts(self, table: exp.Table) -> t.List[str]:
-        if isinstance(table.this, exp.ReadCSV):
-            return [table.this.name]
-        return [table.text(part) for part in exp.TABLE_PARTS if table.text(part)]
+        return [part.name for part in reversed(table.parts)]
 
     def find(
         self, table: exp.Table, raise_on_missing: bool = True, ensure_data_types: bool = False
@@ -417,12 +415,10 @@ def _normalize_table(
         normalized_table = exp.maybe_parse(table, into=exp.Table, dialect=dialect, copy=normalize)
 
         if normalize:
-            for arg in exp.TABLE_PARTS:
-                value = normalized_table.args.get(arg)
-                if isinstance(value, exp.Identifier):
-                    normalized_table.set(
-                        arg,
-                        normalize_name(value, dialect=dialect, is_table=True, normalize=normalize),
+            for part in normalized_table.parts:
+                if isinstance(part, exp.Identifier):
+                    part.replace(
+                        normalize_name(part, dialect=dialect, is_table=True, normalize=normalize)
                     )
 
         return normalized_table
diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py
@@ -9,7 +9,6 @@
     UnsupportedError,
     exp,
     parse,
-    parse_one,
     transpile,
 )
 from sqlglot.helper import logger as helper_logger
@@ -85,12 +84,21 @@ def test_bigquery(self):
             "PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%E*S%z', x)",
         )
 
-        table = parse_one("x-0._y.z", dialect="bigquery", into=exp.Table)
+        for prefix in ("c.db.", "db.", ""):
+            with self.subTest(f"Parsing {prefix}INFORMATION_SCHEMA.X into a Table"):
+                table = self.parse_one(f"`{prefix}INFORMATION_SCHEMA.X`", into=exp.Table)
+                this = table.this
+
+                self.assertIsInstance(this, exp.Identifier)
+                self.assertTrue(this.quoted)
+                self.assertEqual(this.name, "INFORMATION_SCHEMA.X")
+
+        table = self.parse_one("x-0._y.z", into=exp.Table)
         self.assertEqual(table.catalog, "x-0")
         self.assertEqual(table.db, "_y")
         self.assertEqual(table.name, "z")
 
-        table = parse_one("x-0._y", dialect="bigquery", into=exp.Table)
+        table = self.parse_one("x-0._y", into=exp.Table)
         self.assertEqual(table.db, "x-0")
         self.assertEqual(table.name, "_y")
 
@@ -200,9 +208,6 @@ def test_bigquery(self):
         self.validate_identity(
             "MERGE INTO dataset.NewArrivals USING (SELECT * FROM UNNEST([('microwave', 10, 'warehouse #1'), ('dryer', 30, 'warehouse #1'), ('oven', 20, 'warehouse #2')])) ON FALSE WHEN NOT MATCHED THEN INSERT ROW WHEN NOT MATCHED BY SOURCE THEN DELETE"
         )
-        self.validate_identity(
-            "SELECT * FROM `SOME_PROJECT_ID.SOME_DATASET_ID.INFORMATION_SCHEMA.SOME_VIEW`"
-        )
         self.validate_identity(
             "SELECT * FROM test QUALIFY a IS DISTINCT FROM b WINDOW c AS (PARTITION BY d)"
         )
@@ -233,6 +238,22 @@ def test_bigquery(self):
         self.validate_identity(
             "CREATE OR REPLACE VIEW test (tenant_id OPTIONS (description='Test description on table creation')) AS SELECT 1 AS tenant_id, 1 AS customer_id",
         )
+        self.validate_identity(
+            "SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
+            "SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW` AS `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
+        )
+        self.validate_identity(
+            "SELECT * FROM region_or_dataset.INFORMATION_SCHEMA.TABLES",
+            "SELECT * FROM region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
+        )
+        self.validate_identity(
+            "SELECT * FROM region_or_dataset.INFORMATION_SCHEMA.TABLES AS some_name",
+            "SELECT * FROM region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS some_name",
+        )
+        self.validate_identity(
+            "SELECT * FROM proj.region_or_dataset.INFORMATION_SCHEMA.TABLES",
+            "SELECT * FROM proj.region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
+        )
         self.validate_identity(
             "CREATE VIEW `d.v` OPTIONS (expiration_timestamp=TIMESTAMP '2020-01-02T04:05:06.007Z') AS SELECT 1 AS c",
             "CREATE VIEW `d.v` OPTIONS (expiration_timestamp=CAST('2020-01-02T04:05:06.007Z' AS TIMESTAMP)) AS SELECT 1 AS c",
diff --git a/tests/fixtures/optimizer/qualify_tables.sql b/tests/fixtures/optimizer/qualify_tables.sql
@@ -14,6 +14,26 @@ SELECT 1 FROM x.y.z AS z;
 SELECT 1 FROM x.y.z AS z;
 SELECT 1 FROM x.y.z AS z;
 
+# title: only information schema
+# dialect: bigquery
+SELECT * FROM information_schema.tables;
+SELECT * FROM c.db.`information_schema.tables` AS tables;
+
+# title: information schema with db
+# dialect: bigquery
+SELECT * FROM y.information_schema.tables;
+SELECT * FROM c.y.`information_schema.tables` AS tables;
+
+# title: information schema with db, catalog
+# dialect: bigquery
+SELECT * FROM x.y.information_schema.tables;
+SELECT * FROM x.y.`information_schema.tables` AS tables;
+
+# title: information schema with db, catalog, alias
+# dialect: bigquery
+SELECT * FROM x.y.information_schema.tables AS z;
+SELECT * FROM x.y.`information_schema.tables` AS z;
+
 # title: redshift unnest syntax, z.a should be a column, not a table
 # dialect: redshift
 SELECT 1 FROM y.z AS z, z.a;
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -315,7 +315,7 @@ def test_qualify_columns(self, logger):
                 ),
                 dialect="bigquery",
             ).sql(),
-            'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA"."COLUMNS" AS "COLUMNS") SELECT "x"."a" AS "a" FROM "x" AS "x"',
+            'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA.COLUMNS" AS "columns") SELECT "x"."a" AS "a" FROM "x" AS "x"',
         )
 
         self.assertEqual(

Original file line number	Diff line number	Diff line change
`@@ -315,7 +315,7 @@ def test_qualify_columns(self, logger):`
`315`	`315`	`),`
`316`	`316`	`dialect="bigquery",`
`317`	`317`	`).sql(),`
`318`		`- 'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA"."COLUMNS" AS "COLUMNS") SELECT "x"."a" AS "a" FROM "x" AS "x"',`
	`318`	`+ 'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA.COLUMNS" AS "columns") SELECT "x"."a" AS "a" FROM "x" AS "x"',`
`319`	`319`	`)`
`320`	`320`
`321`	`321`	`self.assertEqual(`