feat(duckdb)!: Add support for PIVOT multiple IN clauses (#4964)

VaggelisD · web-flow · commit 72cf4a4501a8 · 2025-04-10T19:52:49.000+03:00
* feat(duckdb): Add support for PIVOT's multi IN clauses

* PR Feedback 1

* PR Feedback 2
diff --git a/sqlglot/dialects/snowflake.py b/sqlglot/dialects/snowflake.py
@@ -193,12 +193,12 @@ def _unqualify_pivot_columns(expression: exp.Expression) -> exp.Expression:
         if expression.unpivot:
             expression = transforms.unqualify_columns(expression)
         else:
-            field = expression.args.get("field")
-            field_expr = seq_get(field.expressions if field else [], 0)
+            for field in expression.fields:
+                field_expr = seq_get(field.expressions if field else [], 0)
 
-            if isinstance(field_expr, exp.PivotAny):
-                unqualified_field_expr = transforms.unqualify_columns(field_expr)
-                t.cast(exp.Expression, field).set("expressions", unqualified_field_expr, 0)
+                if isinstance(field_expr, exp.PivotAny):
+                    unqualified_field_expr = transforms.unqualify_columns(field_expr)
+                    t.cast(exp.Expression, field).set("expressions", unqualified_field_expr, 0)
 
     return expression
 
diff --git a/sqlglot/dialects/spark2.py b/sqlglot/dialects/spark2.py
@@ -104,7 +104,9 @@ def _unqualify_pivot_columns(expression: exp.Expression) -> exp.Expression:
         SELECT * FROM tbl PIVOT(SUM(tbl.sales) FOR quarter IN ('Q1', 'Q1'))
     """
     if isinstance(expression, exp.Pivot):
-        expression.set("field", transforms.unqualify_columns(expression.args["field"]))
+        expression.set(
+            "fields", [transforms.unqualify_columns(field) for field in expression.fields]
+        )
 
     return expression
 
@@ -237,7 +239,7 @@ def _parse_drop_column(self) -> t.Optional[exp.Drop | exp.Command]:
 
         def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]:
             if len(aggregations) == 1:
-                return [""]
+                return []
             return pivot_column_names(aggregations, dialect="spark")
 
     class Generator(Hive.Generator):
diff --git a/sqlglot/expressions.py b/sqlglot/expressions.py
@@ -4322,7 +4322,7 @@ class Pivot(Expression):
         "this": False,
         "alias": False,
         "expressions": False,
-        "field": False,
+        "fields": False,
         "unpivot": False,
         "using": False,
         "group": False,
@@ -4336,6 +4336,10 @@ class Pivot(Expression):
     def unpivot(self) -> bool:
         return bool(self.args.get("unpivot"))
 
+    @property
+    def fields(self) -> t.List[Expression]:
+        return self.args.get("fields", [])
+
 
 # https://duckdb.org/docs/sql/statements/unpivot#simplified-unpivot-syntax
 # UNPIVOT ... INTO [NAME <col_name> VALUE <col_value>][...,]
diff --git a/sqlglot/generator.py b/sqlglot/generator.py
@@ -2068,7 +2068,15 @@ def pivot_sql(self, expression: exp.Pivot) -> str:
         alias = self.sql(expression, "alias")
         alias = f" AS {alias}" if alias else ""
 
-        field = self.sql(expression, "field")
+        fields = self.expressions(
+            expression,
+            "fields",
+            sep=" ",
+            dynamic=True,
+            new_line=True,
+            skip_first=True,
+            skip_last=True,
+        )
 
         include_nulls = expression.args.get("include_nulls")
         if include_nulls is not None:
@@ -2078,7 +2086,7 @@ def pivot_sql(self, expression: exp.Pivot) -> str:
 
         default_on_null = self.sql(expression, "default_on_null")
         default_on_null = f" DEFAULT ON NULL ({default_on_null})" if default_on_null else ""
-        return f"{self.seg(direction)}{nulls}({expressions} FOR {field}{default_on_null}{group}){alias}"
+        return f"{self.seg(direction)}{nulls}({expressions} FOR {fields}{default_on_null}{group}){alias}"
 
     def version_sql(self, expression: exp.Version) -> str:
         this = f"FOR {expression.name}"
diff --git a/sqlglot/optimizer/qualify_columns.py b/sqlglot/optimizer/qualify_columns.py
@@ -140,13 +140,14 @@ def validate_qualify_columns(expression: E) -> E:
 
 
 def _unpivot_columns(unpivot: exp.Pivot) -> t.Iterator[exp.Column]:
-    name_column = []
-    field = unpivot.args.get("field")
-    if isinstance(field, exp.In) and isinstance(field.this, exp.Column):
-        name_column.append(field.this)
-
+    name_columns = [
+        field.this
+        for field in unpivot.fields
+        if isinstance(field, exp.In) and isinstance(field.this, exp.Column)
+    ]
     value_columns = (c for e in unpivot.expressions for c in e.find_all(exp.Column))
-    return itertools.chain(name_column, value_columns)
+
+    return itertools.chain(name_columns, value_columns)
 
 
 def _pop_table_column_aliases(derived_tables: t.List[exp.CTE | exp.Subquery]) -> None:
@@ -608,18 +609,19 @@ def _expand_stars(
     dialect = resolver.schema.dialect
 
     pivot_output_columns = None
-    pivot_exclude_columns = None
+    pivot_exclude_columns: t.Set[str] = set()
 
     pivot = t.cast(t.Optional[exp.Pivot], seq_get(scope.pivots, 0))
     if isinstance(pivot, exp.Pivot) and not pivot.alias_column_names:
         if pivot.unpivot:
             pivot_output_columns = [c.output_name for c in _unpivot_columns(pivot)]
 
-            field = pivot.args.get("field")
-            if isinstance(field, exp.In):
-                pivot_exclude_columns = {
-                    c.output_name for e in field.expressions for c in e.find_all(exp.Column)
-                }
+            for field in pivot.fields:
+                if isinstance(field, exp.In):
+                    pivot_exclude_columns.update(
+                        c.output_name for e in field.expressions for c in e.find_all(exp.Column)
+                    )
+
         else:
             pivot_exclude_columns = set(c.output_name for c in pivot.find_all(exp.Column))
 
diff --git a/sqlglot/parser.py b/sqlglot/parser.py
@@ -2,6 +2,7 @@
 
 import logging
 import typing as t
+import itertools
 from collections import defaultdict
 
 from sqlglot import exp
@@ -4242,7 +4243,13 @@ def _parse_pivot(self) -> t.Optional[exp.Pivot]:
         if not self._match(TokenType.FOR):
             self.raise_error("Expecting FOR")
 
-        field = self._parse_pivot_in()
+        fields = []
+        while True:
+            field = self._try_parse(self._parse_pivot_in)
+            if not field:
+                break
+            fields.append(field)
+
         default_on_null = self._match_text_seq("DEFAULT", "ON", "NULL") and self._parse_wrapped(
             self._parse_bitwise
         )
@@ -4254,7 +4261,7 @@ def _parse_pivot(self) -> t.Optional[exp.Pivot]:
         pivot = self.expression(
             exp.Pivot,
             expressions=expressions,
-            field=field,
+            fields=fields,
             unpivot=unpivot,
             include_nulls=include_nulls,
             default_on_null=default_on_null,
@@ -4268,26 +4275,43 @@ def _parse_pivot(self) -> t.Optional[exp.Pivot]:
             names = self._pivot_column_names(t.cast(t.List[exp.Expression], expressions))
 
             columns: t.List[exp.Expression] = []
-            pivot_field_expressions = pivot.args["field"].expressions
+            all_fields = []
+            for pivot_field in pivot.fields:
+                pivot_field_expressions = pivot_field.expressions
+
+                # The `PivotAny` expression corresponds to `ANY ORDER BY <column>`; we can't infer in this case.
+                if isinstance(seq_get(pivot_field_expressions, 0), exp.PivotAny):
+                    continue
+
+                all_fields.append(
+                    [
+                        fld.sql() if self.IDENTIFY_PIVOT_STRINGS else fld.alias_or_name
+                        for fld in pivot_field_expressions
+                    ]
+                )
+
+            if all_fields:
+                if names:
+                    all_fields.append(names)
+
+                # Generate all possible combinations of the pivot columns
+                # e.g PIVOT(sum(...) as total FOR year IN (2000, 2010) FOR country IN ('NL', 'US'))
+                # generates the product between [[2000, 2010], ['NL', 'US'], ['total']]
+                for fld_parts_tuple in itertools.product(*all_fields):
+                    fld_parts = list(fld_parts_tuple)
 
-            # The `PivotAny` expression corresponds to `ANY ORDER BY <column>`; we can't infer in this case.
-            if not isinstance(seq_get(pivot_field_expressions, 0), exp.PivotAny):
-                for fld in pivot_field_expressions:
-                    field_name = fld.sql() if self.IDENTIFY_PIVOT_STRINGS else fld.alias_or_name
-                    for name in names:
-                        if self.PREFIXED_PIVOT_COLUMNS:
-                            name = f"{name}_{field_name}" if name else field_name
-                        else:
-                            name = f"{field_name}_{name}" if name else field_name
+                    if names and self.PREFIXED_PIVOT_COLUMNS:
+                        # Move the "name" to the front of the list
+                        fld_parts.insert(0, fld_parts.pop(-1))
 
-                        columns.append(exp.to_identifier(name))
+                    columns.append(exp.to_identifier("_".join(fld_parts)))
 
             pivot.set("columns", columns)
 
         return pivot
 
     def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]:
-        return [agg.alias for agg in aggregations]
+        return [agg.alias for agg in aggregations if agg.alias]
 
     def _parse_prewhere(self, skip_where_token: bool = False) -> t.Optional[exp.PreWhere]:
         if not skip_where_token and not self._match(TokenType.PREWHERE):
diff --git a/tests/fixtures/optimizer/optimizer.sql b/tests/fixtures/optimizer/optimizer.sql
@@ -760,10 +760,12 @@ SELECT
   `_q_0`.`first_half_sales` AS `first_half_sales`,
   `_q_0`.`second_half_sales` AS `second_half_sales`
 FROM `produce` AS `produce`
-UNPIVOT((`first_half_sales`, `second_half_sales`) FOR `semesters` IN (
-  (`produce`.`q1`, `produce`.`q2`) AS 'semester_1',
-  (`produce`.`q3`, `produce`.`q4`) AS 'semester_2'
-)) AS `_q_0`;
+UNPIVOT((`first_half_sales`, `second_half_sales`) FOR 
+  `semesters` IN (
+    (`produce`.`q1`, `produce`.`q2`) AS 'semester_1',
+    (`produce`.`q3`, `produce`.`q4`) AS 'semester_2'
+  )
+) AS `_q_0`;
 
 # title: quoting is preserved
 # dialect: snowflake
diff --git a/tests/fixtures/optimizer/qualify_columns.sql b/tests/fixtures/optimizer/qualify_columns.sql
@@ -512,6 +512,10 @@ SELECT _q_0.c1 AS c1, _q_0.c2 AS c2 FROM VALUES ((1, 1), (2, 2)) AS _q_0(c1, c2)
 SELECT * FROM VALUES (1, 2, 3);
 SELECT _q_0.c1 AS c1 FROM VALUES ((1), (2), (3)) AS _q_0(c1);
 
+# title: Expand PIVOT column combinations
+# dialect: duckdb
+WITH cities AS (SELECT * FROM (VALUES ('nl', 'amsterdam', 2000, 1005)) AS t(country, name, year, population)) SELECT * FROM cities PIVOT(SUM(population) AS total, COUNT(population) AS count FOR country IN ('nl', 'us') year IN (2000, 2010) name IN ('amsterdam', 'seattle'));
+WITH cities AS (SELECT t.country AS country, t.name AS name, t.year AS year, t.population AS population FROM (VALUES ('nl', 'amsterdam', 2000, 1005)) AS t(country, name, year, population)) SELECT _q_0.nl_2000_amsterdam_total AS nl_2000_amsterdam_total, _q_0.nl_2000_amsterdam_count AS nl_2000_amsterdam_count, _q_0.nl_2000_seattle_total AS nl_2000_seattle_total, _q_0.nl_2000_seattle_count AS nl_2000_seattle_count, _q_0.nl_2010_amsterdam_total AS nl_2010_amsterdam_total, _q_0.nl_2010_amsterdam_count AS nl_2010_amsterdam_count, _q_0.nl_2010_seattle_total AS nl_2010_seattle_total, _q_0.nl_2010_seattle_count AS nl_2010_seattle_count, _q_0.us_2000_amsterdam_total AS us_2000_amsterdam_total, _q_0.us_2000_amsterdam_count AS us_2000_amsterdam_count, _q_0.us_2000_seattle_total AS us_2000_seattle_total, _q_0.us_2000_seattle_count AS us_2000_seattle_count, _q_0.us_2010_amsterdam_total AS us_2010_amsterdam_total, _q_0.us_2010_amsterdam_count AS us_2010_amsterdam_count, _q_0.us_2010_seattle_total AS us_2010_seattle_total, _q_0.us_2010_seattle_count AS us_2010_seattle_count FROM cities AS cities PIVOT(SUM(population) AS total, COUNT(population) AS count FOR country IN ('nl', 'us') year IN (2000, 2010) name IN ('amsterdam', 'seattle')) AS _q_0;
 
 --------------------------------------
 -- CTEs
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -646,6 +646,27 @@ def test_pivot_columns(self):
             ) PIVOT (AVG("PrIcE"), MAX(quality) FOR partname IN ('prop' AS prop1, 'rudder'))
         """
 
+        two_in_clauses_duckdb = """
+            SELECT * FROM cities PIVOT (
+                sum(population) AS total,
+                count(population) AS count
+                FOR
+                    year IN (2000, 2010)
+                    country IN ('NL', 'US')
+            )
+        """
+
+        three_in_clauses_duckdb = """
+            SELECT * FROM cities PIVOT (
+                sum(population) AS total,
+                count(population) AS count
+                FOR
+                    year IN (2000, 2010)
+                    country IN ('NL', 'US')
+                    name IN ('Amsterdam', 'Seattle')
+            )
+        """
+
         query_to_column_names = {
             nothing_aliased: {
                 "bigquery": ["prop", "rudder"],
@@ -707,13 +728,48 @@ def test_pivot_columns(self):
                     '"rudder_max(quality)"',
                 ],
             },
+            two_in_clauses_duckdb: {
+                "duckdb": [
+                    '"2000_NL_total"',
+                    '"2000_NL_count"',
+                    '"2000_US_total"',
+                    '"2000_US_count"',
+                    '"2010_NL_total"',
+                    '"2010_NL_count"',
+                    '"2010_US_total"',
+                    '"2010_US_count"',
+                ],
+            },
+            three_in_clauses_duckdb: {
+                "duckdb": [
+                    '"2000_NL_Amsterdam_total"',
+                    '"2000_NL_Amsterdam_count"',
+                    '"2000_NL_Seattle_total"',
+                    '"2000_NL_Seattle_count"',
+                    '"2000_US_Amsterdam_total"',
+                    '"2000_US_Amsterdam_count"',
+                    '"2000_US_Seattle_total"',
+                    '"2000_US_Seattle_count"',
+                    '"2010_NL_Amsterdam_total"',
+                    '"2010_NL_Amsterdam_count"',
+                    '"2010_NL_Seattle_total"',
+                    '"2010_NL_Seattle_count"',
+                    '"2010_US_Amsterdam_total"',
+                    '"2010_US_Amsterdam_count"',
+                    '"2010_US_Seattle_total"',
+                    '"2010_US_Seattle_count"',
+                ],
+            },
         }
 
         for query, dialect_columns in query_to_column_names.items():
             for dialect, expected_columns in dialect_columns.items():
-                expr = parse_one(query, read=dialect)
-                columns = expr.args["from"].this.args["pivots"][0].args["columns"]
-                self.assertEqual(expected_columns, [col.sql(dialect=dialect) for col in columns])
+                with self.subTest(f"Testing query '{query}' for dialect {dialect}"):
+                    expr = parse_one(query, read=dialect)
+                    columns = expr.args["from"].this.args["pivots"][0].args["columns"]
+                    self.assertEqual(
+                        expected_columns, [col.sql(dialect=dialect) for col in columns]
+                    )
 
     def test_parse_nested(self):
         def warn_over_threshold(query: str, max_threshold: float = 0.2):