SNOW-3176017: Fix accidental removal of aliases in certain JOIN statements (#4096)

sfc-gh-joshi · web-flow · commit f1d02bf37914 · 2026-03-23T11:30:09.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Release History
 
+## 1.49.0 (TBD)
+
+### Snowpark Python API Updates
+
+#### New Features
+
+#### Bug Fixes
+
+#### Improvements
+
+- Restored the following query improvements that were reverted in 1.47.0 due to bugs:
+    - Reduced the size of queries generated by certain `DataFrame.join` operations.
+    - Removed redundant aliases in generated queries (for example, `SELECT "A" AS "A"` is now always simplified to `SELECT "A"`).
+
 ## 1.48.0 (TBD)
 
 ### Snowpark Python API Updates
diff --git a/src/snowflake/snowpark/_internal/analyzer/analyzer.py b/src/snowflake/snowpark/_internal/analyzer/analyzer.py
@@ -763,10 +763,18 @@ def unary_expression_extractor(
                     for k, v in df_alias_dict.items():
                         if v == expr.child.name:
                             df_alias_dict[k] = updated_due_to_inheritance  # type: ignore
+            origin = self.analyze(
+                expr.child, df_aliased_col_name_to_real_col_name, parse_local_name
+            )
+            if (
+                isinstance(expr.child, (Attribute, UnresolvedAttribute))
+                and origin == quoted_name
+            ):
+                # If the column name matches the target of the alias (`quoted_name`),
+                # we can directly emit the column name without an AS clause.
+                return origin
             return alias_expression(
-                self.analyze(
-                    expr.child, df_aliased_col_name_to_real_col_name, parse_local_name
-                ),
+                origin,
                 quoted_name,
             )
 
diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py
@@ -2244,11 +2244,18 @@ def derive_column_states_from_subquery(
             else Attribute(quoted_c_name, DataType())
         )
         from_c_state = from_.column_states.get(quoted_c_name)
+        result_name = analyzer.analyze(
+            c, from_.df_aliased_col_name_to_real_col_name, parse_local_name=True
+        ).strip(" ")
         if from_c_state and from_c_state.change_state != ColumnChangeState.DROPPED:
             # review later. should use parse_column_name
-            if c_name != analyzer.analyze(
-                c, from_.df_aliased_col_name_to_real_col_name, parse_local_name=True
-            ).strip(" "):
+            # SNOW-2895675: Always treat Aliases as "changed", even if it is an identity.
+            # The fact this check is needed may be a bug in column state analysis, and we should revisit it later.
+            # The following tests fail without this check:
+            # - tests/integ/test_cte.py::test_sql_simplifier
+            # - tests/integ/scala/test_dataframe_suite.py::test_rename_join_dataframe
+            # - tests/integ/test_dataframe.py::test_dataframe_alias
+            if c_name != result_name or isinstance(c, Alias):
                 column_states[quoted_c_name] = ColumnState(
                     quoted_c_name,
                     ColumnChangeState.CHANGED_EXP,
diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py
@@ -277,29 +277,69 @@ def _get_unaliased(col_name: str) -> List[str]:
     return unaliased
 
 
+def _get_aliased_column_names(
+    df: "DataFrame",
+    cs: List[str],
+    prefix: Optional[str],
+    suffix: Optional[str],
+    common_col_names: List[str],
+) -> List[str]:
+    aliases = []
+    for c in cs:
+        unquoted_col_name = c.strip('"')
+        if c in common_col_names:
+            if suffix:
+                column_case_insensitive = is_snowflake_quoted_id_case_insensitive(c)
+                suffix_unqouted_case_insensitive = (
+                    is_snowflake_unquoted_suffix_case_insensitive(suffix)
+                )
+                aliases.append(
+                    f'"{unquoted_col_name}{suffix.upper()}"'
+                    if column_case_insensitive and suffix_unqouted_case_insensitive
+                    else f'''"{unquoted_col_name}{escape_quotes(suffix.strip('"'))}"'''
+                )
+            else:
+                aliases.append(f'"{prefix}{unquoted_col_name}"')
+        else:
+            # Removal of redundant aliases (like `"A" AS "A"`) is handled at the analyzer level.
+            aliases.append(f'"{unquoted_col_name}"')
+    return aliases
+
+
+def _apply_aliases(
+    df: "DataFrame",
+    cs: List[str],
+    c_aliases: List[str],
+) -> List[Column]:
+    return [
+        df.col(c, _emit_ast=False).alias(c_alias) for c, c_alias in zip(cs, c_aliases)
+    ]
+
+
 def _alias_if_needed(
     df: "DataFrame",
-    c: str,
+    cs: List[str],
     prefix: Optional[str],
     suffix: Optional[str],
     common_col_names: List[str],
-):
-    col = df.col(c, _emit_ast=False)
-    unquoted_col_name = c.strip('"')
-    if c in common_col_names:
-        if suffix:
-            column_case_insensitive = is_snowflake_quoted_id_case_insensitive(c)
-            suffix_unqouted_case_insensitive = (
-                is_snowflake_unquoted_suffix_case_insensitive(suffix)
-            )
-            return col.alias(
-                f'"{unquoted_col_name}{suffix.upper()}"'
-                if column_case_insensitive and suffix_unqouted_case_insensitive
-                else f'''"{unquoted_col_name}{escape_quotes(suffix.strip('"'))}"'''
-            )
-        return col.alias(f'"{prefix}{unquoted_col_name}"')
-    else:
-        return col.alias(f'"{unquoted_col_name}"')
+) -> List[Column]:
+    return _apply_aliases(
+        df, cs, _get_aliased_column_names(df, cs, prefix, suffix, common_col_names)
+    )
+
+
+def _populate_expr_to_alias(df: "DataFrame") -> None:
+    """
+    Populate expr_to_alias mapping for a DataFrame's output columns.
+    This is needed for column lineage tracking when we skip the select() wrapping
+    optimization in _disambiguate.
+    """
+    for attr in df._output:
+        # Map each attribute's expr_id to its quoted column name
+        # This allows later lookups like df["column_name"] to resolve correctly
+        # Use quote_name() for consistency with analyzer.py Alias handling (line 743, 756)
+        if attr.expr_id not in df._plan.expr_to_alias:
+            df._plan.expr_to_alias[attr.expr_id] = quote_name(attr.name)
 
 
 def _disambiguate(
@@ -328,11 +368,11 @@ def _disambiguate(
         for n in lhs_names
         if n in set(rhs_names) and n not in normalized_using_columns
     ]
+
     all_names = [unquote_if_quoted(n) for n in lhs_names + rhs_names]
 
-    if common_col_names:
-        # We use the session of the LHS DataFrame to report this telemetry
-        lhs._session._conn._telemetry_client.send_alias_in_join_telemetry()
+    # We use the session of the LHS DataFrame to report this telemetry
+    lhs._session._conn._telemetry_client.send_alias_in_join_telemetry()
 
     lsuffix = lsuffix or lhs._alias
     rsuffix = rsuffix or rhs._alias
@@ -344,25 +384,37 @@ def _disambiguate(
         _generate_deterministic_prefix("r", all_names) if not suffix_provided else ""
     )
 
+    lhs_aliases = _get_aliased_column_names(
+        lhs,
+        lhs_names,
+        lhs_prefix,
+        lsuffix,
+        [] if isinstance(join_type, (LeftSemi, LeftAnti)) else common_col_names,
+    )
+    rhs_aliases = _get_aliased_column_names(
+        rhs, rhs_names, rhs_prefix, rsuffix, common_col_names
+    )
+    if all(
+        l_name == l_aliased for l_name, l_aliased in zip(lhs_names, lhs_aliases)
+    ) and all(r_name == r_aliased for r_name, r_aliased in zip(rhs_names, rhs_aliases)):
+        # Optimization: No column name conflicts, so we can skip aliasing and the select() wrapping.
+        # But we still need to populate expr_to_alias for column lineage tracking,
+        # so that df["column_name"] can resolve correctly after the join.
+        # This is identified by the test case
+        # tests/integ/scala/test_dataframe_join_suite.py::test_name_alias_on_multiple_join.
+        # Note that we must also ensure none of the column names have changed due to internal quote stripping:
+        # see tests/integ/compiler/test_query_generator.py::test_disambiguate_skips_quoted_alias for details.
+        _populate_expr_to_alias(lhs)
+        _populate_expr_to_alias(rhs)
+        return lhs, rhs
+
     lhs_remapped = lhs.select(
-        [
-            _alias_if_needed(
-                lhs,
-                name,
-                lhs_prefix,
-                lsuffix,
-                [] if isinstance(join_type, (LeftSemi, LeftAnti)) else common_col_names,
-            )
-            for name in lhs_names
-        ],
+        _apply_aliases(lhs, lhs_names, lhs_aliases),
         _emit_ast=False,
     )
 
     rhs_remapped = rhs.select(
-        [
-            _alias_if_needed(rhs, name, rhs_prefix, rsuffix, common_col_names)
-            for name in rhs_names
-        ],
+        _apply_aliases(rhs, rhs_names, rhs_aliases),
         _emit_ast=False,
     )
     return lhs_remapped, rhs_remapped
@@ -5113,16 +5165,13 @@ def _lateral(
             )
         prefix = _generate_prefix("a")
         child = self.select(
-            [
-                _alias_if_needed(
-                    self,
-                    attr.name,
-                    prefix,
-                    suffix=None,
-                    common_col_names=common_col_names,
-                )
-                for attr in self._output
-            ],
+            _alias_if_needed(
+                self,
+                [attr.name for attr in self._output],
+                prefix,
+                suffix=None,
+                common_col_names=common_col_names,
+            ),
             _emit_ast=False,
         )
         return DataFrame(
diff --git a/src/snowflake/snowpark/mock/_analyzer.py b/src/snowflake/snowpark/mock/_analyzer.py
@@ -654,12 +654,18 @@ def unary_expression_extractor(
                             if v == expr.child.name:
                                 df_alias_dict[k] = quoted_name
 
-            alias_exp = alias_expression(
-                self.analyze(
-                    expr.child, df_aliased_col_name_to_real_col_name, parse_local_name
-                ),
-                quoted_name,
+            origin = self.analyze(
+                expr.child, df_aliased_col_name_to_real_col_name, parse_local_name
             )
+            if (
+                isinstance(expr.child, (Attribute, UnresolvedAttribute))
+                and origin == quoted_name
+            ):
+                # If the column name matches the target of the alias (`quoted_name`),
+                # we can directly emit the column name without an AS clause.
+                return origin
+
+            alias_exp = alias_expression(origin, quoted_name)
 
             expr_str = alias_exp if keep_alias else expr.name or keep_alias
             expr_str = expr_str.upper() if parse_local_name else expr_str
diff --git a/src/snowflake/snowpark/mock/_plan.py b/src/snowflake/snowpark/mock/_plan.py
@@ -1495,8 +1495,8 @@ def aggregate_by_groups(cur_group: TableEmulator):
     if isinstance(source_plan, Project):
         return TableEmulator(ColumnEmulator(col) for col in source_plan.project_list)
     if isinstance(source_plan, Join):
-        L_expr_to_alias = {}
-        R_expr_to_alias = {}
+        L_expr_to_alias = dict(getattr(source_plan.left, "expr_to_alias", None) or {})
+        R_expr_to_alias = dict(getattr(source_plan.right, "expr_to_alias", None) or {})
         left = execute_mock_plan(source_plan.left, L_expr_to_alias).reset_index(
             drop=True
         )
diff --git a/tests/integ/compiler/test_query_generator.py b/tests/integ/compiler/test_query_generator.py
@@ -5,7 +5,11 @@
 import copy
 from typing import List
 from unittest.mock import patch
+import tempfile
+import os
+import re
 
+import pandas
 import pytest
 
 import snowflake.snowpark._internal.analyzer.snowflake_plan as snowflake_plan
@@ -551,6 +555,102 @@ def test_select_alias(session):
     check_generated_plan_queries(df2._plan)
 
 
+def test_select_alias_identity(session):
+    df = session.create_dataframe([[1, 2], [3, 4]], schema=["a", "b"])
+    df_res = df.select("a", col("b").as_("b"))
+    if session.sql_simplifier_enabled:
+        # Because "b" was aliased to itself, the emitted SQL should drop the AS clause.
+        ref_query = 'SELECT "A", "B" FROM ( SELECT $1 AS "A", $2 AS "B" FROM  VALUES (1 :: INT, 2 :: INT), (3 :: INT, 4 :: INT))'
+    else:
+        ref_query = 'SELECT "A", "B" FROM (SELECT "A", "B" FROM (SELECT $1 AS "A", $2 AS "B" FROM VALUES (1 :: INT, 2 :: INT), (3 :: INT, 4 :: INT)))'
+    assert Utils.normalize_sql(df_res.queries["queries"][-1]) == Utils.normalize_sql(
+        ref_query
+    )
+
+
+def test_disambiguate_skips_quoted_alias(session):
+    # SNOW-3176017: This tests a previous regression in a SnowML pipeline where alias optimization
+    # incorrectly removed an alias from """col_0""" (triple-quoted in SQL) to "col_0" (single-quoted).
+    # Due to differences in code paths for generating select statements, this bug is only apparent with
+    # triple-quoted identifiers created from a file read operation, and not from a direct `.project` call.
+    session_stage = session.get_session_stage()
+    data = [[0, 1, 2], [3, 4, 5]]
+    pandas_df = pandas.DataFrame(data, columns=["ID", '"COL_0"', '"COL_1"'])
+    stage_filename = f"{session_stage}/disambiguate_test.parquet"
+    with tempfile.TemporaryDirectory() as temp_dir:
+        local_path = os.path.join(temp_dir, "disambiguate_test.parquet")
+        pandas_df.to_parquet(local_path)
+        Utils.upload_to_stage(session, stage_filename, local_path, compress=False)
+    df1 = session.read.parquet(stage_filename)
+    df2 = session.create_dataframe(data, schema=["ID", "A", "B"])
+    df_res = df1.join(df2, on=["ID"])[['"COL_0"', '"COL_1"']]
+    # TODO run with sql simplifier disabled
+    actual_query = re.sub(
+        r'@"[\d\w\_]+"\."[\d\w\_]+"\.',
+        '@"DB_SCHEMA_NAME".',
+        re.sub(
+            r"SNOWPARK_TEMP_(STAGE|FILE_FORMAT)_[\d\w]+",
+            "SNOWPARK_TEMP_NAME",
+            df_res.queries["queries"][-1],
+        ),
+    )
+    if session.sql_simplifier_enabled:
+        rhs_creation_sql = """
+        SELECT
+            "ID",
+            "A",
+            "B"
+        FROM (
+            SELECT $1 AS "ID", $2 AS "A", $3 AS "B" FROM  VALUES (0 :: INT, 1 :: INT, 2 :: INT), (3 :: INT, 4 :: INT, 5 :: INT)
+        )
+        """
+    else:
+        rhs_creation_sql = """
+        SELECT
+            "ID",
+            "A",
+            "B"
+        FROM (
+            SELECT
+                "ID",
+                "A",
+                "B"
+            FROM (
+                SELECT $1 AS "ID", $2 AS "A", $3 AS "B" FROM  VALUES (0 :: INT, 1 :: INT, 2 :: INT), (3 :: INT, 4 :: INT, 5 :: INT)
+            )
+        )
+        """
+
+    ref_query = f'''
+    SELECT
+        "COL_0",
+        "COL_1"
+    FROM (
+        SELECT *
+        FROM (
+            (
+                SELECT
+                    "ID",
+                    """COL_0""" AS "COL_0",
+                    """COL_1""" AS "COL_1"
+                FROM (
+                    SELECT $1:"ID"::NUMBER(38, 0) AS "ID", $1:"""COL_0"""::NUMBER(38, 0) AS """COL_0""", $1:"""COL_1"""::NUMBER(38, 0) AS """COL_1""" FROM @"DB_SCHEMA_NAME".SNOWPARK_TEMP_NAME/disambiguate_test.parquet( FILE_FORMAT  => 'SNOWPARK_TEMP_NAME')
+                )
+            ) AS SNOWPARK_LEFT
+            INNER JOIN
+            (
+                {rhs_creation_sql}
+            ) AS SNOWPARK_RIGHT
+            USING (ID)
+        )
+    )
+    '''
+    assert Utils.normalize_sql(actual_query) == Utils.normalize_sql(ref_query)
+    # Ensure the DF can be materialized without error
+    materialized = df_res.to_pandas()
+    assert list(materialized.columns) == ["COL_0", "COL_1"]
+
+
 def test_nullable_is_false_dataframe(session):
     from snowflake.snowpark._internal.analyzer.analyzer import ARRAY_BIND_THRESHOLD
 
diff --git a/tests/integ/test_query_line_intervals.py b/tests/integ/test_query_line_intervals.py
diff --git a/tests/integ/test_query_plan_analysis.py b/tests/integ/test_query_plan_analysis.py
diff --git a/tests/integ/test_simplifier_suite.py b/tests/integ/test_simplifier_suite.py