bodo-ai
diff --git a/‎pydough/conversion/agg_split.py‎
Lines changed: 47 additions & 8 deletions b/‎pydough/conversion/agg_split.py‎
Lines changed: 47 additions & 8 deletions
diff --git a/‎pydough/conversion/column_bubbler.py‎
Lines changed: 40 additions & 21 deletions b/‎pydough/conversion/column_bubbler.py‎
Lines changed: 40 additions & 21 deletions
diff --git a/‎pydough/conversion/projection_pullup.py‎
Lines changed: 11 additions & 14 deletions b/‎pydough/conversion/projection_pullup.py‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎pydough/conversion/relational_converter.py‎
Lines changed: 44 additions & 30 deletions b/‎pydough/conversion/relational_converter.py‎
Lines changed: 44 additions & 30 deletions
@@ -13,6 +13,7 @@
     CallExpression,
     ColumnReference,
     ColumnReferenceFinder,
+    ColumnReferenceInputNameRemover,
     Join,
     JoinType,
     LiteralExpression,
@@ -128,7 +129,7 @@ def decompose_aggregations(node: Aggregate, config: PyDoughConfigs) -> Relationa
     new_aggregate: Aggregate = Aggregate(node.input, node.keys, aggs)
     project_columns: dict[str, RelationalExpression] = {}
     for name, expr in node.keys.items():
-        project_columns[name] = expr
+        project_columns[name] = ColumnReference(name, expr.data_type)
     project_columns.update(
         {name: final_agg_columns[name] for name in node.aggregations}
     )
@@ -171,6 +172,8 @@ def transpose_aggregate_join(
     agg_input_name: str | None = join.default_input_aliases[agg_side]
     need_projection: bool = False
 
+    finder: ColumnReferenceFinder = ColumnReferenceFinder()
+    alias_remover: ColumnReferenceInputNameRemover = ColumnReferenceInputNameRemover()
     transposer: ExpressionTranspositionShuttle = ExpressionTranspositionShuttle(
         join, False
     )
@@ -234,19 +237,44 @@ def transpose_aggregate_join(
     for ref in side_keys:
         input_keys[ref.name] = ref.with_input(None)
     transposer.toggle_keep_input_names(True)
-    for agg_key in node.keys.values():
+    for agg_key_name, agg_key in node.keys.items():
+        finder.reset()
         transposed_agg_key = agg_key.accept_shuttle(transposer)
-        assert isinstance(transposed_agg_key, ColumnReference)
-        if transposed_agg_key.input_name == agg_input_name:
-            input_keys[transposed_agg_key.name] = transposed_agg_key.with_input(None)
+        transposed_agg_key.accept(finder)
+        if {col.input_name for col in finder.get_column_references()} == {
+            agg_input_name
+        }:
+            if isinstance(transposed_agg_key, ColumnReference):
+                input_keys[transposed_agg_key.name] = transposed_agg_key.accept_shuttle(
+                    alias_remover
+                )
+            else:
+                if agg_key_name in join.columns and (
+                    agg_key_name in input_keys or agg_key_name in input_aggs
+                ):
+                    # An edge cases that is theoretically possible but never
+                    # encountered so far, and where the behavior is undefined.
+                    raise NotImplementedError("Undefined behavior")
+                input_keys[agg_key_name] = transposed_agg_key.accept_shuttle(
+                    alias_remover
+                )
+                join.columns[agg_key_name] = ColumnReference(
+                    agg_key_name, agg_key.data_type, agg_input_name
+                )
+                node.keys[agg_key_name] = ColumnReference(
+                    agg_key_name, agg_key.data_type
+                )
+            projection_columns[agg_key_name] = ColumnReference(
+                agg_key_name, agg_key.data_type
+            )
 
     # Push the bottom-aggregate beneath the join
     join.inputs[agg_side] = Aggregate(agg_input, input_keys, input_aggs)
 
     # Replace the aggregation above the join with the top
     # side of the aggregations
     node._aggregations = top_aggs
-    node._columns = {**node.columns, **top_aggs}
+    node._columns = {**node.keys, **top_aggs}
 
     return need_projection, count_ref
 
@@ -276,11 +304,19 @@ def attempt_join_aggregate_transpose(
         # push the aggregate down.
         return node, True
 
+    # Verify that all of the aggregation keys strictly come from one side of the
+    # join.
+    finder: ColumnReferenceFinder = ColumnReferenceFinder()
+    for key_expr in node.keys.values():
+        finder.reset()
+        key_expr.accept(finder)
+        if len({ref.input_name for ref in finder.get_column_references()}) > 1:
+            return node, True
+
     # Break down the aggregation calls by which input they refer to.
     lhs_aggs: list[str] = []
     rhs_aggs: list[str] = []
     count_aggs: list[str] = []
-    finder: ColumnReferenceFinder = ColumnReferenceFinder()
     transposer: ExpressionTranspositionShuttle = ExpressionTranspositionShuttle(
         join, True
     )
@@ -341,7 +377,10 @@ def attempt_join_aggregate_transpose(
         # If we cannot push the aggregate down into either side, we cannot
         # perform the transpose.
         return node, True
+
     if need_count_aggs and not (can_push_left and can_push_right):
+        # If we need to push down COUNT(*) aggregates, but cannot push into
+        # both sides of the join, we cannot perform the transpose.
         return node, True
 
     # Parse the join condition to identify the lists of equi-join keys
@@ -365,7 +404,7 @@ def attempt_join_aggregate_transpose(
 
     # Keep a dictionary for the projection columns that will be used to post-process
     # the output of the aggregates, if needed.
-    projection_columns: dict[str, RelationalExpression] = {**node.keys}
+    projection_columns: dict[str, RelationalExpression] = {}
     need_projection: bool = False
 
     # If we need count aggregates, add one to each side of the join.
 
@@ -7,6 +7,8 @@
 __all__ = ["bubble_column_names"]
 
 
+import re
+
 from pydough.relational import (
     Aggregate,
     CallExpression,
@@ -46,36 +48,42 @@ def name_sort_key(name: str) -> tuple[bool, bool, str]:
     )
 
 
-def generate_agg_name(agg_expr: CallExpression) -> str | None:
+def generate_cleaner_names(expr: RelationalExpression, current_name: str) -> list[str]:
     """
-    Generates a more readable name for an aggregation expression based on its
-    function name and input column, if applicable. The two patterns of name
-    generation are:
+    Generates more readable names for an expression based on its, if applicable.
+    The patterns of name generation are:
 
-    - If the aggregation has a single input that is a column reference, the
+    - If a function has a single input that is a column reference, the
       name is generated as `<function_name>_<column_name>`. For example,
       `SUM(sales)` would become `sum_sales`, and `AVG(num_cars_owned)`
       would become `avg_num_cars_owned`.
-    - If the aggregation is a `COUNT` with no inputs, the name is simply
+    - If an aggregation is a `COUNT` with no inputs, the name is simply
       `n_rows`, indicating the number of rows counted.
+    - If the current name is in the form `name_idx`, try suggesting just `name`.
 
-    If neither of these conditions are met, the function returns `None`.
+    If none of these conditions are met, the function returns an empty list.
 
     Args:
-        `agg_expr`: The function call expression for which to generate a name,
-        which is presumed to be an aggregation call.
+        `expr`: The function call expression for which to generate
+        alternative names.
+        `current_name`: The current name of the expression.
 
     Returns:
-        A string representing the generated name, or `None` if no suitable
-        name can be generated based on the provided conditions.
+        A list of strings string representing the candidate generated names.
     """
-    if len(agg_expr.inputs) == 1:
-        input_expr = agg_expr.inputs[0]
-        if isinstance(input_expr, ColumnReference):
-            return f"{agg_expr.op.function_name.lower()}_{input_expr.name}"
-    if len(agg_expr.inputs) == 0 and agg_expr.op.function_name.lower() == "count":
-        return "n_rows"
-    return None
+    result: list[str] = []
+    if isinstance(expr, CallExpression):
+        if len(expr.inputs) == 1:
+            input_expr = expr.inputs[0]
+            if isinstance(input_expr, ColumnReference):
+                result.append(f"{expr.op.function_name.lower()}_{input_expr.name}")
+        if len(expr.inputs) == 0 and expr.op.function_name.lower() == "count":
+            result.append("n_rows")
+
+    if not (current_name.startswith("agg") or current_name.startswith("expr")):
+        if re.match(r"^(.*)_[0-9]+$", current_name):
+            result.append(re.findall(r"^(.*)_[0-9]+$", current_name)[0])
+    return result
 
 
 def run_column_bubbling(
@@ -160,6 +168,17 @@ def run_column_bubbling(
                         new_ref = remapping[new_ref]
                         name = new_expr.name
                         used_names.add(name)
+                    # Try the same thing with generated alternative names
+                    else:
+                        for alt_name in generate_cleaner_names(new_expr, name):
+                            if alt_name not in used_names:
+                                remapping[new_ref] = ColumnReference(
+                                    alt_name, new_expr.data_type
+                                )
+                                new_ref = remapping[new_ref]
+                                name = alt_name
+                                used_names.add(name)
+                                break
                     aliases[new_expr] = new_ref
                     output_columns[name] = new_expr
             # For limit, also transform the orderings if they exist.
@@ -218,14 +237,14 @@ def run_column_bubbling(
                     # Special case for aggregations: if the existing name is
                     # bad, try to replace it with a better name based on the
                     # function name and input column, if applicable.
-                    if name.startswith("agg") or name.startswith("expr"):
-                        alt_name: str | None = generate_agg_name(new_expr)
-                        if alt_name is not None and alt_name not in used_names:
+                    for alt_name in generate_cleaner_names(new_expr, name):
+                        if alt_name not in used_names:
                             used_names.add(alt_name)
                             alt_ref = ColumnReference(alt_name, call_expr.data_type)
                             remapping[new_ref] = alt_ref
                             new_ref = alt_ref
                             name = alt_name
+                            break
                     aliases[new_expr] = new_ref
                     new_aggs[name] = new_expr
             return Aggregate(new_input, new_keys, new_aggs), remapping
 
@@ -55,10 +55,10 @@ def widen_columns(
     # to the calling site.
     substitutions: dict[RelationalExpression, RelationalExpression] = {}
 
-    # Mapping of every expression in the node's columns to a reference to the
-    # column of the node that points to it. This is used to keep track of which
-    # expressions are already present in the node's columns versus the ones that
-    # should be added to un-prune the node.
+    # Mapping of every expression in the input nodes columns to a reference to
+    # the column of the node that points to it. This is used to keep track of
+    # which expressions are already present in the node's columns versus the
+    # ones that should be added to un-prune the node.
     existing_vals: dict[RelationalExpression, RelationalExpression] = {
         expr: ColumnReference(name, expr.data_type)
         for name, expr in node.columns.items()
@@ -71,28 +71,25 @@ def widen_columns(
         input_alias: str | None = node.default_input_aliases[input_idx]
         input_node: RelationalNode = node.inputs[input_idx]
         for name, expr in input_node.columns.items():
-            # If the current node is a Join, add input names to the expression.
-            if isinstance(node, Join):
-                expr = add_input_name(expr, input_alias)
-            ref_expr: ColumnReference = ColumnReference(
+            ref_expr: RelationalExpression = ColumnReference(
                 name, expr.data_type, input_name=input_alias
             )
+
             # If the expression is not already in the node's columns, then
             # inject it so the node can use it later if a pull-up occurs that
             # would need to reference this expression.
-            if expr not in existing_vals:
+            if ref_expr not in existing_vals:
                 new_name: str = name
                 idx: int = 0
                 while new_name in node.columns:
                     idx += 1
                     new_name = f"{name}_{idx}"
                 new_ref: ColumnReference = ColumnReference(new_name, expr.data_type)
                 node.columns[new_name] = ref_expr
-                existing_vals[expr] = ref_expr
-                if ref_expr != new_ref:
-                    substitutions[ref_expr] = new_ref
-            elif ref_expr != existing_vals[expr]:
-                substitutions[ref_expr] = existing_vals[expr]
+                existing_vals[ref_expr] = new_ref
+                substitutions[ref_expr] = new_ref
+            else:
+                substitutions[ref_expr] = existing_vals[ref_expr]
 
     # Return the substitution mapping
     return substitutions
 
@@ -1443,44 +1443,59 @@ def optimize_relational_tree(
         The optimized relational root.
     """
 
-    # Step 0: prune unused columns. This is done early to remove as many dead
+    # Start by pruning unused columns. This is done early to remove as many dead
     # names as possible so that steps that require generating column names can
     # use nicer names instead of generating nastier ones to avoid collisions.
     # It also speeds up all subsequent steps by reducing the total number of
     # objects inside the plan.
-    root = ColumnPruner().prune_unused_columns(root)
+    pruner: ColumnPruner = ColumnPruner()
+    root = pruner.prune_unused_columns(root)
+
+    # Bubble up names from the leaf nodes to further encourage simpler naming
+    # without aliases, and also to delete duplicate columns where possible.
+    # This is done early to maximize the chances that a nicer name will be used
+    # for aggregations before projection pullup eliminates many of those names
+    # by pulling the aggregated expression inputs into the aggregate call.
+    root = bubble_column_names(root)
+
+    # Run projection pullup to move projections as far up the tree as possible.
+    # This is done as soon as possible to make joins redundant if they only
+    # exist to compute a scalar projection and then link it with the data.
+    # print()
+    # print(root.to_tree_string())
+    root = confirm_root(pullup_projections(root))
+    # print()
+    # print(root.to_tree_string())
 
-    # Step 1: push filters down as far as possible
+    # Push filters down as far as possible
     root = confirm_root(push_filters(root, configs))
 
-    # Step 2: merge adjacent projections, unless it would result in excessive
-    # duplicate subexpression computations.
+    # Merge adjacent projections, unless it would result in excessive duplicate
+    # subexpression computations.
     root = confirm_root(merge_projects(root))
 
-    # Step 3: split aggregations on top of joins so part of the aggregate
-    # happens underneath the join.
+    # Split aggregations on top of joins so part of the aggregate happens
+    # underneath the join.
     root = confirm_root(split_partial_aggregates(root, configs))
 
-    # Step 4: delete aggregations that are inferred to be redundant due to
-    # operating on already unique data.
+    # Delete aggregations that are inferred to be redundant due to operating on
+    # already unique data.
     root = remove_redundant_aggs(root)
 
-    # Step 5: re-run projection merging since the removal of redundant
-    # aggregations may have created redundant projections that can be deleted.
+    # Re-run projection merging since the removal of redundant aggregations may
+    # have created redundant projections that can be deleted.
     root = confirm_root(merge_projects(root))
 
-    # Step 6: re-run column pruning after the various steps, which may have
-    # rendered more columns unused. This is done befre the next step to remove
-    # as many column names as possible so the column bubbling step can try to
-    # use nicer names without worrying about collisions.
-    root = ColumnPruner().prune_unused_columns(root)
+    # Re-run column pruning after the various steps, which may have rendered
+    # more columns unused. This is done befre the next step to remove as many
+    # column names as possible so the column bubbling step can try to use nicer
+    # names without worrying about collisions.
+    root = pruner.prune_unused_columns(root)
 
-    # Step 7: bubble up names from the leaf nodes to further encourage simpler
-    # naming without aliases, and also to delete duplicate columns where
-    # possible.
+    # Re-run column bubbling now that the columns have been pruned again.
     root = bubble_column_names(root)
 
-    # Step 8: the following pipeline twice:
+    # Run the following pipeline twice:
     #   A: projection pullup
     #   B: expression simplification
     #   C: filter pushdown
@@ -1494,21 +1509,20 @@ def optimize_relational_tree(
         root = confirm_root(pullup_projections(root))
         simplify_expressions(root, configs, additional_shuttles)
         root = confirm_root(push_filters(root, configs))
-        root = ColumnPruner().prune_unused_columns(root)
+        root = pruner.prune_unused_columns(root)
 
-    # Step 9: re-run projection merging, without pushing into joins. This
-    # will allow some redundant projections created by pullup to be removed
-    # entirely.
+    # Re-run projection merging, without pushing into joins. This will allow
+    # some redundant projections created by pullup to be removed entirely.
     root = confirm_root(merge_projects(root, push_into_joins=False))
 
-    # Step 10: re-run column bubbling to further simplify the final names of
-    # columns in the output now that more columns have been pruned, and delete
-    # any new duplicate columns that were created during the pullup step.
+    # Re-run column bubbling to further simplify the final names of columns in
+    # the output now that more columns have been pruned, and delete any new
+    # duplicate columns that were created during the pullup step.
     root = bubble_column_names(root)
 
-    # Step 11: re-run column pruning one last time to remove any columns that
-    # are no longer used after the final round of transformations.
-    root = ColumnPruner().prune_unused_columns(root)
+    # Re-run column pruning one last time to remove any columns that are no
+    # longer used after the final round of transformations.
+    root = pruner.prune_unused_columns(root)
 
     return root