Skip to content
Draft
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pydough/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"get_logger",
"init_pydough_context",
"parse_json_metadata_from_file",
"range_collection",
"to_df",
"to_sql",
]
Expand All @@ -22,6 +23,7 @@
from .logger import get_logger
from .metadata import parse_json_metadata_from_file
from .unqualified import display_raw, from_string, init_pydough_context
from .user_collections.user_collection_apis import range_collection

# Create a default session for the user to interact with.
# In most situations users will just use this session and
Expand Down
3 changes: 2 additions & 1 deletion pydough/conversion/agg_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
CallExpression,
EmptySingleton,
Filter,
GeneratedTable,
Join,
JoinType,
Limit,
Expand Down Expand Up @@ -276,7 +277,7 @@ def aggregation_uniqueness_helper(
)
return node, final_uniqueness
# Empty singletons don't have uniqueness information.
case EmptySingleton():
case EmptySingleton() | GeneratedTable():
return node, set()
case _:
raise NotImplementedError(
Expand Down
3 changes: 2 additions & 1 deletion pydough/conversion/filter_pushdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ColumnReference,
EmptySingleton,
Filter,
GeneratedTable,
Join,
JoinType,
Limit,
Expand Down Expand Up @@ -143,7 +144,7 @@ def push_filters(
# be transposed beneath a limit without changing its output.
node._input = push_filters(node.input, set())
return build_filter(node, filters)
case EmptySingleton() | Scan():
case EmptySingleton() | Scan() | GeneratedTable():
# For remaining nodes, materialize all of the remaining filters.
return build_filter(node, filters)
case _:
Expand Down
31 changes: 31 additions & 0 deletions pydough/conversion/hybrid_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"HybridPartition",
"HybridPartitionChild",
"HybridRoot",
"HybridUserGeneratedCollection",
]


Expand All @@ -27,6 +28,9 @@
ColumnProperty,
PyDoughExpressionQDAG,
)
from pydough.qdag.collections.user_collection_qdag import (
PyDoughUserGeneratedCollectionQDag,
)

from .hybrid_connection import HybridConnection
from .hybrid_expressions import (
Expand Down Expand Up @@ -483,3 +487,30 @@ def __repr__(self):

def search_term_definition(self, name: str) -> HybridExpr | None:
return self.predecessor.search_term_definition(name)


class HybridUserGeneratedCollection(HybridOperation):
"""
Class for HybridOperation corresponding to a user-generated collection.
"""

def __init__(self, user_collection: PyDoughUserGeneratedCollectionQDag):
"""
Args:
`collection`: the QDAG node for the user-generated collection.
"""
self._user_collection: PyDoughUserGeneratedCollectionQDag = user_collection
terms: dict[str, HybridExpr] = {}
for name, typ in user_collection.collection.column_names_and_types:
terms[name] = HybridRefExpr(name, typ)
super().__init__(terms, {}, [], [])

@property
def user_collection(self) -> PyDoughUserGeneratedCollectionQDag:
"""
The user-generated collection that this hybrid operation represents.
"""
return self._user_collection

def __repr__(self):
return f"USER_GEN_COLLECTION[{self.user_collection.name}]"
Comment on lines +519 to +520
Copy link
Contributor

@knassre-bodo knassre-bodo Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name doesn't matter here, let's stringify using self.user_collection.to_string()

26 changes: 26 additions & 0 deletions pydough/conversion/hybrid_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
Where,
WindowCall,
)
from pydough.qdag.collections.user_collection_qdag import (
PyDoughUserGeneratedCollectionQDag,
)
from pydough.types import BooleanType, NumericType

from .hybrid_connection import ConnectionType, HybridConnection
Expand All @@ -68,6 +71,7 @@
HybridPartition,
HybridPartitionChild,
HybridRoot,
HybridUserGeneratedCollection,
)
from .hybrid_syncretizer import HybridSyncretizer
from .hybrid_tree import HybridTree
Expand Down Expand Up @@ -1339,6 +1343,9 @@ def define_root_link(
case HybridRoot():
# A root does not need to be joined to its parent
join_keys = []
case HybridUserGeneratedCollection():
# A user-generated collection does not need to be joined to its parent
join_keys = []
case _:
raise NotImplementedError(f"{operation.__class__.__name__}")
if join_keys is not None:
Expand Down Expand Up @@ -1624,12 +1631,31 @@ def make_hybrid_tree(
successor_hybrid = HybridTree(
HybridRoot(), node.ancestral_mapping
)
# HA: TODO: handle the case where the child access is a
# user-generated collection.
case HybridUserGeneratedCollection():
raise NotImplementedError(
"User-generated collections are not supported in child access"
)
case _:
raise NotImplementedError(
f"{node.__class__.__name__} (child is {node.child_access.__class__.__name__})"
)
self.define_root_link(parent, successor_hybrid, is_aggregate)
return successor_hybrid
case PyDoughUserGeneratedCollectionQDag():
# A user-generated collection is a special case of a collection
# access that is not a sub-collection, but rather a user-defined
# collection that is defined in the PyDough user collections.
hybrid_collection = HybridUserGeneratedCollection(node)
# Create a new hybrid tree for the user-generated collection.
successor_hybrid = HybridTree(hybrid_collection, node.ancestral_mapping)
hybrid = self.make_hybrid_tree(
node.ancestor_context, parent, is_aggregate
)
hybrid.add_successor(successor_hybrid)
return successor_hybrid

case _:
raise NotImplementedError(f"{node.__class__.__name__}")

Expand Down
9 changes: 9 additions & 0 deletions pydough/conversion/hybrid_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
HybridPartition,
HybridPartitionChild,
HybridRoot,
HybridUserGeneratedCollection,
)


Expand Down Expand Up @@ -676,6 +677,9 @@ def always_exists(self) -> bool:
# Stepping into a partition child always has a matching data
# record for each parent, by definition.
pass
case HybridUserGeneratedCollection():
# User-generated collections are always guaranteed to exist.
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No they aren't (what if the range is empty?) This is why we need the always exists field for the generated colleciton.

case _:
raise NotImplementedError(
f"Invalid start of pipeline: {start_operation.__class__.__name__}"
Expand Down Expand Up @@ -726,6 +730,11 @@ def is_singular(self) -> bool:
case HybridChildPullUp():
if not self.children[self.pipeline[0].child_idx].subtree.is_singular():
return False
# HA TODO: confirm is that right?
case HybridUserGeneratedCollection():
# User-generated collections are always guaranteed to be
# singular.
pass
Copy link
Contributor

@knassre-bodo knassre-bodo Jul 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely not. It is only singular if we can guarantee it has <=1 rows.

case _:
return False
# The current level is fine, so check any levels above it next.
Expand Down
43 changes: 41 additions & 2 deletions pydough/conversion/relational_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
EmptySingleton,
ExpressionSortInfo,
Filter,
GeneratedTable,
Join,
JoinCardinality,
JoinType,
Expand All @@ -49,6 +50,7 @@
WindowCallExpression,
)
from pydough.types import BooleanType, NumericType, UnknownType
from pydough.types.pydough_type import PyDoughType

from .agg_removal import remove_redundant_aggs
from .agg_split import split_partial_aggregates
Expand Down Expand Up @@ -79,6 +81,7 @@
HybridPartition,
HybridPartitionChild,
HybridRoot,
HybridUserGeneratedCollection,
)
from .hybrid_translator import HybridTranslator
from .hybrid_tree import HybridTree
Expand Down Expand Up @@ -1166,6 +1169,29 @@ def translate_hybridroot(self, context: TranslationOutput) -> TranslationOutput:
new_expressions[shifted_expr] = column_ref
return TranslationOutput(context.relational_node, new_expressions)

def build_user_generated_table(
self, node: HybridUserGeneratedCollection
) -> TranslationOutput:
"""Builds a user-generated table from the given hybrid user-generated collection.

Args:
`node`: The user-generated collection node to translate.

Returns:
The translated output payload.
"""
collection = node._user_collection.collection
out_columns: dict[HybridExpr, ColumnReference] = {}
gen_columns: dict[str, RelationalExpression] = {}
for column_name, column_type in collection.column_names_and_types:
hybrid_ref = HybridRefExpr(column_name, column_type)
col_ref = ColumnReference(column_name, column_type)
out_columns[hybrid_ref] = col_ref
gen_columns[column_name] = col_ref

answer = GeneratedTable(collection)
return TranslationOutput(answer, out_columns)

def rel_translation(
self,
hybrid: HybridTree,
Expand Down Expand Up @@ -1289,6 +1315,8 @@ def rel_translation(
case HybridRoot():
assert context is not None, "Malformed HybridTree pattern."
result = self.translate_hybridroot(context)
case HybridUserGeneratedCollection():
result = self.build_user_generated_table(operation)
case _:
raise NotImplementedError(
f"TODO: support relational conversion on {operation.__class__.__name__}"
Expand All @@ -1304,16 +1332,27 @@ def preprocess_root(
"""
Transforms the final PyDough collection by appending it with an extra
CALCULATE containing all of the columns that are output.
Args:
`node`: the PyDough QDAG collection node to be translated.
`output_cols`: a list of tuples in the form `(alias, column)`
describing every column that should be in the output, in the order
they should appear, and the alias they should be given. If None, uses
the most recent CALCULATE in the node to determine the columns.
Returns:
The PyDoughCollectionQDAG with an additional CALCULATE at the end
that contains all of the columns that should be in the output.
"""
# Fetch all of the expressions that should be kept in the final output
final_terms: list[tuple[str, PyDoughExpressionQDAG]] = []
if output_cols is None:
for name in node.calc_terms:
final_terms.append((name, Reference(node, name)))
name_typ: PyDoughType = node.get_expr(name).pydough_type
final_terms.append((name, Reference(node, name, name_typ)))
final_terms.sort(key=lambda term: node.get_expression_position(term[0]))
else:
for _, column in output_cols:
final_terms.append((column, Reference(node, column)))
column_typ: PyDoughType = node.get_expr(column).pydough_type
final_terms.append((column, Reference(node, column, column_typ)))
children: list[PyDoughCollectionQDAG] = []
final_calc: Calculate = Calculate(node, children).with_terms(final_terms)
return final_calc
Expand Down
17 changes: 13 additions & 4 deletions pydough/qdag/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ table_collection = builder.build_child_access("Nations", global_context_node)

# Build a reference node
# Equivalent PyDough code: `TPCH.Nations.name`
reference_node = builder.build_reference(table_collection, "name")
ref_name = "name"
pydough_type = table_collection.get_expr(ref_name).pydough_type
reference_node = builder.build_reference(table_collection, ref_name, pydough_type)

# Build an expression function call node
# Equivalent PyDough code: `LOWER(TPCH.Nations.name)`
Expand Down Expand Up @@ -99,7 +101,10 @@ regions_collection = builder.build_child_access("Regions", global_context_node)
# Access nations sub-collection
nations_sub_collection = builder.build_child_access("nations", regions_collection)
# Create WHERE(key == 4) condition
key_ref = builder.build_reference(nations_sub_collection, "key")

ref_name = "key"
pydough_type = nations_sub_collection.get_expr(ref_name).pydough_type
key_ref = builder.build_reference(nations_sub_collection, ref_name, pydough_type)
literal_4 = builder.build_literal(4, NumericType())
condition = builder.build_expression_function_call("EQU", [key_ref, literal_4])
# Build WHERE node with condition
Expand All @@ -108,7 +113,9 @@ where_node = where_node.with_condition(condition)
# Create SINGULAR node from filtered result
singular_node = builder.build_singular(where_node)
# Build reference node for name
reference_node = builder.build_reference(singular_node, "name")
ref_name = "name"
pydough_type = singular_node.get_expr(ref_name).pydough_type
reference_node = builder.build_reference(singular_node, ref_name, pydough_type)
# Build CALCULATE node with calculated term
calculate_node = builder.build_calc(regions_collection, [nations_sub_collection])
calculate_node = calculate_node.with_terms([("n_4_nation", reference_node)])
Expand All @@ -130,7 +137,9 @@ top_k_node = top_k_node.with_collation([collation_expression])
# Build a PARTITION BY node
# Equivalent PyDough code: `TPCH.PARTITION(Parts, name="p", by=part_type)`
part_collection = builder.build_child_access("Parts", global_context_node)
partition_key = builder.build_reference(part_collection, "part_type")
ref_name = "part_type"
pydough_type = part_collection.get_expr(ref_name).pydough_type
partition_key = builder.build_reference(part_collection, ref_name, pydough_type)
partition_by_node = builder.build_partition(part_collection, child_collection, "p")
partition_by_node = partition_by_node.with_keys([partition_key])

Expand Down
1 change: 1 addition & 0 deletions pydough/qdag/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"TableCollection",
"TopK",
"Where",
"range_collection",
]

from .augmenting_child_operator import AugmentingChildOperator
Expand Down
3 changes: 2 additions & 1 deletion pydough/qdag/collections/augmenting_child_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def get_term(self, term_name: str) -> PyDoughQDAG:
if isinstance(term, ChildAccess):
term = term.clone_with_parent(self)
elif isinstance(term, PyDoughExpressionQDAG):
term = Reference(self.preceding_context, term_name)
typ = self.preceding_context.get_expr(term_name).pydough_type
term = Reference(self.preceding_context, term_name, typ)
return term

@cache
Expand Down
4 changes: 3 additions & 1 deletion pydough/qdag/collections/collection_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ def get_term(self, term_name: str) -> PyDoughQDAG:
else:
assert context.ancestor_context is not None
context = context.ancestor_context
return Reference(context, term_name)
return Reference(
context, term_name, context.get_expr(term_name).pydough_type
)

if term_name not in self.all_terms:
raise PyDoughQDAGException(self.name_mismatch_error(term_name))
Expand Down
4 changes: 3 additions & 1 deletion pydough/qdag/collections/partition_child.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def get_term(self, term_name: str):
else:
assert context.ancestor_context is not None
context = context.ancestor_context
return Reference(context, term_name)
return Reference(
context, term_name, context.get_expr(term_name).pydough_type
)

elif term_name not in self.all_terms:
raise PyDoughQDAGException(self.name_mismatch_error(term_name))
Expand Down
Loading