Skip to content

Commit e204a4a

Browse files
authored
Keep track of reverse-cardinality in Joins for optimizations (#413)
Modifying the cardinality set up for relational join nodes to have a notion of "reverse cardinality", e.g. what is the cardinality from the perspective of the RHS input with regards to the LHS input. For example, when the left hand side is the join is the tpch CUSTOMER table and the right hand side is the tpch ORDERS table, the cardinality is `PLURAL_FILTER` (since each customer can have 0, 1, or multiple matching orders) but the reverse cardinality is `SINGULAR_ACCESS` (since each order has exactly 1 matching customer). This reverse cardinality can be used for two things right away: - Adjusting the partial aggregation splitting protocol to infer when to push / not push an aggregate into the _right_ hand side based on the reverse cardinality (e.g. if the reverse cardinality is filtering, don't push because the join will actually reduce the number of rows). - Modifying the column pruning protocol for joins, which currently removes the RHS for certain kinds of joins if the RHS columns are unused and the cardinality is `SINGULAR_ACCESS`: can now dot he same to prune the LHS entirely if the reverse cardinality is `SINGULAR_ACCESS` (e.g. in the `CUSTOMER` -> `ORDERS` example, if it is an inner join and every column in `CUSTOMER` is unused, we can just prune the `CUSTOMER` side of the join entirely)
1 parent ccff7af commit e204a4a

File tree

449 files changed

+2438
-2002
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

449 files changed

+2438
-2002
lines changed

demos/metadata/tpch_demo_graph.json

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"name": "regions",
88
"type": "simple table",
99
"table path": "main.REGION",
10-
"unique properties": ["key"],
10+
"unique properties": ["key", "name"],
1111
"properties": [
1212
{
1313
"name": "key",
@@ -41,7 +41,7 @@
4141
"name": "nations",
4242
"type": "simple table",
4343
"table path": "main.NATION",
44-
"unique properties": ["key"],
44+
"unique properties": ["key", "name"],
4545
"properties": [
4646
{
4747
"name": "key",
@@ -83,7 +83,7 @@
8383
"name": "parts",
8484
"type": "simple table",
8585
"table path": "main.PART",
86-
"unique properties": ["key"],
86+
"unique properties": ["key", "name"],
8787
"properties": [
8888
{
8989
"name": "key",
@@ -170,7 +170,7 @@
170170
"name": "suppliers",
171171
"type": "simple table",
172172
"table path": "main.SUPPLIER",
173-
"unique properties": ["key", "name"],
173+
"unique properties": ["key", "name", "phone", "address"],
174174
"properties": [
175175
{
176176
"name": "key",
@@ -527,7 +527,7 @@
527527
"name": "customers",
528528
"type": "simple table",
529529
"table path": "main.CUSTOMER",
530-
"unique properties": ["key", "name"],
530+
"unique properties": ["key", "name", "address"],
531531
"properties": [
532532
{
533533
"name": "key",
@@ -813,6 +813,9 @@
813813
"description": "The orders that a customer has placed, each of which contains one or more line items",
814814
"synonyms": ["transactions", "purchases"]
815815
}
816-
]
817-
}
816+
],
817+
"additional definitions": [],
818+
"verified pydough analysis": [],
819+
"extra semantic info": {}
820+
},
818821
]

pydough/conversion/agg_split.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ def attempt_join_aggregate_transpose(
355355
# if joining first will reduce the number of rows that get aggregated.
356356
if join.cardinality.filters:
357357
can_push_left = False
358+
if join.reverse_cardinality.filters:
358359
can_push_right = False
359360

360361
# If any of the aggregations to either side cannot be pushed down, then
@@ -468,6 +469,9 @@ def attempt_join_aggregate_transpose(
468469
)
469470
node.aggregations[count_call_name] = regular_sum
470471
node.columns[count_call_name] = regular_sum
472+
projection_columns[count_call_name] = ColumnReference(
473+
count_call_name, NumericType()
474+
)
471475

472476
# If the node requires projection at the end, create a new Project node on
473477
# top of the top aggregate.

pydough/conversion/filter_pushdown.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ def visit_join(self, join: Join) -> RelationalNode:
186186
# The join type, cardinality, and inputs for the output join node.
187187
join_type: JoinType = join.join_type
188188
cardinality: JoinCardinality = join.cardinality
189+
reverse_cardinality: JoinCardinality = join.reverse_cardinality
189190
new_inputs: list[RelationalNode] = []
190191

191192
# If the join type is LEFT or SEMI but the condition is TRUE, convert it
@@ -240,10 +241,14 @@ def visit_join(self, join: Join) -> RelationalNode:
240241
remaining_filters,
241242
lambda expr: only_references_columns(expr, input_cols[idx]),
242243
)
243-
# Ensure that if any filter is pushed into an input (besides
244-
# the first input) that the join is marked as filtering.
245-
if len(pushable_filters) > 0 and idx > 0:
246-
cardinality = join.cardinality.add_filter()
244+
# Ensure that if any filter is pushed into an input, the
245+
# corresponding join cardinality is updated to reflect that a filter
246+
# has been applied.
247+
if len(pushable_filters) > 0:
248+
if idx == 1:
249+
cardinality = join.cardinality.add_filter()
250+
else:
251+
reverse_cardinality = reverse_cardinality.add_filter()
247252
pushable_filters = {
248253
expr.accept_shuttle(transposer) for expr in pushable_filters
249254
}
@@ -271,6 +276,7 @@ def visit_join(self, join: Join) -> RelationalNode:
271276
else:
272277
new_conjunction.add(join._condition)
273278
cardinality = join.cardinality.add_filter()
279+
reverse_cardinality = join.reverse_cardinality.add_filter()
274280
join._condition = RelationalExpression.form_conjunction(
275281
sorted(new_conjunction, key=repr)
276282
)
@@ -281,6 +287,7 @@ def visit_join(self, join: Join) -> RelationalNode:
281287
new_node = join.copy(inputs=new_inputs)
282288
assert isinstance(new_node, Join)
283289
new_node.cardinality = cardinality
290+
new_node.reverse_cardinality = reverse_cardinality
284291
new_node.join_type = join_type
285292
return build_filter(new_node, remaining_filters)
286293

pydough/conversion/hybrid_connection.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from enum import Enum
1111
from typing import TYPE_CHECKING
1212

13-
from pydough.relational import JoinType
13+
from pydough.relational import JoinCardinality, JoinType
1414

1515
from .hybrid_expressions import (
1616
HybridFunctionExpr,
@@ -313,6 +313,8 @@ class HybridConnection:
313313
child can be defined at (exclusive).
314314
- `aggs`: a mapping of aggregation calls made onto expressions relative to the
315315
context of `subtree`.
316+
- `reverse_cardinality`: the JoinCardinality of the connection from the
317+
perspective of the child subtree back to the parent tree.
316318
"""
317319

318320
parent: "HybridTree"
@@ -349,6 +351,12 @@ class HybridConnection:
349351
expressions defined relative to the child subtree.
350352
"""
351353

354+
reverse_cardinality: JoinCardinality
355+
"""
356+
The JoinCardinality of the connection from the perspective of the child
357+
subtree back to the parent tree.
358+
"""
359+
352360
always_exists: bool | None = None
353361
"""
354362
Whether the connection is guaranteed to have at least one matching

pydough/conversion/hybrid_correlation_extraction.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,9 @@ def attempt_correlation_extraction(
238238
for _, rhs_key in new_equi_filters:
239239
bottom_subtree.agg_keys.append(rhs_key)
240240
connection.always_exists = False
241+
connection.reverse_cardinality = (
242+
connection.reverse_cardinality.add_filter()
243+
)
241244

242245
if len(new_general_filters) > 0:
243246
if bottom_subtree.general_join_condition is not None:
@@ -262,6 +265,9 @@ def attempt_correlation_extraction(
262265
pydop.BAN, new_general_filters, BooleanType()
263266
)
264267
connection.always_exists = False
268+
connection.reverse_cardinality = (
269+
connection.reverse_cardinality.add_filter()
270+
)
265271

266272
# Update the filter condition with the new conjunction of terms
267273
if new_conjunction != conjunction:

pydough/conversion/hybrid_decorrelater.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import copy
1010

1111
import pydough.pydough_operators as pydop
12+
from pydough.relational import JoinCardinality
1213
from pydough.types import BooleanType
1314

1415
from .hybrid_connection import ConnectionType, HybridConnection
@@ -430,6 +431,13 @@ def decorrelate_child(
430431
)
431432
if child.connection_type.is_aggregation or is_faux_agg:
432433
child.subtree.agg_keys = new_agg_keys
434+
435+
# Mark the reverse cardinality as SINGULAR_ACCESS since each record of
436+
# the de-correlated child can only match with one record of the
437+
# original parent due to the join keys being based on the uniqueness
438+
# keys of the original parent.
439+
child.reverse_cardinality = JoinCardinality.SINGULAR_ACCESS
440+
433441
# If the child is such that we don't need to keep rows from the parent
434442
# without a match, replace the parent & its ancestors with a
435443
# HybridPullUp node (and replace any other deleted nodes with no-ops).

pydough/conversion/hybrid_tree.py

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
from pydough.metadata import (
1919
SubcollectionRelationshipMetadata,
2020
)
21+
from pydough.metadata.properties import ReversiblePropertyMetadata
2122
from pydough.qdag import (
2223
Literal,
2324
SubCollection,
2425
TableCollection,
2526
)
27+
from pydough.relational import JoinCardinality
2628
from pydough.types import BooleanType, NumericType
2729

2830
from .hybrid_connection import ConnectionType, HybridConnection
@@ -579,13 +581,29 @@ def add_child(
579581
# Return the index of the existing child.
580582
return idx
581583

584+
# Infer the cardinality of the join from the perspective of the new
585+
# collection to the existing data.
586+
reverse_cardinality: JoinCardinality = child.infer_root_reverse_cardinality(
587+
self
588+
)
589+
582590
# Create and insert the new child connection.
583591
new_child_idx = len(self.children)
584592
connection: HybridConnection = HybridConnection(
585-
self, child, connection_type, min_steps, max_steps, {}
593+
self,
594+
child,
595+
connection_type,
596+
min_steps,
597+
max_steps,
598+
{},
599+
reverse_cardinality,
586600
)
587601
self._children.append(connection)
588602

603+
# Augment the reverse cardinality if the parent does not always exist.
604+
if (not reverse_cardinality.filters) and (not self.always_exists()):
605+
connection.reverse_cardinality = reverse_cardinality.add_filter()
606+
589607
# If an operation prevents the child's presence from directly
590608
# filtering the current level, update its connection type to be either
591609
# SINGULAR or AGGREGATION, then insert a similar COUNT(*)/PRESENT
@@ -605,6 +623,96 @@ def add_child(
605623
# Return the index of the newly created child.
606624
return new_child_idx
607625

626+
@staticmethod
627+
def infer_metadata_reverse_cardinality(
628+
metadata: SubcollectionRelationshipMetadata,
629+
) -> JoinCardinality:
630+
"""
631+
Infers the cardinality of the reverse of a join (child → parent)
632+
based on the metadata of the reverse-relationship, if one exists.
633+
If no reverse metadata exists, defaults to PLURAL_FILTER (safest assumption)
634+
635+
Args:
636+
`metadata`: the metadata for the sub-collection property mapping
637+
the parent to the child.
638+
639+
Returns:
640+
The join cardinality for the connection from the child back to the
641+
parent, if it can be inferred. Uses `PLURAL_FILTER` as a fallback.
642+
"""
643+
# If there is no reverse, fall back to plural filter (which is the
644+
# safest default assumption).
645+
if (
646+
not isinstance(metadata, ReversiblePropertyMetadata)
647+
or metadata.reverse is None
648+
):
649+
return JoinCardinality.PLURAL_FILTER
650+
651+
# If the reverse property exists, use its properties to
652+
# infer if the reverse cardinality is singular or plural
653+
# and whether a match always exists or not.
654+
cardinality: JoinCardinality
655+
match (metadata.reverse.is_plural, metadata.reverse.always_matches):
656+
case (False, True):
657+
cardinality = JoinCardinality.SINGULAR_ACCESS
658+
case (False, False):
659+
cardinality = JoinCardinality.SINGULAR_FILTER
660+
case (True, True):
661+
cardinality = JoinCardinality.PLURAL_ACCESS
662+
case (True, False):
663+
cardinality = JoinCardinality.PLURAL_FILTER
664+
return cardinality
665+
666+
def infer_root_reverse_cardinality(self, context: "HybridTree") -> JoinCardinality:
667+
"""
668+
Infers the cardinality of the join connecting the root of the hybrid
669+
tree back to its parent context.
670+
671+
Args:
672+
`context`: the parent context that the root of the hybrid tree is
673+
being connected to.
674+
675+
Returns:
676+
The inferred cardinality of the join connecting the root of the
677+
hybrid tree to its parent context.
678+
"""
679+
# Keep traversing upward until we find the root of the current tree.
680+
if self.parent is not None:
681+
return self.parent.infer_root_reverse_cardinality(context)
682+
683+
# Once we find the root, infer the cardinality of the join that would
684+
# connect just this node to the parent context.
685+
# At the root, only this node’s type matters for reverse cardinality.
686+
# Deeper nodes do not affect parent-child match guarantees.
687+
match self.pipeline[0]:
688+
case HybridRoot():
689+
# If the parent of the child is a root, it means a cross join
690+
# is occurring, so the cardinality depends on whether
691+
# the parent context is singular or plural.
692+
return (
693+
JoinCardinality.SINGULAR_ACCESS
694+
if context.is_singular()
695+
else JoinCardinality.PLURAL_ACCESS
696+
)
697+
case HybridCollectionAccess():
698+
# For non sub-collection accesses, use plural access.
699+
# For a sub-collection, infer from the reverse property.
700+
if isinstance(self.pipeline[0].collection, SubCollection):
701+
return self.infer_metadata_reverse_cardinality(
702+
self.pipeline[0].collection.subcollection_property
703+
)
704+
else:
705+
return JoinCardinality.PLURAL_ACCESS
706+
# For partition & partition child, infer from the underlying child.
707+
case HybridPartition():
708+
return self.children[0].subtree.infer_root_reverse_cardinality(context)
709+
case HybridPartitionChild():
710+
return self.pipeline[0].subtree.infer_root_reverse_cardinality(context)
711+
case _:
712+
raise NotImplementedError(
713+
f"Invalid start of pipeline: {self.pipeline[0].__class__.__name__}"
714+
)
715+
608716
def add_successor(self, successor: "HybridTree") -> None:
609717
"""
610718
Marks two hybrid trees in a predecessor-successor relationship.
@@ -723,7 +831,7 @@ def is_singular(self) -> bool:
723831
match self.pipeline[0]:
724832
case HybridCollectionAccess():
725833
if isinstance(self.pipeline[0].collection, TableCollection):
726-
pass
834+
return False
727835
else:
728836
assert isinstance(self.pipeline[0].collection, SubCollection)
729837
meta: SubcollectionRelationshipMetadata = self.pipeline[
@@ -734,6 +842,8 @@ def is_singular(self) -> bool:
734842
case HybridChildPullUp():
735843
if not self.children[self.pipeline[0].child_idx].subtree.is_singular():
736844
return False
845+
case HybridRoot():
846+
pass
737847
case _:
738848
return False
739849
# The current level is fine, so check any levels above it next.

0 commit comments

Comments
 (0)