Skip to content

Commit d181a30

Browse files
authored
Add physical and logical plan conversion to and from protobuf (#892)
* Add physical and logical plan conversion to and from protobuf * Add wrappers for LogicalPlan and ExecutionPlan * Add unit tests for to_proto and from_proto for logical and physical plans
1 parent 976b700 commit d181a30

File tree

11 files changed

+398
-158
lines changed

11 files changed

+398
-158
lines changed

Cargo.lock

Lines changed: 117 additions & 134 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"]
4040
arrow = { version = "53", features = ["pyarrow"] }
4141
datafusion = { version = "42.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
4242
datafusion-substrait = { version = "42.0.0", optional = true }
43+
datafusion-proto = { version = "42.0.0" }
4344
prost = "0.13" # keep in line with `datafusion-substrait`
4445
prost-types = "0.13" # keep in line with `datafusion-substrait`
4546
uuid = { version = "1.9", features = ["v4"] }

python/datafusion/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from .catalog import Catalog, Database, Table
3737

3838
# The following imports are okay to remain as opaque to the user.
39-
from ._internal import Config, LogicalPlan, ExecutionPlan, runtime
39+
from ._internal import Config, runtime
4040

4141
from .record_batch import RecordBatchStream, RecordBatch
4242

@@ -53,6 +53,8 @@
5353
WindowFrame,
5454
)
5555

56+
from .plan import LogicalPlan, ExecutionPlan
57+
5658
from . import functions, object_store, substrait
5759

5860
__version__ = importlib_metadata.version(__name__)

python/datafusion/context.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from ._internal import RuntimeConfig as RuntimeConfigInternal
2424
from ._internal import SQLOptions as SQLOptionsInternal
2525
from ._internal import SessionContext as SessionContextInternal
26-
from ._internal import LogicalPlan, ExecutionPlan
2726

2827
from datafusion.catalog import Catalog, Table
2928
from datafusion.dataframe import DataFrame
@@ -39,6 +38,7 @@
3938
import pandas
4039
import polars
4140
import pathlib
41+
from datafusion.plan import LogicalPlan, ExecutionPlan
4242

4343

4444
class SessionConfig:
@@ -268,8 +268,10 @@ def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConf
268268
Returns:
269269
A new :py:class:`RuntimeConfig` object with the updated setting.
270270
"""
271-
paths = [str(p) for p in paths]
272-
self.config_internal = self.config_internal.with_disk_manager_specified(paths)
271+
paths_list = [str(p) for p in paths]
272+
self.config_internal = self.config_internal.with_disk_manager_specified(
273+
paths_list
274+
)
273275
return self
274276

275277
def with_unbounded_memory_pool(self) -> RuntimeConfig:
@@ -558,7 +560,7 @@ def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame:
558560
Returns:
559561
DataFrame representation of the logical plan.
560562
"""
561-
return DataFrame(self.ctx.create_dataframe_from_logical_plan(plan))
563+
return DataFrame(self.ctx.create_dataframe_from_logical_plan(plan._raw_plan))
562564

563565
def from_pylist(
564566
self, data: list[dict[str, Any]], name: str | None = None
@@ -1034,4 +1036,4 @@ def read_table(self, table: Table) -> DataFrame:
10341036

10351037
def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
10361038
"""Execute the ``plan`` and return the results."""
1037-
return RecordBatchStream(self.ctx.execute(plan, partitions))
1039+
return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions))

python/datafusion/dataframe.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from typing import Any, List, TYPE_CHECKING
2525
from datafusion.record_batch import RecordBatchStream
2626
from typing_extensions import deprecated
27+
from datafusion.plan import LogicalPlan, ExecutionPlan
2728

2829
if TYPE_CHECKING:
2930
import pyarrow as pa
@@ -34,10 +35,6 @@
3435

3536
from datafusion._internal import DataFrame as DataFrameInternal
3637
from datafusion.expr import Expr, SortExpr, sort_or_default
37-
from datafusion._internal import (
38-
LogicalPlan,
39-
ExecutionPlan,
40-
)
4138

4239

4340
class DataFrame:
@@ -316,23 +313,23 @@ def logical_plan(self) -> LogicalPlan:
316313
Returns:
317314
Unoptimized logical plan.
318315
"""
319-
return self.df.logical_plan()
316+
return LogicalPlan(self.df.logical_plan())
320317

321318
def optimized_logical_plan(self) -> LogicalPlan:
322319
"""Return the optimized ``LogicalPlan``.
323320
324321
Returns:
325322
Optimized logical plan.
326323
"""
327-
return self.df.optimized_logical_plan()
324+
return LogicalPlan(self.df.optimized_logical_plan())
328325

329326
def execution_plan(self) -> ExecutionPlan:
330327
"""Return the execution/physical plan.
331328
332329
Returns:
333330
Execution plan.
334331
"""
335-
return self.df.execution_plan()
332+
return ExecutionPlan(self.df.execution_plan())
336333

337334
def repartition(self, num: int) -> DataFrame:
338335
"""Repartition a DataFrame into ``num`` partitions.

python/datafusion/expr.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,18 @@
2222

2323
from __future__ import annotations
2424

25-
from typing import Any, Optional, Type
25+
from typing import Any, Optional, Type, TYPE_CHECKING
2626

2727
import pyarrow as pa
2828
from datafusion.common import DataTypeMap, NullTreatment, RexType
2929
from typing_extensions import deprecated
3030

31-
from ._internal import LogicalPlan
3231
from ._internal import expr as expr_internal
3332
from ._internal import functions as functions_internal
3433

34+
if TYPE_CHECKING:
35+
from datafusion.plan import LogicalPlan
36+
3537
# The following are imported from the internal representation. We may choose to
3638
# give these all proper wrappers, or to simply leave as is. These were added
3739
# in order to support passing the `test_imports` unit test.
@@ -485,7 +487,7 @@ def rex_call_operator(self) -> str:
485487

486488
def column_name(self, plan: LogicalPlan) -> str:
487489
"""Compute the output column name based on the provided logical plan."""
488-
return self.expr.column_name(plan)
490+
return self.expr.column_name(plan._raw_plan)
489491

490492
def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder:
491493
"""Set the ordering for a window or aggregate function.

python/datafusion/plan.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
"""This module supports physical and logical plans in DataFusion."""
19+
20+
from __future__ import annotations
21+
22+
import datafusion._internal as df_internal
23+
24+
from typing import List, Any, TYPE_CHECKING
25+
26+
if TYPE_CHECKING:
27+
from datafusion.context import SessionContext
28+
29+
__all__ = [
30+
"LogicalPlan",
31+
"ExecutionPlan",
32+
]
33+
34+
35+
class LogicalPlan:
36+
"""Logical Plan.
37+
38+
A `LogicalPlan` is a node in a tree of relational operators (such as
39+
Projection or Filter).
40+
41+
Represents transforming an input relation (table) to an output relation
42+
(table) with a potentially different schema. Plans form a dataflow tree
43+
where data flows from leaves up to the root to produce the query result.
44+
45+
`LogicalPlan`s can be created by the SQL query planner, the DataFrame API,
46+
or programmatically (for example custom query languages).
47+
"""
48+
49+
def __init__(self, plan: df_internal.LogicalPlan) -> None:
50+
"""This constructor should not be called by the end user."""
51+
self._raw_plan = plan
52+
53+
def to_variant(self) -> Any:
54+
"""Convert the logical plan into its specific variant."""
55+
return self._raw_plan.to_variant()
56+
57+
def inputs(self) -> List[LogicalPlan]:
58+
"""Returns the list of inputs to the logical plan."""
59+
return [LogicalPlan(p) for p in self._raw_plan.inputs()]
60+
61+
def __repr__(self) -> str:
62+
"""Generate a printable representation of the plan."""
63+
return self._raw_plan.__repr__()
64+
65+
def display(self) -> str:
66+
"""Print the logical plan."""
67+
return self._raw_plan.display()
68+
69+
def display_indent(self) -> str:
70+
"""Print an indented form of the logical plan."""
71+
return self._raw_plan.display_indent()
72+
73+
def display_indent_schema(self) -> str:
74+
"""Print an indented form of the schema for the logical plan."""
75+
return self._raw_plan.display_indent_schema()
76+
77+
def display_graphviz(self) -> str:
78+
"""Print the graph visualization of the logical plan.
79+
80+
Returns a `format`able structure that produces lines meant for graphical display
81+
using the `DOT` language. This format can be visualized using software from
82+
[`graphviz`](https://graphviz.org/)
83+
"""
84+
return self._raw_plan.display_graphviz()
85+
86+
@staticmethod
87+
def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan:
88+
"""Create a LogicalPlan from protobuf bytes.
89+
90+
Tables created in memory from record batches are currently not supported.
91+
"""
92+
return LogicalPlan(df_internal.LogicalPlan.from_proto(ctx.ctx, data))
93+
94+
def to_proto(self) -> bytes:
95+
"""Convert a LogicalPlan to protobuf bytes.
96+
97+
Tables created in memory from record batches are currently not supported.
98+
"""
99+
return self._raw_plan.to_proto()
100+
101+
102+
class ExecutionPlan:
103+
"""Represent nodes in the DataFusion Physical Plan."""
104+
105+
def __init__(self, plan: df_internal.ExecutionPlan) -> None:
106+
"""This constructor should not be called by the end user."""
107+
self._raw_plan = plan
108+
109+
def children(self) -> List[ExecutionPlan]:
110+
"""Get a list of children `ExecutionPlan`s that act as inputs to this plan.
111+
112+
The returned list will be empty for leaf nodes such as scans, will contain a
113+
single value for unary nodes, or two values for binary nodes (such as joins).
114+
"""
115+
return [ExecutionPlan(e) for e in self._raw_plan.children()]
116+
117+
def display(self) -> str:
118+
"""Print the physical plan."""
119+
return self._raw_plan.display()
120+
121+
def display_indent(self) -> str:
122+
"""Print an indented form of the physical plan."""
123+
return self._raw_plan.display_indent()
124+
125+
def __repr__(self) -> str:
126+
"""Print a string representation of the physical plan."""
127+
return self._raw_plan.__repr__()
128+
129+
@property
130+
def partition_count(self) -> int:
131+
"""Returns the number of partitions in the physical plan."""
132+
return self._raw_plan.partition_count
133+
134+
@staticmethod
135+
def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan:
136+
"""Create an ExecutionPlan from protobuf bytes.
137+
138+
Tables created in memory from record batches are currently not supported.
139+
"""
140+
return ExecutionPlan(df_internal.ExecutionPlan.from_proto(ctx.ctx, data))
141+
142+
def to_proto(self) -> bytes:
143+
"""Convert an ExecutionPlan into protobuf bytes.
144+
145+
Tables created in memory from record batches are currently not supported.
146+
"""
147+
return self._raw_plan.to_proto()

python/datafusion/substrait.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@
2828
from typing import TYPE_CHECKING
2929
from typing_extensions import deprecated
3030
import pathlib
31+
from datafusion.plan import LogicalPlan
3132

3233
if TYPE_CHECKING:
3334
from datafusion.context import SessionContext
34-
from datafusion._internal import LogicalPlan
3535

3636
__all__ = [
3737
"Plan",
@@ -156,7 +156,9 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan:
156156
Substrait plan.
157157
"""
158158
return Plan(
159-
substrait_internal.Producer.to_substrait_plan(logical_plan, ctx.ctx)
159+
substrait_internal.Producer.to_substrait_plan(
160+
logical_plan._raw_plan, ctx.ctx
161+
)
160162
)
161163

162164

@@ -181,8 +183,8 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan:
181183
Returns:
182184
LogicalPlan.
183185
"""
184-
return substrait_internal.Consumer.from_substrait_plan(
185-
ctx.ctx, plan.plan_internal
186+
return LogicalPlan(
187+
substrait_internal.Consumer.from_substrait_plan(ctx.ctx, plan.plan_internal)
186188
)
187189

188190

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
from datafusion import SessionContext, LogicalPlan, ExecutionPlan
19+
import pytest
20+
21+
22+
# Note: We must use CSV because memory tables are currently not supported for
23+
# conversion to/from protobuf.
24+
@pytest.fixture
25+
def df():
26+
ctx = SessionContext()
27+
return ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv").select("c1")
28+
29+
30+
def test_logical_plan_to_proto(ctx, df) -> None:
31+
logical_plan_bytes = df.logical_plan().to_proto()
32+
logical_plan = LogicalPlan.from_proto(ctx, logical_plan_bytes)
33+
34+
df_round_trip = ctx.create_dataframe_from_logical_plan(logical_plan)
35+
36+
assert df.collect() == df_round_trip.collect()
37+
38+
original_execution_plan = df.execution_plan()
39+
execution_plan_bytes = original_execution_plan.to_proto()
40+
execution_plan = ExecutionPlan.from_proto(ctx, execution_plan_bytes)
41+
42+
assert str(original_execution_plan) == str(execution_plan)

0 commit comments

Comments
 (0)