Skip to content

Commit 6ff8c7c

Browse files
committed
Removing datafusion-python as a rust dependency
1 parent 3990555 commit 6ff8c7c

File tree

9 files changed

+249
-775
lines changed

9 files changed

+249
-775
lines changed

Cargo.lock

Lines changed: 178 additions & 713 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,14 @@ rust-version = "1.62"
2929
build = "build.rs"
3030

3131
[dependencies]
32-
datafusion = { version = "41.0.0", features = ["pyarrow", "avro"] }
33-
datafusion-proto = "41.0.0"
34-
datafusion-python = "41.0.0"
32+
datafusion = { version = "42.0.0", features = ["pyarrow", "avro"] }
33+
datafusion-proto = "42.0.0"
3534
futures = "0.3"
3635
glob = "0.3"
3736
log = "0.4"
38-
prost = "0.12"
39-
prost-types = "0.12"
40-
pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] }
37+
prost = "0.13.1"
38+
prost-types = "0.13.1"
39+
pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] }
4140
tokio = { version = "1.40", features = ["macros", "rt", "rt-multi-thread", "sync"] }
4241
uuid = "1.2"
4342

build.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@ fn main() -> Result<(), String> {
3232

3333
// We don't include the proto files in releases so that downstreams
3434
// do not need to have PROTOC included
35-
if Path::new("src/proto/datafusion-ray.proto").exists() {
35+
if Path::new("src/proto/datafusion_ray.proto").exists() {
3636
println!("cargo:rerun-if-changed=src/proto/datafusion.proto");
37-
println!("cargo:rerun-if-changed=src/proto/datafusion-ray.proto");
37+
println!("cargo:rerun-if-changed=src/proto/datafusion_ray.proto");
3838
tonic_build::configure()
3939
.extern_path(".datafusion", "::datafusion_proto::protobuf")
40-
.compile(&["src/proto/datafusion-ray.proto"], &["src/proto"])
40+
.compile(&["src/proto/datafusion_ray.proto"], &["src/proto"])
4141
.map_err(|e| format!("protobuf compilation failed: {e}"))?;
42-
let generated_source_path = out.join("datafusion-ray.protobuf.rs");
42+
let generated_source_path = out.join("datafusion_ray.protobuf.rs");
4343
let code = std::fs::read_to_string(generated_source_path).unwrap();
4444
let mut file = std::fs::OpenOptions::new()
4545
.write(true)

datafusion_ray/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
ExecutionGraph,
2626
QueryStage,
2727
execute_partition,
28-
serialize_execution_plan,
29-
deserialize_execution_plan,
28+
# serialize_execution_plan,
29+
# deserialize_execution_plan,
3030
)
3131
from .context import DatafusionRayContext
3232

datafusion_ray/tests/test_context.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
import pytest
1918
from datafusion_ray import Context
19+
from datafusion import SessionContext
20+
2021

2122
def test():
22-
ctx = Context(1, False)
23-
ctx.register_csv('tips', 'examples/tips.csv', True)
23+
df_ctx = SessionContext()
24+
ctx = Context(df_ctx, False)
25+
df_ctx.register_csv("tips", "examples/tips.csv", has_header=True)
2426
ctx.plan("SELECT * FROM tips")

src/context.rs

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ use datafusion_proto::bytes::{
3737
};
3838
use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec};
3939
use datafusion_proto::protobuf;
40-
use datafusion_python::physical_plan::PyExecutionPlan;
4140
use futures::StreamExt;
4241
use prost::{DecodeError, Message};
4342
use pyo3::exceptions::PyRuntimeError;
@@ -56,6 +55,26 @@ pub struct PyContext {
5655
use_ray_shuffle: bool,
5756
}
5857

58+
pub(crate) fn execution_plan_from_pyany(
59+
py_plan: &Bound<PyAny>,
60+
) -> PyResult<Arc<dyn ExecutionPlan>> {
61+
let py_proto = py_plan.call_method0("to_proto")?;
62+
let plan_bytes: &[u8] = py_proto.extract()?;
63+
let plan_node = protobuf::PhysicalPlanNode::try_decode(plan_bytes).map_err(|e| {
64+
PyRuntimeError::new_err(format!(
65+
"Unable to decode physical plan protobuf message: {}",
66+
e
67+
))
68+
})?;
69+
70+
let codec = DefaultPhysicalExtensionCodec {};
71+
let runtime = RuntimeEnv::default();
72+
let registry = SessionContext::new();
73+
plan_node
74+
.try_into_physical_plan(&registry, &runtime, &codec)
75+
.map_err(|e| e.into())
76+
}
77+
5978
#[pymethods]
6079
impl PyContext {
6180
#[new]
@@ -117,20 +136,9 @@ impl PyContext {
117136
// let df = wait_for_future(py, self.ctx.sql(sql))?;
118137
let py_df = self.run_sql(sql, py)?;
119138
let py_plan = py_df.call_method0(py, "execution_plan")?;
120-
let py_proto = py_plan.call_method0(py, "to_proto")?;
121-
let plan_bytes: &[u8] = py_proto.extract(py)?;
122-
let plan_node = protobuf::PhysicalPlanNode::decode(plan_bytes).map_err(|e| {
123-
PyRuntimeError::new_err(format!(
124-
"Unable to decode physical plan protobuf message: {}",
125-
e
126-
))
127-
})?;
128-
129-
let codec = DefaultPhysicalExtensionCodec {};
130-
let runtime = RuntimeEnv::default();
131-
let registry = SessionContext::new();
132-
let plan = plan_node.try_into_physical_plan(&registry, &runtime, &codec)?;
139+
let py_plan = py_plan.bind(py);
133140

141+
let plan = execution_plan_from_pyany(py_plan)?;
134142
let graph = make_execution_graph(plan.clone(), self.use_ray_shuffle)?;
135143

136144
// debug logging
@@ -150,7 +158,7 @@ impl PyContext {
150158
/// Execute a partition of a query plan. This will typically be executing a shuffle write and write the results to disk
151159
pub fn execute_partition(
152160
&self,
153-
plan: PyExecutionPlan,
161+
plan: &Bound<'_, PyAny>,
154162
part: usize,
155163
inputs: PyObject,
156164
py: Python,
@@ -161,7 +169,7 @@ impl PyContext {
161169

162170
#[pyfunction]
163171
pub fn execute_partition(
164-
plan: PyExecutionPlan,
172+
plan: &Bound<'_, PyAny>,
165173
part: usize,
166174
inputs: PyObject,
167175
py: Python,
@@ -174,25 +182,25 @@ pub fn execute_partition(
174182
}
175183

176184
// TODO(@lsf) change this to use pickle
177-
#[pyfunction]
178-
pub fn serialize_execution_plan(plan: PyExecutionPlan) -> PyResult<Vec<u8>> {
179-
let codec = ShuffleCodec {};
180-
Ok(physical_plan_to_bytes_with_extension_codec(plan.plan, &codec)?.to_vec())
181-
}
185+
// #[pyfunction]
186+
// pub fn serialize_execution_plan(plan: Py<PyAny>) -> PyResult<Vec<u8>> {
187+
// let codec = ShuffleCodec {};
188+
// Ok(physical_plan_to_bytes_with_extension_codec(plan.plan, &codec)?.to_vec())
189+
// }
182190

183-
#[pyfunction]
184-
pub fn deserialize_execution_plan(bytes: Vec<u8>) -> PyResult<PyExecutionPlan> {
185-
let ctx = SessionContext::new();
186-
let codec = ShuffleCodec {};
187-
Ok(PyExecutionPlan::new(
188-
physical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?,
189-
))
190-
}
191+
// #[pyfunction]
192+
// pub fn deserialize_execution_plan(bytes: Vec<u8>) -> PyResult<PyExecutionPlan> {
193+
// let ctx = SessionContext::new();
194+
// let codec = ShuffleCodec {};
195+
// Ok(PyExecutionPlan::new(
196+
// physical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?,
197+
// ))
198+
// }
191199

192200
/// Iterate down an ExecutionPlan and set the input objects for RayShuffleReaderExec.
193201
fn _set_inputs_for_ray_shuffle_reader(
194202
plan: Arc<dyn ExecutionPlan>,
195-
input_partitions: &PyList,
203+
input_partitions: &Bound<'_, PyList>,
196204
) -> Result<()> {
197205
if let Some(reader_exec) = plan.as_any().downcast_ref::<RayShuffleReaderExec>() {
198206
let exec_stage_id = reader_exec.stage_id;
@@ -218,8 +226,8 @@ fn _set_inputs_for_ray_shuffle_reader(
218226
.map_err(|e| DataFusionError::Execution(format!("{}", e)))?
219227
.extract::<usize>()
220228
.map_err(|e| DataFusionError::Execution(format!("{}", e)))?;
221-
let batch = RecordBatch::from_pyarrow(
222-
pytuple
229+
let batch = RecordBatch::from_pyarrow_bound(
230+
&pytuple
223231
.get_item(2)
224232
.map_err(|e| DataFusionError::Execution(format!("{}", e)))?,
225233
)
@@ -238,7 +246,7 @@ fn _set_inputs_for_ray_shuffle_reader(
238246
/// write the results to disk, except for the final query stage, which will return the data.
239247
/// inputs is a list of tuples of (stage_id, partition_id, bytes) for each input partition.
240248
fn _execute_partition(
241-
plan: PyExecutionPlan,
249+
py_plan: &Bound<'_, PyAny>,
242250
part: usize,
243251
inputs: PyObject,
244252
) -> Result<Vec<RecordBatch>> {
@@ -251,19 +259,21 @@ fn _execute_partition(
251259
HashMap::new(),
252260
Arc::new(RuntimeEnv::default()),
253261
));
262+
263+
let plan = execution_plan_from_pyany(py_plan)
264+
.map_err(|e| DataFusionError::Execution(e.to_string()))?;
254265
Python::with_gil(|py| {
255266
let input_partitions = inputs
256-
.as_ref(py)
257-
.downcast::<PyList>()
267+
.downcast_bound::<PyList>(py)
258268
.map_err(|e| DataFusionError::Execution(format!("{}", e)))?;
259-
_set_inputs_for_ray_shuffle_reader(plan.plan.clone(), input_partitions)
269+
_set_inputs_for_ray_shuffle_reader(plan.clone(), input_partitions)
260270
})?;
261271

262272
// create a Tokio runtime to run the async code
263273
let rt = Runtime::new().unwrap();
264274

265275
let fut: JoinHandle<Result<Vec<RecordBatch>>> = rt.spawn(async move {
266-
let mut stream = plan.plan.execute(part, ctx)?;
276+
let mut stream = plan.execute(part, ctx)?;
267277
let mut results = vec![];
268278
while let Some(result) = stream.next().await {
269279
results.push(result?);

src/lib.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ extern crate core;
2020
use pyo3::prelude::*;
2121

2222
mod proto;
23-
use crate::context::{deserialize_execution_plan, execute_partition, serialize_execution_plan};
23+
// use crate::context::{deserialize_execution_plan, execute_partition, serialize_execution_plan};
24+
use crate::context::execute_partition;
2425
pub use proto::generated::protobuf;
2526

2627
pub mod context;
@@ -37,7 +38,7 @@ fn _datafusion_ray_internal(m: &Bound<'_, PyModule>) -> PyResult<()> {
3738
m.add_class::<planner::PyExecutionGraph>()?;
3839
m.add_class::<query_stage::PyQueryStage>()?;
3940
m.add_function(wrap_pyfunction!(execute_partition, m)?)?;
40-
m.add_function(wrap_pyfunction!(serialize_execution_plan, m)?)?;
41-
m.add_function(wrap_pyfunction!(deserialize_execution_plan, m)?)?;
41+
// m.add_function(wrap_pyfunction!(serialize_execution_plan, m)?)?;
42+
// m.add_function(wrap_pyfunction!(deserialize_execution_plan, m)?)?;
4243
Ok(())
4344
}

src/query_stage.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ use datafusion::error::Result;
2020
use datafusion::physical_plan::{ExecutionPlan, Partitioning};
2121
use datafusion::prelude::SessionContext;
2222
use datafusion_proto::bytes::physical_plan_from_bytes_with_extension_codec;
23-
use datafusion_python::physical_plan::PyExecutionPlan;
2423
use pyo3::prelude::*;
2524
use std::sync::Arc;
2625

@@ -51,9 +50,9 @@ impl PyQueryStage {
5150
self.stage.id
5251
}
5352

54-
pub fn get_execution_plan(&self) -> PyExecutionPlan {
55-
PyExecutionPlan::new(self.stage.plan.clone())
56-
}
53+
// pub fn get_execution_plan(&self) -> PyExecutionPlan {
54+
// PyExecutionPlan::new(self.stage.plan.clone())
55+
// }
5756

5857
pub fn get_child_stage_ids(&self) -> Vec<usize> {
5958
self.stage.get_child_stage_ids()

src/shuffle/codec.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,7 @@ fn encode_partitioning_scheme(partitioning: &Partitioning) -> Result<PhysicalHas
186186
Partitioning::Hash(expr, partition_count) => Ok(protobuf::PhysicalHashRepartition {
187187
hash_expr: expr
188188
.iter()
189-
.map(|expr| {
190-
serialize_physical_expr(expr.clone(), &DefaultPhysicalExtensionCodec {})
191-
})
189+
.map(|expr| serialize_physical_expr(expr, &DefaultPhysicalExtensionCodec {}))
192190
.collect::<Result<Vec<_>, DataFusionError>>()?,
193191
partition_count: *partition_count as u64,
194192
}),

0 commit comments

Comments
 (0)