Skip to content

Commit 09fc890

Browse files
committed
Implement query builder
1 parent a999c92 commit 09fc890

File tree

8 files changed

+169
-4
lines changed

8 files changed

+169
-4
lines changed

crates/core/src/delta_datafusion/mod.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -824,9 +824,12 @@ impl TableProvider for DeltaTableProvider {
824824

825825
fn supports_filters_pushdown(
826826
&self,
827-
_filter: &[&Expr],
827+
filter: &[&Expr],
828828
) -> DataFusionResult<Vec<TableProviderFilterPushDown>> {
829-
Ok(vec![TableProviderFilterPushDown::Inexact])
829+
Ok(filter
830+
.iter()
831+
.map(|_| TableProviderFilterPushDown::Inexact)
832+
.collect())
830833
}
831834

832835
fn statistics(&self) -> Option<Statistics> {

python/deltalake/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from ._internal import __version__ as __version__
33
from ._internal import rust_core_version as rust_core_version
44
from .data_catalog import DataCatalog as DataCatalog
5+
from .query import QueryBuilder
56
from .schema import DataType as DataType
67
from .schema import Field as Field
78
from .schema import Schema as Schema

python/deltalake/_internal.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,11 @@ class DeltaFileSystemHandler:
873873
) -> ObjectOutputStream:
874874
"""Open an output stream for sequential writing."""
875875

876+
class PyQueryBuilder:
877+
def __init__(self) -> None: ...
878+
def register(self, table_name: str, delta_table: RawDeltaTable): ...
879+
def execute(self, sql: str) -> List[pyarrow.RecordBatch]: ...
880+
876881
class DeltaDataChecker:
877882
def __init__(self, invariants: List[Tuple[str, str]]) -> None: ...
878883
def check_batch(self, batch: pyarrow.RecordBatch) -> None: ...

python/deltalake/query.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from __future__ import annotations
2+
3+
from typing import List
4+
5+
import pyarrow
6+
7+
from deltalake._internal import PyQueryBuilder
8+
from deltalake.table import DeltaTable
9+
10+
11+
class QueryBuilder:
12+
def __init__(self) -> None:
13+
self._query_builder = PyQueryBuilder()
14+
15+
def register(self, table_name: str, delta_table: DeltaTable) -> QueryBuilder:
16+
"""Add a table to the query builder."""
17+
self._query_builder.register(
18+
table_name=table_name,
19+
delta_table=delta_table._table,
20+
)
21+
return self
22+
23+
def execute(self, sql: str) -> List[pyarrow.RecordBatch]:
24+
"""Execute the query and return a list of record batches."""
25+
return self._query_builder.execute(sql)

python/src/error.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use arrow_schema::ArrowError;
2+
use deltalake::datafusion::error::DataFusionError;
23
use deltalake::protocol::ProtocolError;
34
use deltalake::{errors::DeltaTableError, ObjectStoreError};
45
use pyo3::exceptions::{
@@ -79,6 +80,10 @@ fn checkpoint_to_py(err: ProtocolError) -> PyErr {
7980
}
8081
}
8182

83+
fn datafusion_to_py(err: DataFusionError) -> PyErr {
84+
DeltaError::new_err(err.to_string())
85+
}
86+
8287
#[derive(thiserror::Error, Debug)]
8388
pub enum PythonError {
8489
#[error("Error in delta table")]
@@ -89,6 +94,8 @@ pub enum PythonError {
8994
Arrow(#[from] ArrowError),
9095
#[error("Error in checkpoint")]
9196
Protocol(#[from] ProtocolError),
97+
#[error("Error in data fusion")]
98+
DataFusion(#[from] DataFusionError),
9299
}
93100

94101
impl From<PythonError> for pyo3::PyErr {
@@ -98,6 +105,7 @@ impl From<PythonError> for pyo3::PyErr {
98105
PythonError::ObjectStore(err) => object_store_to_py(err),
99106
PythonError::Arrow(err) => arrow_to_py(err),
100107
PythonError::Protocol(err) => checkpoint_to_py(err),
108+
PythonError::DataFusion(err) => datafusion_to_py(err),
101109
}
102110
}
103111
}

python/src/lib.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ mod error;
22
mod features;
33
mod filesystem;
44
mod merge;
5+
mod query;
56
mod schema;
67
mod utils;
78

89
use std::collections::{HashMap, HashSet};
910
use std::future::IntoFuture;
1011
use std::str::FromStr;
12+
use std::sync::Arc;
1113
use std::time;
1214
use std::time::{SystemTime, UNIX_EPOCH};
1315

@@ -17,12 +19,18 @@ use delta_kernel::expressions::Scalar;
1719
use delta_kernel::schema::StructField;
1820
use deltalake::arrow::compute::concat_batches;
1921
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
22+
use deltalake::arrow::pyarrow::ToPyArrow;
2023
use deltalake::arrow::record_batch::{RecordBatch, RecordBatchIterator};
2124
use deltalake::arrow::{self, datatypes::Schema as ArrowSchema};
2225
use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
26+
use deltalake::datafusion::datasource::provider_as_source;
27+
use deltalake::datafusion::logical_expr::{LogicalPlanBuilder, UNNAMED_TABLE};
2328
use deltalake::datafusion::physical_plan::ExecutionPlan;
24-
use deltalake::datafusion::prelude::SessionContext;
25-
use deltalake::delta_datafusion::DeltaDataChecker;
29+
use deltalake::datafusion::prelude::{DataFrame, SessionContext};
30+
use deltalake::delta_datafusion::{
31+
DataFusionMixins, DeltaDataChecker, DeltaScanConfigBuilder, DeltaSessionConfig,
32+
DeltaTableProvider,
33+
};
2634
use deltalake::errors::DeltaTableError;
2735
use deltalake::kernel::{
2836
scalars::ScalarExt, Action, Add, Invariant, LogicalFile, Remove, StructType, Transaction,
@@ -66,6 +74,7 @@ use crate::error::PythonError;
6674
use crate::features::TableFeatures;
6775
use crate::filesystem::FsConfig;
6876
use crate::merge::PyMergeBuilder;
77+
use crate::query::PyQueryBuilder;
6978
use crate::schema::{schema_to_pyobject, Field};
7079
use crate::utils::rt;
7180

@@ -2069,6 +2078,7 @@ fn _internal(m: &Bound<'_, PyModule>) -> PyResult<()> {
20692078
)?)?;
20702079
m.add_class::<RawDeltaTable>()?;
20712080
m.add_class::<PyMergeBuilder>()?;
2081+
m.add_class::<PyQueryBuilder>()?;
20722082
m.add_class::<RawDeltaTableMetaData>()?;
20732083
m.add_class::<PyDeltaDataChecker>()?;
20742084
m.add_class::<PyTransaction>()?;

python/src/query.rs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
use std::sync::Arc;
2+
3+
use deltalake::{
4+
arrow::pyarrow::ToPyArrow,
5+
datafusion::prelude::SessionContext,
6+
delta_datafusion::{DeltaScanConfigBuilder, DeltaSessionConfig, DeltaTableProvider},
7+
};
8+
use pyo3::prelude::*;
9+
10+
use crate::{error::PythonError, utils::rt, RawDeltaTable};
11+
12+
#[pyclass(module = "deltalake._internal")]
13+
pub(crate) struct PyQueryBuilder {
14+
_ctx: SessionContext,
15+
}
16+
17+
#[pymethods]
18+
impl PyQueryBuilder {
19+
#[new]
20+
pub fn new() -> Self {
21+
let config = DeltaSessionConfig::default().into();
22+
let _ctx = SessionContext::new_with_config(config);
23+
24+
PyQueryBuilder { _ctx }
25+
}
26+
27+
pub fn register(&self, table_name: &str, delta_table: &RawDeltaTable) -> PyResult<()> {
28+
let snapshot = delta_table._table.snapshot().map_err(PythonError::from)?;
29+
let log_store = delta_table._table.log_store();
30+
31+
let scan_config = DeltaScanConfigBuilder::default()
32+
.with_parquet_pushdown(false)
33+
.build(snapshot)
34+
.map_err(PythonError::from)?;
35+
36+
let provider = Arc::new(
37+
DeltaTableProvider::try_new(snapshot.clone(), log_store, scan_config)
38+
.map_err(PythonError::from)?,
39+
);
40+
41+
self._ctx
42+
.register_table(table_name, provider)
43+
.map_err(PythonError::from)?;
44+
45+
Ok(())
46+
}
47+
48+
pub fn execute(&self, py: Python, sql: &str) -> PyResult<PyObject> {
49+
let batches = py.allow_threads(|| {
50+
rt().block_on(async {
51+
let df = self._ctx.sql(sql).await?;
52+
df.collect().await
53+
})
54+
.map_err(PythonError::from)
55+
})?;
56+
57+
batches.to_pyarrow(py)
58+
}
59+
}

python/tests/test_table_read.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from deltalake._util import encode_partition_value
1111
from deltalake.exceptions import DeltaProtocolError
12+
from deltalake.query import QueryBuilder
1213
from deltalake.table import ProtocolVersions
1314
from deltalake.writer import write_deltalake
1415

@@ -946,3 +947,56 @@ def test_is_deltatable_with_storage_opts():
946947
"DELTA_DYNAMO_TABLE_NAME": "custom_table_name",
947948
}
948949
assert DeltaTable.is_deltatable(table_path, storage_options=storage_options)
950+
951+
952+
def test_read_query_builder():
953+
table_path = "../crates/test/tests/data/delta-0.8.0-partitioned"
954+
dt = DeltaTable(table_path)
955+
expected = {
956+
"value": ["4", "5", "6", "7"],
957+
"year": ["2021", "2021", "2021", "2021"],
958+
"month": ["4", "12", "12", "12"],
959+
"day": ["5", "4", "20", "20"],
960+
}
961+
actual = pa.Table.from_batches(
962+
QueryBuilder()
963+
.register("tbl", dt)
964+
.execute("SELECT * FROM tbl WHERE year >= 2021 ORDER BY value")
965+
).to_pydict()
966+
assert expected == actual
967+
968+
969+
def test_read_query_builder_join_multiple_tables(tmp_path):
970+
table_path = "../crates/test/tests/data/delta-0.8.0-date"
971+
dt1 = DeltaTable(table_path)
972+
973+
write_deltalake(
974+
tmp_path,
975+
pa.table(
976+
{
977+
"date": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-12-31"],
978+
"value": ["a", "b", "c", "d"],
979+
}
980+
),
981+
)
982+
dt2 = DeltaTable(tmp_path)
983+
984+
expected = {
985+
"date": ["2021-01-01", "2021-01-02", "2021-01-03"],
986+
"dayOfYear": [1, 2, 3],
987+
"value": ["a", "b", "c"],
988+
}
989+
actual = pa.Table.from_batches(
990+
QueryBuilder()
991+
.register("tbl1", dt1)
992+
.register("tbl2", dt2)
993+
.execute(
994+
"""
995+
SELECT tbl2.date, tbl1.dayOfYear, tbl2.value
996+
FROM tbl1
997+
INNER JOIN tbl2 ON tbl1.date = tbl2.date
998+
ORDER BY tbl1.date
999+
"""
1000+
)
1001+
).to_pydict()
1002+
assert expected == actual

0 commit comments

Comments
 (0)