Skip to content

Commit 4af9052

Browse files
committed
chore: add some more documentation to the new QueryBuilder interface
Signed-off-by: R. Tyler Croy <[email protected]>
1 parent 3335a03 commit 4af9052

File tree

3 files changed

+56
-8
lines changed

3 files changed

+56
-8
lines changed

python/deltalake/query.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@
1111

1212

1313
class QueryBuilder:
14+
"""
15+
QueryBuilder is an experimental API which exposes Apache DataFusion SQL to Python users of the deltalake library.
16+
17+
This API is subject to change.
18+
19+
>>> qb = QueryBuilder()
20+
"""
21+
1422
def __init__(self) -> None:
1523
warnings.warn(
1624
"QueryBuilder is experimental and subject to change",
@@ -19,13 +27,38 @@ def __init__(self) -> None:
1927
self._query_builder = PyQueryBuilder()
2028

2129
def register(self, table_name: str, delta_table: DeltaTable) -> QueryBuilder:
22-
"""Add a table to the query builder."""
30+
"""
31+
Add a table to the query builder instance by name. The `table_name`
32+
will be how the referenced `DeltaTable` can be referenced in SQL
33+
queries.
34+
35+
For example:
36+
37+
>>> tmp = getfixture('tmp_path')
38+
>>> import pyarrow as pa
39+
>>> from deltalake import DeltaTable, QueryBuilder
40+
>>> dt = DeltaTable.create(table_uri=tmp, schema=pa.schema([pa.field('name', pa.string())]))
41+
>>> qb = QueryBuilder().register('test', dt)
42+
>>> assert qb is not None
43+
"""
2344
self._query_builder.register(
2445
table_name=table_name,
2546
delta_table=delta_table._table,
2647
)
2748
return self
2849

2950
def execute(self, sql: str) -> List[pyarrow.RecordBatch]:
30-
"""Execute the query and return a list of record batches."""
51+
"""
52+
Execute the query and return a list of record batches
53+
54+
For example:
55+
56+
>>> tmp = getfixture('tmp_path')
57+
>>> import pyarrow as pa
58+
>>> from deltalake import DeltaTable, QueryBuilder
59+
>>> dt = DeltaTable.create(table_uri=tmp, schema=pa.schema([pa.field('name', pa.string())]))
60+
>>> qb = QueryBuilder().register('test', dt)
61+
>>> results = qb.execute('SELECT * FROM test')
62+
>>> assert results is not None
63+
"""
3164
return self._query_builder.execute(sql)

python/src/filesystem.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use std::sync::Arc;
1313

1414
const DEFAULT_MAX_BUFFER_SIZE: usize = 5 * 1024 * 1024;
1515

16-
#[derive(Debug, Clone, Serialize, Deserialize)]
16+
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1717
pub(crate) struct FsConfig {
1818
pub(crate) root_url: String,
1919
pub(crate) options: HashMap<String, String>,

python/src/query.rs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,31 @@ use pyo3::prelude::*;
99

1010
use crate::{error::PythonError, utils::rt, RawDeltaTable};
1111

12+
/// PyQueryBuilder supports the _experimental_ `QueryBuilder` Pythoh interface which allows users
13+
/// to take advantage of the [Apache DataFusion](https://datafusion.apache.org) engine already
14+
/// present in the Python package.
1215
#[pyclass(module = "deltalake._internal")]
16+
#[derive(Default)]
1317
pub(crate) struct PyQueryBuilder {
14-
_ctx: SessionContext,
18+
/// DataFusion [SessionContext] to hold mappings of registered tables
19+
ctx: SessionContext,
1520
}
1621

1722
#[pymethods]
1823
impl PyQueryBuilder {
1924
#[new]
2025
pub fn new() -> Self {
2126
let config = DeltaSessionConfig::default().into();
22-
let _ctx = SessionContext::new_with_config(config);
27+
let ctx = SessionContext::new_with_config(config);
2328

24-
PyQueryBuilder { _ctx }
29+
PyQueryBuilder { ctx }
2530
}
2631

32+
/// Register the given [RawDeltaTable] into the [SessionContext] using the provided
33+
/// `table_name`
34+
///
35+
/// Once called, the provided `delta_table` will be referencable in SQL queries so long as
36+
/// another table of the same name is not registered over it.
2737
pub fn register(&self, table_name: &str, delta_table: &RawDeltaTable) -> PyResult<()> {
2838
let snapshot = delta_table._table.snapshot().map_err(PythonError::from)?;
2939
let log_store = delta_table._table.log_store();
@@ -37,17 +47,22 @@ impl PyQueryBuilder {
3747
.map_err(PythonError::from)?,
3848
);
3949

40-
self._ctx
50+
self.ctx
4151
.register_table(table_name, provider)
4252
.map_err(PythonError::from)?;
4353

4454
Ok(())
4555
}
4656

57+
/// Execute the given SQL command within the [SessionContext] of this instance
58+
///
59+
/// **NOTE:** Since this function returns a materialized Python list of `RecordBatch`
60+
/// instances, it may result unexpected memory consumption for queries which return large data
61+
/// sets.
4762
pub fn execute(&self, py: Python, sql: &str) -> PyResult<PyObject> {
4863
let batches = py.allow_threads(|| {
4964
rt().block_on(async {
50-
let df = self._ctx.sql(sql).await?;
65+
let df = self.ctx.sql(sql).await?;
5166
df.collect().await
5267
})
5368
.map_err(PythonError::from)

0 commit comments

Comments
 (0)