Skip to content

Commit a8275dc

Browse files
committed
Normalize & simplify TableProvider/DataFrame registration; add
utilities, docs, and robustness fixes * Normalized table-provider handling and simplified registration flow across the codebase; multiple commits centralize provider coercion and normalization. * Introduced utility helpers (`coerce_table_provider`, `extract_table_provider`, `_normalize_table_provider`) to centralize extraction, error handling, and improve clarity. * Simplified `from_dataframe` / `into_view` behavior: clearer implementations, direct returns of DataFrame views where appropriate, and added internal tests for DataFrame flows. * Fixed DataFrame registration semantics: enforce `TypeError` for invalid registrations; added handling for `DataFrameWrapper` by converting it to a view. * Added tests, including a schema registration test using a PyArrow dataset and internal DataFrame tests to cover new flows. * Documentation improvements: expanded `from_dataframe` docstrings with parameter details, added usage examples for `into_view`, and documented deprecations (e.g., `register_table_provider` → `register_table`). * Warning and UX fixes: synchronized deprecation `stacklevel` so warnings point to caller code; improved `__dir__` to return sorted, unique attributes. * Cleanup: removed unused imports (including an unused error import from `utils.rs`) and other dead code to reduce noise.
1 parent 512442b commit a8275dc

File tree

14 files changed

+268
-62
lines changed

14 files changed

+268
-62
lines changed

dev/changelog/49.0.0.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ This release consists of 16 commits from 7 contributors. See credits at the end
2525

2626
- fix(build): Include build.rs in published crates [#1199](https://github.com/apache/datafusion-python/pull/1199) (colinmarc)
2727

28+
**Deprecations:**
29+
30+
- Document that `SessionContext.register_table_provider` is deprecated in favor of `SessionContext.register_table`.
31+
2832
**Other:**
2933

3034
- 48.0.0 Release [#1175](https://github.com/apache/datafusion-python/pull/1175) (timsaucer)

docs/source/user-guide/data-sources.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,14 @@ as Delta Lake. This will require a recent version of
160160
df = ctx.table("my_delta_table")
161161
df.show()
162162
163-
On older versions of ``deltalake`` (prior to 0.22) you can use the
163+
.. note::
164+
165+
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
166+
deprecated. Use
167+
:py:meth:`~datafusion.context.SessionContext.register_table` with a
168+
:py:class:`~datafusion.TableProvider` instead.
169+
170+
On older versions of ``deltalake`` (prior to 0.22) you can use the
164171
`Arrow DataSet <https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html>`_
165172
interface to import to DataFusion, but this does not support features such as filter push down
166173
which can lead to a significant performance difference.

docs/source/user-guide/io/table_provider.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ Call the provider's ``__datafusion_table_provider__()`` method to obtain the cap
5454
before constructing a ``TableProvider``. The ``TableProvider.from_view()`` helper is
5555
deprecated; instead use ``TableProvider.from_dataframe()`` or ``DataFrame.into_view()``.
5656

57+
.. note::
58+
59+
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
60+
deprecated. Use
61+
:py:meth:`~datafusion.context.SessionContext.register_table` with the
62+
resulting :py:class:`~datafusion.TableProvider` instead.
63+
5764
.. code-block:: python
5865
5966
from datafusion import SessionContext, TableProvider

python/datafusion/catalog.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from typing import TYPE_CHECKING, Protocol
2424

2525
import datafusion._internal as df_internal
26+
from datafusion.utils import _normalize_table_provider
2627

2728
if TYPE_CHECKING:
2829
import pyarrow as pa
@@ -137,9 +138,8 @@ def register_table(
137138
Objects implementing ``__datafusion_table_provider__`` are also supported
138139
and treated as :class:`TableProvider` instances.
139140
"""
140-
if isinstance(table, Table):
141-
return self._raw_schema.register_table(name, table.table)
142-
return self._raw_schema.register_table(name, table)
141+
provider = _normalize_table_provider(table)
142+
return self._raw_schema.register_table(name, provider)
143143

144144
def deregister_table(self, name: str) -> None:
145145
"""Deregister a table provider from this schema."""

python/datafusion/context.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
3535
from datafusion.record_batch import RecordBatchStream
3636
from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF
37+
from datafusion.utils import _normalize_table_provider
3738

3839
from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
3940
from ._internal import SessionConfig as SessionConfigInternal
@@ -735,7 +736,7 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
735736
# https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
736737
# is the discussion on how we arrived at adding register_view
737738
def register_view(self, name: str, df: DataFrame) -> None:
738-
"""Register a :py:class:`~datafusion.dataframe.DataFrame` as a view.
739+
"""Register a :py:class: `~datafusion.detaframe.DataFrame` as a view.
739740
740741
Args:
741742
name (str): The name to register the view under.
@@ -747,28 +748,27 @@ def register_view(self, name: str, df: DataFrame) -> None:
747748
def register_table(
748749
self, name: str, table: Table | TableProvider | TableProviderExportable
749750
) -> None:
750-
"""Register a :py:class:`~datafusion.catalog.Table` or ``TableProvider``.
751+
"""Register a :py:class:`~datafusion.catalog.Table` or
752+
:py:class:`~datafusion.TableProvider`.
751753
752754
The registered table can be referenced from SQL statements executed against
753755
this context.
754756
755757
Plain :py:class:`~datafusion.dataframe.DataFrame` objects are not supported;
756758
convert them first with :meth:`datafusion.dataframe.DataFrame.into_view` or
757-
:meth:`datafusion.catalog.TableProvider.from_dataframe`.
759+
:meth:`datafusion.TableProvider.from_dataframe`.
758760
759761
Objects implementing ``__datafusion_table_provider__`` are also supported
760-
and treated as :class:`~datafusion.catalog.TableProvider` instances.
762+
and treated as :py:class:`~datafusion.TableProvider` instances.
761763
762764
Args:
763765
name: Name of the resultant table.
764766
table: DataFusion :class:`Table`, :class:`TableProvider`, or any object
765767
implementing ``__datafusion_table_provider__`` to add to the session
766768
context.
767769
"""
768-
if isinstance(table, Table):
769-
self.ctx.register_table(name, table.table)
770-
else:
771-
self.ctx.register_table(name, table)
770+
provider = _normalize_table_provider(table)
771+
self.ctx.register_table(name, provider)
772772

773773
def deregister_table(self, name: str) -> None:
774774
"""Remove a table from the session."""
@@ -795,7 +795,7 @@ def register_table_provider(
795795
Deprecated: use :meth:`register_table` instead.
796796
797797
Objects implementing ``__datafusion_table_provider__`` are also supported
798-
and treated as :class:`~datafusion.catalog.TableProvider` instances.
798+
and treated as :py:class:`~datafusion.TableProvider` instances.
799799
"""
800800
warnings.warn(
801801
"register_table_provider is deprecated; use register_table",

python/datafusion/dataframe.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,20 @@ def into_view(self) -> TableProvider:
313313
314314
This is the preferred way to obtain a view for
315315
:py:meth:`~datafusion.context.SessionContext.register_table`.
316-
``TableProvider.from_dataframe`` calls this method under the hood,
316+
``datafusion.TableProvider.from_dataframe`` calls this method under the hood,
317317
and the older ``TableProvider.from_view`` helper is deprecated.
318+
319+
The ``DataFrame`` remains valid after conversion, so it can still be used for
320+
additional queries alongside the returned view.
321+
322+
Examples:
323+
>>> from datafusion import SessionContext
324+
>>> ctx = SessionContext()
325+
>>> df = ctx.sql("SELECT 1 AS value")
326+
>>> provider = df.into_view()
327+
>>> ctx.register_table("values_view", provider)
328+
>>> df.collect() # The DataFrame is still usable
329+
>>> ctx.sql("SELECT value FROM values_view").collect()
318330
"""
319331
from datafusion.table_provider import TableProvider as _TableProvider
320332

python/datafusion/table_provider.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626

2727
_InternalTableProvider = df_internal.TableProvider
2828

29+
# Keep in sync with ``datafusion._internal.TableProvider.from_view``.
30+
_FROM_VIEW_WARN_STACKLEVEL = 2
31+
2932

3033
class TableProvider:
3134
"""High level wrapper around :mod:`datafusion._internal.TableProvider`."""
@@ -50,14 +53,26 @@ def from_capsule(cls, capsule: Any) -> TableProvider:
5053

5154
@classmethod
5255
def from_dataframe(cls, df: Any) -> TableProvider:
53-
"""Create a :class:`TableProvider` from a :class:`DataFrame`."""
56+
"""Create a :class:`TableProvider` from tabular data.
57+
58+
Parameters
59+
----------
60+
df:
61+
Either a :class:`~datafusion.dataframe.DataFrame` wrapper or the
62+
corresponding :class:`~datafusion._internal.DataFrame`. When
63+
working with third-party DataFrame libraries, convert them via
64+
:meth:`~datafusion.SessionContext.from_arrow` before calling
65+
:meth:`~datafusion.dataframe.DataFrame.into_view` or this
66+
constructor.
67+
"""
5468
from datafusion.dataframe import DataFrame as DataFrameWrapper
5569

5670
if isinstance(df, DataFrameWrapper):
57-
df = df.df
71+
dataframe = df
72+
else:
73+
dataframe = DataFrameWrapper(df)
5874

59-
provider = _InternalTableProvider.from_dataframe(df)
60-
return cls(provider)
75+
return dataframe.into_view()
6176

6277
@classmethod
6378
def from_view(cls, df: Any) -> TableProvider:
@@ -74,8 +89,8 @@ def from_view(cls, df: Any) -> TableProvider:
7489
warnings.warn(
7590
"TableProvider.from_view is deprecated; use DataFrame.into_view or "
7691
"TableProvider.from_dataframe instead.",
77-
DeprecationWarning,
78-
stacklevel=2,
92+
category=DeprecationWarning,
93+
stacklevel=_FROM_VIEW_WARN_STACKLEVEL,
7994
)
8095
return cls(provider)
8196

@@ -88,7 +103,7 @@ def __getattr__(self, name: str) -> Any:
88103

89104
def __dir__(self) -> list[str]:
90105
"""Expose delegated attributes via :func:`dir`."""
91-
return dir(self._table_provider) + super().__dir__()
106+
return sorted(set(super().__dir__()) | set(dir(self._table_provider)))
92107

93108
def __repr__(self) -> str: # pragma: no cover - simple delegation
94109
"""Return a representation of the wrapped provider."""

python/datafusion/utils.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Miscellaneous helper utilities for DataFusion's Python bindings."""
18+
19+
from __future__ import annotations
20+
21+
from importlib import import_module, util
22+
from typing import TYPE_CHECKING, Any
23+
24+
from datafusion._internal import EXPECTED_PROVIDER_MSG
25+
26+
_PYARROW_DATASET_TYPES: tuple[type[Any], ...]
27+
_dataset_spec = util.find_spec("pyarrow.dataset")
28+
if _dataset_spec is None: # pragma: no cover - optional dependency at runtime
29+
_PYARROW_DATASET_TYPES = ()
30+
else: # pragma: no cover - exercised in environments with pyarrow installed
31+
_dataset_module = import_module("pyarrow.dataset")
32+
dataset_base = getattr(_dataset_module, "Dataset", None)
33+
dataset_types: set[type[Any]] = set()
34+
if isinstance(dataset_base, type):
35+
dataset_types.add(dataset_base)
36+
for value in vars(_dataset_module).values():
37+
if isinstance(value, type) and issubclass(value, dataset_base):
38+
dataset_types.add(value)
39+
_PYARROW_DATASET_TYPES = tuple(dataset_types)
40+
41+
if TYPE_CHECKING: # pragma: no cover - imported for typing only
42+
from datafusion import TableProvider
43+
from datafusion.catalog import Table
44+
from datafusion.context import TableProviderExportable
45+
46+
47+
def _normalize_table_provider(
48+
table: Table | TableProvider | TableProviderExportable,
49+
) -> Any:
50+
"""Return the underlying provider for supported table inputs.
51+
52+
Args:
53+
table: A :class:`~datafusion.catalog.Table`,
54+
:class:`~datafusion.table_provider.TableProvider`, or object exporting a
55+
DataFusion table provider via ``__datafusion_table_provider__``.
56+
57+
Returns:
58+
The object expected by the Rust bindings for table registration.
59+
60+
Raises:
61+
TypeError: If ``table`` is not a supported table provider input.
62+
"""
63+
64+
from datafusion.catalog import Table as _Table
65+
from datafusion.table_provider import TableProvider as _TableProvider
66+
67+
if isinstance(table, _Table):
68+
return table.table
69+
70+
if isinstance(table, _TableProvider):
71+
return table._table_provider
72+
73+
if _PYARROW_DATASET_TYPES and isinstance(table, _PYARROW_DATASET_TYPES):
74+
return table
75+
76+
provider_factory = getattr(table, "__datafusion_table_provider__", None)
77+
if callable(provider_factory):
78+
return table
79+
80+
raise TypeError(EXPECTED_PROVIDER_MSG)

python/tests/test_catalog.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import pyarrow as pa
2121
import pyarrow.dataset as ds
2222
import pytest
23-
from datafusion import SessionContext, Table
23+
from datafusion import EXPECTED_PROVIDER_MSG, SessionContext, Table
2424

2525

2626
# Note we take in `database` as a variable even though we don't use
@@ -164,6 +164,38 @@ def test_python_table_provider(ctx: SessionContext):
164164
assert schema.table_names() == {"table4"}
165165

166166

167+
def test_schema_register_table_with_pyarrow_dataset(ctx: SessionContext):
168+
schema = ctx.catalog().schema()
169+
batch = pa.RecordBatch.from_arrays(
170+
[pa.array([1, 2, 3]), pa.array([4, 5, 6])],
171+
names=["a", "b"],
172+
)
173+
dataset = ds.dataset([batch])
174+
table_name = "pa_dataset"
175+
176+
try:
177+
schema.register_table(table_name, dataset)
178+
assert table_name in schema.table_names()
179+
180+
result = ctx.sql(f"SELECT a, b FROM {table_name}").collect()
181+
182+
assert len(result) == 1
183+
assert result[0].column(0) == pa.array([1, 2, 3])
184+
assert result[0].column(1) == pa.array([4, 5, 6])
185+
finally:
186+
schema.deregister_table(table_name)
187+
188+
189+
def test_schema_register_table_with_dataframe_errors(ctx: SessionContext):
190+
schema = ctx.catalog().schema()
191+
df = ctx.from_pydict({"a": [1]})
192+
193+
with pytest.raises(Exception) as exc_info:
194+
schema.register_table("bad", df)
195+
196+
assert str(exc_info.value) == EXPECTED_PROVIDER_MSG
197+
198+
167199
def test_in_end_to_end_python_providers(ctx: SessionContext):
168200
"""Test registering all python providers and running a query against them."""
169201

0 commit comments

Comments
 (0)