diff --git a/README.md b/README.md index 9c56b62dd..4f80dbe18 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,46 @@ This produces the following chart: ![Chart](examples/chart.png) +## Registering a DataFrame as a View + +You can use SessionContext's `register_view` method to convert a DataFrame into a view and register it with the context. + +```python +from datafusion import SessionContext, col, literal + +# Create a DataFusion context +ctx = SessionContext() + +# Create sample data +data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + +# Create a DataFrame from the dictionary +df = ctx.from_pydict(data, "my_table") + +# Filter the DataFrame (for example, keep rows where a > 2) +df_filtered = df.filter(col("a") > literal(2)) + +# Register the dataframe as a view with the context +ctx.register_view("view1", df_filtered) + +# Now run a SQL query against the registered view +df_view = ctx.sql("SELECT * FROM view1") + +# Collect the results +results = df_view.collect() + +# Convert results to a list of dictionaries for display +result_dicts = [batch.to_pydict() for batch in results] + +print(result_dicts) +``` + +This will output: + +```python +[{'a': [3, 4, 5], 'b': [30, 40, 50]}] +``` + ## Configuration It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context. diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst index d7c708c21..7abd1f138 100644 --- a/docs/source/user-guide/common-operations/index.rst +++ b/docs/source/user-guide/common-operations/index.rst @@ -23,6 +23,7 @@ The contents of this section are designed to guide a new user through how to use .. toctree:: :maxdepth: 2 + views basic-info select-and-filter expressions diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst new file mode 100644 index 000000000..df11e3abe --- /dev/null +++ b/docs/source/user-guide/common-operations/views.rst @@ -0,0 +1,58 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +====================== +Registering Views +====================== + +You can use the context's ``register_view`` method to register a DataFrame as a view + +.. code-block:: python + + from datafusion import SessionContext, col, literal + + # Create a DataFusion context + ctx = SessionContext() + + # Create sample data + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + # Create a DataFrame from the dictionary + df = ctx.from_pydict(data, "my_table") + + # Filter the DataFrame (for example, keep rows where a > 2) + df_filtered = df.filter(col("a") > literal(2)) + + # Register the dataframe as a view with the context + ctx.register_view("view1", df_filtered) + + # Now run a SQL query against the registered view + df_view = ctx.sql("SELECT * FROM view1") + + # Collect the results + results = df_view.collect() + + # Convert results to a list of dictionaries for display + result_dicts = [batch.to_pydict() for batch in results] + + print(result_dicts) + +This will output: + +.. code-block:: python + + [{'a': [3, 4, 5], 'b': [30, 40, 50]}] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 21955b6d1..befc4dce6 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -707,6 +707,18 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_polars(data, name)) + # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + # is the discussion on how we arrived at adding register_view + def register_view(self, name: str, df: DataFrame): + """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. + + Args: + name (str): The name to register the view under. + df (DataFrame): The DataFrame to be converted into a view and registered. + """ + view = df.into_view() + self.ctx.register_table(name, view) + def register_table(self, name: str, table: Table) -> None: """Register a :py:class: `~datafusion.catalog.Table` as a table. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 23b5d630b..85a179ec9 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -124,6 +124,10 @@ def __init__(self, df: DataFrameInternal) -> None: """ self.df = df + def into_view(self) -> pa.Table: + """Convert DataFrame as a ViewTable which can be used in register_table.""" + return self.df.into_view() + def __getitem__(self, key: str | List[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. diff --git a/python/tests/test_view.py b/python/tests/test_view.py new file mode 100644 index 000000000..1d92cc0d4 --- /dev/null +++ b/python/tests/test_view.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from datafusion import SessionContext, col, literal + + +def test_register_filtered_dataframe(): + ctx = SessionContext() + + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + df = ctx.from_pydict(data, "my_table") + + df_filtered = df.filter(col("a") > literal(2)) + + ctx.register_view("view1", df_filtered) + + df_view = ctx.sql("SELECT * FROM view1") + + filtered_results = df_view.collect() + + result_dicts = [batch.to_pydict() for batch in filtered_results] + + expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}] + + assert result_dicts == expected_results + + df_results = df.collect() + + df_result_dicts = [batch.to_pydict() for batch in df_results] + + expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}] + + assert df_result_dicts == expected_df_results diff --git a/src/dataframe.rs b/src/dataframe.rs index ed9578a71..243e2e14f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -30,6 +30,7 @@ use datafusion::arrow::util::pretty; use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; +use datafusion::datasource::TableProvider; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; @@ -39,6 +40,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; +use crate::catalog::PyTable; use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; @@ -50,6 +52,25 @@ use crate::{ expr::{sort_expr::PySortExpr, PyExpr}, }; +// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 +// - we have not decided on the table_provider approach yet +// this is an interim implementation +#[pyclass(name = "TableProvider", module = "datafusion")] +pub struct PyTableProvider { + provider: Arc, +} + +impl PyTableProvider { + pub fn new(provider: Arc) -> Self { + Self { provider } + } + + pub fn as_table(&self) -> PyTable { + let table_provider: Arc = self.provider.clone(); + PyTable::new(table_provider) + } +} + /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. @@ -156,6 +177,24 @@ impl PyDataFrame { PyArrowType(self.df.schema().into()) } + /// Convert this DataFrame into a Table that can be used in register_table + /// By convention, into_... methods consume self and return the new object. + /// Disabling the clippy lint, so we can use &self + /// because we're working with Python bindings + /// where objects are shared + /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + /// - we have not decided on the table_provider approach yet + #[allow(clippy::wrong_self_convention)] + fn into_view(&self) -> PyDataFusionResult { + // Call the underlying Rust DataFrame::into_view method. + // Note that the Rust method consumes self; here we clone the inner Arc + // so that we don’t invalidate this PyDataFrame. + let table_provider = self.df.as_ref().clone().into_view(); + let table_provider = PyTableProvider::new(table_provider); + + Ok(table_provider.as_table()) + } + #[pyo3(signature = (*args))] fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>();