apache · timsaucer · Mar 8, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/README.md b/README.md
@@ -79,6 +79,46 @@ This produces the following chart:
 
 ![Chart](examples/chart.png)
 
+## Registering a DataFrame as a View
+
+You can use SessionContext's `register_view` method to convert a DataFrame into a view and register it with the context.
+
+```python
+from datafusion import SessionContext, col, literal
+
+# Create a DataFusion context
+ctx = SessionContext()
+
+# Create sample data
+data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
+
+# Create a DataFrame from the dictionary
+df = ctx.from_pydict(data, "my_table")
+
+# Filter the DataFrame (for example, keep rows where a > 2)
+df_filtered = df.filter(col("a") > literal(2))
+
+# Register the dataframe as a view with the context
+ctx.register_view("view1", df_filtered)
+
+# Now run a SQL query against the registered view
+df_view = ctx.sql("SELECT * FROM view1")
+
+# Collect the results
+results = df_view.collect()
+
+# Convert results to a list of dictionaries for display
+result_dicts = [batch.to_pydict() for batch in results]
+
+print(result_dicts)
+```
+
+This will output:
+
+```python
+[{'a': [3, 4, 5], 'b': [30, 40, 50]}]
+```
+
 ## Configuration
 
 It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context.

diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst
@@ -23,6 +23,7 @@ The contents of this section are designed to guide a new user through how to use
 .. toctree::
    :maxdepth: 2
 
+   views
    basic-info
    select-and-filter
    expressions

diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst
@@ -0,0 +1,58 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+======================
+Registering Views
+======================
+
+You can use the context's ``register_view`` method to register a DataFrame as a view
+
+.. code-block:: python
+
+    from datafusion import SessionContext, col, literal
+
+    # Create a DataFusion context
+    ctx = SessionContext()
+
+    # Create sample data
+    data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
+
+    # Create a DataFrame from the dictionary
+    df = ctx.from_pydict(data, "my_table")
+
+    # Filter the DataFrame (for example, keep rows where a > 2)
+    df_filtered = df.filter(col("a") > literal(2))
+
+    # Register the dataframe as a view with the context
+    ctx.register_view("view1", df_filtered)
+
+    # Now run a SQL query against the registered view
+    df_view = ctx.sql("SELECT * FROM view1")
+
+    # Collect the results
+    results = df_view.collect()
+
+    # Convert results to a list of dictionaries for display
+    result_dicts = [batch.to_pydict() for batch in results]
+
+    print(result_dicts)
+
+This will output:
+
+.. code-block:: python
+
+    [{'a': [3, 4, 5], 'b': [30, 40, 50]}]
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -707,6 +707,18 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr
         """
         return DataFrame(self.ctx.from_polars(data, name))
 
+    # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
+    # is the discussion on how we arrived at adding register_view
+    def register_view(self, name: str, df: DataFrame):
+        """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view.
+
+        Args:
+            name (str): The name to register the view under.
+            df (DataFrame): The DataFrame to be converted into a view and registered.
+        """
+        view = df.into_view()
+        self.ctx.register_table(name, view)
+
     def register_table(self, name: str, table: Table) -> None:
         """Register a :py:class: `~datafusion.catalog.Table` as a table.
 

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -124,6 +124,10 @@ def __init__(self, df: DataFrameInternal) -> None:
         """
         self.df = df
 
+    def into_view(self) -> pa.Table:
+        """Convert DataFrame as a ViewTable which can be used in register_table."""
+        return self.df.into_view()
+
     def __getitem__(self, key: str | List[str]) -> DataFrame:
         """Return a new :py:class`DataFrame` with the specified column or columns.
 

diff --git a/python/tests/test_view.py b/python/tests/test_view.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from datafusion import SessionContext, col, literal
+
+
+def test_register_filtered_dataframe():
+    ctx = SessionContext()
+
+    data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
+
+    df = ctx.from_pydict(data, "my_table")
+
+    df_filtered = df.filter(col("a") > literal(2))
+
+    ctx.register_view("view1", df_filtered)
+
+    df_view = ctx.sql("SELECT * FROM view1")
+
+    filtered_results = df_view.collect()
+
+    result_dicts = [batch.to_pydict() for batch in filtered_results]
+
+    expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}]
+
+    assert result_dicts == expected_results
+
+    df_results = df.collect()
+
+    df_result_dicts = [batch.to_pydict() for batch in df_results]
+
+    expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}]
+
+    assert df_result_dicts == expected_df_results
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -30,6 +30,7 @@ use datafusion::arrow::util::pretty;
 use datafusion::common::UnnestOptions;
 use datafusion::config::{CsvOptions, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
+use datafusion::datasource::TableProvider;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
 use datafusion::prelude::*;
@@ -39,6 +40,7 @@ use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
 use tokio::task::JoinHandle;
 
+use crate::catalog::PyTable;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
@@ -50,9 +52,79 @@ use crate::{
     expr::{sort_expr::PySortExpr, PyExpr},
 };
 
+// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
+// - we have not decided on the table_provider approach yet
+// this is an interim implementation
+#[pyclass(name = "TableProvider", module = "datafusion")]
+pub struct PyTableProvider {
+    provider: Arc<dyn TableProvider>,
+}
+
+impl PyTableProvider {
+    pub fn new(provider: Arc<dyn TableProvider>) -> Self {
+        Self { provider }
+    }
+
+    pub fn as_table(&self) -> PyTable {
+        let table_provider: Arc<dyn TableProvider> = self.provider.clone();
+        PyTable::new(table_provider)
+    }
+}
+
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
 /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment.
+///
+/// # Methods
+///
+/// - `new`: Creates a new PyDataFrame.
+/// - `__getitem__`: Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`.
+/// - `__repr__`: Returns a string representation of the DataFrame.
+/// - `_repr_html_`: Returns an HTML representation of the DataFrame.
+/// - `describe`: Calculate summary statistics for a DataFrame.
+/// - `schema`: Returns the schema from the logical plan.
+/// - `into_view`: Convert this DataFrame into a Table that can be used in register_table. We have not finalized on PyTableProvider approach yet.
+/// - `select_columns`: Select columns from the DataFrame.
+/// - `select`: Select expressions from the DataFrame.
+/// - `drop`: Drop columns from the DataFrame.
+/// - `filter`: Filter the DataFrame based on a predicate.
+/// - `with_column`: Add a new column to the DataFrame.
+/// - `with_columns`: Add multiple new columns to the DataFrame.
+/// - `with_column_renamed`: Rename a column in the DataFrame.
+/// - `aggregate`: Aggregate the DataFrame based on group by and aggregation expressions.
+/// - `sort`: Sort the DataFrame based on expressions.
+/// - `limit`: Limit the number of rows in the DataFrame.
+/// - `collect`: Executes the plan, returning a list of `RecordBatch`es.
+/// - `cache`: Cache the DataFrame.
+/// - `collect_partitioned`: Executes the DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning.
+/// - `show`: Print the result, 20 lines by default.
+/// - `distinct`: Filter out duplicate rows.
+/// - `join`: Join two DataFrames.
+/// - `join_on`: Join two DataFrames based on expressions.
+/// - `explain`: Print the query plan.
+/// - `logical_plan`: Get the logical plan for this DataFrame.
+/// - `optimized_logical_plan`: Get the optimized logical plan for this DataFrame.
+/// - `execution_plan`: Get the execution plan for this DataFrame.
+/// - `repartition`: Repartition the DataFrame based on a logical partitioning scheme.
+/// - `repartition_by_hash`: Repartition the DataFrame based on a hash partitioning scheme.
+/// - `union`: Calculate the union of two DataFrames, preserving duplicate rows.
+/// - `union_distinct`: Calculate the distinct union of two DataFrames.
+/// - `unnest_column`: Unnest a column in the DataFrame.
+/// - `unnest_columns`: Unnest multiple columns in the DataFrame.
+/// - `intersect`: Calculate the intersection of two DataFrames.
+/// - `except_all`: Calculate the exception of two DataFrames.
+/// - `write_csv`: Write the DataFrame to a CSV file.
+/// - `write_parquet`: Write the DataFrame to a Parquet file.
+/// - `write_json`: Write the DataFrame to a JSON file.
+/// - `to_arrow_table`: Convert the DataFrame to an Arrow Table.
+/// - `__arrow_c_stream__`: Convert the DataFrame to an Arrow C Stream.
+/// - `execute_stream`: Execute the DataFrame and return a RecordBatchStream.
+/// - `execute_stream_partitioned`: Execute the DataFrame and return partitioned RecordBatchStreams.
+/// - `to_pandas`: Convert the DataFrame to a Pandas DataFrame.
+/// - `to_pylist`: Convert the DataFrame to a Python list.
+/// - `to_pydict`: Convert the DataFrame to a Python dictionary.
+/// - `to_polars`: Convert the DataFrame to a Polars DataFrame.
+/// - `count`: Execute the DataFrame to get the total number of rows.
 #[pyclass(name = "DataFrame", module = "datafusion", subclass)]
 #[derive(Clone)]
 pub struct PyDataFrame {
@@ -156,6 +228,24 @@ impl PyDataFrame {
         PyArrowType(self.df.schema().into())
     }
 
+    /// Convert this DataFrame into a Table that can be used in register_table
+    /// By convention, into_... methods consume self and return the new object.
+    /// Disabling the clippy lint, so we can use &self
+    /// because we're working with Python bindings
+    /// where objects are shared
+    /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
+    /// - we have not decided on the table_provider approach yet
+    #[allow(clippy::wrong_self_convention)]
+    fn into_view(&self) -> PyDataFusionResult<PyTable> {
+        // Call the underlying Rust DataFrame::into_view method.
+        // Note that the Rust method consumes self; here we clone the inner Arc<DataFrame>
+        // so that we don’t invalidate this PyDataFrame.
+        let table_provider = self.df.as_ref().clone().into_view();
+        let table_provider = PyTableProvider::new(table_provider);
+
+        Ok(table_provider.as_table())
+    }
+
     #[pyo3(signature = (*args))]
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();