apache · timsaucer · Mar 8, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/README.md b/README.md
@@ -81,6 +81,49 @@ This produces the following chart:
 
 ![Chart](examples/chart.png)
 
+## Registering a DataFrame as a View
+
+You can use the `into_view` method to convert a DataFrame into a view and register it with the context.
+
+```python
+from datafusion import SessionContext, col, literal
+
+# Create a DataFusion context
+ctx = SessionContext()
+
+# Create sample data
+data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
+
+# Create a DataFrame from the dictionary
+df = ctx.from_pydict(data, "my_table")
+
+# Filter the DataFrame (for example, keep rows where a > 2)
+df_filtered = df.filter(col("a") > literal(2))
+
+# Convert the filtered DataFrame into a view
+view = df_filtered.into_view()
+
+# Register the view with the context
+ctx.register_table("view1", view)
+
+# Now run a SQL query against the registered view
+df_view = ctx.sql("SELECT * FROM view1")
+
+# Collect the results
+results = df_view.collect()
+
+# Convert results to a list of dictionaries for display
+result_dicts = [batch.to_pydict() for batch in results]
+
+print(result_dicts)
+```
+
+This will output:
+
+```python
+[{'a': [3, 4, 5], 'b': [30, 40, 50]}]
+```
+
 ## Configuration
 
 It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context.

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -121,6 +121,14 @@ def __init__(self, df: DataFrameInternal) -> None:
         """
         self.df = df
 
+    def into_view(self) -> pa.Table:
+        """Convert DataFrame as a ViewTable which can be used in register_table."""
+        return self._into_view()
+
+    def _into_view(self) -> pa.Table:
+        """Convert DataFrame as a ViewTable which can be used in register_table."""
+        return self.df._into_view()
+
     def __getitem__(self, key: str | List[str]) -> DataFrame:
         """Return a new :py:class`DataFrame` with the specified column or columns.
 

diff --git a/python/tests/test_view.py b/python/tests/test_view.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from datafusion import SessionContext, col, literal
+
+
+def test_register_filtered_dataframe():
+    ctx = SessionContext()
+
+    data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
+
+    df = ctx.from_pydict(data, "my_table")
+
+    df_filtered = df.filter(col("a") > literal(2))
+    view = df_filtered.into_view()
+
+    assert view.kind == "view"
+
+    ctx.register_table("view1", view)
+
+    df_view = ctx.sql("SELECT * FROM view1")
+
+    filtered_results = df_view.collect()
+
+    result_dicts = [batch.to_pydict() for batch in filtered_results]
+
+    expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}]
+
+    assert result_dicts == expected_results
+
+    df_results = df.collect()
+
+    df_result_dicts = [batch.to_pydict() for batch in df_results]
+
+    expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}]
+
+    assert df_result_dicts == expected_df_results
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -30,6 +30,7 @@ use datafusion::arrow::util::pretty;
 use datafusion::common::UnnestOptions;
 use datafusion::config::{CsvOptions, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
+use datafusion::datasource::TableProvider;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
 use datafusion::prelude::*;
@@ -39,6 +40,7 @@ use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
 use tokio::task::JoinHandle;
 
+use crate::catalog::PyTable;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
@@ -50,6 +52,22 @@ use crate::{
     expr::{sort_expr::PySortExpr, PyExpr},
 };
 
+#[pyclass(name = "TableProvider", module = "datafusion")]
+pub struct PyTableProvider {
+    provider: Arc<dyn TableProvider>,
+}
+
+impl PyTableProvider {
+    pub fn new(provider: Arc<dyn TableProvider>) -> Self {
+        Self { provider }
+    }
+
+    pub fn as_table(&self) -> PyTable {
+        let table_provider: Arc<dyn TableProvider> = self.provider.clone();
+        PyTable::new(table_provider)
+    }
+}
+
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
 /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment.
@@ -156,6 +174,20 @@ impl PyDataFrame {
         PyArrowType(self.df.schema().into())
     }
 
+    /// Convert this DataFrame into a Table that can be used in register_table
+    fn _into_view(&self) -> PyDataFusionResult<PyTable> {
+        // Call the underlying Rust DataFrame::into_view method.
+        // Note that the Rust method consumes self; here we clone the inner Arc<DataFrame>
+        // so that we don’t invalidate this PyDataFrame.
+        // _into_view because clippy says `into_*` usually take `self` by value
+        // but we cannot own self because Python objects are shared,
+        // so 'self' cannot be moved out of the Python interpreter
+        let table_provider = self.df.as_ref().clone().into_view();
+        let table_provider = PyTableProvider::new(table_provider);
+
+        Ok(table_provider.as_table())
+    }
+
     #[pyo3(signature = (*args))]
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();