merge main

timsaucer · timsaucer · commit c82d617abee8 · 2025-10-01T13:53:47.000-04:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,7 +17,7 @@
 
 [package]
 name = "datafusion-python"
-version = "49.0.0"
+version = "50.0.0"
 homepage = "https://datafusion.apache.org/python"
 repository = "https://github.com/apache/datafusion-python"
 authors = ["Apache DataFusion <dev@datafusion.apache.org>"]
diff --git a/dev/changelog/50.0.0.md b/dev/changelog/50.0.0.md
@@ -0,0 +1,60 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion Python 50.0.0 Changelog
+
+This release consists of 12 commits from 7 contributors. See credits at the end of this changelog for more information.
+
+**Implemented enhancements:**
+
+- feat: allow passing a slice to and expression with the [] indexing [#1215](https://github.com/apache/datafusion-python/pull/1215) (timsaucer)
+
+**Documentation updates:**
+
+- docs: fix CaseBuilder documentation example [#1225](https://github.com/apache/datafusion-python/pull/1225) (IndexSeek)
+- docs: update link to user example for custom table provider [#1224](https://github.com/apache/datafusion-python/pull/1224) (IndexSeek)
+- docs: add apache iceberg as datafusion data source [#1240](https://github.com/apache/datafusion-python/pull/1240) (kevinjqliu)
+
+**Other:**
+
+- 49.0.0 release [#1211](https://github.com/apache/datafusion-python/pull/1211) (timsaucer)
+- Update development guide in README.md [#1213](https://github.com/apache/datafusion-python/pull/1213) (YKoustubhRao)
+- Add benchmark script and documentation for maximizing CPU usage in DataFusion Python [#1216](https://github.com/apache/datafusion-python/pull/1216) (kosiew)
+- Fixing a few Typos [#1220](https://github.com/apache/datafusion-python/pull/1220) (ntjohnson1)
+- Set fail on warning for documentation generation [#1218](https://github.com/apache/datafusion-python/pull/1218) (timsaucer)
+- chore: remove redundant error transformation [#1232](https://github.com/apache/datafusion-python/pull/1232) (mesejo)
+- Support string column identifiers for sort/aggregate/window and stricter Expr validation [#1221](https://github.com/apache/datafusion-python/pull/1221) (kosiew)
+- Prepare for DF50 [#1231](https://github.com/apache/datafusion-python/pull/1231) (timsaucer)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     4	Tim Saucer
+     2	Tyler White
+     2	kosiew
+     1	Daniel Mesejo
+     1	Kevin Liu
+     1	Koustubh Rao
+     1	Nick
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
+
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -427,13 +427,30 @@ def select(self, *exprs: Expr | str) -> DataFrame:
     def drop(self, *columns: str) -> DataFrame:
         """Drop arbitrary amount of columns.
 
+        Column names are case-sensitive and do not require double quotes like
+        other operations such as `select`. Leading and trailing double quotes
+        are allowed and will be automatically stripped if present.
+
         Args:
-            columns: Column names to drop from the dataframe.
+            columns: Column names to drop from the dataframe. Both ``column_name``
+                    and ``"column_name"`` are accepted.
 
         Returns:
             DataFrame with those columns removed in the projection.
+
+        Example Usage::
+
+            df.drop('ID_For_Students')      # Works
+            df.drop('"ID_For_Students"')    # Also works (quotes stripped)
         """
-        return DataFrame(self.df.drop(*columns))
+        normalized_columns = []
+        for col in columns:
+            if col.startswith('"') and col.endswith('"'):
+                normalized_columns.append(col.strip('"'))  # Strip double quotes
+            else:
+                normalized_columns.append(col)
+
+        return DataFrame(self.df.drop(*normalized_columns))
 
     def filter(self, *predicates: Expr) -> DataFrame:
         """Return a DataFrame for which ``predicate`` evaluates to ``True``.
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -220,6 +220,16 @@ def test_select(df):
     assert result.column(1) == pa.array([1, 2, 3])
 
 
+def test_drop_quoted_columns():
+    ctx = SessionContext()
+    batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], names=["ID_For_Students"])
+    df = ctx.create_dataframe([[batch]])
+
+    # Both should work
+    assert df.drop('"ID_For_Students"').schema().names == []
+    assert df.drop("ID_For_Students").schema().names == []
+
+
 def test_select_mixed_expr_string(df):
     df = df.select(column("b"), "a")
 
diff --git a/src/context.rs b/src/context.rs
@@ -296,7 +296,7 @@ impl PySQLOptions {
 /// `PySessionContext` is able to plan and execute DataFusion plans.
 /// It has a powerful optimizer, a physical planner for local execution, and a
 /// multi-threaded execution engine to perform the execution.
-#[pyclass(name = "SessionContext", module = "datafusion", subclass)]
+#[pyclass(frozen, name = "SessionContext", module = "datafusion", subclass)]
 #[derive(Clone)]
 pub struct PySessionContext {
     pub ctx: SessionContext,
@@ -348,7 +348,7 @@ impl PySessionContext {
     /// Register an object store with the given name
     #[pyo3(signature = (scheme, store, host=None))]
     pub fn register_object_store(
-        &mut self,
+        &self,
         scheme: &str,
         store: StorageContexts,
         host: Option<&str>,
@@ -380,7 +380,7 @@ impl PySessionContext {
     schema=None,
     file_sort_order=None))]
     pub fn register_listing_table(
-        &mut self,
+        &self,
         name: &str,
         path: &str,
         table_partition_cols: Vec<(String, PyArrowType<DataType>)>,
@@ -421,22 +421,22 @@ impl PySessionContext {
         Ok(())
     }
 
-    pub fn register_udtf(&mut self, func: PyTableFunction) {
+    pub fn register_udtf(&self, func: PyTableFunction) {
         let name = func.name.clone();
         let func = Arc::new(func);
         self.ctx.register_udtf(&name, func);
     }
 
     /// Returns a PyDataFrame whose plan corresponds to the SQL statement.
-    pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult<PyDataFrame> {
+    pub fn sql(&self, query: &str, py: Python) -> PyDataFusionResult<PyDataFrame> {
         let result = self.ctx.sql(query);
         let df = wait_for_future(py, result)??;
         Ok(PyDataFrame::new(df))
     }
 
     #[pyo3(signature = (query, options=None))]
     pub fn sql_with_options(
-        &mut self,
+        &self,
         query: &str,
         options: Option<PySQLOptions>,
         py: Python,
@@ -453,7 +453,7 @@ impl PySessionContext {
 
     #[pyo3(signature = (partitions, name=None, schema=None))]
     pub fn create_dataframe(
-        &mut self,
+        &self,
         partitions: PyArrowType<Vec<Vec<RecordBatch>>>,
         name: Option<&str>,
         schema: Option<PyArrowType<Schema>>,
@@ -488,14 +488,14 @@ impl PySessionContext {
     }
 
     /// Create a DataFrame from an existing logical plan
-    pub fn create_dataframe_from_logical_plan(&mut self, plan: PyLogicalPlan) -> PyDataFrame {
+    pub fn create_dataframe_from_logical_plan(&self, plan: PyLogicalPlan) -> PyDataFrame {
         PyDataFrame::new(DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()))
     }
 
     /// Construct datafusion dataframe from Python list
     #[pyo3(signature = (data, name=None))]
     pub fn from_pylist(
-        &mut self,
+        &self,
         data: Bound<'_, PyList>,
         name: Option<&str>,
     ) -> PyResult<PyDataFrame> {
@@ -515,7 +515,7 @@ impl PySessionContext {
     /// Construct datafusion dataframe from Python dictionary
     #[pyo3(signature = (data, name=None))]
     pub fn from_pydict(
-        &mut self,
+        &self,
         data: Bound<'_, PyDict>,
         name: Option<&str>,
     ) -> PyResult<PyDataFrame> {
@@ -535,7 +535,7 @@ impl PySessionContext {
     /// Construct datafusion dataframe from Arrow Table
     #[pyo3(signature = (data, name=None))]
     pub fn from_arrow(
-        &mut self,
+        &self,
         data: Bound<'_, PyAny>,
         name: Option<&str>,
         py: Python,
@@ -569,11 +569,7 @@ impl PySessionContext {
     /// Construct datafusion dataframe from pandas
     #[allow(clippy::wrong_self_convention)]
     #[pyo3(signature = (data, name=None))]
-    pub fn from_pandas(
-        &mut self,
-        data: Bound<'_, PyAny>,
-        name: Option<&str>,
-    ) -> PyResult<PyDataFrame> {
+    pub fn from_pandas(&self, data: Bound<'_, PyAny>, name: Option<&str>) -> PyResult<PyDataFrame> {
         // Obtain GIL token
         let py = data.py();
 
@@ -589,11 +585,7 @@ impl PySessionContext {
 
     /// Construct datafusion dataframe from polars
     #[pyo3(signature = (data, name=None))]
-    pub fn from_polars(
-        &mut self,
-        data: Bound<'_, PyAny>,
-        name: Option<&str>,
-    ) -> PyResult<PyDataFrame> {
+    pub fn from_polars(&self, data: Bound<'_, PyAny>, name: Option<&str>) -> PyResult<PyDataFrame> {
         // Convert Polars dataframe to Arrow Table
         let table = data.call_method0("to_arrow")?;
 
@@ -602,24 +594,20 @@ impl PySessionContext {
         Ok(df)
     }
 
-    pub fn register_table(
-        &mut self,
-        name: &str,
-        table: Bound<'_, PyAny>,
-    ) -> PyDataFusionResult<()> {
+    pub fn register_table(&self, name: &str, table: Bound<'_, PyAny>) -> PyDataFusionResult<()> {
         let table = PyTable::new(&table)?;
 
         self.ctx.register_table(name, table.table)?;
         Ok(())
     }
 
-    pub fn deregister_table(&mut self, name: &str) -> PyDataFusionResult<()> {
+    pub fn deregister_table(&self, name: &str) -> PyDataFusionResult<()> {
         self.ctx.deregister_table(name)?;
         Ok(())
     }
 
     pub fn register_catalog_provider(
-        &mut self,
+        &self,
         name: &str,
         provider: Bound<'_, PyAny>,
     ) -> PyDataFusionResult<()> {
@@ -648,7 +636,7 @@ impl PySessionContext {
 
     /// Construct datafusion dataframe from Arrow Table
     pub fn register_table_provider(
-        &mut self,
+        &self,
         name: &str,
         provider: Bound<'_, PyAny>,
     ) -> PyDataFusionResult<()> {
@@ -657,7 +645,7 @@ impl PySessionContext {
     }
 
     pub fn register_record_batches(
-        &mut self,
+        &self,
         name: &str,
         partitions: PyArrowType<Vec<Vec<RecordBatch>>>,
     ) -> PyDataFusionResult<()> {
@@ -675,7 +663,7 @@ impl PySessionContext {
                         schema=None,
                         file_sort_order=None))]
     pub fn register_parquet(
-        &mut self,
+        &self,
         name: &str,
         path: &str,
         table_partition_cols: Vec<(String, PyArrowType<DataType>)>,
@@ -718,7 +706,7 @@ impl PySessionContext {
                         file_extension=".csv",
                         file_compression_type=None))]
     pub fn register_csv(
-        &mut self,
+        &self,
         name: &str,
         path: &Bound<'_, PyAny>,
         schema: Option<PyArrowType<Schema>>,
@@ -766,7 +754,7 @@ impl PySessionContext {
                         table_partition_cols=vec![],
                         file_compression_type=None))]
     pub fn register_json(
-        &mut self,
+        &self,
         name: &str,
         path: PathBuf,
         schema: Option<PyArrowType<Schema>>,
@@ -805,7 +793,7 @@ impl PySessionContext {
                         file_extension=".avro",
                         table_partition_cols=vec![]))]
     pub fn register_avro(
-        &mut self,
+        &self,
         name: &str,
         path: PathBuf,
         schema: Option<PyArrowType<Schema>>,
@@ -846,17 +834,17 @@ impl PySessionContext {
         Ok(())
     }
 
-    pub fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> {
+    pub fn register_udf(&self, udf: PyScalarUDF) -> PyResult<()> {
         self.ctx.register_udf(udf.function);
         Ok(())
     }
 
-    pub fn register_udaf(&mut self, udaf: PyAggregateUDF) -> PyResult<()> {
+    pub fn register_udaf(&self, udaf: PyAggregateUDF) -> PyResult<()> {
         self.ctx.register_udaf(udaf.function);
         Ok(())
     }
 
-    pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> {
+    pub fn register_udwf(&self, udwf: PyWindowUDF) -> PyResult<()> {
         self.ctx.register_udwf(udwf.function);
         Ok(())
     }
@@ -928,7 +916,7 @@ impl PySessionContext {
     #[allow(clippy::too_many_arguments)]
     #[pyo3(signature = (path, schema=None, schema_infer_max_records=1000, file_extension=".json", table_partition_cols=vec![], file_compression_type=None))]
     pub fn read_json(
-        &mut self,
+        &self,
         path: PathBuf,
         schema: Option<PyArrowType<Schema>>,
         schema_infer_max_records: usize,
diff --git a/src/substrait.rs b/src/substrait.rs
@@ -138,7 +138,7 @@ impl PySubstraitConsumer {
     /// Convert Substrait Plan to DataFusion DataFrame
     #[staticmethod]
     pub fn from_substrait_plan(
-        ctx: &mut PySessionContext,
+        ctx: &PySessionContext,
         plan: PyPlan,
         py: Python,
     ) -> PyDataFusionResult<PyLogicalPlan> {