Skip to content

Commit c82d617

Browse files
committed
merge main
2 parents 20099d2 + f08d5b0 commit c82d617

File tree

7 files changed

+118
-43
lines changed

7 files changed

+118
-43
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
[package]
1919
name = "datafusion-python"
20-
version = "49.0.0"
20+
version = "50.0.0"
2121
homepage = "https://datafusion.apache.org/python"
2222
repository = "https://github.com/apache/datafusion-python"
2323
authors = ["Apache DataFusion <[email protected]>"]

dev/changelog/50.0.0.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# Apache DataFusion Python 50.0.0 Changelog
21+
22+
This release consists of 12 commits from 7 contributors. See credits at the end of this changelog for more information.
23+
24+
**Implemented enhancements:**
25+
26+
- feat: allow passing a slice to and expression with the [] indexing [#1215](https://github.com/apache/datafusion-python/pull/1215) (timsaucer)
27+
28+
**Documentation updates:**
29+
30+
- docs: fix CaseBuilder documentation example [#1225](https://github.com/apache/datafusion-python/pull/1225) (IndexSeek)
31+
- docs: update link to user example for custom table provider [#1224](https://github.com/apache/datafusion-python/pull/1224) (IndexSeek)
32+
- docs: add apache iceberg as datafusion data source [#1240](https://github.com/apache/datafusion-python/pull/1240) (kevinjqliu)
33+
34+
**Other:**
35+
36+
- 49.0.0 release [#1211](https://github.com/apache/datafusion-python/pull/1211) (timsaucer)
37+
- Update development guide in README.md [#1213](https://github.com/apache/datafusion-python/pull/1213) (YKoustubhRao)
38+
- Add benchmark script and documentation for maximizing CPU usage in DataFusion Python [#1216](https://github.com/apache/datafusion-python/pull/1216) (kosiew)
39+
- Fixing a few Typos [#1220](https://github.com/apache/datafusion-python/pull/1220) (ntjohnson1)
40+
- Set fail on warning for documentation generation [#1218](https://github.com/apache/datafusion-python/pull/1218) (timsaucer)
41+
- chore: remove redundant error transformation [#1232](https://github.com/apache/datafusion-python/pull/1232) (mesejo)
42+
- Support string column identifiers for sort/aggregate/window and stricter Expr validation [#1221](https://github.com/apache/datafusion-python/pull/1221) (kosiew)
43+
- Prepare for DF50 [#1231](https://github.com/apache/datafusion-python/pull/1231) (timsaucer)
44+
45+
## Credits
46+
47+
Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
48+
49+
```
50+
4 Tim Saucer
51+
2 Tyler White
52+
2 kosiew
53+
1 Daniel Mesejo
54+
1 Kevin Liu
55+
1 Koustubh Rao
56+
1 Nick
57+
```
58+
59+
Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
60+

python/datafusion/dataframe.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,13 +427,30 @@ def select(self, *exprs: Expr | str) -> DataFrame:
427427
def drop(self, *columns: str) -> DataFrame:
428428
"""Drop arbitrary amount of columns.
429429
430+
Column names are case-sensitive and do not require double quotes like
431+
other operations such as `select`. Leading and trailing double quotes
432+
are allowed and will be automatically stripped if present.
433+
430434
Args:
431-
columns: Column names to drop from the dataframe.
435+
columns: Column names to drop from the dataframe. Both ``column_name``
436+
and ``"column_name"`` are accepted.
432437
433438
Returns:
434439
DataFrame with those columns removed in the projection.
440+
441+
Example Usage::
442+
443+
df.drop('ID_For_Students') # Works
444+
df.drop('"ID_For_Students"') # Also works (quotes stripped)
435445
"""
436-
return DataFrame(self.df.drop(*columns))
446+
normalized_columns = []
447+
for col in columns:
448+
if col.startswith('"') and col.endswith('"'):
449+
normalized_columns.append(col.strip('"')) # Strip double quotes
450+
else:
451+
normalized_columns.append(col)
452+
453+
return DataFrame(self.df.drop(*normalized_columns))
437454

438455
def filter(self, *predicates: Expr) -> DataFrame:
439456
"""Return a DataFrame for which ``predicate`` evaluates to ``True``.

python/tests/test_dataframe.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,16 @@ def test_select(df):
220220
assert result.column(1) == pa.array([1, 2, 3])
221221

222222

223+
def test_drop_quoted_columns():
224+
ctx = SessionContext()
225+
batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], names=["ID_For_Students"])
226+
df = ctx.create_dataframe([[batch]])
227+
228+
# Both should work
229+
assert df.drop('"ID_For_Students"').schema().names == []
230+
assert df.drop("ID_For_Students").schema().names == []
231+
232+
223233
def test_select_mixed_expr_string(df):
224234
df = df.select(column("b"), "a")
225235

src/context.rs

Lines changed: 26 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ impl PySQLOptions {
296296
/// `PySessionContext` is able to plan and execute DataFusion plans.
297297
/// It has a powerful optimizer, a physical planner for local execution, and a
298298
/// multi-threaded execution engine to perform the execution.
299-
#[pyclass(name = "SessionContext", module = "datafusion", subclass)]
299+
#[pyclass(frozen, name = "SessionContext", module = "datafusion", subclass)]
300300
#[derive(Clone)]
301301
pub struct PySessionContext {
302302
pub ctx: SessionContext,
@@ -348,7 +348,7 @@ impl PySessionContext {
348348
/// Register an object store with the given name
349349
#[pyo3(signature = (scheme, store, host=None))]
350350
pub fn register_object_store(
351-
&mut self,
351+
&self,
352352
scheme: &str,
353353
store: StorageContexts,
354354
host: Option<&str>,
@@ -380,7 +380,7 @@ impl PySessionContext {
380380
schema=None,
381381
file_sort_order=None))]
382382
pub fn register_listing_table(
383-
&mut self,
383+
&self,
384384
name: &str,
385385
path: &str,
386386
table_partition_cols: Vec<(String, PyArrowType<DataType>)>,
@@ -421,22 +421,22 @@ impl PySessionContext {
421421
Ok(())
422422
}
423423

424-
pub fn register_udtf(&mut self, func: PyTableFunction) {
424+
pub fn register_udtf(&self, func: PyTableFunction) {
425425
let name = func.name.clone();
426426
let func = Arc::new(func);
427427
self.ctx.register_udtf(&name, func);
428428
}
429429

430430
/// Returns a PyDataFrame whose plan corresponds to the SQL statement.
431-
pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult<PyDataFrame> {
431+
pub fn sql(&self, query: &str, py: Python) -> PyDataFusionResult<PyDataFrame> {
432432
let result = self.ctx.sql(query);
433433
let df = wait_for_future(py, result)??;
434434
Ok(PyDataFrame::new(df))
435435
}
436436

437437
#[pyo3(signature = (query, options=None))]
438438
pub fn sql_with_options(
439-
&mut self,
439+
&self,
440440
query: &str,
441441
options: Option<PySQLOptions>,
442442
py: Python,
@@ -453,7 +453,7 @@ impl PySessionContext {
453453

454454
#[pyo3(signature = (partitions, name=None, schema=None))]
455455
pub fn create_dataframe(
456-
&mut self,
456+
&self,
457457
partitions: PyArrowType<Vec<Vec<RecordBatch>>>,
458458
name: Option<&str>,
459459
schema: Option<PyArrowType<Schema>>,
@@ -488,14 +488,14 @@ impl PySessionContext {
488488
}
489489

490490
/// Create a DataFrame from an existing logical plan
491-
pub fn create_dataframe_from_logical_plan(&mut self, plan: PyLogicalPlan) -> PyDataFrame {
491+
pub fn create_dataframe_from_logical_plan(&self, plan: PyLogicalPlan) -> PyDataFrame {
492492
PyDataFrame::new(DataFrame::new(self.ctx.state(), plan.plan.as_ref().clone()))
493493
}
494494

495495
/// Construct datafusion dataframe from Python list
496496
#[pyo3(signature = (data, name=None))]
497497
pub fn from_pylist(
498-
&mut self,
498+
&self,
499499
data: Bound<'_, PyList>,
500500
name: Option<&str>,
501501
) -> PyResult<PyDataFrame> {
@@ -515,7 +515,7 @@ impl PySessionContext {
515515
/// Construct datafusion dataframe from Python dictionary
516516
#[pyo3(signature = (data, name=None))]
517517
pub fn from_pydict(
518-
&mut self,
518+
&self,
519519
data: Bound<'_, PyDict>,
520520
name: Option<&str>,
521521
) -> PyResult<PyDataFrame> {
@@ -535,7 +535,7 @@ impl PySessionContext {
535535
/// Construct datafusion dataframe from Arrow Table
536536
#[pyo3(signature = (data, name=None))]
537537
pub fn from_arrow(
538-
&mut self,
538+
&self,
539539
data: Bound<'_, PyAny>,
540540
name: Option<&str>,
541541
py: Python,
@@ -569,11 +569,7 @@ impl PySessionContext {
569569
/// Construct datafusion dataframe from pandas
570570
#[allow(clippy::wrong_self_convention)]
571571
#[pyo3(signature = (data, name=None))]
572-
pub fn from_pandas(
573-
&mut self,
574-
data: Bound<'_, PyAny>,
575-
name: Option<&str>,
576-
) -> PyResult<PyDataFrame> {
572+
pub fn from_pandas(&self, data: Bound<'_, PyAny>, name: Option<&str>) -> PyResult<PyDataFrame> {
577573
// Obtain GIL token
578574
let py = data.py();
579575

@@ -589,11 +585,7 @@ impl PySessionContext {
589585

590586
/// Construct datafusion dataframe from polars
591587
#[pyo3(signature = (data, name=None))]
592-
pub fn from_polars(
593-
&mut self,
594-
data: Bound<'_, PyAny>,
595-
name: Option<&str>,
596-
) -> PyResult<PyDataFrame> {
588+
pub fn from_polars(&self, data: Bound<'_, PyAny>, name: Option<&str>) -> PyResult<PyDataFrame> {
597589
// Convert Polars dataframe to Arrow Table
598590
let table = data.call_method0("to_arrow")?;
599591

@@ -602,24 +594,20 @@ impl PySessionContext {
602594
Ok(df)
603595
}
604596

605-
pub fn register_table(
606-
&mut self,
607-
name: &str,
608-
table: Bound<'_, PyAny>,
609-
) -> PyDataFusionResult<()> {
597+
pub fn register_table(&self, name: &str, table: Bound<'_, PyAny>) -> PyDataFusionResult<()> {
610598
let table = PyTable::new(&table)?;
611599

612600
self.ctx.register_table(name, table.table)?;
613601
Ok(())
614602
}
615603

616-
pub fn deregister_table(&mut self, name: &str) -> PyDataFusionResult<()> {
604+
pub fn deregister_table(&self, name: &str) -> PyDataFusionResult<()> {
617605
self.ctx.deregister_table(name)?;
618606
Ok(())
619607
}
620608

621609
pub fn register_catalog_provider(
622-
&mut self,
610+
&self,
623611
name: &str,
624612
provider: Bound<'_, PyAny>,
625613
) -> PyDataFusionResult<()> {
@@ -648,7 +636,7 @@ impl PySessionContext {
648636

649637
/// Construct datafusion dataframe from Arrow Table
650638
pub fn register_table_provider(
651-
&mut self,
639+
&self,
652640
name: &str,
653641
provider: Bound<'_, PyAny>,
654642
) -> PyDataFusionResult<()> {
@@ -657,7 +645,7 @@ impl PySessionContext {
657645
}
658646

659647
pub fn register_record_batches(
660-
&mut self,
648+
&self,
661649
name: &str,
662650
partitions: PyArrowType<Vec<Vec<RecordBatch>>>,
663651
) -> PyDataFusionResult<()> {
@@ -675,7 +663,7 @@ impl PySessionContext {
675663
schema=None,
676664
file_sort_order=None))]
677665
pub fn register_parquet(
678-
&mut self,
666+
&self,
679667
name: &str,
680668
path: &str,
681669
table_partition_cols: Vec<(String, PyArrowType<DataType>)>,
@@ -718,7 +706,7 @@ impl PySessionContext {
718706
file_extension=".csv",
719707
file_compression_type=None))]
720708
pub fn register_csv(
721-
&mut self,
709+
&self,
722710
name: &str,
723711
path: &Bound<'_, PyAny>,
724712
schema: Option<PyArrowType<Schema>>,
@@ -766,7 +754,7 @@ impl PySessionContext {
766754
table_partition_cols=vec![],
767755
file_compression_type=None))]
768756
pub fn register_json(
769-
&mut self,
757+
&self,
770758
name: &str,
771759
path: PathBuf,
772760
schema: Option<PyArrowType<Schema>>,
@@ -805,7 +793,7 @@ impl PySessionContext {
805793
file_extension=".avro",
806794
table_partition_cols=vec![]))]
807795
pub fn register_avro(
808-
&mut self,
796+
&self,
809797
name: &str,
810798
path: PathBuf,
811799
schema: Option<PyArrowType<Schema>>,
@@ -846,17 +834,17 @@ impl PySessionContext {
846834
Ok(())
847835
}
848836

849-
pub fn register_udf(&mut self, udf: PyScalarUDF) -> PyResult<()> {
837+
pub fn register_udf(&self, udf: PyScalarUDF) -> PyResult<()> {
850838
self.ctx.register_udf(udf.function);
851839
Ok(())
852840
}
853841

854-
pub fn register_udaf(&mut self, udaf: PyAggregateUDF) -> PyResult<()> {
842+
pub fn register_udaf(&self, udaf: PyAggregateUDF) -> PyResult<()> {
855843
self.ctx.register_udaf(udaf.function);
856844
Ok(())
857845
}
858846

859-
pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> {
847+
pub fn register_udwf(&self, udwf: PyWindowUDF) -> PyResult<()> {
860848
self.ctx.register_udwf(udwf.function);
861849
Ok(())
862850
}
@@ -928,7 +916,7 @@ impl PySessionContext {
928916
#[allow(clippy::too_many_arguments)]
929917
#[pyo3(signature = (path, schema=None, schema_infer_max_records=1000, file_extension=".json", table_partition_cols=vec![], file_compression_type=None))]
930918
pub fn read_json(
931-
&mut self,
919+
&self,
932920
path: PathBuf,
933921
schema: Option<PyArrowType<Schema>>,
934922
schema_infer_max_records: usize,

src/substrait.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ impl PySubstraitConsumer {
138138
/// Convert Substrait Plan to DataFusion DataFrame
139139
#[staticmethod]
140140
pub fn from_substrait_plan(
141-
ctx: &mut PySessionContext,
141+
ctx: &PySessionContext,
142142
plan: PyPlan,
143143
py: Python,
144144
) -> PyDataFusionResult<PyLogicalPlan> {

0 commit comments

Comments
 (0)