-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Allow dropping qualified columns #19549
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+169
−9
Merged
Changes from 8 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
7ee49d2
Add failing test
ntjohnson1 52d153e
rust/issues/85077
ntjohnson1 743ede5
Resolve failing test by coercing into column
ntjohnson1 224deb9
Add helper for qualified column access
ntjohnson1 95940c0
Run pre-commit
ntjohnson1 fe63c80
Handle duplicate unqualified name
ntjohnson1 978b973
Run ci fmt
ntjohnson1 3cd4e67
One more pass with fmt and clippy
ntjohnson1 849bcb8
Revert "rust/issues/85077"
ntjohnson1 66b65f2
Merge branch 'main' of github.com:rerun-io/arrow-datafusion into feat…
ntjohnson1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -445,15 +445,31 @@ impl DataFrame { | |
| /// # Ok(()) | ||
| /// # } | ||
| /// ``` | ||
| pub fn drop_columns(self, columns: &[&str]) -> Result<DataFrame> { | ||
| pub fn drop_columns<T>(self, columns: &[T]) -> Result<DataFrame> | ||
| where | ||
| T: Into<Column> + Clone, | ||
| { | ||
| let fields_to_drop = columns | ||
| .iter() | ||
| .flat_map(|name| { | ||
| self.plan | ||
| .schema() | ||
| .qualified_fields_with_unqualified_name(name) | ||
| .flat_map(|col| { | ||
| let column: Column = col.clone().into(); | ||
| match column.relation.as_ref() { | ||
| Some(_) => { | ||
| // qualified_field_from_column returns Result<(Option<&TableReference>, &FieldRef)> | ||
| vec![self.plan.schema().qualified_field_from_column(&column)] | ||
| } | ||
| None => { | ||
| // qualified_fields_with_unqualified_name returns Vec<(Option<&TableReference>, &FieldRef)> | ||
| self.plan | ||
| .schema() | ||
| .qualified_fields_with_unqualified_name(&column.name) | ||
| .into_iter() | ||
| .map(Ok) | ||
| .collect::<Vec<_>>() | ||
| } | ||
| } | ||
| }) | ||
| .collect::<Vec<_>>(); | ||
| .collect::<Result<Vec<_>, _>>()?; | ||
| let expr: Vec<Expr> = self | ||
| .plan | ||
| .schema() | ||
|
|
@@ -2463,6 +2479,48 @@ impl DataFrame { | |
| .collect() | ||
| } | ||
|
|
||
| /// Find qualified columns for this dataframe from names | ||
| /// | ||
| /// # Arguments | ||
| /// * `names` - Unqualified names to find. | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
| /// # use datafusion::prelude::*; | ||
| /// # use datafusion::error::Result; | ||
| /// # use datafusion_common::ScalarValue; | ||
| /// # #[tokio::main] | ||
| /// # async fn main() -> Result<()> { | ||
| /// let ctx = SessionContext::new(); | ||
| /// ctx.register_csv("first_table", "tests/data/example.csv", CsvReadOptions::new()) | ||
| /// .await?; | ||
| /// let df = ctx.table("first_table").await?; | ||
| /// ctx.register_csv("second_table", "tests/data/example.csv", CsvReadOptions::new()) | ||
| /// .await?; | ||
| /// let df2 = ctx.table("second_table").await?; | ||
| /// let join_expr = df.find_qualified_columns(&["a"])?.iter() | ||
| /// .zip(df2.find_qualified_columns(&["a"])?.iter()) | ||
| /// .map(|(col1, col2)| col(*col1).eq(col(*col2))) | ||
| /// .collect::<Vec<Expr>>(); | ||
| /// let df3 = df.join_on(df2, JoinType::Inner, join_expr)?; | ||
| /// # Ok(()) | ||
| /// # } | ||
| /// ``` | ||
| pub fn find_qualified_columns( | ||
| &self, | ||
| names: &[&str], | ||
| ) -> Result<Vec<(Option<&TableReference>, &FieldRef)>> { | ||
| let schema = self.logical_plan().schema(); | ||
| names | ||
| .iter() | ||
| .map(|name| { | ||
| schema | ||
| .qualified_field_from_column(&Column::from_name(*name)) | ||
| .map_err(|_| plan_datafusion_err!("Column '{}' not found", name)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. much better than silently ignoring the error 👍 |
||
| }) | ||
| .collect() | ||
| } | ||
|
|
||
| /// Helper for creating DataFrame. | ||
| /// # Example | ||
| /// ``` | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -534,7 +534,8 @@ async fn drop_columns_with_nonexistent_columns() -> Result<()> { | |
| async fn drop_columns_with_empty_array() -> Result<()> { | ||
| // build plan using Table API | ||
| let t = test_table().await?; | ||
| let t2 = t.drop_columns(&[])?; | ||
| let drop_columns = vec![] as Vec<&str>; | ||
| let t2 = t.drop_columns(&drop_columns)?; | ||
| let plan = t2.logical_plan().clone(); | ||
|
|
||
| // build query using SQL | ||
|
|
@@ -549,6 +550,107 @@ async fn drop_columns_with_empty_array() -> Result<()> { | |
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn drop_columns_qualified() -> Result<()> { | ||
| // build plan using Table API | ||
| let mut t = test_table().await?; | ||
| t = t.select_columns(&["c1", "c2", "c11"])?; | ||
| let mut t2 = test_table_with_name("another_table").await?; | ||
| t2 = t2.select_columns(&["c1", "c2", "c11"])?; | ||
| let mut t3 = t.join_on( | ||
| t2, | ||
| JoinType::Inner, | ||
| [col("aggregate_test_100.c1").eq(col("another_table.c1"))], | ||
| )?; | ||
| t3 = t3.drop_columns(&["another_table.c2", "another_table.c11"])?; | ||
ntjohnson1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| let plan = t3.logical_plan().clone(); | ||
|
|
||
| let sql = "SELECT aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c11, another_table.c1 FROM (SELECT c1, c2, c11 FROM aggregate_test_100) INNER JOIN (SELECT c1, c2, c11 FROM another_table) ON aggregate_test_100.c1 = another_table.c1"; | ||
| let ctx = SessionContext::new(); | ||
| register_aggregate_csv(&ctx, "aggregate_test_100").await?; | ||
| register_aggregate_csv(&ctx, "another_table").await?; | ||
| let sql_plan = ctx.sql(sql).await?.into_unoptimized_plan(); | ||
|
|
||
| // the two plans should be identical | ||
| assert_same_plan(&plan, &sql_plan); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn drop_columns_qualified_find_qualified() -> Result<()> { | ||
| // build plan using Table API | ||
| let mut t = test_table().await?; | ||
| t = t.select_columns(&["c1", "c2", "c11"])?; | ||
| let mut t2 = test_table_with_name("another_table").await?; | ||
| t2 = t2.select_columns(&["c1", "c2", "c11"])?; | ||
| let mut t3 = t.join_on( | ||
| t2.clone(), | ||
| JoinType::Inner, | ||
| [col("aggregate_test_100.c1").eq(col("another_table.c1"))], | ||
| )?; | ||
| t3 = t3.drop_columns(&t2.find_qualified_columns(&["c2", "c11"])?)?; | ||
|
|
||
| let plan = t3.logical_plan().clone(); | ||
|
|
||
| let sql = "SELECT aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c11, another_table.c1 FROM (SELECT c1, c2, c11 FROM aggregate_test_100) INNER JOIN (SELECT c1, c2, c11 FROM another_table) ON aggregate_test_100.c1 = another_table.c1"; | ||
| let ctx = SessionContext::new(); | ||
| register_aggregate_csv(&ctx, "aggregate_test_100").await?; | ||
| register_aggregate_csv(&ctx, "another_table").await?; | ||
| let sql_plan = ctx.sql(sql).await?.into_unoptimized_plan(); | ||
|
|
||
| // the two plans should be identical | ||
| assert_same_plan(&plan, &sql_plan); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_find_qualified_names() -> Result<()> { | ||
| let t = test_table().await?; | ||
| let column_names = ["c1", "c2", "c3"]; | ||
| let columns = t.find_qualified_columns(&column_names)?; | ||
|
|
||
| // Expected results for each column | ||
| let binding = TableReference::bare("aggregate_test_100"); | ||
| let expected = [ | ||
| (Some(&binding), "c1"), | ||
| (Some(&binding), "c2"), | ||
| (Some(&binding), "c3"), | ||
| ]; | ||
|
|
||
| // Verify we got the expected number of results | ||
| assert_eq!( | ||
| columns.len(), | ||
| expected.len(), | ||
| "Expected {} columns, got {}", | ||
| expected.len(), | ||
| columns.len() | ||
| ); | ||
|
|
||
| // Iterate over the results and check each one individually | ||
| for (i, (actual, expected)) in columns.iter().zip(expected.iter()).enumerate() { | ||
| let (actual_table_ref, actual_field_ref) = actual; | ||
| let (expected_table_ref, expected_field_name) = expected; | ||
|
|
||
| // Check table reference | ||
| assert_eq!( | ||
| actual_table_ref, expected_table_ref, | ||
| "Column {i}: expected table reference {expected_table_ref:?}, got {actual_table_ref:?}" | ||
| ); | ||
|
|
||
| // Check field name | ||
| assert_eq!( | ||
| actual_field_ref.name(), | ||
| *expected_field_name, | ||
| "Column {i}: expected field name '{expected_field_name}', got '{actual_field_ref}'" | ||
| ); | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn drop_with_quotes() -> Result<()> { | ||
| // define data with a column name that has a "." in it: | ||
|
|
@@ -594,7 +696,7 @@ async fn drop_with_periods() -> Result<()> { | |
| let ctx = SessionContext::new(); | ||
| ctx.register_batch("t", batch)?; | ||
|
|
||
| let df = ctx.table("t").await?.drop_columns(&["f.c1"])?; | ||
| let df = ctx.table("t").await?.drop_columns(&["\"f.c1\""])?; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. while this is technically a breaking API change, I think it is reasonable to treat it as a bug fix |
||
|
|
||
| let df_results = df.collect().await?; | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was wondering why the
Clonebound is needed. In case anyone else is curious -- it turns out it needed because what is passed in is&[T](vs something like[T])