-
Notifications
You must be signed in to change notification settings - Fork 146
SNOW-807303 Use SELECT * EXCLUDE for DataFrame.drop() #3316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
06dea35
524adc6
738f48a
6e46b87
63aa5e9
5df2f01
339d842
a0d3836
e65dcf3
ae4ff00
eae9e5f
9cd8a0b
4d89fcf
b456fca
f32c825
803739c
26504d0
d1465f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1736,7 +1736,6 @@ def select_expr( | |
|
|
||
| selectExpr = select_expr | ||
|
|
||
| @df_api_usage | ||
| @publicapi | ||
| def drop( | ||
| self, *cols: Union[ColumnOrName, Iterable[ColumnOrName]], _emit_ast: bool = True | ||
|
|
@@ -1782,48 +1781,83 @@ def drop( | |
| build_expr_from_snowpark_column_or_col_name(ast.cols.args.add(), c) | ||
| ast.cols.variadic = is_variadic | ||
|
|
||
| names = [] | ||
| for c in exprs: | ||
| if isinstance(c, str): | ||
| names.append(c) | ||
| elif isinstance(c, Column) and isinstance(c._expression, Attribute): | ||
| from snowflake.snowpark.mock._connection import MockServerConnection | ||
| with ResourceUsageCollector() as resource_usage_collector: | ||
| names = [] | ||
| for c in exprs: | ||
| if isinstance(c, str): | ||
| names.append(c) | ||
| elif isinstance(c, Column) and isinstance(c._expression, Attribute): | ||
| from snowflake.snowpark.mock._connection import MockServerConnection | ||
|
|
||
| if isinstance(self._session._conn, MockServerConnection): | ||
| self.schema # to execute the plan and populate expr_to_alias | ||
| if isinstance(self._session._conn, MockServerConnection): | ||
| self.schema # to execute the plan and populate expr_to_alias | ||
|
|
||
| names.append( | ||
| self._plan.expr_to_alias.get( | ||
| c._expression.expr_id, c._expression.name | ||
| names.append( | ||
| self._plan.expr_to_alias.get( | ||
| c._expression.expr_id, c._expression.name | ||
| ) | ||
| ) | ||
| ) | ||
| elif ( | ||
| isinstance(c, Column) | ||
| and isinstance(c._expression, UnresolvedAttribute) | ||
| and c._expression.df_alias | ||
| elif ( | ||
| isinstance(c, Column) | ||
| and isinstance(c._expression, UnresolvedAttribute) | ||
| and c._expression.df_alias | ||
| ): | ||
| names.append( | ||
| self._plan.df_aliased_col_name_to_real_col_name.get( | ||
| c._expression.name, c._expression.name | ||
| ) | ||
| ) | ||
| elif isinstance(c, Column) and isinstance( | ||
| c._expression, NamedExpression | ||
| ): | ||
| names.append(c._expression.name) | ||
| else: | ||
| raise SnowparkClientExceptionMessages.DF_CANNOT_DROP_COLUMN_NAME( | ||
| str(c) | ||
| ) | ||
|
|
||
| normalized_names = {quote_name(n) for n in names} | ||
| existing_names = [attr.name for attr in self._output] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible to not check existing names if using exclude? It will trigger a describe query
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as discussed offline, if we don't do this describe query, it may lead to a BCR since |
||
| keep_col_names = [c for c in existing_names if c not in normalized_names] | ||
| if not keep_col_names: | ||
| raise SnowparkClientExceptionMessages.DF_CANNOT_DROP_ALL_COLUMNS() | ||
|
|
||
| if self._select_statement and self._session.conf.get( | ||
| "use_simplified_query_generation" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there other functionality that will use this flag?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah. all new query generation improvement for dataframe APIs are protected with this parameter. .distinct(), .random_split(), .stat.sample_by(), .drop() and planning to add .rename() under this. |
||
| ): | ||
| names.append( | ||
| self._plan.df_aliased_col_name_to_real_col_name.get( | ||
| c._expression.name, c._expression.name | ||
| # Only drop the columns that exist in the DataFrame. | ||
| drop_normalized_names = [ | ||
| name for name in normalized_names if name in existing_names | ||
| ] | ||
| if not drop_normalized_names: | ||
| df = self._with_plan(self._select_statement) | ||
| else: | ||
| df = self._with_plan( | ||
| self._select_statement.exclude( | ||
| drop_normalized_names, keep_col_names | ||
| ) | ||
| ) | ||
| ) | ||
| elif isinstance(c, Column) and isinstance(c._expression, NamedExpression): | ||
| names.append(c._expression.name) | ||
| else: | ||
| raise SnowparkClientExceptionMessages.DF_CANNOT_DROP_COLUMN_NAME(str(c)) | ||
| df = self.select(list(keep_col_names), _emit_ast=False) | ||
|
|
||
| normalized_names = {quote_name(n) for n in names} | ||
| existing_names = [attr.name for attr in self._output] | ||
| keep_col_names = [c for c in existing_names if c not in normalized_names] | ||
| if not keep_col_names: | ||
| raise SnowparkClientExceptionMessages.DF_CANNOT_DROP_ALL_COLUMNS() | ||
| if self._session.conf.get("use_simplified_query_generation"): | ||
| add_api_call( | ||
| df, | ||
| "DataFrame.drop[exclude]", | ||
| resource_usage_collector.get_resource_usage(), | ||
| ) | ||
| else: | ||
| df = self.select(list(keep_col_names), _emit_ast=False) | ||
| adjust_api_subcalls( | ||
| df, | ||
| "DataFrame.drop[select]", | ||
| len_subcalls=1, | ||
| resource_usage=resource_usage_collector.get_resource_usage(), | ||
| ) | ||
|
|
||
| if _emit_ast: | ||
| df._ast_id = stmt.uid | ||
| if _emit_ast: | ||
| df._ast_id = stmt.uid | ||
|
|
||
| return df | ||
| return df | ||
|
|
||
| @df_api_usage | ||
| @publicapi | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.