From a26edea15f97abcf77b412ce4945d034a0d57fc8 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 28 Jan 2025 11:44:31 -0800 Subject: [PATCH 1/2] order by min(row position) in object describe --- .../compiler/snowflake_query_compiler.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 55bd416857..71532e5c6d 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13558,8 +13558,14 @@ def describe( axis=0, ) # Compute top (the mode of each column) + freq (the number of times this mode appears). - top_freq_identifiers = padded_qc._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( - pandas_labels=["top", "freq"] + # Also create a new identifier to track min(__row_position__) of each group to ensure stability + new_identifiers = padded_qc._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( + pandas_labels=["top", "freq", "min_row_position"] + ) + top_freq_identifiers = new_identifiers[0:2] + min_row_position_ident = new_identifiers[2] + row_position_ident = ( + padded_qc._modin_frame.ordered_dataframe.row_position_snowflake_quoted_identifier ) # To accommodate multi-level columns in the source frame, we generate a new index column # in the top/freq frame for each level. We transpose this frame later, so the columns @@ -13608,7 +13614,7 @@ def count_freqs( # IFF(a IS NULL, NULL, COUNT(a)) AS freq # FROM df # GROUP BY a - # ORDER BY freq DESC NULLS LAST + # ORDER BY freq DESC, MIN(__row_position__) ASC NULLS LAST # LIMIT 1 # # The resulting 1-row frame for column "a": [1, 1, 2] will have the form @@ -13656,9 +13662,13 @@ def count_freqs( pandas_lit(None), count(col(col_ident)), ).as_(freq_ident), + min_(row_position_ident).as_(min_row_position_ident), ], ) - .sort(OrderingColumn(freq_ident, ascending=False, na_last=True)) + .sort( + OrderingColumn(freq_ident, ascending=False, na_last=True), + OrderingColumn(min_row_position_ident, ascending=True), + ) .limit(1) .select( *( From c13f7b30c4ba8e750c6b0e8c3542b47393887d4b Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 28 Jan 2025 14:32:06 -0800 Subject: [PATCH 2/2] rename variables for review --- .../compiler/snowflake_query_compiler.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 71532e5c6d..4ade32c816 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13559,12 +13559,15 @@ def describe( ) # Compute top (the mode of each column) + freq (the number of times this mode appears). # Also create a new identifier to track min(__row_position__) of each group to ensure stability - new_identifiers = padded_qc._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( + ( + top_identifier, + freq_identifier, + min_row_position_identifier, + ) = padded_qc._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( pandas_labels=["top", "freq", "min_row_position"] ) - top_freq_identifiers = new_identifiers[0:2] - min_row_position_ident = new_identifiers[2] - row_position_ident = ( + top_freq_identifiers = (top_identifier, freq_identifier) + row_position_identifier = ( padded_qc._modin_frame.ordered_dataframe.row_position_snowflake_quoted_identifier ) # To accommodate multi-level columns in the source frame, we generate a new index column @@ -13579,7 +13582,7 @@ def describe( ) def count_freqs( - col_labels: Union[str, tuple[str, ...]], col_ident: str + col_labels: Union[str, tuple[str, ...]], col_identifier: str ) -> OrderedDataFrame: """ Helper function to compute the mode ("top") and frequency with which the mode @@ -13601,7 +13604,6 @@ def count_freqs( QC.value_counts(dropna=False) would correctly report NULL as the `top` item, but reports `freq` as the number of times NULL appears, which we do not want. """ - top_ident, freq_ident = top_freq_identifiers col_labels_tuple = ( col_labels if is_list_like(col_labels) else (col_labels,) ) @@ -13655,19 +13657,21 @@ def count_freqs( # +------+---+ return ( padded_qc._modin_frame.ordered_dataframe.group_by( - [col_ident], + [col_identifier], [ iff( - col(col_ident).is_null(), + col(col_identifier).is_null(), pandas_lit(None), - count(col(col_ident)), - ).as_(freq_ident), - min_(row_position_ident).as_(min_row_position_ident), + count(col(col_identifier)), + ).as_(freq_identifier), + min_(row_position_identifier).as_( + min_row_position_identifier + ), ], ) .sort( - OrderingColumn(freq_ident, ascending=False, na_last=True), - OrderingColumn(min_row_position_ident, ascending=True), + OrderingColumn(freq_identifier, ascending=False, na_last=True), + OrderingColumn(min_row_position_identifier, ascending=True), ) .limit(1) .select( @@ -13675,14 +13679,16 @@ def count_freqs( # If the original frame had multi-level columns, we must create # a multi-level index to transpose this frame later. [ - pandas_lit(col_label).as_(index_ident) - for col_label, index_ident in zip( + pandas_lit(col_label).as_(index_identifier) + for col_label, index_identifier in zip( col_labels_tuple, new_index_identifiers ) ] + [ - col(col_ident).cast(VariantType()).as_(top_ident), - freq_ident, + col(col_identifier) + .cast(VariantType()) + .as_(top_identifier), + freq_identifier, ] ) )