SNOW-2217486: Fix some apply-related Jenkins failures (#3572)

sfc-gh-joshi · web-flow · commit 94215a459dee · 2025-07-21T16:23:43.000-07:00
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py
@@ -410,17 +410,19 @@ def end_partition(self, df):  # type: ignore[no-untyped-def] # pragma: no cover
         APPLY_LABEL_COLUMN_QUOTED_IDENTIFIER,
         APPLY_VALUE_COLUMN_QUOTED_IDENTIFIER,
     ]
-    cache_key = UDTFCacheKey(
-        pickle_function(ApplyFunc.end_partition),
-        tuple(col_types),
-        tuple(col_identifiers),
-        tuple([LongType()] + input_types),
-        tuple(pkg.__name__ if isinstance(pkg, ModuleType) else pkg for pkg in packages),
-    )
-    cache = session_apply_axis_1_udtf_cache[session]
-    if cache_key not in cache:
-        try:
-            new_udtf = sp_func.udtf(
+    try:
+        cache_key = UDTFCacheKey(
+            pickle_function(ApplyFunc.end_partition),
+            tuple(col_types),
+            tuple(col_identifiers),
+            tuple([LongType()] + input_types),
+            tuple(
+                pkg.__name__ if isinstance(pkg, ModuleType) else pkg for pkg in packages
+            ),
+        )
+        cache = session_apply_axis_1_udtf_cache[session]
+        if cache_key not in cache:
+            cache[cache_key] = sp_func.udtf(
                 ApplyFunc,
                 output_schema=PandasDataFrameType(
                     col_types,
@@ -432,13 +434,12 @@ def end_partition(self, df):  # type: ignore[no-untyped-def] # pragma: no cover
                 session=session,
                 statement_params=get_default_snowpark_pandas_statement_params(),
             )
-            cache[cache_key] = new_udtf
-        except NotImplementedError:  # pragma: no cover
-            # When a Snowpark object is passed to a UDF, a NotImplementedError with message
-            # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
-            # catch this exception and return a more user-friendly error message.
-            raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
-    return cache[cache_key]
+        return cache[cache_key]
+    except NotImplementedError:  # pragma: no cover
+        # When a Snowpark object is passed to a UDF, a NotImplementedError with message
+        # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
+        # catch this exception and return a more user-friendly error message.
+        raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
 
 
 def convert_groupby_apply_dataframe_result_to_standard_schema(
@@ -797,17 +798,17 @@ def end_partition(self, df: native_pd.DataFrame):  # type: ignore[no-untyped-def
             func_result.insert(0, "__min_row_position__", min_row_position)
             return func_result
 
-    cache_key = UDTFCacheKey(
-        pickle_function(ApplyFunc.end_partition),
-        tuple(output_schema.column_types),
-        tuple(output_schema.column_ids),
-        tuple(input_column_types),
-        tuple(session.get_packages().values()),
-    )
-    cache = session_groupby_apply_no_pivot_udtf_cache[session]
-    if cache_key not in cache:
-        try:
-            new_udtf = sp_func.udtf(
+    try:
+        cache_key = UDTFCacheKey(
+            pickle_function(ApplyFunc.end_partition),
+            tuple(output_schema.column_types),
+            tuple(output_schema.column_ids),
+            tuple(input_column_types),
+            tuple(session.get_packages().values()),
+        )
+        cache = session_groupby_apply_no_pivot_udtf_cache[session]
+        if cache_key not in cache:
+            cache[cache_key] = sp_func.udtf(
                 ApplyFunc,
                 output_schema=PandasDataFrameType(
                     output_schema.column_types, output_schema.column_ids
@@ -819,13 +820,12 @@ def end_partition(self, df: native_pd.DataFrame):  # type: ignore[no-untyped-def
                 session=session,
                 statement_params=get_default_snowpark_pandas_statement_params(),
             )
-            cache[cache_key] = new_udtf
-        except NotImplementedError:  # pragma: no cover
-            # When a Snowpark object is passed to a UDF, a NotImplementedError with message
-            # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
-            # catch this exception and return a more user-friendly error message.
-            raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
-    return cache[cache_key]
+        return cache[cache_key]
+    except NotImplementedError:  # pragma: no cover
+        # When a Snowpark object is passed to a UDF, a NotImplementedError with message
+        # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
+        # catch this exception and return a more user-friendly error message.
+        raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
 
 
 def infer_output_schema_for_apply(
@@ -1183,18 +1183,17 @@ def end_partition(self, df: native_pd.DataFrame):  # type: ignore[no-untyped-def
         IntegerType(),
         IntegerType(),
     ]
-    cache_key = UDTFCacheKey(
-        pickle_function(ApplyFunc.end_partition),
-        tuple(col_types),
-        tuple(col_names),
-        tuple(input_types),
-        tuple(session.get_packages().values()),
-    )
-    cache = session_groupby_apply_udtf_cache[session]
-
-    if cache_key not in cache:
-        try:
-            new_udtf = sp_func.udtf(
+    try:
+        cache_key = UDTFCacheKey(
+            pickle_function(ApplyFunc.end_partition),
+            tuple(col_types),
+            tuple(col_names),
+            tuple(input_types),
+            tuple(session.get_packages().values()),
+        )
+        cache = session_groupby_apply_udtf_cache[session]
+        if cache_key not in cache:
+            cache[cache_key] = sp_func.udtf(
                 ApplyFunc,
                 output_schema=PandasDataFrameType(
                     col_types,
@@ -1207,13 +1206,12 @@ def end_partition(self, df: native_pd.DataFrame):  # type: ignore[no-untyped-def
                 session=session,
                 statement_params=get_default_snowpark_pandas_statement_params(),
             )
-            cache[cache_key] = new_udtf
-        except NotImplementedError:  # pragma: no cover
-            # When a Snowpark object is passed to a UDF, a NotImplementedError with message
-            # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
-            # catch this exception and return a more user-friendly error message.
-            raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
-    return None, cache[cache_key]
+        return None, cache[cache_key]
+    except NotImplementedError:  # pragma: no cover
+        # When a Snowpark object is passed to a UDF, a NotImplementedError with message
+        # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
+        # catch this exception and return a more user-friendly error message.
+        raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
 
 
 def create_udf_for_series_apply(
@@ -1279,16 +1277,15 @@ def apply_func(x):  # type: ignore[no-untyped-def] # pragma: no cover
             return x.apply(func, args=args, **kwargs)
 
     strict = na_action == "ignore"
-    cache_key = UDFCacheKey(
-        pickle_function(apply_func),
-        return_type,
-        input_type,
-        strict,
-    )
-    cache = session_udf_cache[session]
-
-    if cache_key not in cache:
-        try:
+    try:
+        cache_key = UDFCacheKey(
+            pickle_function(apply_func),
+            return_type,
+            input_type,
+            strict,
+        )
+        cache = session_udf_cache[session]
+        if cache_key not in cache:
             cache[cache_key] = sp_func.udf(
                 apply_func,
                 return_type=PandasSeriesType(return_type),
@@ -1298,12 +1295,12 @@ def apply_func(x):  # type: ignore[no-untyped-def] # pragma: no cover
                 packages=packages,
                 statement_params=get_default_snowpark_pandas_statement_params(),
             )
-        except NotImplementedError:  # pragma: no cover
-            # When a Snowpark object is passed to a UDF, a NotImplementedError with message
-            # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
-            # catch this exception and return a more user-friendly error message.
-            raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
-    return cache[cache_key]
+        return cache[cache_key]
+    except NotImplementedError:  # pragma: no cover
+        # When a Snowpark object is passed to a UDF, a NotImplementedError with message
+        # 'Snowpark pandas does not yet support the method DataFrame.__reduce__' is raised. Instead,
+        # catch this exception and return a more user-friendly error message.
+        raise ValueError(APPLY_WITH_SNOWPARK_OBJECT_ERROR_MSG)
 
 
 def handle_missing_value_in_variant(value: Any) -> Any:
diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py
@@ -1007,34 +1007,34 @@ def test_udfs_and_udtfs_with_snowpark_object_error_msg():
     )
     snow_df = pd.DataFrame([7, 8, 9])
     with SqlCounter(
-        query_count=16,
+        query_count=14,
         high_count_expected=True,
         high_count_reason="Series.apply has high query count",
     ):
         with pytest.raises(ValueError, match=expected_error_msg):  # Series.apply
             snow_df[0].apply(lambda row: snow_df.iloc[0, 0])
-    with SqlCounter(query_count=2):
+    with SqlCounter(query_count=0):
         with pytest.raises(
             ValueError, match=expected_error_msg
         ):  # DataFrame.apply axis=0
             snow_df.apply(lambda row: snow_df.iloc[0, 0])
-    with SqlCounter(query_count=2):
+    with SqlCounter(query_count=0):
         with pytest.raises(
             ValueError, match=expected_error_msg
         ):  # DataFrame.apply axis=1
             snow_df.apply(lambda row: snow_df.iloc[0, 0], axis=1)
-    with SqlCounter(query_count=2):
+    with SqlCounter(query_count=0):
         with pytest.raises(ValueError, match=expected_error_msg):  # DataFrame.transform
             snow_df.transform(lambda row: snow_df.iloc[0, 0])
     with SqlCounter(
-        query_count=16,
+        query_count=14,
         high_count_expected=True,
         high_count_reason="DataFrame.map has high query count",
     ):
         with pytest.raises(ValueError, match=expected_error_msg):  # DataFrame.map
             snow_df.map(lambda row: snow_df.iloc[0, 0])
     with SqlCounter(
-        query_count=16,
+        query_count=14,
         high_count_expected=True,
         high_count_reason="Series.map has high query count",
     ):
diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py
@@ -213,7 +213,7 @@ def test_cache_result_post_applymap(self, inplace, simple_test_data):
             native_pd.DataFrame(simple_test_data).applymap(lambda x: x + x), native_pd
         )
         with SqlCounter(
-            query_count=11,
+            query_count=8,
             union_count=9,
             udf_count=1,
             high_count_expected=True,
@@ -227,7 +227,7 @@ def test_cache_result_post_applymap(self, inplace, simple_test_data):
             )
 
         with SqlCounter(
-            query_count=10,
+            query_count=7,
             high_count_expected=True,
             high_count_reason="applymap requires additional queries to setup the UDF.",
         ):