Merge branch 'main' into helmeleegy-SNOW-1819523

sfc-gh-helmeleegy · sfc-gh-helmeleegy · commit f83ae56b1ce3 · 2025-01-09T10:01:21.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -61,6 +61,7 @@
 - Added documentation for `DataFrame.map`.
 - Improve performance of `DataFrame.apply` by mapping numpy functions to snowpark functions if possible.
 - Added documentation on the extent of Snowpark pandas interoperability with scikit-learn
+- Infer return type of functions in `Series.map`, `Series.apply` and `DataFrame.map` if type-hint is not provided.
 
 ## 1.26.0 (2024-12-05)
 
diff --git a/src/snowflake/snowpark/_internal/utils.py b/src/snowflake/snowpark/_internal/utils.py
@@ -45,11 +45,13 @@
 )
 
 import snowflake.snowpark
+from snowflake.connector.constants import FIELD_ID_TO_NAME
 from snowflake.connector.cursor import ResultMetadata, SnowflakeCursor
 from snowflake.connector.description import OPERATING_SYSTEM, PLATFORM
 from snowflake.connector.options import MissingOptionalDependency, ModuleLikeObject
 from snowflake.connector.version import VERSION as connector_version
 from snowflake.snowpark._internal.error_message import SnowparkClientExceptionMessages
+from snowflake.snowpark.context import _should_use_structured_type_semantics
 from snowflake.snowpark.row import Row
 from snowflake.snowpark.version import VERSION as snowpark_version
 
@@ -698,19 +700,50 @@ def column_to_bool(col_):
     return bool(col_)
 
 
+def _parse_result_meta(
+    result_meta: Union[List[ResultMetadata], List["ResultMetadataV2"]]
+) -> Tuple[Optional[List[str]], Optional[List[Callable]]]:
+    """
+    Takes a list of result metadata objects and returns a list containing the names of all fields as
+    well as a list of functions that wrap specific columns.
+
+    A column type may need to be wrapped if the connector is unable to provide the columns data in
+    an expected format. For example StructType columns are returned as dict objects, but are better
+    represented as Row objects.
+    """
+    if not result_meta:
+        return None, None
+    col_names = []
+    wrappers = []
+    for col in result_meta:
+        col_names.append(col.name)
+        if (
+            _should_use_structured_type_semantics()
+            and FIELD_ID_TO_NAME[col.type_code] == "OBJECT"
+            and col.fields is not None
+        ):
+            wrappers.append(lambda x: Row(**x))
+        else:
+            wrappers.append(None)
+    return col_names, wrappers
+
+
 def result_set_to_rows(
     result_set: List[Any],
     result_meta: Optional[Union[List[ResultMetadata], List["ResultMetadataV2"]]] = None,
     case_sensitive: bool = True,
 ) -> List[Row]:
-    col_names = [col.name for col in result_meta] if result_meta else None
+    col_names, wrappers = _parse_result_meta(result_meta or [])
     rows = []
     row_struct = Row
     if col_names:
         row_struct = (
             Row._builder.build(*col_names).set_case_sensitive(case_sensitive).to_row()
         )
     for data in result_set:
+        if wrappers:
+            data = [wrap(d) if wrap else d for wrap, d in zip(wrappers, data)]
+
         if data is None:
             raise ValueError("Result returned from Python connector is None")
         row = row_struct(*data)
@@ -723,7 +756,7 @@ def result_set_to_iter(
     result_meta: Optional[List[ResultMetadata]] = None,
     case_sensitive: bool = True,
 ) -> Iterator[Row]:
-    col_names = [col.name for col in result_meta] if result_meta else None
+    col_names, wrappers = _parse_result_meta(result_meta)
     row_struct = Row
     if col_names:
         row_struct = (
@@ -732,6 +765,8 @@ def result_set_to_iter(
     for data in result_set:
         if data is None:
             raise ValueError("Result returned from Python connector is None")
+        if wrappers:
+            data = [wrap(d) if wrap else d for wrap, d in zip(wrappers, data)]
         row = row_struct(*data)
         yield row
 
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py
@@ -8,6 +8,7 @@
 from collections.abc import Hashable
 from enum import Enum, auto
 from typing import Any, Callable, Literal, Optional, Union
+from datetime import datetime
 
 import cloudpickle
 import numpy as np
@@ -21,6 +22,9 @@
 from snowflake.snowpark._internal.udf_utils import get_types_from_type_hints
 import functools
 from snowflake.snowpark.column import Column as SnowparkColumn
+from snowflake.snowpark.modin.plugin._internal.snowpark_pandas_types import (
+    TimedeltaType,
+)
 from snowflake.snowpark.modin.plugin._internal.type_utils import (
     infer_object_type,
     pandas_lit,
@@ -45,13 +49,19 @@
 from snowflake.snowpark.session import Session
 from snowflake.snowpark.types import (
     ArrayType,
+    BinaryType,
+    BooleanType,
     DataType,
+    _IntegralType,
+    _FractionalType,
     IntegerType,
     LongType,
     MapType,
+    NullType,
     PandasDataFrameType,
     PandasSeriesType,
     StringType,
+    TimestampType,
     VariantType,
 )
 from snowflake.snowpark.udf import UserDefinedFunction
@@ -113,7 +123,7 @@ class GroupbyApplySortMethod(Enum):
 
 def check_return_variant_and_get_return_type(func: Callable) -> tuple[bool, DataType]:
     """Check whether the function returns a variant in Snowflake, and get its return type."""
-    return_type = deduce_return_type_from_function(func)
+    return_type = deduce_return_type_from_function(func, None)
     if return_type is None or isinstance(
         return_type, (VariantType, PandasSeriesType, PandasDataFrameType)
     ):
@@ -756,6 +766,8 @@ def apply_func(x):  # type: ignore[no-untyped-def] # pragma: no cover
     else:
 
         def apply_func(x):  # type: ignore[no-untyped-def] # pragma: no cover
+            # TODO SNOW-1874779: Add verification here to ensure inferred type matches
+            #  actual type.
             return x.apply(func, args=args, **kwargs)
 
     func_udf = sp_func.udf(
@@ -829,14 +841,128 @@ def convert_numpy_int_result_to_int(value: Any) -> Any:
     )
 
 
+DUMMY_BOOL_INPUT = native_pd.Series([False, True])
+DUMMY_INT_INPUT = native_pd.Series(
+    [-37, -9, -2, -1, 0, 2, 3, 5, 7, 9, 13, 16, 20]
+    + np.power(10, np.arange(19)).tolist()
+    + np.multiply(-1, np.power(10, np.arange(19))).tolist()
+)
+DUMMY_FLOAT_INPUT = native_pd.Series(
+    [-9.9, -2.2, -1.0, 0.0, 0.5, 0.33, None, 0.99, 2.0, 3.0, 5.0, 7.7, 9.898989]
+    + np.power(10.1, np.arange(19)).tolist()
+    + np.multiply(-1.0, np.power(10.1, np.arange(19))).tolist()
+)
+DUMMY_STRING_INPUT = native_pd.Series(
+    ["", "a", "A", "0", "1", "01", "123", "-1", "-12", "true", "True", "false", "False"]
+    + [None, "null", "Jane Smith", "janesmith@snowflake.com", "janesmith@gmail.com"]
+    + ["650-592-4563", "Jane Smith, 123 Main St., Anytown, CA 12345"]
+    + ["2020-12-23", "2020-12-23 12:34:56", "08/08/2024", "07-08-2022", "12:34:56"]
+    + ["ABC", "bat-man", "super_man", "1@#$%^&*()_+", "<>?:{}|[]\\;'/.,", "<tag>"]
+)
+DUMMY_BINARY_INPUT = native_pd.Series(
+    [bytes("snow", "utf-8"), bytes("flake", "utf-8"), bytes("12", "utf-8"), None]
+)
+DUMMY_TIMESTAMP_INPUT = native_pd.to_datetime(
+    ["2020-12-31 00:00:00", "2020-01-01 00:00:00", native_pd.Timestamp.min]  # past
+    + ["2090-01-01 00:00:00", "2090-12-31 00:00:00", native_pd.Timestamp.max]  # future
+    + [datetime.today(), None],  # current
+    format="mixed",
+)
+
+
+def infer_return_type_using_dummy_data(
+    func: Callable, input_type: DataType, **kwargs: Any
+) -> Optional[DataType]:
+    """
+    Infer the return type of given function by applying it to a dummy input.
+    This method only supports the following input types: _IntegralType, _FractionalType,
+     StringType, BooleanType, TimestampType, BinaryType.
+    Args:
+        func: The function to infer the return type from.
+        input_type: The input type of the function.
+        **kwargs : Additional keyword arguments to pass as keywords arguments to func.
+    Returns:
+        The inferred return type of the function. If the return type cannot be inferred,
+         return None.
+    """
+    if input_type is None:
+        return None
+    input_data = None
+    if isinstance(input_type, _IntegralType):
+        input_data = DUMMY_INT_INPUT
+    elif isinstance(input_type, _FractionalType):
+        input_data = DUMMY_FLOAT_INPUT
+    elif isinstance(input_type, StringType):
+        input_data = DUMMY_STRING_INPUT
+    elif isinstance(input_type, BooleanType):
+        input_data = DUMMY_BOOL_INPUT
+    elif isinstance(input_type, TimestampType):
+        input_data = DUMMY_TIMESTAMP_INPUT
+    elif isinstance(input_type, BinaryType):
+        input_data = DUMMY_BINARY_INPUT
+    else:
+        return None
+
+    def merge_types(t1: DataType, t2: DataType) -> DataType:
+        """
+        Merge two types into one as per the following rules:
+        - Null + T = T
+        - T + Null = T
+        - T1 + T2 = T1 where T1 == T2
+        - T1 + T2 = Variant where T1 != T2
+        Args:
+            t1: first type to merge.
+            t2: second type to merge.
+
+        Returns:
+            Merged type of t1 and t2.
+        """
+        # treat NullType as None
+        t1 = None if t1 == NullType() else t1
+        t2 = None if t2 == NullType() else t2
+
+        if t1 is None:
+            return t2
+        if t2 is None:
+            return t1
+        if t1 == t2:
+            return t1
+        if isinstance(t1, MapType) and isinstance(t2, MapType):
+            return MapType(
+                merge_types(t1.key_type, t2.key_type),
+                merge_types(t1.value_type, t2.value_type),
+            )
+        if isinstance(t1, ArrayType) and isinstance(t2, ArrayType):
+            return ArrayType(merge_types(t1.element_type, t2.element_type))
+        return VariantType()
+
+    inferred_type = None
+    for x in input_data:
+        try:
+            inferred_type = merge_types(
+                inferred_type, infer_object_type(func(x, **kwargs))
+            )
+        except Exception:
+            pass
+
+    if isinstance(inferred_type, TimedeltaType):
+        # TODO: SNOW-1619940: pd.Timedelta is encoded as string.
+        return StringType()
+    return inferred_type
+
+
 def deduce_return_type_from_function(
-    func: Union[AggFuncType, UserDefinedFunction]
+    func: Union[AggFuncType, UserDefinedFunction],
+    input_type: Optional[DataType],
+    **kwargs: Any,
 ) -> Optional[DataType]:
     """
     Deduce return type if possible from a function, list, dict or type object. List will be mapped to ArrayType(),
     dict to MapType(), and if a type object (e.g., str) is given a mapping will be consulted.
     Args:
         func: callable function, object or Snowpark UserDefinedFunction that can be passed in pandas to reference a function.
+        input_type: input data type this function is applied to.
+        **kwargs : Additional keyword arguments to pass as keywords arguments to func.
 
     Returns:
         Snowpark Datatype or None if no return type could be deduced.
@@ -860,13 +986,17 @@ def deduce_return_type_from_function(
     else:
         # handle special case 'object' type, in this case use Variant Type.
         # Catch potential TypeError exception here from python_type_to_snow_type.
-        # If it is not the object type, return None to indicate that type hint could not be extracted successfully.
+        # If it is not the object type, return None to indicate that type hint could not
+        # be extracted successfully.
         try:
-            return get_types_from_type_hints(func, TempObjectType.FUNCTION)[0]
+            return_type = get_types_from_type_hints(func, TempObjectType.FUNCTION)[0]
+            if return_type is not None:
+                return return_type
         except TypeError as te:
             if str(te) == "invalid type <class 'object'>":
                 return VariantType()
-            return None
+        # infer return type using dummy data.
+        return infer_return_type_using_dummy_data(func, input_type, **kwargs)
 
 
 def sort_apply_udtf_result_columns_by_pandas_positions(
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -8672,8 +8672,8 @@ def wrapped_func(*args, **kwargs):  # type: ignore[no-untyped-def] # pragma: no
             )
 
             # Extract return type from annotations (or lookup for known pandas functions) for func object,
-            # if not return type could be extracted the variable will hold None.
-            return_type = deduce_return_type_from_function(func)
+            # if no return type could be extracted the variable will hold None.
+            return_type = deduce_return_type_from_function(func, None)
 
             # Check whether return_type has been extracted. If return type is not
             # a Series, tuple or list object, compute df.apply using a vUDF. In this case no column expansion needs to
@@ -8766,7 +8766,9 @@ def applymap(
             Function to apply to each element of the QueryCompiler.
         na_action: If 'ignore', propagate NULL values
         *args : iterable
+            Positional arguments passed to func after the input data.
         **kwargs : dict
+            Additional keyword arguments to pass as keywords arguments to func.
         """
         self._raise_not_implemented_error_for_timedelta()
 
@@ -8799,15 +8801,17 @@ def applymap(
             ErrorMessage.not_implemented(
                 "Snowpark pandas applymap API doesn't yet support na_action == 'ignore'"
             )
-        return_type = deduce_return_type_from_function(func)
-        if not return_type:
-            return_type = VariantType()
 
         # create and apply udfs on all data columns
         replace_mapping = {}
         for f in self._modin_frame.ordered_dataframe.schema.fields:
             identifier = f.column_identifier.quoted_name
             if identifier in self._modin_frame.data_column_snowflake_quoted_identifiers:
+                return_type = deduce_return_type_from_function(
+                    func, f.datatype, **kwargs
+                )
+                if not return_type:
+                    return_type = VariantType()
                 func_udf = create_udf_for_series_apply(
                     func,
                     return_type,
diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py
@@ -687,7 +687,8 @@ def apply():
         Notes
         -----
         1. When ``func`` has a type annotation for its return value, the result will be cast
-        to the corresponding dtype. When no type annotation is provided, data will be converted
+        to the corresponding dtype. When no type annotation is provided, we try to infer
+        return type using dummy data. If return type inference is not successful data will be converted
         to VARIANT type in Snowflake, and the result will have ``dtype=object``. In this case, the return value must
         be JSON-serializable, which can be a valid input to ``json.dumps`` (e.g., ``dict`` and
         ``list`` objects are JSON-serializable, but ``bytes`` and ``datetime.datetime`` objects
diff --git a/tests/integ/modin/frame/test_applymap.py b/tests/integ/modin/frame/test_applymap.py
@@ -166,6 +166,12 @@ def test_preserve_order():
         eval_snowpark_pandas_result(df, native_df, lambda x: x.applymap(lambda y: -y))
 
 
+@sql_count_checker(
+    query_count=10,
+    udf_count=1,
+    high_count_expected=True,
+    high_count_reason="udf creation",
+)
 def test_applymap_variant_json_null():
     def f(x):
         if native_pd.isna(x):
@@ -182,11 +188,5 @@ def f(x):
     # the last column is a variant column [None, pd.NA], where both None and pd.NA
     # are mapped to SQL null by Python UDF in the input
     df = pd.DataFrame([[1, 2, None], [3, 4, pd.NA]])
-    with SqlCounter(query_count=9):
-        df = df.applymap(f)
-
-    with SqlCounter(query_count=1, udf_count=1):
-        assert df.isna().to_numpy().tolist() == [
-            [False, True, True],
-            [True, False, True],
-        ]
+    native_df = native_pd.DataFrame([[1, 2, None], [3, 4, pd.NA]])
+    eval_snowpark_pandas_result(df, native_df, lambda x: x.applymap(f).isna())
diff --git a/tests/integ/modin/series/test_apply_and_map.py b/tests/integ/modin/series/test_apply_and_map.py
diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py
diff --git a/tests/unit/modin/test_apply_utils.py b/tests/unit/modin/test_apply_utils.py