@@ -15395,6 +15395,209 @@ def _quantiles_single_col(
1539515395
1539615396 return SnowflakeQueryCompiler(internal_frame)
1539715397
15398+ @register_query_compiler_method_not_implemented(
15399+ "BasePandasDataset",
15400+ "interpolate",
15401+ UnsupportedArgsRule(
15402+ unsupported_conditions=[
15403+ (
15404+ lambda args: args.get("method")
15405+ not in {"linear", "ffill", "pad", "bfill", "backfill"},
15406+ lambda args: f"method = '{args.get('method')}' is not supported. Snowpark pandas currently only supports method = 'linear', 'ffill', 'pad', 'bfill', and 'backfill'",
15407+ ),
15408+ ("axis", 1),
15409+ (
15410+ lambda args: args.get("limit") is not None,
15411+ lambda args: f"limit = {args.get('limit')} is not supported. Snowpark pandas currently only supports limit = None",
15412+ ),
15413+ (
15414+ lambda args: args.get("downcast") is not None,
15415+ lambda args: f"downcast = '{args.get('downcast')}' is not supported. Snowpark pandas currently only supports downcast = None",
15416+ ),
15417+ ]
15418+ ),
15419+ )
15420+ def interpolate(
15421+ self,
15422+ method: str = "linear",
15423+ axis: int = 0,
15424+ limit: Optional[int] = None,
15425+ inplace: bool = False,
15426+ limit_direction: Literal["forward", "backward", "both", None] = None,
15427+ limit_area: Literal[None, "inside", "outside"] = None,
15428+ downcast: Literal["infer", None] = None,
15429+ ) -> "SnowflakeQueryCompiler":
15430+ """
15431+ Interpolate missing values in a dataframe.
15432+
15433+ Only numeric and datetime columns are affected; other columns are left untouched.
15434+
15435+ Parameters
15436+ ----------
15437+ method: str, default: "linear"
15438+ The method of interpolation. Native pandas supports a wide range of values for this argument,
15439+ and uses it to call an appropriate scipy interpolation function. Snowflake only supports the
15440+ "linear", "bfill", and "pad" methods; the "index"/"values" method can also be easily supported
15441+ but is left as an exercise for some future implementor.
15442+ axis: int, default: 0
15443+ The axis across which to interpolate. Snowflake only supports 0 (columnar).
15444+ limit: Optional[int], default: None
15445+ The maximum number of consecutive NaN values to fill. Not supported by Snowpark pandas.
15446+ inplace: bool, default: False
15447+ Whether or not the interpolation occurs in-place. This argument is ignored and only provided
15448+ for compatibility with Modin.
15449+ limit_direction: Literal["forward", "backward", "both", None], default: None
15450+ The direction in which to fill consecutive NaN values. If `method` is "pad" or "ffill"
15451+ this must be "forward"; if `method` is "bfill" or "backfill" this must be "backward".
15452+
15453+ The default value is "backward" for "bfill"/"backfill", and "forward" otherwise.
15454+ limit_area: Literal["inside", "outside", None], default: None
15455+ Restrictions on how consecutive NaN values should be filled. None means all NaN values
15456+ are replaced, "inside" means only NaNs between valid values are replaced, and "outside"
15457+ means only NaNs outside valid values are replaced.
15458+
15459+ If the method is "linear", only "inside" and None are supported.
15460+
15461+ If the method is "pad"/"ffill" or "backfill"/"bfill", only None is supported.
15462+ downcast: Literal["infer", None], default: None
15463+ Whether to downcast dtypes if possible. Not supported by Snowpark pandas.
15464+
15465+ Returns
15466+ -------
15467+ SnowflakeQueryCompiler
15468+ A query compiler containing the interpolated result.
15469+ """
15470+ if method == "linear":
15471+ sql_fill_method = "interpolate_linear"
15472+ elif method == "pad" or method == "ffill":
15473+ sql_fill_method = "interpolate_ffill"
15474+ elif method == "backfill" or method == "bfill":
15475+ sql_fill_method = "interpolate_bfill"
15476+ else:
15477+ ErrorMessage.not_implemented(
15478+ f"Snowpark pandas does not yet support interpolate with method = {method}"
15479+ )
15480+ # The high-level approaches for each supported fill method are as follows.
15481+ # Linear fill:
15482+ # - limit_area=None: INTERPOLATE_LINEAR, then
15483+ # - INTERPOLATE_FFILL if limit_direction = "forward"
15484+ # - INTERPOLATE_BFILL if limit_direction = "backward"
15485+ # - do both FFILL and BFILL if limit_direction = "both"
15486+ # - limit_area="inside": INTERPOLATE_LINEAR only
15487+ # - limit_area="outside": unsupported
15488+ # Forwards fill: (direction is restricted to "forwards")
15489+ # - limit_area=None: FFILL once
15490+ # - limit_area="inside": unsupported
15491+ # - limit_area="outside": unsupported
15492+ # Backwards fill: (direction is restricted to "backwards")
15493+ # - limit_area=None: BFILL once
15494+ # - limit_area="inside": unsupported
15495+ # - limit_area="outside": unsupported
15496+ #
15497+ # "outside" configurations could theoretically be done by finding the max/min row position
15498+ # of non-null values in the table, but this gets complicated.
15499+ if (
15500+ (
15501+ sql_fill_method == "interpolate_ffill"
15502+ or sql_fill_method == "interpolate_bfill"
15503+ )
15504+ and limit_area is not None
15505+ ) or (sql_fill_method == "interpolate_linear" and limit_area == "outside"):
15506+ ErrorMessage.not_implemented(
15507+ f"Snowpark pandas does not yet support interpolate with limit_area = {limit_area} for method = {method}"
15508+ )
15509+ # Validate limit_direction (these are actual ValueErrors, not unimplemented parameter combinations)
15510+ if (
15511+ sql_fill_method == "interpolate_ffill"
15512+ and limit_direction is not None
15513+ and limit_direction != "forward"
15514+ ):
15515+ raise ValueError(
15516+ f"`limit_direction` must be 'forward' for method `{method}`"
15517+ )
15518+ if (
15519+ sql_fill_method == "interpolate_bfill"
15520+ and limit_direction is not None
15521+ and limit_direction != "backward"
15522+ ):
15523+ raise ValueError(
15524+ f"`limit_direction` must be 'backward' for method `{method}`"
15525+ )
15526+ # pandas only supports linear interpolation for MultiIndex rows.
15527+ if self.is_multiindex(axis=0) and sql_fill_method != "interpolate_linear":
15528+ raise ValueError(
15529+ "Only `method=linear` interpolation is supported on MultiIndexes."
15530+ )
15531+ if self.get_axis_len(1) == 0:
15532+ # If there's no columns, do nothing.
15533+ return self
15534+ if limit_direction is None:
15535+ limit_direction = (
15536+ "backward" if sql_fill_method == "interpolate_bfill" else "forward"
15537+ )
15538+ frame = self._modin_frame.ensure_row_position_column()
15539+ original_identifiers = (
15540+ self._modin_frame.data_column_snowflake_quoted_identifiers
15541+ )
15542+ # Linear interpolation touches only numeric and datetime columns, but ffill and bfill work
15543+ # on non-numeric data as well.
15544+ # SNOW-2405318: Tests that hit this branch are skipped due to a SQL bug with INTERPOLATE_LINEAR.
15545+ if sql_fill_method == "interpolate_linear": # pragma: no cover
15546+ columns_to_interpolate = [
15547+ identifier
15548+ for identifier, dtype in zip(
15549+ original_identifiers, self._get_dtypes(original_identifiers)
15550+ )
15551+ if is_datetime64_any_dtype(dtype) or is_numeric_dtype(dtype)
15552+ ]
15553+ else:
15554+ columns_to_interpolate = original_identifiers
15555+ pos_window = Window.order_by(frame.row_position_snowflake_quoted_identifier)
15556+ # SNOW-2405318: Tests that hit this branch are skipped due to a SQL bug with INTERPOLATE_LINEAR.
15557+ # The branch was tested manually with the INTERPOLATE_LINEAR invocation replaced with
15558+ # INTERPOLATE_FFILL to ensure it otherwise works; coverage should be returned after the
15559+ # server-side bug is addressed.
15560+ if (
15561+ sql_fill_method == "interpolate_linear" and limit_area is None
15562+ ): # pragma: no cover
15563+ # If the fill method is linear and limit_area is None, we need to fill leading/trailing
15564+ # NULL values as well since the SQL function ordinarily does not touch them. Because
15565+ # window functions cannot be nested, we implement this by adding 1 column with the FFILL
15566+ # result (covers trailing NULLs), and 1 column with the BFILL result (covers leading
15567+ # NULLs), then coalescing each interpolation together.
15568+ # Note that this may create a SQL expression with 3x the columns of the original frame,
15569+ # so it may become expensive. However, we expect most interpolations to occur on
15570+ # single-column frames or Series, so this overhead is acceptable.
15571+ #
15572+ # Example:
15573+ # pd.Series([nan, 0.0, nan, 1.0, nan]).interpolate(method="linear", limit_area=None)
15574+ # 1. input column nan 0.0 nan 1.0 nan
15575+ # 2. after linear interpolation nan 0.0 0.5 1.0 nan
15576+ # 3. after ffill nan 0.0 0.5 1.0 1.0
15577+ # 4. after bfill 0.0 0.0 0.5 1.0 1.0
15578+ update_expr = {}
15579+ for column_identifier in columns_to_interpolate:
15580+ column = col(column_identifier)
15581+ cols = [builtin("interpolate_linear")(column).over(pos_window)]
15582+ if limit_direction == "forward" or limit_direction == "both":
15583+ cols.append(builtin("interpolate_ffill")(column).over(pos_window))
15584+ if limit_direction == "backward" or limit_direction == "both":
15585+ cols.append(builtin("interpolate_bfill")(column).over(pos_window))
15586+ update_expr[column_identifier] = coalesce(*cols)
15587+ else:
15588+ # Other parameter combinations map directly to SQL behavior.
15589+ update_expr = {
15590+ column_identifier: builtin(sql_fill_method)(
15591+ col(column_identifier)
15592+ ).over(pos_window)
15593+ for column_identifier in columns_to_interpolate
15594+ }
15595+ return SnowflakeQueryCompiler(
15596+ frame.update_snowflake_quoted_identifiers_with_expressions(update_expr)[
15597+ 0
15598+ ].ensure_row_position_column()
15599+ )
15600+
1539815601 @register_query_compiler_method_not_implemented(
1539915602 "BasePandasDataset",
1540015603 "skew",
0 commit comments