Revert "feat: Add user-defined window function (UDWF) decorator and tests"

kosiew · kosiew · commit 6ee8696f9d97 · 2025-03-13T15:07:59.000+08:00
This reverts commit da691b4.
diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py
@@ -623,76 +623,59 @@ def __call__(self, *args: Expr) -> Expr:
 
     @staticmethod
     def udwf(
-        func: Callable[[], WindowEvaluator] | None = None,
-        input_types: pa.DataType | list[pa.DataType] | None = None,
-        return_type: pa.DataType | None = None,
-        volatility: Volatility | str | None = None,
-        name: str | None = None,
-    ) -> Union[WindowUDF, Callable[[Callable[[], WindowEvaluator]], WindowUDF]]:
-        """Create a new User-Defined Window Function (UDWF).
-
-        This method can be used both as a function and as a decorator:
+        func: Callable[[], WindowEvaluator],
+        input_types: pa.DataType | list[pa.DataType],
+        return_type: pa.DataType,
+        volatility: Volatility | str,
+        name: Optional[str] = None,
+    ) -> WindowUDF:
+        """Create a new User-Defined Window Function.
 
-        As a function:
-            udwf(func, input_types, return_type, volatility, name)
+        If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you
+        can simply pass it's type as ``func``. If you need to pass additional arguments
+        to it's constructor, you can define a lambda or a factory method. During runtime
+        the :py:class:`WindowEvaluator` will be constructed for every instance in
+        which this UDWF is used. The following examples are all valid.
 
-        As a decorator:
-            @udwf(input_types, return_type, volatility, name)
-            def func():
-                return WindowEvaluator()
+        .. code-block:: python
 
-        Args:
-            func: The window evaluator factory function
-            input_types: The input types for the window function
-            return_type: The return type for the window function
-            volatility: The volatility of the function
-            name: Optional name for the function
+            import pyarrow as pa
 
-        Returns:
-            Either a WindowUDF instance or a decorator function
-        """
-        # Used as decorator without arguments: @udwf
-        if func is not None and all(
-            x is None for x in (input_types, return_type, volatility)
-        ):
-            return WindowUDF._create(
-                func, [pa.float64()], pa.float64(), "volatile", None
-            )
+            class BiasedNumbers(WindowEvaluator):
+                def __init__(self, start: int = 0) -> None:
+                    self.start = start
 
-        # Used as decorator with arguments: @udwf(...)
-        if func is None:
+                def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array:
+                    return pa.array([self.start + i for i in range(num_rows)])
 
-            def decorator(f: Callable[[], WindowEvaluator]) -> WindowUDF:
-                if input_types is None or return_type is None or volatility is None:
-                    raise ValueError(
-                        "Must provide input_types, return_type, and volatility"
-                    )
-                return WindowUDF._create(f, input_types, return_type, volatility, name)
+            def bias_10() -> BiasedNumbers:
+                return BiasedNumbers(10)
 
-            return decorator
+            udwf1 = udwf(BiasedNumbers, pa.int64(), pa.int64(), "immutable")
+            udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable")
+            udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable")
 
-        # Used as function: udwf(...)
-        if input_types is None or return_type is None or volatility is None:
-            raise ValueError("Must provide input_types, return_type, and volatility")
-        return WindowUDF._create(func, input_types, return_type, volatility, name)
+        Args:
+            func: A callable to create the window function.
+            input_types: The data types of the arguments to ``func``.
+            return_type: The data type of the return value.
+            volatility: See :py:class:`Volatility` for allowed values.
+            arguments: A list of arguments to pass in to the __init__ method for accum.
+            name: A descriptive name for the function.
 
-    @staticmethod
-    def _create(
-        func: Callable[[], WindowEvaluator],
-        input_types: pa.DataType | list[pa.DataType],
-        return_type: pa.DataType,
-        volatility: Volatility | str,
-        name: str | None = None,
-    ) -> WindowUDF:
-        """Internal method to create a WindowUDF instance."""
+        Returns:
+            A user-defined window function.
+        """  # noqa: W505, E501
         if not callable(func):
-            raise TypeError("`func` must be callable")
+            msg = "`func` must be callable."
+            raise TypeError(msg)
         if not isinstance(func(), WindowEvaluator):
-            raise TypeError("`func` must implement WindowEvaluator")
-        if isinstance(input_types, pa.DataType):
-            input_types = [input_types]
+            msg = "`func` must implement the abstract base class WindowEvaluator"
+            raise TypeError(msg)
         if name is None:
             name = func().__class__.__qualname__.lower()
+        if isinstance(input_types, pa.DataType):
+            input_types = [input_types]
         return WindowUDF(
             name=name,
             func=func,
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -339,7 +339,7 @@ def test_join():
 
     # Verify we don't make a breaking change to pre-43.0.0
     # where users would pass join_keys as a positional argument
-    df2 = df.join(df1, (["a"], ["a"]), how="inner")  # type: ignore
+    df2 = df.join(df1, (["a"], ["a"]), how="inner")
     df2.show()
     df2 = df2.sort(column("l.a"))
     table = pa.Table.from_batches(df2.collect())
@@ -375,17 +375,17 @@ def test_join_invalid_params():
     with pytest.raises(
         ValueError, match=r"`left_on` or `right_on` should not provided with `on`"
     ):
-        df2 = df.join(df1, on="a", how="inner", right_on="test")  # type: ignore
+        df2 = df.join(df1, on="a", how="inner", right_on="test")
 
     with pytest.raises(
         ValueError, match=r"`left_on` and `right_on` should both be provided."
     ):
-        df2 = df.join(df1, left_on="a", how="inner")  # type: ignore
+        df2 = df.join(df1, left_on="a", how="inner")
 
     with pytest.raises(
         ValueError, match=r"either `on` or `left_on` and `right_on` should be provided."
     ):
-        df2 = df.join(df1, how="inner")  # type: ignore
+        df2 = df.join(df1, how="inner")
 
 
 def test_join_on():
@@ -567,7 +567,7 @@ def test_distinct():
 ]
 
 
-@pytest.mark.parametrize("name,expr,result", data_test_window_functions)
+@pytest.mark.parametrize(("name", "expr", "result"), data_test_window_functions)
 def test_window_functions(partitioned_df, name, expr, result):
     df = partitioned_df.select(
         column("a"), column("b"), column("c"), f.alias(expr, name)
@@ -730,7 +730,9 @@ def test_optimized_logical_plan(aggregate_df):
 def test_execution_plan(aggregate_df):
     plan = aggregate_df.execution_plan()
 
-    expected = "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"  # noqa: E501
+    expected = (
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
+    )
 
     assert expected == plan.display()
 
@@ -754,7 +756,7 @@ def test_execution_plan(aggregate_df):
 
     ctx = SessionContext()
     rows_returned = 0
-    for idx in range(0, plan.partition_count):
+    for idx in range(plan.partition_count):
         stream = ctx.execute(plan, idx)
         try:
             batch = stream.next()
@@ -883,7 +885,7 @@ def test_union_distinct(ctx):
     )
     df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
 
-    df_a_u_b = df_a.union(df_b, True).sort(column("a"))
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
 
     assert df_c.collect() == df_a_u_b.collect()
     assert df_c.collect() == df_a_u_b.collect()
@@ -952,8 +954,6 @@ def test_to_arrow_table(df):
 
 def test_execute_stream(df):
     stream = df.execute_stream()
-    for s in stream:
-        print(type(s))
     assert all(batch is not None for batch in stream)
     assert not list(stream)  # after one iteration the generator must be exhausted
 
@@ -967,7 +967,7 @@ def test_execute_stream_to_arrow_table(df, schema):
             (batch.to_pyarrow() for batch in stream), schema=df.schema()
         )
     else:
-        pyarrow_table = pa.Table.from_batches((batch.to_pyarrow() for batch in stream))
+        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
 
     assert isinstance(pyarrow_table, pa.Table)
     assert pyarrow_table.shape == (3, 3)
@@ -1031,7 +1031,7 @@ def test_describe(df):
     }
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_write_csv(ctx, df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1044,7 +1044,7 @@ def test_write_csv(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_write_json(ctx, df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1057,7 +1057,7 @@ def test_write_json(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
-@pytest.mark.parametrize("path_to_str", (True, False))
+@pytest.mark.parametrize("path_to_str", [True, False])
 def test_write_parquet(df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1069,7 +1069,7 @@ def test_write_parquet(df, tmp_path, path_to_str):
 
 
 @pytest.mark.parametrize(
-    "compression, compression_level",
+    ("compression", "compression_level"),
     [("gzip", 6), ("brotli", 7), ("zstd", 15)],
 )
 def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
@@ -1080,7 +1080,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
     )
 
     # test that the actual compression scheme is the one written
-    for root, dirs, files in os.walk(path):
+    for _root, _dirs, files in os.walk(path):
         for file in files:
             if file.endswith(".parquet"):
                 metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
@@ -1095,7 +1095,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
 
 
 @pytest.mark.parametrize(
-    "compression, compression_level",
+    ("compression", "compression_level"),
     [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
 )
 def test_write_compressed_parquet_wrong_compression_level(
@@ -1150,7 +1150,7 @@ def test_dataframe_export(df) -> None:
     table = pa.table(df, schema=desired_schema)
     assert table.num_columns == 1
     assert table.num_rows == 3
-    for i in range(0, 3):
+    for i in range(3):
         assert table[0][i].as_py() is None
 
     # Expect an error when we cannot convert schema
@@ -1184,8 +1184,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame:
     result = df.to_pydict()
 
     assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(0, 3)]
-    assert result["new_col"] == [3 for _i in range(0, 3)]
+    assert result["string_col"] == ["string data" for _i in range(3)]
+    assert result["new_col"] == [3 for _i in range(3)]
 
 
 def test_dataframe_repr_html(df) -> None:
diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py
@@ -306,55 +306,3 @@ def test_udwf_functions(df, name, expr, expected):
     result = df.sort(column("a")).select(column(name)).collect()[0]
 
     assert result.column(0) == pa.array(expected)
-
-
-def test_udwf_decorator(df):
-    @udwf
-    def smooth_default():
-        return ExponentialSmoothDefault()
-
-    df1 = df.select(smooth_default()(column("a")))
-    result = df1.collect()[0].column(0)
-    # Test just the first few values with more lenient comparison
-    assert abs(result[0].as_py() - 0.0) < 1e-6
-    assert abs(result[1].as_py() - 1.0) < 1e-6
-    assert abs(result[2].as_py() - 2.1) < 1e-6
-
-    # Test with explicit types
-    @udwf(pa.float64(), pa.float64(), "immutable")
-    def smooth_with_args():
-        return ExponentialSmoothDefault(alpha=0.8)
-
-    df2 = df.select(smooth_with_args()(column("a")))
-    result = df2.collect()[0].column(0)
-    # Test just the first few values
-    assert abs(result[0].as_py() - 0.0) < 1e-6
-    assert abs(result[1].as_py() - 1.0) < 1e-6
-    assert abs(result[2].as_py() - 1.8) < 1e-6
-
-
-def test_udwf_with_window_frame_decorator(df):
-    @udwf(pa.float64(), pa.float64(), "immutable")
-    def smooth_frame():
-        return ExponentialSmoothFrame(alpha=0.9)
-
-    # Create window function and apply transformations
-    window_fn = smooth_frame()(column("a"))
-    window_fn = window_fn.window_frame(WindowFrame("rows", None, 0))
-    window_fn = window_fn.build()
-
-    result = df.select(window_fn).collect()[0].column(0)
-    # Test just the first few values
-    assert abs(result[0].as_py() - 0.0) < 1e-6
-    assert abs(result[1].as_py() - 0.9) < 1e-6
-
-    # With order by
-    window_fn = smooth_frame()(column("a"))
-    window_fn = window_fn.window_frame(WindowFrame("rows", None, 0))
-    window_fn = window_fn.order_by(column("b"))
-    window_fn = window_fn.build()
-
-    result = df.select(window_fn).collect()[0].column(0)
-    # Test just the first few values
-    assert abs(result[0].as_py() - 0.551) < 1e-3
-    assert abs(result[1].as_py() - 1.13) < 1e-3