feat: Add user-defined window function (UDWF) decorator and tests

kosiew · kosiew · commit da691b4330a0 · 2025-03-13T15:03:32.000+08:00
- Implemented the `udwf` decorator to create user-defined window functions, allowing for more flexible function definitions.
- Enhanced the `udwf` method to support both function and decorator usage.
- Added tests for `udwf` decorator functionality, including default and parameterized use cases.
- Included tests for window frame decorators to validate behavior with and without ordering.
diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py
@@ -623,59 +623,76 @@ def __call__(self, *args: Expr) -> Expr:
 
     @staticmethod
     def udwf(
-        func: Callable[[], WindowEvaluator],
-        input_types: pa.DataType | list[pa.DataType],
-        return_type: pa.DataType,
-        volatility: Volatility | str,
-        name: Optional[str] = None,
-    ) -> WindowUDF:
-        """Create a new User-Defined Window Function.
+        func: Callable[[], WindowEvaluator] | None = None,
+        input_types: pa.DataType | list[pa.DataType] | None = None,
+        return_type: pa.DataType | None = None,
+        volatility: Volatility | str | None = None,
+        name: str | None = None,
+    ) -> Union[WindowUDF, Callable[[Callable[[], WindowEvaluator]], WindowUDF]]:
+        """Create a new User-Defined Window Function (UDWF).
 
-        If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you
-        can simply pass it's type as ``func``. If you need to pass additional arguments
-        to it's constructor, you can define a lambda or a factory method. During runtime
-        the :py:class:`WindowEvaluator` will be constructed for every instance in
-        which this UDWF is used. The following examples are all valid.
+        This method can be used both as a function and as a decorator:
 
-        .. code-block:: python
+        As a function:
+            udwf(func, input_types, return_type, volatility, name)
 
-            import pyarrow as pa
+        As a decorator:
+            @udwf(input_types, return_type, volatility, name)
+            def func():
+                return WindowEvaluator()
 
-            class BiasedNumbers(WindowEvaluator):
-                def __init__(self, start: int = 0) -> None:
-                    self.start = start
+        Args:
+            func: The window evaluator factory function
+            input_types: The input types for the window function
+            return_type: The return type for the window function
+            volatility: The volatility of the function
+            name: Optional name for the function
 
-                def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array:
-                    return pa.array([self.start + i for i in range(num_rows)])
+        Returns:
+            Either a WindowUDF instance or a decorator function
+        """
+        # Used as decorator without arguments: @udwf
+        if func is not None and all(
+            x is None for x in (input_types, return_type, volatility)
+        ):
+            return WindowUDF._create(
+                func, [pa.float64()], pa.float64(), "volatile", None
+            )
 
-            def bias_10() -> BiasedNumbers:
-                return BiasedNumbers(10)
+        # Used as decorator with arguments: @udwf(...)
+        if func is None:
 
-            udwf1 = udwf(BiasedNumbers, pa.int64(), pa.int64(), "immutable")
-            udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable")
-            udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable")
+            def decorator(f: Callable[[], WindowEvaluator]) -> WindowUDF:
+                if input_types is None or return_type is None or volatility is None:
+                    raise ValueError(
+                        "Must provide input_types, return_type, and volatility"
+                    )
+                return WindowUDF._create(f, input_types, return_type, volatility, name)
 
-        Args:
-            func: A callable to create the window function.
-            input_types: The data types of the arguments to ``func``.
-            return_type: The data type of the return value.
-            volatility: See :py:class:`Volatility` for allowed values.
-            arguments: A list of arguments to pass in to the __init__ method for accum.
-            name: A descriptive name for the function.
+            return decorator
 
-        Returns:
-            A user-defined window function.
-        """  # noqa: W505, E501
+        # Used as function: udwf(...)
+        if input_types is None or return_type is None or volatility is None:
+            raise ValueError("Must provide input_types, return_type, and volatility")
+        return WindowUDF._create(func, input_types, return_type, volatility, name)
+
+    @staticmethod
+    def _create(
+        func: Callable[[], WindowEvaluator],
+        input_types: pa.DataType | list[pa.DataType],
+        return_type: pa.DataType,
+        volatility: Volatility | str,
+        name: str | None = None,
+    ) -> WindowUDF:
+        """Internal method to create a WindowUDF instance."""
         if not callable(func):
-            msg = "`func` must be callable."
-            raise TypeError(msg)
+            raise TypeError("`func` must be callable")
         if not isinstance(func(), WindowEvaluator):
-            msg = "`func` must implement the abstract base class WindowEvaluator"
-            raise TypeError(msg)
-        if name is None:
-            name = func().__class__.__qualname__.lower()
+            raise TypeError("`func` must implement WindowEvaluator")
         if isinstance(input_types, pa.DataType):
             input_types = [input_types]
+        if name is None:
+            name = func().__class__.__qualname__.lower()
         return WindowUDF(
             name=name,
             func=func,
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -339,7 +339,7 @@ def test_join():
 
     # Verify we don't make a breaking change to pre-43.0.0
     # where users would pass join_keys as a positional argument
-    df2 = df.join(df1, (["a"], ["a"]), how="inner")
+    df2 = df.join(df1, (["a"], ["a"]), how="inner")  # type: ignore
     df2.show()
     df2 = df2.sort(column("l.a"))
     table = pa.Table.from_batches(df2.collect())
@@ -375,17 +375,17 @@ def test_join_invalid_params():
     with pytest.raises(
         ValueError, match=r"`left_on` or `right_on` should not provided with `on`"
     ):
-        df2 = df.join(df1, on="a", how="inner", right_on="test")
+        df2 = df.join(df1, on="a", how="inner", right_on="test")  # type: ignore
 
     with pytest.raises(
         ValueError, match=r"`left_on` and `right_on` should both be provided."
     ):
-        df2 = df.join(df1, left_on="a", how="inner")
+        df2 = df.join(df1, left_on="a", how="inner")  # type: ignore
 
     with pytest.raises(
         ValueError, match=r"either `on` or `left_on` and `right_on` should be provided."
     ):
-        df2 = df.join(df1, how="inner")
+        df2 = df.join(df1, how="inner")  # type: ignore
 
 
 def test_join_on():
@@ -567,7 +567,7 @@ def test_distinct():
 ]
 
 
-@pytest.mark.parametrize(("name", "expr", "result"), data_test_window_functions)
+@pytest.mark.parametrize("name,expr,result", data_test_window_functions)
 def test_window_functions(partitioned_df, name, expr, result):
     df = partitioned_df.select(
         column("a"), column("b"), column("c"), f.alias(expr, name)
@@ -730,9 +730,7 @@ def test_optimized_logical_plan(aggregate_df):
 def test_execution_plan(aggregate_df):
     plan = aggregate_df.execution_plan()
 
-    expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
-    )
+    expected = "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"  # noqa: E501
 
     assert expected == plan.display()
 
@@ -756,7 +754,7 @@ def test_execution_plan(aggregate_df):
 
     ctx = SessionContext()
     rows_returned = 0
-    for idx in range(plan.partition_count):
+    for idx in range(0, plan.partition_count):
         stream = ctx.execute(plan, idx)
         try:
             batch = stream.next()
@@ -885,7 +883,7 @@ def test_union_distinct(ctx):
     )
     df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
 
-    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
+    df_a_u_b = df_a.union(df_b, True).sort(column("a"))
 
     assert df_c.collect() == df_a_u_b.collect()
     assert df_c.collect() == df_a_u_b.collect()
@@ -954,6 +952,8 @@ def test_to_arrow_table(df):
 
 def test_execute_stream(df):
     stream = df.execute_stream()
+    for s in stream:
+        print(type(s))
     assert all(batch is not None for batch in stream)
     assert not list(stream)  # after one iteration the generator must be exhausted
 
@@ -967,7 +967,7 @@ def test_execute_stream_to_arrow_table(df, schema):
             (batch.to_pyarrow() for batch in stream), schema=df.schema()
         )
     else:
-        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
+        pyarrow_table = pa.Table.from_batches((batch.to_pyarrow() for batch in stream))
 
     assert isinstance(pyarrow_table, pa.Table)
     assert pyarrow_table.shape == (3, 3)
@@ -1031,7 +1031,7 @@ def test_describe(df):
     }
 
 
-@pytest.mark.parametrize("path_to_str", [True, False])
+@pytest.mark.parametrize("path_to_str", (True, False))
 def test_write_csv(ctx, df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1044,7 +1044,7 @@ def test_write_csv(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
-@pytest.mark.parametrize("path_to_str", [True, False])
+@pytest.mark.parametrize("path_to_str", (True, False))
 def test_write_json(ctx, df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1057,7 +1057,7 @@ def test_write_json(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
-@pytest.mark.parametrize("path_to_str", [True, False])
+@pytest.mark.parametrize("path_to_str", (True, False))
 def test_write_parquet(df, tmp_path, path_to_str):
     path = str(tmp_path) if path_to_str else tmp_path
 
@@ -1069,7 +1069,7 @@ def test_write_parquet(df, tmp_path, path_to_str):
 
 
 @pytest.mark.parametrize(
-    ("compression", "compression_level"),
+    "compression, compression_level",
     [("gzip", 6), ("brotli", 7), ("zstd", 15)],
 )
 def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
@@ -1080,7 +1080,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
     )
 
     # test that the actual compression scheme is the one written
-    for _root, _dirs, files in os.walk(path):
+    for root, dirs, files in os.walk(path):
         for file in files:
             if file.endswith(".parquet"):
                 metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
@@ -1095,7 +1095,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
 
 
 @pytest.mark.parametrize(
-    ("compression", "compression_level"),
+    "compression, compression_level",
     [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
 )
 def test_write_compressed_parquet_wrong_compression_level(
@@ -1150,7 +1150,7 @@ def test_dataframe_export(df) -> None:
     table = pa.table(df, schema=desired_schema)
     assert table.num_columns == 1
     assert table.num_rows == 3
-    for i in range(3):
+    for i in range(0, 3):
         assert table[0][i].as_py() is None
 
     # Expect an error when we cannot convert schema
@@ -1184,8 +1184,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame:
     result = df.to_pydict()
 
     assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(3)]
-    assert result["new_col"] == [3 for _i in range(3)]
+    assert result["string_col"] == ["string data" for _i in range(0, 3)]
+    assert result["new_col"] == [3 for _i in range(0, 3)]
 
 
 def test_dataframe_repr_html(df) -> None:
diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py
@@ -306,3 +306,55 @@ def test_udwf_functions(df, name, expr, expected):
     result = df.sort(column("a")).select(column(name)).collect()[0]
 
     assert result.column(0) == pa.array(expected)
+
+
+def test_udwf_decorator(df):
+    @udwf
+    def smooth_default():
+        return ExponentialSmoothDefault()
+
+    df1 = df.select(smooth_default()(column("a")))
+    result = df1.collect()[0].column(0)
+    # Test just the first few values with more lenient comparison
+    assert abs(result[0].as_py() - 0.0) < 1e-6
+    assert abs(result[1].as_py() - 1.0) < 1e-6
+    assert abs(result[2].as_py() - 2.1) < 1e-6
+
+    # Test with explicit types
+    @udwf(pa.float64(), pa.float64(), "immutable")
+    def smooth_with_args():
+        return ExponentialSmoothDefault(alpha=0.8)
+
+    df2 = df.select(smooth_with_args()(column("a")))
+    result = df2.collect()[0].column(0)
+    # Test just the first few values
+    assert abs(result[0].as_py() - 0.0) < 1e-6
+    assert abs(result[1].as_py() - 1.0) < 1e-6
+    assert abs(result[2].as_py() - 1.8) < 1e-6
+
+
+def test_udwf_with_window_frame_decorator(df):
+    @udwf(pa.float64(), pa.float64(), "immutable")
+    def smooth_frame():
+        return ExponentialSmoothFrame(alpha=0.9)
+
+    # Create window function and apply transformations
+    window_fn = smooth_frame()(column("a"))
+    window_fn = window_fn.window_frame(WindowFrame("rows", None, 0))
+    window_fn = window_fn.build()
+
+    result = df.select(window_fn).collect()[0].column(0)
+    # Test just the first few values
+    assert abs(result[0].as_py() - 0.0) < 1e-6
+    assert abs(result[1].as_py() - 0.9) < 1e-6
+
+    # With order by
+    window_fn = smooth_frame()(column("a"))
+    window_fn = window_fn.window_frame(WindowFrame("rows", None, 0))
+    window_fn = window_fn.order_by(column("b"))
+    window_fn = window_fn.build()
+
+    result = df.select(window_fn).collect()[0].column(0)
+    # Test just the first few values
+    assert abs(result[0].as_py() - 0.551) < 1e-3
+    assert abs(result[1].as_py() - 1.13) < 1e-3