Spaarsh
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/datafusion/expr.py‎
Lines changed: 4 additions & 4 deletions b/‎python/datafusion/expr.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/datafusion/functions.py‎
Lines changed: 18 additions & 0 deletions b/‎python/datafusion/functions.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎python/datafusion/udf.py‎
Lines changed: 99 additions & 24 deletions b/‎python/datafusion/udf.py‎
Lines changed: 99 additions & 24 deletions
diff --git a/‎python/tests/test_dataframe.py‎
Lines changed: 54 additions & 0 deletions b/‎python/tests/test_dataframe.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎python/tests/test_functions.py‎
Lines changed: 4 additions & 0 deletions b/‎python/tests/test_functions.py‎
Lines changed: 4 additions & 0 deletions
@@ -133,6 +133,7 @@ dev = [
     "maturin>=1.8.1",
     "numpy>1.25.0",
     "pytest>=7.4.4",
+    "pytest-asyncio>=0.23.3",
     "ruff>=0.9.1",
     "toml>=0.10.2",
     "pygithub==2.5.0",
 
@@ -193,7 +193,7 @@ class Expr:
     :ref:`Expressions` in the online documentation for more information.
     """
 
-    def __init__(self, expr: expr_internal.Expr) -> None:
+    def __init__(self, expr: expr_internal.RawExpr) -> None:
         """This constructor should not be called by the end user."""
         self.expr = expr
 
@@ -383,7 +383,7 @@ def literal(value: Any) -> Expr:
             value = pa.scalar(value, type=pa.string_view())
         if not isinstance(value, pa.Scalar):
             value = pa.scalar(value)
-        return Expr(expr_internal.Expr.literal(value))
+        return Expr(expr_internal.RawExpr.literal(value))
 
     @staticmethod
     def string_literal(value: str) -> Expr:
@@ -398,13 +398,13 @@ def string_literal(value: str) -> Expr:
         """
         if isinstance(value, str):
             value = pa.scalar(value, type=pa.string())
-            return Expr(expr_internal.Expr.literal(value))
+            return Expr(expr_internal.RawExpr.literal(value))
         return Expr.literal(value)
 
     @staticmethod
     def column(value: str) -> Expr:
         """Creates a new expression representing a column."""
-        return Expr(expr_internal.Expr.column(value))
+        return Expr(expr_internal.RawExpr.column(value))
 
     def alias(self, name: str) -> Expr:
         """Assign a name to the expression."""
 
@@ -217,6 +217,7 @@
     "random",
     "range",
     "rank",
+    "regexp_count",
     "regexp_like",
     "regexp_match",
     "regexp_replace",
@@ -779,6 +780,23 @@ def regexp_replace(
     return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags))
 
 
+def regexp_count(
+    string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None
+) -> Expr:
+    """Returns the number of matches in a string.
+
+    Optional start position (the first position is 1) to search for the regular
+    expression.
+    """
+    if flags is not None:
+        flags = flags.expr
+    if start is not None:
+        start = start.expr
+    else:
+        start = Expr.expr
+    return Expr(f.regexp_count(string.expr, pattern.expr, start, flags))
+
+
 def repeat(string: Expr, n: Expr) -> Expr:
     """Repeats the ``string`` to ``n`` times."""
     return Expr(f.repeat(string.expr, n.expr))
 
@@ -621,31 +621,48 @@ def __call__(self, *args: Expr) -> Expr:
         args_raw = [arg.expr for arg in args]
         return Expr(self._udwf.__call__(*args_raw))
 
+    @overload
+    @staticmethod
+    def udwf(
+        input_types: pa.DataType | list[pa.DataType],
+        return_type: pa.DataType,
+        volatility: Volatility | str,
+        name: Optional[str] = None,
+    ) -> Callable[..., WindowUDF]: ...
+
+    @overload
     @staticmethod
     def udwf(
         func: Callable[[], WindowEvaluator],
         input_types: pa.DataType | list[pa.DataType],
         return_type: pa.DataType,
         volatility: Volatility | str,
         name: Optional[str] = None,
-    ) -> WindowUDF:
-        """Create a new User-Defined Window Function.
+    ) -> WindowUDF: ...
 
-        If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you
-        can simply pass it's type as ``func``. If you need to pass additional arguments
-        to it's constructor, you can define a lambda or a factory method. During runtime
-        the :py:class:`WindowEvaluator` will be constructed for every instance in
-        which this UDWF is used. The following examples are all valid.
+    @staticmethod
+    def udwf(*args: Any, **kwargs: Any):  # noqa: D417
+        """Create a new User-Defined Window Function (UDWF).
 
-        .. code-block:: python
+        This class can be used both as a **function** and as a **decorator**.
+
+        Usage:
+            - **As a function**: Call `udwf(func, input_types, return_type, volatility,
+              name)`.
+            - **As a decorator**: Use `@udwf(input_types, return_type, volatility,
+              name)`. When using `udwf` as a decorator, **do not pass `func`
+              explicitly**.
 
+        **Function example:**
+            ```
             import pyarrow as pa
 
             class BiasedNumbers(WindowEvaluator):
                 def __init__(self, start: int = 0) -> None:
                     self.start = start
 
-                def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array:
+                def evaluate_all(self, values: list[pa.Array],
+                    num_rows: int) -> pa.Array:
                     return pa.array([self.start + i for i in range(num_rows)])
 
             def bias_10() -> BiasedNumbers:
@@ -655,35 +672,93 @@ def bias_10() -> BiasedNumbers:
             udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable")
             udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable")
 
+            ```
+
+        **Decorator example:**
+            ```
+            @udwf(pa.int64(), pa.int64(), "immutable")
+            def biased_numbers() -> BiasedNumbers:
+                return BiasedNumbers(10)
+            ```
+
         Args:
-            func: A callable to create the window function.
-            input_types: The data types of the arguments to ``func``.
+            func: **Only needed when calling as a function. Skip this argument when
+                using `udwf` as a decorator.**
+            input_types: The data types of the arguments.
             return_type: The data type of the return value.
             volatility: See :py:class:`Volatility` for allowed values.
-            arguments: A list of arguments to pass in to the __init__ method for accum.
             name: A descriptive name for the function.
 
         Returns:
-            A user-defined window function.
-        """  # noqa: W505, E501
+            A user-defined window function that can be used in window function calls.
+        """
+        if args and callable(args[0]):
+            # Case 1: Used as a function, require the first parameter to be callable
+            return WindowUDF._create_window_udf(*args, **kwargs)
+        # Case 2: Used as a decorator with parameters
+        return WindowUDF._create_window_udf_decorator(*args, **kwargs)
+
+    @staticmethod
+    def _create_window_udf(
+        func: Callable[[], WindowEvaluator],
+        input_types: pa.DataType | list[pa.DataType],
+        return_type: pa.DataType,
+        volatility: Volatility | str,
+        name: Optional[str] = None,
+    ) -> WindowUDF:
+        """Create a WindowUDF instance from function arguments."""
         if not callable(func):
             msg = "`func` must be callable."
             raise TypeError(msg)
         if not isinstance(func(), WindowEvaluator):
             msg = "`func` must implement the abstract base class WindowEvaluator"
             raise TypeError(msg)
-        if name is None:
-            name = func().__class__.__qualname__.lower()
-        if isinstance(input_types, pa.DataType):
-            input_types = [input_types]
-        return WindowUDF(
-            name=name,
-            func=func,
-            input_types=input_types,
-            return_type=return_type,
-            volatility=volatility,
+
+        name = name or func.__qualname__.lower()
+        input_types = (
+            [input_types] if isinstance(input_types, pa.DataType) else input_types
         )
 
+        return WindowUDF(name, func, input_types, return_type, volatility)
+
+    @staticmethod
+    def _get_default_name(func: Callable) -> str:
+        """Get the default name for a function based on its attributes."""
+        if hasattr(func, "__qualname__"):
+            return func.__qualname__.lower()
+        return func.__class__.__name__.lower()
+
+    @staticmethod
+    def _normalize_input_types(
+        input_types: pa.DataType | list[pa.DataType],
+    ) -> list[pa.DataType]:
+        """Convert a single DataType to a list if needed."""
+        if isinstance(input_types, pa.DataType):
+            return [input_types]
+        return input_types
+
+    @staticmethod
+    def _create_window_udf_decorator(
+        input_types: pa.DataType | list[pa.DataType],
+        return_type: pa.DataType,
+        volatility: Volatility | str,
+        name: Optional[str] = None,
+    ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]:
+        """Create a decorator for a WindowUDF."""
+
+        def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]:
+            udwf_caller = WindowUDF._create_window_udf(
+                func, input_types, return_type, volatility, name
+            )
+
+            @functools.wraps(func)
+            def wrapper(*args: Any, **kwargs: Any) -> Expr:
+                return udwf_caller(*args, **kwargs)
+
+            return wrapper
+
+        return decorator
+
 
 # Convenience exports so we can import instead of treating as
 # variables at the package root
 
@@ -771,6 +771,16 @@ def test_execution_plan(aggregate_df):
     assert rows_returned == 5
 
 
+@pytest.mark.asyncio
+async def test_async_iteration_of_df(aggregate_df):
+    rows_returned = 0
+    async for batch in aggregate_df.execute_stream():
+        assert batch is not None
+        rows_returned += len(batch.to_pyarrow()[0])
+
+    assert rows_returned == 5
+
+
 def test_repartition(df):
     df.repartition(2)
 
@@ -958,6 +968,18 @@ def test_execute_stream(df):
     assert not list(stream)  # after one iteration the generator must be exhausted
 
 
+@pytest.mark.asyncio
+async def test_execute_stream_async(df):
+    stream = df.execute_stream()
+    batches = [batch async for batch in stream]
+
+    assert all(batch is not None for batch in batches)
+
+    # After consuming all batches, the stream should be exhausted
+    remaining_batches = [batch async for batch in stream]
+    assert not remaining_batches
+
+
 @pytest.mark.parametrize("schema", [True, False])
 def test_execute_stream_to_arrow_table(df, schema):
     stream = df.execute_stream()
@@ -974,6 +996,25 @@ def test_execute_stream_to_arrow_table(df, schema):
     assert set(pyarrow_table.column_names) == {"a", "b", "c"}
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("schema", [True, False])
+async def test_execute_stream_to_arrow_table_async(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream]
+        )
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
 def test_execute_stream_partitioned(df):
     streams = df.execute_stream_partitioned()
     assert all(batch is not None for stream in streams for batch in stream)
@@ -982,6 +1023,19 @@ def test_execute_stream_partitioned(df):
     )  # after one iteration all generators must be exhausted
 
 
+@pytest.mark.asyncio
+async def test_execute_stream_partitioned_async(df):
+    streams = df.execute_stream_partitioned()
+
+    for stream in streams:
+        batches = [batch async for batch in stream]
+        assert all(batch is not None for batch in batches)
+
+        # Ensure the stream is exhausted after iteration
+        remaining_batches = [batch async for batch in stream]
+        assert not remaining_batches
+
+
 def test_empty_to_arrow_table(df):
     # Convert empty datafusion dataframe to pyarrow Table
     pyarrow_table = df.limit(0).to_arrow_table()
 
@@ -740,6 +740,10 @@ def test_array_function_obj_tests(stmt, py_expr):
             f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")),
             pa.array(["H-o", "W-d", "!"]),
         ),
+        (
+            f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)),
+            pa.array([1, 1, 0], type=pa.int64()),
+        ),
     ],
 )
 def test_string_functions(df, function, expected_result):