Add docstring examples for Scalar regex, crypto, struct and other functions

ntjohnson1 · claude · ntjohnson1 · commit 6ad9e10bb98d · 2026-03-11T12:38:36.000-04:00
Add example usage to docstrings for Scalar regex, crypto, struct and other functions to improve documentation.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/python/datafusion/conftest.py b/python/datafusion/conftest.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Pytest configuration for doctest namespace injection."""
+
+import numpy as np
+import pytest
+
+import datafusion as dfn
+
+
+@pytest.fixture(autouse=True)
+def _doctest_namespace(doctest_namespace: dict) -> None:
+    """Add common imports to the doctest namespace."""
+    doctest_namespace["dfn"] = dfn
+    doctest_namespace["np"] = np
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -637,7 +637,18 @@ def chr(arg: Expr) -> Expr:
 
 
 def coalesce(*args: Expr) -> Expr:
-    """Returns the value of the first expr in ``args`` which is not NULL."""
+    """Returns the value of the first expr in ``args`` which is not NULL.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]})
+    >>> result = df.select(
+    ...     dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c"))
+    >>> result = result
+    >>> result.collect_column("c")[0].as_py()
+    2
+    """
     args = [arg.expr for arg in args]
     return Expr(f.coalesce(*args))
 
@@ -820,7 +831,16 @@ def ltrim(arg: Expr) -> Expr:
 
 
 def md5(arg: Expr) -> Expr:
-    """Computes an MD5 128-bit checksum for a string expression."""
+    """Computes an MD5 128-bit checksum for a string expression.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello"]})
+    >>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5"))
+    >>> result.collect_column("md5")[0].as_py()
+    '5d41402abc4b2a76b9719d911017c592'
+    """
     return Expr(f.md5(arg.expr))
 
 
@@ -830,7 +850,18 @@ def nanvl(x: Expr, y: Expr) -> Expr:
 
 
 def nvl(x: Expr, y: Expr) -> Expr:
-    """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``."""
+    """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]})
+    >>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl"))
+    >>> nvl_df.collect_column("nvl")[0].as_py()
+    0
+    >>> nvl_df.collect_column("nvl")[1].as_py()
+    1
+    """
     return Expr(f.nvl(x.expr, y.expr))
 
 
@@ -899,21 +930,45 @@ def radians(arg: Expr) -> Expr:
 
 
 def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
-    """Find if any regular expression (regex) matches exist.
+    r"""Find if any regular expression (regex) matches exist.
 
     Tests a string using a regular expression returning true if at least one match,
     false otherwise.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello123"]})
+    >>> result = df.select(
+    ...     dfn.functions.regexp_like(
+    ...         dfn.col("a"), dfn.lit("\\d+")
+    ...     ).alias("m")
+    ... )
+    >>> result.collect_column("m")[0].as_py()
+    True
     """
     if flags is not None:
         flags = flags.expr
     return Expr(f.regexp_like(string.expr, regex.expr, flags))
 
 
 def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
-    """Perform regular expression (regex) matching.
+    r"""Perform regular expression (regex) matching.
 
     Returns an array with each element containing the leftmost-first match of the
     corresponding index in ``regex`` to string in ``string``.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello 42 world"]})
+    >>> result = df.select(
+    ...     dfn.functions.regexp_match(
+    ...         dfn.col("a"), dfn.lit("(\\d+)")
+    ...     ).alias("m")
+    ... )
+    >>> result.collect_column("m")[0].as_py()
+    ['42']
     """
     if flags is not None:
         flags = flags.expr
@@ -923,13 +978,26 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
 def regexp_replace(
     string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None
 ) -> Expr:
-    """Replaces substring(s) matching a PCRE-like regular expression.
+    r"""Replaces substring(s) matching a PCRE-like regular expression.
 
     The full list of supported features and syntax can be found at
     <https://docs.rs/regex/latest/regex/#syntax>
 
     Supported flags with the addition of 'g' can be found at
     <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello 42"]})
+    >>> result = df.select(
+    ...     dfn.functions.regexp_replace(
+    ...         dfn.col("a"), dfn.lit("\\d+"),
+    ...         dfn.lit("XX")
+    ...     ).alias("r")
+    ... )
+    >>> result.collect_column("r")[0].as_py()
+    'hello XX'
     """
     if flags is not None:
         flags = flags.expr
@@ -943,6 +1011,16 @@ def regexp_count(
 
     Optional start position (the first position is 1) to search for the regular
     expression.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["abcabc"]})
+    >>> result = df.select(
+    ...     dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c"))
+    >>> result = result
+    >>> result.collect_column("c")[0].as_py()
+    2
     """
     if flags is not None:
         flags = flags.expr
@@ -958,12 +1036,24 @@ def regexp_instr(
     flags: Expr | None = None,
     sub_expr: Expr | None = None,
 ) -> Expr:
-    """Returns the position of a regular expression match in a string.
+    r"""Returns the position of a regular expression match in a string.
 
     Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
     ``start`` (the first position is 1). Returns the starting or ending position based
     on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
     return the position of a specific capture group instead of the entire match.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello 42 world"]})
+    >>> result = df.select(
+    ...     dfn.functions.regexp_instr(
+    ...         dfn.col("a"), dfn.lit("\\d+")
+    ...     ).alias("pos")
+    ... )
+    >>> result.collect_column("pos")[0].as_py()
+    7
     """
     start = start.expr if start is not None else None
     n = n.expr if n is not None else None
@@ -1030,22 +1120,66 @@ def rtrim(arg: Expr) -> Expr:
 
 
 def sha224(arg: Expr) -> Expr:
-    """Computes the SHA-224 hash of a binary string."""
+    """Computes the SHA-224 hash of a binary string.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello"]})
+    >>> result = df.select(
+    ...     dfn.functions.sha224(dfn.col("a")).alias("h")
+    ... )
+    >>> len(result.collect_column("h")[0].as_py()) > 0
+    True
+    """
     return Expr(f.sha224(arg.expr))
 
 
 def sha256(arg: Expr) -> Expr:
-    """Computes the SHA-256 hash of a binary string."""
+    """Computes the SHA-256 hash of a binary string.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello"]})
+    >>> result = df.select(
+    ...     dfn.functions.sha256(dfn.col("a")).alias("h")
+    ... )
+    >>> len(result.collect_column("h")[0].as_py()) > 0
+    True
+    """
     return Expr(f.sha256(arg.expr))
 
 
 def sha384(arg: Expr) -> Expr:
-    """Computes the SHA-384 hash of a binary string."""
+    """Computes the SHA-384 hash of a binary string.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello"]})
+    >>> result = df.select(
+    ...     dfn.functions.sha384(dfn.col("a")).alias("h")
+    ... )
+    >>> len(result.collect_column("h")[0].as_py()) > 0
+    True
+    """
     return Expr(f.sha384(arg.expr))
 
 
 def sha512(arg: Expr) -> Expr:
-    """Computes the SHA-512 hash of a binary string."""
+    """Computes the SHA-512 hash of a binary string.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["hello"]})
+    >>> result = df.select(
+    ...     dfn.functions.sha512(dfn.col("a")).alias("h")
+    ... )
+    >>> len(result.collect_column("h")[0].as_py()) > 0
+    True
+    """
     return Expr(f.sha512(arg.expr))
 
 
@@ -1370,18 +1504,55 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr:
 
 
 def uuid() -> Expr:
-    """Returns uuid v4 as a string value."""
+    """Returns uuid v4 as a string value.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1]})
+    >>> result = df.select(
+    ...     dfn.functions.uuid().alias("u")
+    ... )
+    >>> len(result.collect_column("u")[0].as_py()) == 36
+    True
+    """
     return Expr(f.uuid())
 
 
 def struct(*args: Expr) -> Expr:
-    """Returns a struct with the given arguments."""
+    """Returns a struct with the given arguments.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1], "b": [2]})
+    >>> result = df.select(
+    ...     dfn.functions.struct(
+    ...         dfn.col("a"), dfn.col("b")
+    ...     ).alias("s")
+    ... )
+    >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2}
+    True
+    """
     args = [arg.expr for arg in args]
     return Expr(f.struct(*args))
 
 
 def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr:
-    """Returns a struct with the given names and arguments pairs."""
+    """Returns a struct with the given names and arguments pairs.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1]})
+    >>> result = df.select(
+    ...     dfn.functions.named_struct(
+    ...         [("x", dfn.lit(10)), ("y", dfn.lit(20))]
+    ...     ).alias("s")
+    ... )
+    >>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20}
+    True
+    """
     name_pair_exprs = [
         [Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]]
         for pair in name_pairs
@@ -1398,12 +1569,31 @@ def from_unixtime(arg: Expr) -> Expr:
 
 
 def arrow_typeof(arg: Expr) -> Expr:
-    """Returns the Arrow type of the expression."""
+    """Returns the Arrow type of the expression.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1]})
+    >>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t"))
+    >>> result.collect_column("t")[0].as_py()
+    'Int64'
+    """
     return Expr(f.arrow_typeof(arg.expr))
 
 
 def arrow_cast(expr: Expr, data_type: Expr) -> Expr:
-    """Casts an expression to a specified data type."""
+    """Casts an expression to a specified data type.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> result = ctx.sql(
+    ...     "SELECT arrow_cast(1, 'Float64') as c"
+    ... )
+    >>> result.collect_column("c")[0].as_py()
+    1.0
+    """
     return Expr(f.arrow_cast(expr.expr, data_type.expr))