Skip to content

Commit 8582e49

Browse files
ntjohnson1claude
andcommitted
Add docstring examples for Scalar regex, crypto, struct and other functions
Add example usage to docstrings for Scalar regex, crypto, struct and other functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1160d5a commit 8582e49

File tree

1 file changed

+204
-16
lines changed

1 file changed

+204
-16
lines changed

python/datafusion/functions.py

Lines changed: 204 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,17 @@ def chr(arg: Expr) -> Expr:
637637

638638

639639
def coalesce(*args: Expr) -> Expr:
640-
"""Returns the value of the first expr in ``args`` which is not NULL."""
640+
"""Returns the value of the first expr in ``args`` which is not NULL.
641+
642+
Examples:
643+
---------
644+
>>> ctx = dfn.SessionContext()
645+
>>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]})
646+
>>> result = df.select(
647+
... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c"))
648+
>>> result.collect_column("c")[0].as_py()
649+
2
650+
"""
641651
args = [arg.expr for arg in args]
642652
return Expr(f.coalesce(*args))
643653

@@ -820,7 +830,16 @@ def ltrim(arg: Expr) -> Expr:
820830

821831

822832
def md5(arg: Expr) -> Expr:
823-
"""Computes an MD5 128-bit checksum for a string expression."""
833+
"""Computes an MD5 128-bit checksum for a string expression.
834+
835+
Examples:
836+
---------
837+
>>> ctx = dfn.SessionContext()
838+
>>> df = ctx.from_pydict({"a": ["hello"]})
839+
>>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5"))
840+
>>> result.collect_column("md5")[0].as_py()
841+
'5d41402abc4b2a76b9719d911017c592'
842+
"""
824843
return Expr(f.md5(arg.expr))
825844

826845

@@ -830,7 +849,18 @@ def nanvl(x: Expr, y: Expr) -> Expr:
830849

831850

832851
def nvl(x: Expr, y: Expr) -> Expr:
833-
"""Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``."""
852+
"""Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.
853+
854+
Examples:
855+
---------
856+
>>> ctx = dfn.SessionContext()
857+
>>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]})
858+
>>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl"))
859+
>>> nvl_df.collect_column("nvl")[0].as_py()
860+
0
861+
>>> nvl_df.collect_column("nvl")[1].as_py()
862+
1
863+
"""
834864
return Expr(f.nvl(x.expr, y.expr))
835865

836866

@@ -899,21 +929,45 @@ def radians(arg: Expr) -> Expr:
899929

900930

901931
def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
902-
"""Find if any regular expression (regex) matches exist.
932+
r"""Find if any regular expression (regex) matches exist.
903933
904934
Tests a string using a regular expression returning true if at least one match,
905935
false otherwise.
936+
937+
Examples:
938+
---------
939+
>>> ctx = dfn.SessionContext()
940+
>>> df = ctx.from_pydict({"a": ["hello123"]})
941+
>>> result = df.select(
942+
... dfn.functions.regexp_like(
943+
... dfn.col("a"), dfn.lit("\\d+")
944+
... ).alias("m")
945+
... )
946+
>>> result.collect_column("m")[0].as_py()
947+
True
906948
"""
907949
if flags is not None:
908950
flags = flags.expr
909951
return Expr(f.regexp_like(string.expr, regex.expr, flags))
910952

911953

912954
def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
913-
"""Perform regular expression (regex) matching.
955+
r"""Perform regular expression (regex) matching.
914956
915957
Returns an array with each element containing the leftmost-first match of the
916958
corresponding index in ``regex`` to string in ``string``.
959+
960+
Examples:
961+
---------
962+
>>> ctx = dfn.SessionContext()
963+
>>> df = ctx.from_pydict({"a": ["hello 42 world"]})
964+
>>> result = df.select(
965+
... dfn.functions.regexp_match(
966+
... dfn.col("a"), dfn.lit("(\\d+)")
967+
... ).alias("m")
968+
... )
969+
>>> result.collect_column("m")[0].as_py()
970+
['42']
917971
"""
918972
if flags is not None:
919973
flags = flags.expr
@@ -923,13 +977,26 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
923977
def regexp_replace(
924978
string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None
925979
) -> Expr:
926-
"""Replaces substring(s) matching a PCRE-like regular expression.
980+
r"""Replaces substring(s) matching a PCRE-like regular expression.
927981
928982
The full list of supported features and syntax can be found at
929983
<https://docs.rs/regex/latest/regex/#syntax>
930984
931985
Supported flags with the addition of 'g' can be found at
932986
<https://docs.rs/regex/latest/regex/#grouping-and-flags>
987+
988+
Examples:
989+
---------
990+
>>> ctx = dfn.SessionContext()
991+
>>> df = ctx.from_pydict({"a": ["hello 42"]})
992+
>>> result = df.select(
993+
... dfn.functions.regexp_replace(
994+
... dfn.col("a"), dfn.lit("\\d+"),
995+
... dfn.lit("XX")
996+
... ).alias("r")
997+
... )
998+
>>> result.collect_column("r")[0].as_py()
999+
'hello XX'
9331000
"""
9341001
if flags is not None:
9351002
flags = flags.expr
@@ -943,6 +1010,15 @@ def regexp_count(
9431010
9441011
Optional start position (the first position is 1) to search for the regular
9451012
expression.
1013+
1014+
Examples:
1015+
---------
1016+
>>> ctx = dfn.SessionContext()
1017+
>>> df = ctx.from_pydict({"a": ["abcabc"]})
1018+
>>> result = df.select(
1019+
... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c"))
1020+
>>> result.collect_column("c")[0].as_py()
1021+
2
9461022
"""
9471023
if flags is not None:
9481024
flags = flags.expr
@@ -958,12 +1034,24 @@ def regexp_instr(
9581034
flags: Expr | None = None,
9591035
sub_expr: Expr | None = None,
9601036
) -> Expr:
961-
"""Returns the position of a regular expression match in a string.
1037+
r"""Returns the position of a regular expression match in a string.
9621038
9631039
Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
9641040
``start`` (the first position is 1). Returns the starting or ending position based
9651041
on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
9661042
return the position of a specific capture group instead of the entire match.
1043+
1044+
Examples:
1045+
---------
1046+
>>> ctx = dfn.SessionContext()
1047+
>>> df = ctx.from_pydict({"a": ["hello 42 world"]})
1048+
>>> result = df.select(
1049+
... dfn.functions.regexp_instr(
1050+
... dfn.col("a"), dfn.lit("\\d+")
1051+
... ).alias("pos")
1052+
... )
1053+
>>> result.collect_column("pos")[0].as_py()
1054+
7
9671055
"""
9681056
start = start.expr if start is not None else None
9691057
n = n.expr if n is not None else None
@@ -1030,22 +1118,66 @@ def rtrim(arg: Expr) -> Expr:
10301118

10311119

10321120
def sha224(arg: Expr) -> Expr:
1033-
"""Computes the SHA-224 hash of a binary string."""
1121+
"""Computes the SHA-224 hash of a binary string.
1122+
1123+
Examples:
1124+
---------
1125+
>>> ctx = dfn.SessionContext()
1126+
>>> df = ctx.from_pydict({"a": ["hello"]})
1127+
>>> result = df.select(
1128+
... dfn.functions.sha224(dfn.col("a")).alias("h")
1129+
... )
1130+
>>> len(result.collect_column("h")[0].as_py()) > 0
1131+
True
1132+
"""
10341133
return Expr(f.sha224(arg.expr))
10351134

10361135

10371136
def sha256(arg: Expr) -> Expr:
1038-
"""Computes the SHA-256 hash of a binary string."""
1137+
"""Computes the SHA-256 hash of a binary string.
1138+
1139+
Examples:
1140+
---------
1141+
>>> ctx = dfn.SessionContext()
1142+
>>> df = ctx.from_pydict({"a": ["hello"]})
1143+
>>> result = df.select(
1144+
... dfn.functions.sha256(dfn.col("a")).alias("h")
1145+
... )
1146+
>>> len(result.collect_column("h")[0].as_py()) > 0
1147+
True
1148+
"""
10391149
return Expr(f.sha256(arg.expr))
10401150

10411151

10421152
def sha384(arg: Expr) -> Expr:
1043-
"""Computes the SHA-384 hash of a binary string."""
1153+
"""Computes the SHA-384 hash of a binary string.
1154+
1155+
Examples:
1156+
---------
1157+
>>> ctx = dfn.SessionContext()
1158+
>>> df = ctx.from_pydict({"a": ["hello"]})
1159+
>>> result = df.select(
1160+
... dfn.functions.sha384(dfn.col("a")).alias("h")
1161+
... )
1162+
>>> len(result.collect_column("h")[0].as_py()) > 0
1163+
True
1164+
"""
10441165
return Expr(f.sha384(arg.expr))
10451166

10461167

10471168
def sha512(arg: Expr) -> Expr:
1048-
"""Computes the SHA-512 hash of a binary string."""
1169+
"""Computes the SHA-512 hash of a binary string.
1170+
1171+
Examples:
1172+
---------
1173+
>>> ctx = dfn.SessionContext()
1174+
>>> df = ctx.from_pydict({"a": ["hello"]})
1175+
>>> result = df.select(
1176+
... dfn.functions.sha512(dfn.col("a")).alias("h")
1177+
... )
1178+
>>> len(result.collect_column("h")[0].as_py()) > 0
1179+
True
1180+
"""
10491181
return Expr(f.sha512(arg.expr))
10501182

10511183

@@ -1370,18 +1502,55 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr:
13701502

13711503

13721504
def uuid() -> Expr:
1373-
"""Returns uuid v4 as a string value."""
1505+
"""Returns uuid v4 as a string value.
1506+
1507+
Examples:
1508+
---------
1509+
>>> ctx = dfn.SessionContext()
1510+
>>> df = ctx.from_pydict({"a": [1]})
1511+
>>> result = df.select(
1512+
... dfn.functions.uuid().alias("u")
1513+
... )
1514+
>>> len(result.collect_column("u")[0].as_py()) == 36
1515+
True
1516+
"""
13741517
return Expr(f.uuid())
13751518

13761519

13771520
def struct(*args: Expr) -> Expr:
1378-
"""Returns a struct with the given arguments."""
1521+
"""Returns a struct with the given arguments.
1522+
1523+
Examples:
1524+
---------
1525+
>>> ctx = dfn.SessionContext()
1526+
>>> df = ctx.from_pydict({"a": [1], "b": [2]})
1527+
>>> result = df.select(
1528+
... dfn.functions.struct(
1529+
... dfn.col("a"), dfn.col("b")
1530+
... ).alias("s")
1531+
... )
1532+
>>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2}
1533+
True
1534+
"""
13791535
args = [arg.expr for arg in args]
13801536
return Expr(f.struct(*args))
13811537

13821538

13831539
def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr:
1384-
"""Returns a struct with the given names and arguments pairs."""
1540+
"""Returns a struct with the given names and arguments pairs.
1541+
1542+
Examples:
1543+
---------
1544+
>>> ctx = dfn.SessionContext()
1545+
>>> df = ctx.from_pydict({"a": [1]})
1546+
>>> result = df.select(
1547+
... dfn.functions.named_struct(
1548+
... [("x", dfn.lit(10)), ("y", dfn.lit(20))]
1549+
... ).alias("s")
1550+
... )
1551+
>>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20}
1552+
True
1553+
"""
13851554
name_pair_exprs = [
13861555
[Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]]
13871556
for pair in name_pairs
@@ -1398,12 +1567,31 @@ def from_unixtime(arg: Expr) -> Expr:
13981567

13991568

14001569
def arrow_typeof(arg: Expr) -> Expr:
1401-
"""Returns the Arrow type of the expression."""
1570+
"""Returns the Arrow type of the expression.
1571+
1572+
Examples:
1573+
---------
1574+
>>> ctx = dfn.SessionContext()
1575+
>>> df = ctx.from_pydict({"a": [1]})
1576+
>>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t"))
1577+
>>> result.collect_column("t")[0].as_py()
1578+
'Int64'
1579+
"""
14021580
return Expr(f.arrow_typeof(arg.expr))
14031581

14041582

14051583
def arrow_cast(expr: Expr, data_type: Expr) -> Expr:
1406-
"""Casts an expression to a specified data type."""
1584+
"""Casts an expression to a specified data type.
1585+
1586+
Examples:
1587+
---------
1588+
>>> ctx = dfn.SessionContext()
1589+
>>> df = ctx.from_pydict({"a": [1]})
1590+
>>> data_type = dfn.string_literal("Float64")).alias("c")
1591+
>>> result = df.select(dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c"))
1592+
>>> result.collect_column("c")[0].as_py()
1593+
1.0
1594+
"""
14071595
return Expr(f.arrow_cast(expr.expr, data_type.expr))
14081596

14091597

0 commit comments

Comments
 (0)