Skip to content

Commit 6ad9e10

Browse files
ntjohnson1claude
andcommitted
Add docstring examples for Scalar regex, crypto, struct and other functions
Add example usage to docstrings for Scalar regex, crypto, struct and other functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1160d5a commit 6ad9e10

File tree

2 files changed

+236
-16
lines changed

2 files changed

+236
-16
lines changed

python/datafusion/conftest.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
"""Pytest configuration for doctest namespace injection."""
19+
20+
import numpy as np
21+
import pytest
22+
23+
import datafusion as dfn
24+
25+
26+
@pytest.fixture(autouse=True)
27+
def _doctest_namespace(doctest_namespace: dict) -> None:
28+
"""Add common imports to the doctest namespace."""
29+
doctest_namespace["dfn"] = dfn
30+
doctest_namespace["np"] = np

python/datafusion/functions.py

Lines changed: 206 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,18 @@ def chr(arg: Expr) -> Expr:
637637

638638

639639
def coalesce(*args: Expr) -> Expr:
640-
"""Returns the value of the first expr in ``args`` which is not NULL."""
640+
"""Returns the value of the first expr in ``args`` which is not NULL.
641+
642+
Examples:
643+
---------
644+
>>> ctx = dfn.SessionContext()
645+
>>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]})
646+
>>> result = df.select(
647+
... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c"))
648+
>>> result = result
649+
>>> result.collect_column("c")[0].as_py()
650+
2
651+
"""
641652
args = [arg.expr for arg in args]
642653
return Expr(f.coalesce(*args))
643654

@@ -820,7 +831,16 @@ def ltrim(arg: Expr) -> Expr:
820831

821832

822833
def md5(arg: Expr) -> Expr:
823-
"""Computes an MD5 128-bit checksum for a string expression."""
834+
"""Computes an MD5 128-bit checksum for a string expression.
835+
836+
Examples:
837+
---------
838+
>>> ctx = dfn.SessionContext()
839+
>>> df = ctx.from_pydict({"a": ["hello"]})
840+
>>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5"))
841+
>>> result.collect_column("md5")[0].as_py()
842+
'5d41402abc4b2a76b9719d911017c592'
843+
"""
824844
return Expr(f.md5(arg.expr))
825845

826846

@@ -830,7 +850,18 @@ def nanvl(x: Expr, y: Expr) -> Expr:
830850

831851

832852
def nvl(x: Expr, y: Expr) -> Expr:
833-
"""Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``."""
853+
"""Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.
854+
855+
Examples:
856+
---------
857+
>>> ctx = dfn.SessionContext()
858+
>>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]})
859+
>>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl"))
860+
>>> nvl_df.collect_column("nvl")[0].as_py()
861+
0
862+
>>> nvl_df.collect_column("nvl")[1].as_py()
863+
1
864+
"""
834865
return Expr(f.nvl(x.expr, y.expr))
835866

836867

@@ -899,21 +930,45 @@ def radians(arg: Expr) -> Expr:
899930

900931

901932
def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
902-
"""Find if any regular expression (regex) matches exist.
933+
r"""Find if any regular expression (regex) matches exist.
903934
904935
Tests a string using a regular expression returning true if at least one match,
905936
false otherwise.
937+
938+
Examples:
939+
---------
940+
>>> ctx = dfn.SessionContext()
941+
>>> df = ctx.from_pydict({"a": ["hello123"]})
942+
>>> result = df.select(
943+
... dfn.functions.regexp_like(
944+
... dfn.col("a"), dfn.lit("\\d+")
945+
... ).alias("m")
946+
... )
947+
>>> result.collect_column("m")[0].as_py()
948+
True
906949
"""
907950
if flags is not None:
908951
flags = flags.expr
909952
return Expr(f.regexp_like(string.expr, regex.expr, flags))
910953

911954

912955
def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
913-
"""Perform regular expression (regex) matching.
956+
r"""Perform regular expression (regex) matching.
914957
915958
Returns an array with each element containing the leftmost-first match of the
916959
corresponding index in ``regex`` to string in ``string``.
960+
961+
Examples:
962+
---------
963+
>>> ctx = dfn.SessionContext()
964+
>>> df = ctx.from_pydict({"a": ["hello 42 world"]})
965+
>>> result = df.select(
966+
... dfn.functions.regexp_match(
967+
... dfn.col("a"), dfn.lit("(\\d+)")
968+
... ).alias("m")
969+
... )
970+
>>> result.collect_column("m")[0].as_py()
971+
['42']
917972
"""
918973
if flags is not None:
919974
flags = flags.expr
@@ -923,13 +978,26 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr:
923978
def regexp_replace(
924979
string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None
925980
) -> Expr:
926-
"""Replaces substring(s) matching a PCRE-like regular expression.
981+
r"""Replaces substring(s) matching a PCRE-like regular expression.
927982
928983
The full list of supported features and syntax can be found at
929984
<https://docs.rs/regex/latest/regex/#syntax>
930985
931986
Supported flags with the addition of 'g' can be found at
932987
<https://docs.rs/regex/latest/regex/#grouping-and-flags>
988+
989+
Examples:
990+
---------
991+
>>> ctx = dfn.SessionContext()
992+
>>> df = ctx.from_pydict({"a": ["hello 42"]})
993+
>>> result = df.select(
994+
... dfn.functions.regexp_replace(
995+
... dfn.col("a"), dfn.lit("\\d+"),
996+
... dfn.lit("XX")
997+
... ).alias("r")
998+
... )
999+
>>> result.collect_column("r")[0].as_py()
1000+
'hello XX'
9331001
"""
9341002
if flags is not None:
9351003
flags = flags.expr
@@ -943,6 +1011,16 @@ def regexp_count(
9431011
9441012
Optional start position (the first position is 1) to search for the regular
9451013
expression.
1014+
1015+
Examples:
1016+
---------
1017+
>>> ctx = dfn.SessionContext()
1018+
>>> df = ctx.from_pydict({"a": ["abcabc"]})
1019+
>>> result = df.select(
1020+
... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c"))
1021+
>>> result = result
1022+
>>> result.collect_column("c")[0].as_py()
1023+
2
9461024
"""
9471025
if flags is not None:
9481026
flags = flags.expr
@@ -958,12 +1036,24 @@ def regexp_instr(
9581036
flags: Expr | None = None,
9591037
sub_expr: Expr | None = None,
9601038
) -> Expr:
961-
"""Returns the position of a regular expression match in a string.
1039+
r"""Returns the position of a regular expression match in a string.
9621040
9631041
Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
9641042
``start`` (the first position is 1). Returns the starting or ending position based
9651043
on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
9661044
return the position of a specific capture group instead of the entire match.
1045+
1046+
Examples:
1047+
---------
1048+
>>> ctx = dfn.SessionContext()
1049+
>>> df = ctx.from_pydict({"a": ["hello 42 world"]})
1050+
>>> result = df.select(
1051+
... dfn.functions.regexp_instr(
1052+
... dfn.col("a"), dfn.lit("\\d+")
1053+
... ).alias("pos")
1054+
... )
1055+
>>> result.collect_column("pos")[0].as_py()
1056+
7
9671057
"""
9681058
start = start.expr if start is not None else None
9691059
n = n.expr if n is not None else None
@@ -1030,22 +1120,66 @@ def rtrim(arg: Expr) -> Expr:
10301120

10311121

10321122
def sha224(arg: Expr) -> Expr:
1033-
"""Computes the SHA-224 hash of a binary string."""
1123+
"""Computes the SHA-224 hash of a binary string.
1124+
1125+
Examples:
1126+
---------
1127+
>>> ctx = dfn.SessionContext()
1128+
>>> df = ctx.from_pydict({"a": ["hello"]})
1129+
>>> result = df.select(
1130+
... dfn.functions.sha224(dfn.col("a")).alias("h")
1131+
... )
1132+
>>> len(result.collect_column("h")[0].as_py()) > 0
1133+
True
1134+
"""
10341135
return Expr(f.sha224(arg.expr))
10351136

10361137

10371138
def sha256(arg: Expr) -> Expr:
1038-
"""Computes the SHA-256 hash of a binary string."""
1139+
"""Computes the SHA-256 hash of a binary string.
1140+
1141+
Examples:
1142+
---------
1143+
>>> ctx = dfn.SessionContext()
1144+
>>> df = ctx.from_pydict({"a": ["hello"]})
1145+
>>> result = df.select(
1146+
... dfn.functions.sha256(dfn.col("a")).alias("h")
1147+
... )
1148+
>>> len(result.collect_column("h")[0].as_py()) > 0
1149+
True
1150+
"""
10391151
return Expr(f.sha256(arg.expr))
10401152

10411153

10421154
def sha384(arg: Expr) -> Expr:
1043-
"""Computes the SHA-384 hash of a binary string."""
1155+
"""Computes the SHA-384 hash of a binary string.
1156+
1157+
Examples:
1158+
---------
1159+
>>> ctx = dfn.SessionContext()
1160+
>>> df = ctx.from_pydict({"a": ["hello"]})
1161+
>>> result = df.select(
1162+
... dfn.functions.sha384(dfn.col("a")).alias("h")
1163+
... )
1164+
>>> len(result.collect_column("h")[0].as_py()) > 0
1165+
True
1166+
"""
10441167
return Expr(f.sha384(arg.expr))
10451168

10461169

10471170
def sha512(arg: Expr) -> Expr:
1048-
"""Computes the SHA-512 hash of a binary string."""
1171+
"""Computes the SHA-512 hash of a binary string.
1172+
1173+
Examples:
1174+
---------
1175+
>>> ctx = dfn.SessionContext()
1176+
>>> df = ctx.from_pydict({"a": ["hello"]})
1177+
>>> result = df.select(
1178+
... dfn.functions.sha512(dfn.col("a")).alias("h")
1179+
... )
1180+
>>> len(result.collect_column("h")[0].as_py()) > 0
1181+
True
1182+
"""
10491183
return Expr(f.sha512(arg.expr))
10501184

10511185

@@ -1370,18 +1504,55 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr:
13701504

13711505

13721506
def uuid() -> Expr:
1373-
"""Returns uuid v4 as a string value."""
1507+
"""Returns uuid v4 as a string value.
1508+
1509+
Examples:
1510+
---------
1511+
>>> ctx = dfn.SessionContext()
1512+
>>> df = ctx.from_pydict({"a": [1]})
1513+
>>> result = df.select(
1514+
... dfn.functions.uuid().alias("u")
1515+
... )
1516+
>>> len(result.collect_column("u")[0].as_py()) == 36
1517+
True
1518+
"""
13741519
return Expr(f.uuid())
13751520

13761521

13771522
def struct(*args: Expr) -> Expr:
1378-
"""Returns a struct with the given arguments."""
1523+
"""Returns a struct with the given arguments.
1524+
1525+
Examples:
1526+
---------
1527+
>>> ctx = dfn.SessionContext()
1528+
>>> df = ctx.from_pydict({"a": [1], "b": [2]})
1529+
>>> result = df.select(
1530+
... dfn.functions.struct(
1531+
... dfn.col("a"), dfn.col("b")
1532+
... ).alias("s")
1533+
... )
1534+
>>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2}
1535+
True
1536+
"""
13791537
args = [arg.expr for arg in args]
13801538
return Expr(f.struct(*args))
13811539

13821540

13831541
def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr:
1384-
"""Returns a struct with the given names and arguments pairs."""
1542+
"""Returns a struct with the given names and arguments pairs.
1543+
1544+
Examples:
1545+
---------
1546+
>>> ctx = dfn.SessionContext()
1547+
>>> df = ctx.from_pydict({"a": [1]})
1548+
>>> result = df.select(
1549+
... dfn.functions.named_struct(
1550+
... [("x", dfn.lit(10)), ("y", dfn.lit(20))]
1551+
... ).alias("s")
1552+
... )
1553+
>>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20}
1554+
True
1555+
"""
13851556
name_pair_exprs = [
13861557
[Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]]
13871558
for pair in name_pairs
@@ -1398,12 +1569,31 @@ def from_unixtime(arg: Expr) -> Expr:
13981569

13991570

14001571
def arrow_typeof(arg: Expr) -> Expr:
1401-
"""Returns the Arrow type of the expression."""
1572+
"""Returns the Arrow type of the expression.
1573+
1574+
Examples:
1575+
---------
1576+
>>> ctx = dfn.SessionContext()
1577+
>>> df = ctx.from_pydict({"a": [1]})
1578+
>>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t"))
1579+
>>> result.collect_column("t")[0].as_py()
1580+
'Int64'
1581+
"""
14021582
return Expr(f.arrow_typeof(arg.expr))
14031583

14041584

14051585
def arrow_cast(expr: Expr, data_type: Expr) -> Expr:
1406-
"""Casts an expression to a specified data type."""
1586+
"""Casts an expression to a specified data type.
1587+
1588+
Examples:
1589+
---------
1590+
>>> ctx = dfn.SessionContext()
1591+
>>> result = ctx.sql(
1592+
... "SELECT arrow_cast(1, 'Float64') as c"
1593+
... )
1594+
>>> result.collect_column("c")[0].as_py()
1595+
1.0
1596+
"""
14071597
return Expr(f.arrow_cast(expr.expr, data_type.expr))
14081598

14091599

0 commit comments

Comments
 (0)