Skip to content

Commit 51b38bb

Browse files
ntjohnson1claude
andcommitted
Add docstring examples for Aggregate basic and bitwise/boolean functions
Add example usage to docstrings for Aggregate basic and bitwise/boolean functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1160d5a commit 51b38bb

File tree

1 file changed

+141
-0
lines changed

1 file changed

+141
-0
lines changed

python/datafusion/functions.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,15 @@ def approx_distinct(
18941894
Args:
18951895
expression: Values to check for distinct entries
18961896
filter: If provided, only compute against rows for which the filter is True
1897+
1898+
Examples:
1899+
---------
1900+
>>> ctx = dfn.SessionContext()
1901+
>>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
1902+
>>> result = df.aggregate(
1903+
... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")])
1904+
>>> result.collect_column("v")[0].as_py() >= 2
1905+
True
18971906
"""
18981907
filter_raw = filter.expr if filter is not None else None
18991908

@@ -1912,6 +1921,15 @@ def approx_median(expression: Expr, filter: Expr | None = None) -> Expr:
19121921
Args:
19131922
expression: Values to find the median for
19141923
filter: If provided, only compute against rows for which the filter is True
1924+
1925+
Examples:
1926+
---------
1927+
>>> ctx = dfn.SessionContext()
1928+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
1929+
>>> result = df.aggregate(
1930+
... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")])
1931+
>>> result.collect_column("v")[0].as_py()
1932+
2.0
19151933
"""
19161934
filter_raw = filter.expr if filter is not None else None
19171935
return Expr(f.approx_median(expression.expr, filter=filter_raw))
@@ -1943,6 +1961,15 @@ def approx_percentile_cont(
19431961
percentile: This must be between 0.0 and 1.0, inclusive
19441962
num_centroids: Max bin size for the t-digest algorithm
19451963
filter: If provided, only compute against rows for which the filter is True
1964+
1965+
Examples:
1966+
---------
1967+
>>> ctx = dfn.SessionContext()
1968+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
1969+
>>> result = df.aggregate(
1970+
... [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 0.5).alias("v")])
1971+
>>> result.collect_column("v")[0].as_py()
1972+
3.0
19461973
"""
19471974
sort_expr_raw = sort_or_default(sort_expression)
19481975
filter_raw = filter.expr if filter is not None else None
@@ -1975,6 +2002,15 @@ def approx_percentile_cont_with_weight(
19752002
num_centroids: Max bin size for the t-digest algorithm
19762003
filter: If provided, only compute against rows for which the filter is True
19772004
2005+
Examples:
2006+
---------
2007+
>>> ctx = dfn.SessionContext()
2008+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]})
2009+
>>> result = df.aggregate(
2010+
... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"),
2011+
... dfn.col("w"), 0.5).alias("v")])
2012+
>>> result.collect_column("v")[0].as_py()
2013+
2.0
19782014
"""
19792015
sort_expr_raw = sort_or_default(sort_expression)
19802016
filter_raw = filter.expr if filter is not None else None
@@ -2038,6 +2074,14 @@ def avg(
20382074
Args:
20392075
expression: Values to combine into an array
20402076
filter: If provided, only compute against rows for which the filter is True
2077+
2078+
Examples:
2079+
---------
2080+
>>> ctx = dfn.SessionContext()
2081+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2082+
>>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")])
2083+
>>> result.collect_column("v")[0].as_py()
2084+
2.0
20412085
"""
20422086
filter_raw = filter.expr if filter is not None else None
20432087
return Expr(f.avg(expression.expr, filter=filter_raw))
@@ -2076,6 +2120,14 @@ def count(
20762120
expressions: Argument to perform bitwise calculation on
20772121
distinct: If True, a single entry for each distinct value will be in the result
20782122
filter: If provided, only compute against rows for which the filter is True
2123+
2124+
Examples:
2125+
---------
2126+
>>> ctx = dfn.SessionContext()
2127+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2128+
>>> result = df.aggregate([], [dfn.functions.count(dfn.col("a")).alias("v")])
2129+
>>> result.collect_column("v")[0].as_py()
2130+
3
20792131
"""
20802132
filter_raw = filter.expr if filter is not None else None
20812133

@@ -2140,6 +2192,14 @@ def max(expression: Expr, filter: Expr | None = None) -> Expr:
21402192
Args:
21412193
expression: The value to find the maximum of
21422194
filter: If provided, only compute against rows for which the filter is True
2195+
2196+
Examples:
2197+
---------
2198+
>>> ctx = dfn.SessionContext()
2199+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2200+
>>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")])
2201+
>>> result.collect_column("v")[0].as_py()
2202+
3
21432203
"""
21442204
filter_raw = filter.expr if filter is not None else None
21452205
return Expr(f.max(expression.expr, filter=filter_raw))
@@ -2149,6 +2209,14 @@ def mean(expression: Expr, filter: Expr | None = None) -> Expr:
21492209
"""Returns the average (mean) value of the argument.
21502210
21512211
This is an alias for :py:func:`avg`.
2212+
2213+
Examples:
2214+
---------
2215+
>>> ctx = dfn.SessionContext()
2216+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2217+
>>> result = df.aggregate([], [dfn.functions.mean(dfn.col("a")).alias("v")])
2218+
>>> result.collect_column("v")[0].as_py()
2219+
2.0
21522220
"""
21532221
return avg(expression, filter)
21542222

@@ -2168,6 +2236,14 @@ def median(
21682236
expression: The value to compute the median of
21692237
distinct: If True, a single entry for each distinct value will be in the result
21702238
filter: If provided, only compute against rows for which the filter is True
2239+
2240+
Examples:
2241+
---------
2242+
>>> ctx = dfn.SessionContext()
2243+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2244+
>>> result = df.aggregate([], [dfn.functions.median(dfn.col("a")).alias("v")])
2245+
>>> result.collect_column("v")[0].as_py()
2246+
2.0
21712247
"""
21722248
filter_raw = filter.expr if filter is not None else None
21732249
return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw))
@@ -2182,6 +2258,14 @@ def min(expression: Expr, filter: Expr | None = None) -> Expr:
21822258
Args:
21832259
expression: The value to find the minimum of
21842260
filter: If provided, only compute against rows for which the filter is True
2261+
2262+
Examples:
2263+
---------
2264+
>>> ctx = dfn.SessionContext()
2265+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2266+
>>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")])
2267+
>>> result.collect_column("v")[0].as_py()
2268+
1
21852269
"""
21862270
filter_raw = filter.expr if filter is not None else None
21872271
return Expr(f.min(expression.expr, filter=filter_raw))
@@ -2201,6 +2285,14 @@ def sum(
22012285
Args:
22022286
expression: Values to combine into an array
22032287
filter: If provided, only compute against rows for which the filter is True
2288+
2289+
Examples:
2290+
---------
2291+
>>> ctx = dfn.SessionContext()
2292+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2293+
>>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")])
2294+
>>> result.collect_column("v")[0].as_py()
2295+
6
22042296
"""
22052297
filter_raw = filter.expr if filter is not None else None
22062298
return Expr(f.sum(expression.expr, filter=filter_raw))
@@ -2618,6 +2710,14 @@ def bit_and(expression: Expr, filter: Expr | None = None) -> Expr:
26182710
Args:
26192711
expression: Argument to perform bitwise calculation on
26202712
filter: If provided, only compute against rows for which the filter is True
2713+
2714+
Examples:
2715+
---------
2716+
>>> ctx = dfn.SessionContext()
2717+
>>> df = ctx.from_pydict({"a": [7, 3]})
2718+
>>> result = df.aggregate([], [dfn.functions.bit_and(dfn.col("a")).alias("v")])
2719+
>>> result.collect_column("v")[0].as_py()
2720+
3
26212721
"""
26222722
filter_raw = filter.expr if filter is not None else None
26232723
return Expr(f.bit_and(expression.expr, filter=filter_raw))
@@ -2634,6 +2734,14 @@ def bit_or(expression: Expr, filter: Expr | None = None) -> Expr:
26342734
Args:
26352735
expression: Argument to perform bitwise calculation on
26362736
filter: If provided, only compute against rows for which the filter is True
2737+
2738+
Examples:
2739+
---------
2740+
>>> ctx = dfn.SessionContext()
2741+
>>> df = ctx.from_pydict({"a": [1, 2]})
2742+
>>> result = df.aggregate([], [dfn.functions.bit_or(dfn.col("a")).alias("v")])
2743+
>>> result.collect_column("v")[0].as_py()
2744+
3
26372745
"""
26382746
filter_raw = filter.expr if filter is not None else None
26392747
return Expr(f.bit_or(expression.expr, filter=filter_raw))
@@ -2653,6 +2761,14 @@ def bit_xor(
26532761
expression: Argument to perform bitwise calculation on
26542762
distinct: If True, evaluate each unique value of expression only once
26552763
filter: If provided, only compute against rows for which the filter is True
2764+
2765+
Examples:
2766+
---------
2767+
>>> ctx = dfn.SessionContext()
2768+
>>> df = ctx.from_pydict({"a": [5, 3]})
2769+
>>> result = df.aggregate([], [dfn.functions.bit_xor(dfn.col("a")).alias("v")])
2770+
>>> result.collect_column("v")[0].as_py()
2771+
6
26562772
"""
26572773
filter_raw = filter.expr if filter is not None else None
26582774
return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw))
@@ -2670,6 +2786,14 @@ def bool_and(expression: Expr, filter: Expr | None = None) -> Expr:
26702786
Args:
26712787
expression: Argument to perform calculation on
26722788
filter: If provided, only compute against rows for which the filter is True
2789+
2790+
Examples:
2791+
---------
2792+
>>> ctx = dfn.SessionContext()
2793+
>>> df = ctx.from_pydict({"a": [True, True, False]})
2794+
>>> result = df.aggregate([], [dfn.functions.bool_and(dfn.col("a")).alias("v")])
2795+
>>> result.collect_column("v")[0].as_py()
2796+
False
26732797
"""
26742798
filter_raw = filter.expr if filter is not None else None
26752799
return Expr(f.bool_and(expression.expr, filter=filter_raw))
@@ -2687,6 +2811,14 @@ def bool_or(expression: Expr, filter: Expr | None = None) -> Expr:
26872811
Args:
26882812
expression: Argument to perform calculation on
26892813
filter: If provided, only compute against rows for which the filter is True
2814+
2815+
Examples:
2816+
---------
2817+
>>> ctx = dfn.SessionContext()
2818+
>>> df = ctx.from_pydict({"a": [False, False, True]})
2819+
>>> result = df.aggregate([], [dfn.functions.bool_or(dfn.col("a")).alias("v")])
2820+
>>> result.collect_column("v")[0].as_py()
2821+
True
26902822
"""
26912823
filter_raw = filter.expr if filter is not None else None
26922824
return Expr(f.bool_or(expression.expr, filter=filter_raw))
@@ -3077,6 +3209,15 @@ def string_agg(
30773209
For example::
30783210
30793211
df.aggregate([], string_agg(col("a"), ",", order_by="b"))
3212+
3213+
Examples:
3214+
---------
3215+
>>> ctx = dfn.SessionContext()
3216+
>>> df = ctx.from_pydict({"a": ["x", "y", "z"]})
3217+
>>> result = df.aggregate(
3218+
... [], [dfn.functions.string_agg(dfn.col("a"), ",", order_by="a").alias("s")])
3219+
>>> result.collect_column("s")[0].as_py()
3220+
'x,y,z'
30803221
"""
30813222
order_by_raw = sort_list_to_raw_sort_list(order_by)
30823223
filter_raw = filter.expr if filter is not None else None

0 commit comments

Comments
 (0)