Skip to content

Commit aca9d13

Browse files
ntjohnson1claude
andcommitted
Add docstring examples for Aggregate statistical and regression functions
Add example usage to docstrings for Aggregate statistical and regression functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1160d5a commit aca9d13

File tree

1 file changed

+180
-0
lines changed

1 file changed

+180
-0
lines changed

python/datafusion/functions.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2055,6 +2055,15 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
20552055
value_y: The dependent variable for correlation
20562056
value_x: The independent variable for correlation
20572057
filter: If provided, only compute against rows for which the filter is True
2058+
2059+
Examples:
2060+
---------
2061+
>>> ctx = dfn.SessionContext()
2062+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]})
2063+
>>> result = df.aggregate(
2064+
... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")])
2065+
>>> result.collect_column("v")[0].as_py()
2066+
1.0
20582067
"""
20592068
filter_raw = filter.expr if filter is not None else None
20602069
return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2101,6 +2110,22 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
21012110
value_y: The dependent variable for covariance
21022111
value_x: The independent variable for covariance
21032112
filter: If provided, only compute against rows for which the filter is True
2113+
2114+
Examples:
2115+
---------
2116+
>>> import builtins
2117+
>>> ctx = dfn.SessionContext()
2118+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2119+
>>> result = df.aggregate(
2120+
... [],
2121+
... [dfn.functions.covar_pop(
2122+
... dfn.col("a"), dfn.col("b")
2123+
... ).alias("v")]
2124+
... )
2125+
>>> builtins.round(
2126+
... result.collect_column("v")[0].as_py(), 4
2127+
... )
2128+
0.6667
21042129
"""
21052130
filter_raw = filter.expr if filter is not None else None
21062131
return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2118,6 +2143,15 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr
21182143
value_y: The dependent variable for covariance
21192144
value_x: The independent variable for covariance
21202145
filter: If provided, only compute against rows for which the filter is True
2146+
2147+
Examples:
2148+
---------
2149+
>>> ctx = dfn.SessionContext()
2150+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2151+
>>> result = df.aggregate(
2152+
... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")])
2153+
>>> result.collect_column("v")[0].as_py()
2154+
1.0
21212155
"""
21222156
filter_raw = filter.expr if filter is not None else None
21232157
return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2127,6 +2161,15 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
21272161
"""Computes the sample covariance.
21282162
21292163
This is an alias for :py:func:`covar_samp`.
2164+
2165+
Examples:
2166+
---------
2167+
>>> ctx = dfn.SessionContext()
2168+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2169+
>>> result = df.aggregate(
2170+
... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")])
2171+
>>> result.collect_column("v")[0].as_py()
2172+
1.0
21302173
"""
21312174
return covar_samp(value_y, value_x, filter)
21322175

@@ -2215,6 +2258,14 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr:
22152258
Args:
22162259
expression: The value to find the minimum of
22172260
filter: If provided, only compute against rows for which the filter is True
2261+
2262+
Examples:
2263+
---------
2264+
>>> ctx = dfn.SessionContext()
2265+
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
2266+
>>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")])
2267+
>>> result.collect_column("v")[0].as_py()
2268+
2.0
22182269
"""
22192270
filter_raw = filter.expr if filter is not None else None
22202271
return Expr(f.stddev(expression.expr, filter=filter_raw))
@@ -2229,6 +2280,14 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr:
22292280
Args:
22302281
expression: The value to find the minimum of
22312282
filter: If provided, only compute against rows for which the filter is True
2283+
2284+
Examples:
2285+
---------
2286+
>>> ctx = dfn.SessionContext()
2287+
>>> df = ctx.from_pydict({"a": [1.0, 3.0]})
2288+
>>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")])
2289+
>>> result.collect_column("v")[0].as_py()
2290+
1.0
22322291
"""
22332292
filter_raw = filter.expr if filter is not None else None
22342293
return Expr(f.stddev_pop(expression.expr, filter=filter_raw))
@@ -2238,6 +2297,14 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr:
22382297
"""Computes the sample standard deviation of the argument.
22392298
22402299
This is an alias for :py:func:`stddev`.
2300+
2301+
Examples:
2302+
---------
2303+
>>> ctx = dfn.SessionContext()
2304+
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
2305+
>>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")])
2306+
>>> result.collect_column("v")[0].as_py()
2307+
2.0
22412308
"""
22422309
return stddev(arg, filter=filter)
22432310

@@ -2246,6 +2313,14 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr:
22462313
"""Computes the sample variance of the argument.
22472314
22482315
This is an alias for :py:func:`var_samp`.
2316+
2317+
Examples:
2318+
---------
2319+
>>> ctx = dfn.SessionContext()
2320+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2321+
>>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")])
2322+
>>> result.collect_column("v")[0].as_py()
2323+
1.0
22492324
"""
22502325
return var_samp(expression, filter)
22512326

@@ -2259,6 +2334,14 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr:
22592334
Args:
22602335
expression: The variable to compute the variance for
22612336
filter: If provided, only compute against rows for which the filter is True
2337+
2338+
Examples:
2339+
---------
2340+
>>> ctx = dfn.SessionContext()
2341+
>>> df = ctx.from_pydict({"a": [0.0, 2.0]})
2342+
>>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")])
2343+
>>> result.collect_column("v")[0].as_py()
2344+
1.0
22622345
"""
22632346
filter_raw = filter.expr if filter is not None else None
22642347
return Expr(f.var_pop(expression.expr, filter=filter_raw))
@@ -2273,6 +2356,14 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr:
22732356
Args:
22742357
expression: The variable to compute the variance for
22752358
filter: If provided, only compute against rows for which the filter is True
2359+
2360+
Examples:
2361+
---------
2362+
>>> ctx = dfn.SessionContext()
2363+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2364+
>>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")])
2365+
>>> result.collect_column("v")[0].as_py()
2366+
1.0
22762367
"""
22772368
filter_raw = filter.expr if filter is not None else None
22782369
return Expr(f.var_sample(expression.expr, filter=filter_raw))
@@ -2282,6 +2373,14 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr:
22822373
"""Computes the sample variance of the argument.
22832374
22842375
This is an alias for :py:func:`var_samp`.
2376+
2377+
Examples:
2378+
---------
2379+
>>> ctx = dfn.SessionContext()
2380+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2381+
>>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")])
2382+
>>> result.collect_column("v")[0].as_py()
2383+
1.0
22852384
"""
22862385
return var_samp(expression, filter)
22872386

@@ -2303,6 +2402,15 @@ def regr_avgx(
23032402
y: The linear regression dependent variable
23042403
x: The linear regression independent variable
23052404
filter: If provided, only compute against rows for which the filter is True
2405+
2406+
Examples:
2407+
---------
2408+
>>> ctx = dfn.SessionContext()
2409+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
2410+
>>> result = df.aggregate(
2411+
... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")])
2412+
>>> result.collect_column("v")[0].as_py()
2413+
5.0
23062414
"""
23072415
filter_raw = filter.expr if filter is not None else None
23082416

@@ -2326,6 +2434,15 @@ def regr_avgy(
23262434
y: The linear regression dependent variable
23272435
x: The linear regression independent variable
23282436
filter: If provided, only compute against rows for which the filter is True
2437+
2438+
Examples:
2439+
---------
2440+
>>> ctx = dfn.SessionContext()
2441+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
2442+
>>> result = df.aggregate(
2443+
... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")])
2444+
>>> result.collect_column("v")[0].as_py()
2445+
2.0
23292446
"""
23302447
filter_raw = filter.expr if filter is not None else None
23312448

@@ -2349,6 +2466,15 @@ def regr_count(
23492466
y: The linear regression dependent variable
23502467
x: The linear regression independent variable
23512468
filter: If provided, only compute against rows for which the filter is True
2469+
2470+
Examples:
2471+
---------
2472+
>>> ctx = dfn.SessionContext()
2473+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
2474+
>>> result = df.aggregate(
2475+
... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")])
2476+
>>> result.collect_column("v")[0].as_py()
2477+
3
23522478
"""
23532479
filter_raw = filter.expr if filter is not None else None
23542480

@@ -2372,6 +2498,15 @@ def regr_intercept(
23722498
y: The linear regression dependent variable
23732499
x: The linear regression independent variable
23742500
filter: If provided, only compute against rows for which the filter is True
2501+
2502+
Examples:
2503+
---------
2504+
>>> ctx = dfn.SessionContext()
2505+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
2506+
>>> result = df.aggregate(
2507+
... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")])
2508+
>>> result.collect_column("v")[0].as_py()
2509+
0.0
23752510
"""
23762511
filter_raw = filter.expr if filter is not None else None
23772512

@@ -2395,6 +2530,15 @@ def regr_r2(
23952530
y: The linear regression dependent variable
23962531
x: The linear regression independent variable
23972532
filter: If provided, only compute against rows for which the filter is True
2533+
2534+
Examples:
2535+
---------
2536+
>>> ctx = dfn.SessionContext()
2537+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
2538+
>>> result = df.aggregate(
2539+
... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")])
2540+
>>> result.collect_column("v")[0].as_py()
2541+
1.0
23982542
"""
23992543
filter_raw = filter.expr if filter is not None else None
24002544

@@ -2418,6 +2562,15 @@ def regr_slope(
24182562
y: The linear regression dependent variable
24192563
x: The linear regression independent variable
24202564
filter: If provided, only compute against rows for which the filter is True
2565+
2566+
Examples:
2567+
---------
2568+
>>> ctx = dfn.SessionContext()
2569+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
2570+
>>> result = df.aggregate(
2571+
... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")])
2572+
>>> result.collect_column("v")[0].as_py()
2573+
2.0
24212574
"""
24222575
filter_raw = filter.expr if filter is not None else None
24232576

@@ -2441,6 +2594,15 @@ def regr_sxx(
24412594
y: The linear regression dependent variable
24422595
x: The linear regression independent variable
24432596
filter: If provided, only compute against rows for which the filter is True
2597+
2598+
Examples:
2599+
---------
2600+
>>> ctx = dfn.SessionContext()
2601+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
2602+
>>> result = df.aggregate(
2603+
... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")])
2604+
>>> result.collect_column("v")[0].as_py()
2605+
2.0
24442606
"""
24452607
filter_raw = filter.expr if filter is not None else None
24462608

@@ -2464,6 +2626,15 @@ def regr_sxy(
24642626
y: The linear regression dependent variable
24652627
x: The linear regression independent variable
24662628
filter: If provided, only compute against rows for which the filter is True
2629+
2630+
Examples:
2631+
---------
2632+
>>> ctx = dfn.SessionContext()
2633+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
2634+
>>> result = df.aggregate(
2635+
... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")])
2636+
>>> result.collect_column("v")[0].as_py()
2637+
2.0
24672638
"""
24682639
filter_raw = filter.expr if filter is not None else None
24692640

@@ -2487,6 +2658,15 @@ def regr_syy(
24872658
y: The linear regression dependent variable
24882659
x: The linear regression independent variable
24892660
filter: If provided, only compute against rows for which the filter is True
2661+
2662+
Examples:
2663+
---------
2664+
>>> ctx = dfn.SessionContext()
2665+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
2666+
>>> result = df.aggregate(
2667+
... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")])
2668+
>>> result.collect_column("v")[0].as_py()
2669+
2.0
24902670
"""
24912671
filter_raw = filter.expr if filter is not None else None
24922672

0 commit comments

Comments
 (0)