Skip to content

Commit 51b451f

Browse files
count and countDistinct kdocs
1 parent 9ded9e4 commit 51b451f

File tree

7 files changed

+378
-2
lines changed

7 files changed

+378
-2
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataFrame.kt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ public interface DataFrame<out T> :
6868

6969
// region rows
7070

71+
/**
72+
* Returns the total number of rows of this [DataFrame].
73+
*
74+
* @return The number of rows in the [DataFrame].
75+
*/
7176
public fun rowsCount(): Int
7277

7378
public operator fun iterator(): Iterator<DataRow<T>> = rows().iterator()

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/count.kt

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,24 @@ import org.jetbrains.kotlinx.dataframe.Predicate
88
import org.jetbrains.kotlinx.dataframe.RowFilter
99
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
1010
import org.jetbrains.kotlinx.dataframe.annotations.Refine
11+
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
12+
import org.jetbrains.kotlinx.dataframe.documentation.RowFilterDescription
1113
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateValue
1214

1315
// region DataColumn
1416

17+
/**
18+
* Counts the elements in this [DataColumn] that satisfy a given [predicate] or returns the total count
19+
* if no predicate is provided.
20+
*
21+
* For more information: {@include [DocumentationUrls.Count]}
22+
*
23+
* @param predicate An optional predicate used to filter the elements.
24+
* The predicate should return `true` for elements to be counted.
25+
* If `null` (by default), all elements are counted.
26+
* @return The count of elements in the column
27+
* that either match the predicate or the total count of elements if no predicate is provided.
28+
*/
1529
public fun <T> DataColumn<T>.count(predicate: Predicate<T>? = null): Int =
1630
if (predicate == null) {
1731
size()
@@ -23,27 +37,126 @@ public fun <T> DataColumn<T>.count(predicate: Predicate<T>? = null): Int =
2337

2438
// region DataRow
2539

40+
/**
41+
* Returns the number of columns in this [DataRow].
42+
*
43+
* For more information: {@include [DocumentationUrls.Count]}
44+
*
45+
* @return the number of columns in this row.
46+
*/
2647
public fun AnyRow.count(): Int = columnsCount()
2748

49+
/**
50+
* Counts the number of elements in the current row that satisfy the given [predicate].
51+
*
52+
* For more information: {@include [DocumentationUrls.Count]}
53+
*
54+
* @param predicate A predicate function to test each element.
55+
* The predicate should return `true` for elements to be counted.
56+
* @return The number of elements that satisfy the predicate.
57+
*/
2858
public inline fun AnyRow.count(predicate: Predicate<Any?>): Int = values().count(predicate)
2959

3060
// endregion
3161

3262
// region DataFrame
3363

64+
/**
65+
* Returns the total number of rows of this [DataFrame].
66+
*
67+
* For more information: {@include [DocumentationUrls.Count]}
68+
*
69+
* @return The number of rows in the [DataFrame].
70+
*/
3471
public fun <T> DataFrame<T>.count(): Int = rowsCount()
3572

73+
/**
74+
* Counts the number of rows in this [DataFrame] that satisfy the given [predicate].
75+
*
76+
* {@include [RowFilterDescription]}
77+
*
78+
* See also:
79+
* - [filter][DataFrame.filter] — filters rows using a [RowFilter] condition.
80+
* - [countDistinct][DataFrame.countDistinct] — counts distinct rows or values.
81+
*
82+
* For more information: {@include [DocumentationUrls.Count]}
83+
*
84+
* ### Example
85+
* ```kotlin
86+
* // Count rows where the value in the "age" column is greater than 18
87+
* // and the "name/firstName" column starts with 'A'
88+
* df.count { age > 18 && name.firstName.startsWith("A") }
89+
* // Count rows
90+
* df.count { prev()?.length >= 50.0 ?: false }
91+
* ```
92+
*
93+
* @param T The schema marker type of the [DataFrame].
94+
* @param predicate A [RowFilter] that returns `true` for rows that should be counted.
95+
* @return The number of rows that satisfy the predicate.
96+
*/
3697
public inline fun <T> DataFrame<T>.count(predicate: RowFilter<T>): Int = rows().count { predicate(it, it) }
3798

3899
// endregion
39100

40101
// region GroupBy
41102

103+
/**
104+
* Aggregates this [GroupBy] by counting the number of rows in each group.
105+
*
106+
* Returns a new [DataFrame] where each row corresponds to a group.
107+
* The resulting frame contains:
108+
* - the original group key columns,
109+
* - a new column (named [resultName], default is `"count"`) that contains the number of rows in each group.
110+
*
111+
* This is equivalent to applying `.aggregate { count() }`, but more efficient.
112+
*
113+
* See also common [aggregate][Grouped.aggregate].
114+
*
115+
* For more information: {@include [DocumentationUrls.Count]}
116+
*
117+
* ### Example
118+
* ```kotlin
119+
* // Counts number of rows for each city, returning
120+
* // a new DataFrame with columns "city" and "count"
121+
* df.groupBy { city }.count()
122+
* ```
123+
*
124+
* @param resultName The name of the result column that will store the group sizes. Defaults to `"count"`.
125+
* @return A new [DataFrame] with group keys and corresponding group sizes.
126+
*/
42127
@Refine
43128
@Interpretable("GroupByCount0")
44129
public fun <T> Grouped<T>.count(resultName: String = "count"): DataFrame<T> =
45130
aggregateValue(resultName) { count() default 0 }
46131

132+
/**
133+
* Aggregates this [GroupBy] by counting the number of rows in each group
134+
* that satisfy the given [predicate].
135+
*
136+
* {@include [RowFilterDescription]}
137+
*
138+
* Returns a new [DataFrame] where each row corresponds to a group.
139+
* The resulting frame contains:
140+
* - the original group key columns,
141+
* - a new column (named [resultName], defaults to `"count"`)
142+
* that stores the number of rows in each group matching the [predicate].
143+
*
144+
* This is equivalent to calling `.aggregate { count(predicate) }`, but more efficient.
145+
*
146+
* See also: common [aggregate][Grouped.aggregate].
147+
*
148+
* For more information: {@include [DocumentationUrls.Count]}
149+
*
150+
* ### Example
151+
* ```kotlin
152+
* // Count rows for each city where the "income" value is greater than 30.0.
153+
* // Returns a new DataFrame with columns "city" and "pointsCount".
154+
* df.groupBy { city }.count("pointsCount") { income >= 30.0 }
155+
* ```
156+
*
157+
* @param resultName The name of the result column containing the group sizes. Defaults to `"count"`.
158+
* @return A new [DataFrame] with group keys and filtered row counts per group.
159+
*/
47160
@Refine
48161
@Interpretable("GroupByCount0")
49162
public inline fun <T> Grouped<T>.count(
@@ -55,16 +168,150 @@ public inline fun <T> Grouped<T>.count(
55168

56169
// region Pivot
57170

171+
/**
172+
* Aggregates this [Pivot] by counting the number of rows in each group.
173+
*
174+
* Returns a single [DataRow] where:
175+
* - each column corresponds to a [pivot] group — if multiple pivot keys were used,
176+
* the result will contain column groups for each pivot key, with columns inside
177+
* corresponding to the values of that key;
178+
* - each value contains the number of rows in that group.
179+
*
180+
* The original [Pivot] column structure is preserved.
181+
* If the [Pivot] was created using multiple or nested keys
182+
* (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]),
183+
* the structure remains unchanged — only the contents of each group
184+
* are replaced with the number of rows in that group.
185+
*
186+
* This is equivalent to calling `.aggregate { count() }`, but more efficient.
187+
*
188+
* See also:
189+
* - common [aggregate][Pivot.aggregate].
190+
* - [pivotCounts][DataFrame.pivotCounts] shortcut.
191+
*
192+
* For more information: {@include [DocumentationUrls.Count]}
193+
*
194+
* ### Example
195+
* ```kotlin
196+
* // Count the number of rows for each city.
197+
* // Returns a single DataRow with one column per city and the count of rows in each.
198+
* df.pivot { city }.count()
199+
* ```
200+
*
201+
* @return A single [DataRow] with one column per group and the corresponding group size as its value.
202+
*/
58203
public fun <T> Pivot<T>.count(): DataRow<T> = delegate { count() }
59204

205+
/**
206+
* Aggregates this [Pivot] by counting the number of rows in each group
207+
* that satisfy the given [predicate].
208+
*
209+
* {@include [RowFilterDescription]}
210+
*
211+
* Returns a single [DataRow] where:
212+
* - each column corresponds to a [pivot] group — if multiple pivot keys were used,
213+
* the result will contain column groups for each pivot key, with columns inside
214+
* corresponding to the values of that key;
215+
* - each value contains the number of rows in that group matching the [predicate].
216+
*
217+
* The original [Pivot] column structure is preserved.
218+
* If the [Pivot] was created using multiple or nested keys
219+
* (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]),
220+
* the structure remains unchanged — only the contents of each group
221+
* are replaced with the number of rows (matching the [predicate]) in that group.
222+
*
223+
* This is equivalent to calling `.aggregate { count(predicate) }`, but more efficient.
224+
*
225+
* See also:
226+
* - common [aggregate][Pivot.aggregate].
227+
* - [pivotCounts][DataFrame.pivotCounts] shortcut.
228+
*
229+
* For more information: {@include [DocumentationUrls.Count]}
230+
*
231+
* ### Example
232+
* ```kotlin
233+
* // Count rows for each city where the "income" value is greater than 30.0.
234+
* // Returns a single DataRow with one column per city and the count of matching rows.
235+
* df.pivot { city }.count { income > 30.0 }
236+
* ```
237+
*
238+
* @return A single [DataRow] with original [Pivot] columns and filtered row counts per group.
239+
*/
60240
public inline fun <T> Pivot<T>.count(crossinline predicate: RowFilter<T>): DataRow<T> = delegate { count(predicate) }
61241

62242
// endregion
63243

64244
// region PivotGroupBy
65245

246+
/**
247+
* Aggregates this [PivotGroupBy] by counting the number of rows in each
248+
* combined [pivot] + [groupBy] group.
249+
*
250+
* Returns a new [DataFrame] containing a following matrix:
251+
* - one row per [groupBy] key (or keys set);
252+
* - one column group per [pivot] key, where each inner column corresponds to a value of that key;
253+
* - each cell contains the number of rows in the corresponding pivot–group pair.
254+
*
255+
* The original [Pivot] column structure is preserved.
256+
* If the [Pivot] was created using multiple or nested keys
257+
* (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]),
258+
* the result will contain nested column groups reflecting that key structure,
259+
* with each group containing columns for the values of the corresponding key.
260+
*
261+
* This is equivalent to calling `.aggregate { count() }`, but more efficient.
262+
*
263+
* See also:
264+
* - common [aggregate][PivotGroupBy.aggregate];
265+
* - [GroupBy.pivotCounts] shortcut.
266+
*
267+
* For more information: {@include [DocumentationUrls.Count]}
268+
*
269+
* ### Example
270+
* ```kotlin
271+
* // Compute a matrix with "city" values horizontally and
272+
* // "age" values vertically, where each cell contains
273+
* // the number of rows with the corresponding age–city pair.
274+
* df.pivot { city }.groupBy { age }.count()
275+
* ```
276+
*
277+
* @return A [DataFrame] with [groupBy] rows and pivoted counts as columns.
278+
*/
66279
public fun <T> PivotGroupBy<T>.count(): DataFrame<T> = aggregate { count() default 0 }
67280

281+
/**
282+
* Aggregates this [PivotGroupBy] by counting the number of rows in each
283+
* combined [pivot] + [groupBy] group, that satisfy the given [predicate].
284+
*
285+
* Returns a new [DataFrame] containing a following matrix:
286+
* - one row per [groupBy] key (or keys set);
287+
* - one column group per [pivot] key, where each inner column corresponds to a value of that key;
288+
* - each cell contains the number of rows in the corresponding pivot–group pair.
289+
*
290+
* The original [Pivot] column structure is preserved.
291+
* If the [Pivot] was created using multiple or nested keys
292+
* (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]),
293+
* the result will contain nested column groups reflecting that key structure,
294+
* with each group containing columns for the values
295+
* (matching the [predicate]) of the corresponding key.
296+
*
297+
* This is equivalent to calling `.aggregate { count() }`, but more efficient.
298+
*
299+
* See also:
300+
* - common [aggregate][PivotGroupBy.aggregate];
301+
* - [GroupBy.pivotCounts] shortcut.
302+
*
303+
* For more information: {@include [DocumentationUrls.Count]}
304+
*
305+
* ### Example
306+
* ```kotlin
307+
* // Compute a matrix with "city" values horizontally and
308+
* // "age" values vertically, where each cell contains
309+
* // the number of rows with the corresponding age–city pair.
310+
* df.pivot { city }.groupBy { age }.count()
311+
* ```
312+
*
313+
* @return A [DataFrame] with [groupBy] rows and pivoted counts as columns matching the [predicate]..
314+
*/
68315
public inline fun <T> PivotGroupBy<T>.count(crossinline predicate: RowFilter<T>): DataFrame<T> =
69316
aggregate {
70317
count(predicate) default

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,80 @@ import org.jetbrains.kotlinx.dataframe.ColumnsSelector
66
import org.jetbrains.kotlinx.dataframe.DataFrame
77
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
88
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
9+
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
10+
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
911
import org.jetbrains.kotlinx.dataframe.indices
1012
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
1113
import kotlin.reflect.KProperty
1214

1315
// region DataFrame
1416

17+
/**
18+
* Returns the number of distinct rows in this [DataFrame].
19+
*
20+
* Compares rows based on the values in all columns and returns
21+
* the number of unique row combinations.
22+
*
23+
* See also:
24+
* - [distinct][DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame].
25+
* - [count][DataFrame.count], which counts the number of rows satisfying a given predicate.
26+
*
27+
* For more information: {@include [DocumentationUrls.CountDistinct]}
28+
*
29+
* @return The number of distinct rows in this [DataFrame].
30+
*/
1531
public fun AnyFrame.countDistinct(): Int = countDistinct { all() }
1632

33+
/**
34+
* Returns number of distinct combinations of values in selected [columns] in this [DataFrame].
35+
*
36+
* Compares values in the selected columns and returns
37+
* the number of unique values combinations.
38+
*
39+
* See also:
40+
* - [distinct][DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame].
41+
* - [count][DataFrame.count], which counts the number of rows satisfying a given predicate.
42+
*
43+
* For more information: {@include [DocumentationUrls.CountDistinct]}
44+
*
45+
* ### This [countDistinct] overload
46+
*/
47+
internal interface CountDistinctDocs
48+
49+
50+
/**
51+
* {@include [CountDistinctDocs]}
52+
* {@include [SelectingColumns.Dsl]}
53+
*
54+
* #### Example
55+
*
56+
* ```kotlin
57+
* // Counts unique combinations of values in the "year" and "title" columns
58+
* // across all rows in the DataFrame
59+
* df.countDistinct { year and title }
60+
* ```
61+
*
62+
* @return The number of distinct rows in this [DataFrame].
63+
*/
1764
public fun <T, C> DataFrame<T>.countDistinct(columns: ColumnsSelector<T, C>): Int {
1865
val cols = get(columns)
1966
return indices.distinctBy { i -> cols.map { it[i] } }.size
2067
}
2168

69+
/**
70+
* {@include [CountDistinctDocs]}
71+
* {@include [SelectingColumns.ColumnNames]}
72+
*
73+
* #### Example
74+
*
75+
* ```kotlin
76+
* // Counts unique combinations of values in the "year" and "title" columns
77+
* // across all rows in the DataFrame
78+
* df.countDistinct("year", "title")
79+
* ```
80+
*
81+
* @return The number of distinct rows in this [DataFrame].
82+
*/
2283
public fun <T> DataFrame<T>.countDistinct(vararg columns: String): Int = countDistinct { columns.toColumnSet() }
2384

2485
@Deprecated(DEPRECATED_ACCESS_API)

0 commit comments

Comments
 (0)