Skip to content

Commit 3c98b9a

Browse files
authored
Merge pull request #1165 from Kotlin/statistics-docs
statistics documentation update
2 parents 1d8d41a + 74df4a4 commit 3c98b9a

File tree

15 files changed

+352
-70
lines changed

15 files changed

+352
-70
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/NumberTypeUtils.kt

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import java.math.BigDecimal
66
import java.math.BigInteger
77
import kotlin.reflect.KClass
88
import kotlin.reflect.KType
9+
import kotlin.reflect.full.isSubtypeOf
910
import kotlin.reflect.full.withNullability
1011
import kotlin.reflect.typeOf
1112

@@ -224,10 +225,24 @@ internal fun Sequence<Number?>.convertToUnifiedNumberType(
224225
"Cannot find unified number type of types: ${types.joinToString { renderType(it) }}",
225226
)
226227
}
227-
val converter = createConverter(typeOf<Number>(), commonNumberType)!! as (Number) -> Number?
228-
return map {
229-
if (it == null) return@map null
230-
converter(it) ?: error("Can not convert $it to $commonNumberType")
228+
require(commonNumberType.isSubtypeOf(typeOf<Number?>())) {
229+
"Cannot convert numbers to $commonNumberType; it is not a subtype of Number?"
230+
}
231+
return when (commonNumberType) {
232+
nothingType -> {
233+
require(null !in this) { "Cannot unify numbers to Nothing; it contains nulls" }
234+
this
235+
}
236+
237+
nullableNothingType -> this
238+
239+
else -> {
240+
val converter = createConverter(typeOf<Number>(), commonNumberType)!! as (Number) -> Number?
241+
this.map {
242+
if (it == null) return@map null
243+
converter(it) ?: error("Can not convert $it to $commonNumberType")
244+
}
245+
}
231246
}
232247
}
233248

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Analyze.kt

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import org.jetbrains.kotlinx.dataframe.api.mean
3535
import org.jetbrains.kotlinx.dataframe.api.meanFor
3636
import org.jetbrains.kotlinx.dataframe.api.meanOf
3737
import org.jetbrains.kotlinx.dataframe.api.median
38+
import org.jetbrains.kotlinx.dataframe.api.medianBy
3839
import org.jetbrains.kotlinx.dataframe.api.medianFor
3940
import org.jetbrains.kotlinx.dataframe.api.medianOf
4041
import org.jetbrains.kotlinx.dataframe.api.min
@@ -43,6 +44,7 @@ import org.jetbrains.kotlinx.dataframe.api.minFor
4344
import org.jetbrains.kotlinx.dataframe.api.minOf
4445
import org.jetbrains.kotlinx.dataframe.api.minOrNull
4546
import org.jetbrains.kotlinx.dataframe.api.percentile
47+
import org.jetbrains.kotlinx.dataframe.api.percentileBy
4648
import org.jetbrains.kotlinx.dataframe.api.percentileFor
4749
import org.jetbrains.kotlinx.dataframe.api.percentileOf
4850
import org.jetbrains.kotlinx.dataframe.api.pivot
@@ -179,7 +181,7 @@ class Analyze : TestBase() {
179181
// SampleStart
180182
df.sum() // sum of values per every numeric column
181183
df.sum { age and weight } // sum of all values in `age` and `weight`
182-
df.sumFor { age and weight } // sum of values per `age` and `weight` separately
184+
df.sumFor(skipNaN = true) { age and weight } // sum of values per `age` and `weight` separately
183185
df.sumOf { (weight ?: 0) / age } // sum of expression evaluated for every row
184186
// SampleEnd
185187
}
@@ -190,7 +192,7 @@ class Analyze : TestBase() {
190192
// SampleStart
191193
df.min() // min of values per every comparable column
192194
df.min { age and weight } // min of all values in `age` and `weight`
193-
df.minFor { age and weight } // min of values per `age` and `weight` separately
195+
df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
194196
df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
195197
df.minBy { age } // DataRow with minimal `age`
196198
// SampleEnd
@@ -214,8 +216,9 @@ class Analyze : TestBase() {
214216
// SampleStart
215217
df.median() // median of values per every comparable column
216218
df.median { age and weight } // median of all values in `age` and `weight`
217-
df.medianFor { age and weight } // median of values per `age` and `weight` separately
219+
df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
218220
df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
221+
df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
219222
// SampleEnd
220223
}
221224

@@ -235,10 +238,11 @@ class Analyze : TestBase() {
235238
@TransformDataFrameExpressions
236239
fun percentileModes() {
237240
// SampleStart
238-
df.percentile(25.0) // percentile of values per every comparable column
239-
df.percentile(25.0) { age and weight } // percentile of all values in `age` and `weight`
240-
df.percentileFor(25.0) { age and weight } // percentile of values per `age` and `weight` separately
241-
df.percentileOf(25.0) { (weight ?: 0) / age } // percentile of expression evaluated for every row
241+
df.percentile(25.0) // 25th percentile of values per every comparable column
242+
df.percentile(75.0) { age and weight } // 75th percentile of all values in `age` and `weight`
243+
df.percentileFor(50.0, skipNaN = true) { age and weight } // 50th percentile of values per `age` and `weight` separately
244+
df.percentileOf(75.0) { (weight ?: 0) / age } // 75th percentile of expression evaluated for every row
245+
df.percentileBy(25.0) { age } // DataRow where the 25th percentile of `age` lies (index rounded using R3)
242246
// SampleEnd
243247
}
244248

@@ -247,9 +251,9 @@ class Analyze : TestBase() {
247251
fun percentileAggregations() {
248252
// SampleStart
249253
df.percentile(25.0)
250-
df.age.percentile(25.0)
251-
df.groupBy { city }.percentile(25.0)
252-
df.pivot { city }.percentile(25.0)
254+
df.age.percentile(75.0)
255+
df.groupBy { city }.percentile(50.0)
256+
df.pivot { city }.percentile(75.0)
253257
df.pivot { city }.groupBy { name.lastName }.percentile(25.0)
254258
// SampleEnd
255259
}
@@ -259,8 +263,8 @@ class Analyze : TestBase() {
259263
fun meanModes() {
260264
// SampleStart
261265
df.mean() // mean of values per every numeric column
262-
df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
263-
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
266+
df.mean { age and weight } // mean of all values in `age` and `weight`
267+
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
264268
df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
265269
// SampleEnd
266270
}
@@ -283,7 +287,7 @@ class Analyze : TestBase() {
283287
// SampleStart
284288
df.std() // std of values per every numeric column
285289
df.std { age and weight } // std of all values in `age` and `weight`
286-
df.stdFor { age and weight } // std of values per `age` and `weight` separately, skips NA
290+
df.stdFor(skipNaN = true) { age and weight } // std of values per `age` and `weight` separately, skips NA
287291
df.stdOf { (weight ?: 0) / age } // std of expression evaluated for every row
288292
// SampleEnd
289293
}

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/statistics/sum.kt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import io.kotest.matchers.doubles.shouldBeNaN
55
import io.kotest.matchers.floats.shouldBeNaN
66
import io.kotest.matchers.shouldBe
77
import io.kotest.matchers.string.shouldContain
8+
import org.jetbrains.kotlinx.dataframe.DataColumn
9+
import org.jetbrains.kotlinx.dataframe.api.cast
810
import org.jetbrains.kotlinx.dataframe.api.columnOf
911
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
1012
import org.jetbrains.kotlinx.dataframe.api.isEmpty
@@ -14,7 +16,9 @@ import org.jetbrains.kotlinx.dataframe.api.sum
1416
import org.jetbrains.kotlinx.dataframe.api.sumFor
1517
import org.jetbrains.kotlinx.dataframe.api.sumOf
1618
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
19+
import org.jetbrains.kotlinx.dataframe.impl.nullableNothingType
1720
import org.junit.Test
21+
import kotlin.reflect.typeOf
1822

1923
class SumTests {
2024

@@ -46,6 +50,28 @@ class SumTests {
4650
df.sumOf { value() } shouldBe expected
4751
}
4852

53+
@Test
54+
fun `empty column with types`() {
55+
val emptyIntCol by columnOf<Int?>(null, null)
56+
emptyIntCol.sum() shouldBe 0
57+
58+
// empty column with Number type
59+
val emptyNumberColumn = DataColumn.createValueColumn<Number?>(
60+
"emptyNumberColumn",
61+
listOf(null, null),
62+
typeOf<Number?>(),
63+
)
64+
emptyNumberColumn.sum() shouldBe 0.0
65+
66+
// empty column with nullable Nothing type
67+
val emptyNothingColumn = DataColumn.createValueColumn(
68+
"emptyNothingColumn",
69+
listOf(null, null),
70+
nullableNothingType,
71+
)
72+
emptyNothingColumn.cast<Number?>().sum() shouldBe 0.0
73+
}
74+
4975
@Test
5076
fun `test multiple columns`() {
5177
val value1 by columnOf(1, 2, 3)

docs/StardustDocs/d.tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
<toc-element topic="DataRow.md"/>
3838
</toc-element>
3939
<toc-element topic="nanAndNa.md"/>
40+
<toc-element topic="numberUnification.md"/>
4041
<toc-element topic="operations.md"/>
4142
<toc-element toc-title="Operations">
4243
<toc-element topic="create.md">

docs/StardustDocs/topics/DataRow.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,21 +83,21 @@ Row condition signature: ```DataRow.(DataRow) -> Boolean```
8383
<snippet id="rowStatistics">
8484

8585
The following [statistics](summaryStatistics.md) are available for `DataRow`:
86-
* `rowMax`
87-
* `rowMin`
8886
* `rowSum`
8987
* `rowMean`
9088
* `rowStd`
91-
* `rowMedian`
9289

93-
These statistics will be applied only to values of appropriate types and incompatible values will be ignored.
94-
For example, if [`DataFrame`](DataFrame.md) has columns of type `String` and `Int`, `rowSum()` will successfully compute sum of `Int` values in a row and ignore `String` values.
90+
These statistics will be applied only to values of appropriate types, and incompatible values will be ignored.
91+
For example, if a [dataframe](DataFrame.md) has columns of types `String` and `Int`,
92+
`rowSum()` will compute the sum of the `Int` values in the row and ignore `String` values.
9593

96-
To apply statistics only to values of particular type use `-Of` versions:
97-
* `rowMaxOf<T>`
98-
* `rowMinOf<T>`
94+
To apply statistics only to values of a particular type use `-Of` versions:
9995
* `rowSumOf<T>`
10096
* `rowMeanOf<T>`
97+
* `rowStdOf<T>`
98+
* `rowMinOf<T>`
99+
* `rowMaxOf<T>`
101100
* `rowMedianOf<T>`
101+
* `rowPercentileOf<T>`
102102

103103
</snippet>
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
[//]: # (title: Column statistics)
22

3-
// TODO
3+
Statistics on columns are described:
4+
- [here](summaryStatistics.md) for summary statistics, like [sum](sum.md) and [mean](mean.md)
5+
- [here](columnStatistics.md) for cumulative statistics, like [cumSum](cumSum.md)

docs/StardustDocs/topics/mean.md

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,28 @@
22

33
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
44

5-
Computes the mean of values.
5+
Computes the [mean (average)](https://en.wikipedia.org/wiki/Arithmetic_mean) of values.
66

7-
Is available for numeric columns. Computed value has type `Double`.
8-
Use `skipNA` flag to skip [`NA` values](nanAndNa.md#na) (`null` and `NaN`).
7+
`null` values are ignored.
8+
9+
All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`.
10+
11+
`mean` also supports the "mixed" `Number` type, as long as the column consists only of the aforementioned
12+
primitive numbers.
13+
The numbers are automatically converted to a [common type](numberUnification.md) for the operation.
14+
15+
The return type is always `Double`; `Double.NaN` for empty columns.
16+
17+
All operations on `Double`/`Float`/`Number` have the `skipNaN` option, which is
18+
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
19+
When it's set to `true`, `NaN` values are ignored.
920

1021
<!---FUN meanModes-->
1122

1223
```kotlin
1324
df.mean() // mean of values per every numeric column
14-
df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
15-
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
25+
df.mean { age and weight } // mean of all values in `age` and `weight`
26+
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
1627
df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
1728
```
1829

@@ -31,3 +42,18 @@ df.pivot { city }.groupBy { name.lastName }.mean()
3142
<!---END-->
3243

3344
See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
45+
46+
### Type Conversion
47+
48+
The following automatic type conversions are performed for the `mean` operation:
49+
50+
| Conversion | Result for Empty Input |
51+
|----------------------------------------------------------------------------|------------------------|
52+
| Int -> Double | Double.NaN |
53+
| Byte -> Double | Double.NaN |
54+
| Short -> Double | Double.NaN |
55+
| Long -> Double | Double.NaN |
56+
| Double -> Double | Double.NaN |
57+
| Float -> Double | Double.NaN |
58+
| Number -> Conversion([Common number type](numberUnification.md)) -> Double | Double.NaN |
59+
| Nothing -> Double | Double.NaN |

docs/StardustDocs/topics/median.md

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,37 @@
22

33
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
44

5-
Computes the median of values.
5+
Computes the [median](https://en.wikipedia.org/wiki/Median) of values.
66

7-
Is available for `Comparable` columns. [`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
7+
This is also called the "middle" of a sorted list, the "50th [percentile](percentile.md)", or
8+
the 2-[quantile](https://en.wikipedia.org/wiki/Quantile).
9+
10+
`null` values in the input are ignored.
11+
The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
12+
or they return `null` when using the `-orNull` overloads.
13+
14+
All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`,
15+
but no mix of different number types.
16+
In these cases, the return type is always `Double?`.
17+
When the number of values is even, the median is the average of the two middle values.
18+
19+
The operation is also available for self-comparable columns
20+
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, etc.)
21+
In this case, the return type remains `T?`.
22+
When the number of values is even, the median is the low of the two middle values.
23+
24+
All operations on `Double`/`Float` have the `skipNaN` option, which is
25+
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
26+
When it's set to `true`, `NaN` values are ignored.
827

928
<!---FUN medianModes-->
1029

1130
```kotlin
1231
df.median() // median of values per every comparable column
1332
df.median { age and weight } // median of all values in `age` and `weight`
14-
df.medianFor { age and weight } // median of values per `age` and `weight` separately
33+
df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
1534
df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
35+
df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
1636
```
1737

1838
<!---END-->
@@ -30,3 +50,19 @@ df.pivot { city }.groupBy { name.lastName }.median()
3050
<!---END-->
3151

3252
See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
53+
54+
### Type Conversion
55+
56+
The following automatic type conversions are performed for the `median` operation.
57+
(Note that `null` only appears in the return type when using `-orNull` overloads).
58+
59+
| Conversion | Result for Empty Input |
60+
|--------------------------------|------------------------|
61+
| T -> T where T : Comparable<T> | null |
62+
| Int -> Double | null |
63+
| Byte -> Double | null |
64+
| Short -> Double | null |
65+
| Long -> Double | null |
66+
| Double -> Double | null |
67+
| Float -> Double | null |
68+
| Nothing -> Nothing | null |

docs/StardustDocs/topics/minmax.md

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,26 @@
22

33
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
44

5-
Computes the minimum / maximum of values.
5+
Computes the [minimum / maximum](https://en.wikipedia.org/wiki/Maximum_and_minimum) of values.
66

7-
Is available for [`Comparable`](https://kotlinlang.org/api/latest/jvm/stdlib/kotlin/-comparable/) columns.
8-
[`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
7+
`null` values in the input are ignored.
8+
The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
9+
or they return `null` when using the `-orNull` overloads.
10+
11+
They are available for self-comparable columns
12+
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, `Int`, etc.)
13+
which includes all primitive number columns, but no mix of different number types.
14+
15+
All operations on `Double`/`Float` have the `skipNaN` option, which is
16+
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
17+
When it's set to `true`, `NaN` values are ignored.
918

1019
<!---FUN minmaxModes-->
1120

1221
```kotlin
1322
df.min() // min of values per every comparable column
1423
df.min { age and weight } // min of all values in `age` and `weight`
15-
df.minFor { age and weight } // min of values per `age` and `weight` separately
24+
df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
1625
df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
1726
df.minBy { age } // DataRow with minimal `age`
1827
```
@@ -32,3 +41,19 @@ df.pivot { city }.groupBy { name.lastName }.min()
3241
<!---END-->
3342

3443
See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
44+
45+
### Type Conversion
46+
47+
The following automatic type conversions are performed for the `min` and `max` operations.
48+
(Note that `null` only appears in the return type when using `-orNull` overloads).
49+
50+
| Conversion | Result for Empty Input |
51+
|--------------------------------|------------------------|
52+
| T -> T where T : Comparable<T> | null |
53+
| Int -> Int | null |
54+
| Byte -> Byte | null |
55+
| Short -> Short | null |
56+
| Long -> Long | null |
57+
| Double -> Double | null |
58+
| Float -> Float | null |
59+
| Nothing -> Nothing | null |
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[//]: # (title: Number Unification)
2+
3+
// TODO

0 commit comments

Comments
 (0)