Merge pull request #1165 from Kotlin/statistics-docs

Jolanrensen · web-flow · commit 3c98b9a8aecc · 2025-05-01T16:22:24.000+02:00
statistics documentation update
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/NumberTypeUtils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/NumberTypeUtils.kt
@@ -6,6 +6,7 @@ import java.math.BigDecimal
 import java.math.BigInteger
 import kotlin.reflect.KClass
 import kotlin.reflect.KType
+import kotlin.reflect.full.isSubtypeOf
 import kotlin.reflect.full.withNullability
 import kotlin.reflect.typeOf
 
@@ -224,10 +225,24 @@ internal fun Sequence<Number?>.convertToUnifiedNumberType(
                 "Cannot find unified number type of types: ${types.joinToString { renderType(it) }}",
             )
     }
-    val converter = createConverter(typeOf<Number>(), commonNumberType)!! as (Number) -> Number?
-    return map {
-        if (it == null) return@map null
-        converter(it) ?: error("Can not convert $it to $commonNumberType")
+    require(commonNumberType.isSubtypeOf(typeOf<Number?>())) {
+        "Cannot convert numbers to $commonNumberType; it is not a subtype of Number?"
+    }
+    return when (commonNumberType) {
+        nothingType -> {
+            require(null !in this) { "Cannot unify numbers to Nothing; it contains nulls" }
+            this
+        }
+
+        nullableNothingType -> this
+
+        else -> {
+            val converter = createConverter(typeOf<Number>(), commonNumberType)!! as (Number) -> Number?
+            this.map {
+                if (it == null) return@map null
+                converter(it) ?: error("Can not convert $it to $commonNumberType")
+            }
+        }
     }
 }
 
diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Analyze.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Analyze.kt
@@ -35,6 +35,7 @@ import org.jetbrains.kotlinx.dataframe.api.mean
 import org.jetbrains.kotlinx.dataframe.api.meanFor
 import org.jetbrains.kotlinx.dataframe.api.meanOf
 import org.jetbrains.kotlinx.dataframe.api.median
+import org.jetbrains.kotlinx.dataframe.api.medianBy
 import org.jetbrains.kotlinx.dataframe.api.medianFor
 import org.jetbrains.kotlinx.dataframe.api.medianOf
 import org.jetbrains.kotlinx.dataframe.api.min
@@ -43,6 +44,7 @@ import org.jetbrains.kotlinx.dataframe.api.minFor
 import org.jetbrains.kotlinx.dataframe.api.minOf
 import org.jetbrains.kotlinx.dataframe.api.minOrNull
 import org.jetbrains.kotlinx.dataframe.api.percentile
+import org.jetbrains.kotlinx.dataframe.api.percentileBy
 import org.jetbrains.kotlinx.dataframe.api.percentileFor
 import org.jetbrains.kotlinx.dataframe.api.percentileOf
 import org.jetbrains.kotlinx.dataframe.api.pivot
@@ -179,7 +181,7 @@ class Analyze : TestBase() {
         // SampleStart
         df.sum() // sum of values per every numeric column
         df.sum { age and weight } // sum of all values in `age` and `weight`
-        df.sumFor { age and weight } // sum of values per `age` and `weight` separately
+        df.sumFor(skipNaN = true) { age and weight } // sum of values per `age` and `weight` separately
         df.sumOf { (weight ?: 0) / age } // sum of expression evaluated for every row
         // SampleEnd
     }
@@ -190,7 +192,7 @@ class Analyze : TestBase() {
         // SampleStart
         df.min() // min of values per every comparable column
         df.min { age and weight } // min of all values in `age` and `weight`
-        df.minFor { age and weight } // min of values per `age` and `weight` separately
+        df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
         df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
         df.minBy { age } // DataRow with minimal `age`
         // SampleEnd
@@ -214,8 +216,9 @@ class Analyze : TestBase() {
         // SampleStart
         df.median() // median of values per every comparable column
         df.median { age and weight } // median of all values in `age` and `weight`
-        df.medianFor { age and weight } // median of values per `age` and `weight` separately
+        df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
         df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
+        df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
         // SampleEnd
     }
 
@@ -235,10 +238,11 @@ class Analyze : TestBase() {
     @TransformDataFrameExpressions
     fun percentileModes() {
         // SampleStart
-        df.percentile(25.0) // percentile of values per every comparable column
-        df.percentile(25.0) { age and weight } // percentile of all values in `age` and `weight`
-        df.percentileFor(25.0) { age and weight } // percentile of values per `age` and `weight` separately
-        df.percentileOf(25.0) { (weight ?: 0) / age } // percentile of expression evaluated for every row
+        df.percentile(25.0) // 25th percentile of values per every comparable column
+        df.percentile(75.0) { age and weight } // 75th percentile of all values in `age` and `weight`
+        df.percentileFor(50.0, skipNaN = true) { age and weight } // 50th percentile of values per `age` and `weight` separately
+        df.percentileOf(75.0) { (weight ?: 0) / age } // 75th percentile of expression evaluated for every row
+        df.percentileBy(25.0) { age } // DataRow where the 25th percentile of `age` lies (index rounded using R3)
         // SampleEnd
     }
 
@@ -247,9 +251,9 @@ class Analyze : TestBase() {
     fun percentileAggregations() {
         // SampleStart
         df.percentile(25.0)
-        df.age.percentile(25.0)
-        df.groupBy { city }.percentile(25.0)
-        df.pivot { city }.percentile(25.0)
+        df.age.percentile(75.0)
+        df.groupBy { city }.percentile(50.0)
+        df.pivot { city }.percentile(75.0)
         df.pivot { city }.groupBy { name.lastName }.percentile(25.0)
         // SampleEnd
     }
@@ -259,8 +263,8 @@ class Analyze : TestBase() {
     fun meanModes() {
         // SampleStart
         df.mean() // mean of values per every numeric column
-        df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
-        df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
+        df.mean { age and weight } // mean of all values in `age` and `weight`
+        df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
         df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
         // SampleEnd
     }
@@ -283,7 +287,7 @@ class Analyze : TestBase() {
         // SampleStart
         df.std() // std of values per every numeric column
         df.std { age and weight } // std of all values in `age` and `weight`
-        df.stdFor { age and weight } // std of values per `age` and `weight` separately, skips NA
+        df.stdFor(skipNaN = true) { age and weight } // std of values per `age` and `weight` separately, skips NA
         df.stdOf { (weight ?: 0) / age } // std of expression evaluated for every row
         // SampleEnd
     }
diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/statistics/sum.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/statistics/sum.kt
@@ -5,6 +5,8 @@ import io.kotest.matchers.doubles.shouldBeNaN
 import io.kotest.matchers.floats.shouldBeNaN
 import io.kotest.matchers.shouldBe
 import io.kotest.matchers.string.shouldContain
+import org.jetbrains.kotlinx.dataframe.DataColumn
+import org.jetbrains.kotlinx.dataframe.api.cast
 import org.jetbrains.kotlinx.dataframe.api.columnOf
 import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
 import org.jetbrains.kotlinx.dataframe.api.isEmpty
@@ -14,7 +16,9 @@ import org.jetbrains.kotlinx.dataframe.api.sum
 import org.jetbrains.kotlinx.dataframe.api.sumFor
 import org.jetbrains.kotlinx.dataframe.api.sumOf
 import org.jetbrains.kotlinx.dataframe.api.toDataFrame
+import org.jetbrains.kotlinx.dataframe.impl.nullableNothingType
 import org.junit.Test
+import kotlin.reflect.typeOf
 
 class SumTests {
 
@@ -46,6 +50,28 @@ class SumTests {
         df.sumOf { value() } shouldBe expected
     }
 
+    @Test
+    fun `empty column with types`() {
+        val emptyIntCol by columnOf<Int?>(null, null)
+        emptyIntCol.sum() shouldBe 0
+
+        // empty column with Number type
+        val emptyNumberColumn = DataColumn.createValueColumn<Number?>(
+            "emptyNumberColumn",
+            listOf(null, null),
+            typeOf<Number?>(),
+        )
+        emptyNumberColumn.sum() shouldBe 0.0
+
+        // empty column with nullable Nothing type
+        val emptyNothingColumn = DataColumn.createValueColumn(
+            "emptyNothingColumn",
+            listOf(null, null),
+            nullableNothingType,
+        )
+        emptyNothingColumn.cast<Number?>().sum() shouldBe 0.0
+    }
+
     @Test
     fun `test multiple columns`() {
         val value1 by columnOf(1, 2, 3)
diff --git a/docs/StardustDocs/d.tree b/docs/StardustDocs/d.tree
@@ -37,6 +37,7 @@
         <toc-element topic="DataRow.md"/>
     </toc-element>
     <toc-element topic="nanAndNa.md"/>
+    <toc-element topic="numberUnification.md"/>
     <toc-element topic="operations.md"/>
     <toc-element toc-title="Operations">
         <toc-element topic="create.md">
diff --git a/docs/StardustDocs/topics/DataRow.md b/docs/StardustDocs/topics/DataRow.md
@@ -83,21 +83,21 @@ Row condition signature: ```DataRow.(DataRow) -> Boolean```
 <snippet id="rowStatistics">
 
 The following [statistics](summaryStatistics.md) are available for `DataRow`:
-* `rowMax`
-* `rowMin`
 * `rowSum`
 * `rowMean`
 * `rowStd`
-* `rowMedian`
 
-These statistics will be applied only to values of appropriate types and incompatible values will be ignored.
-For example, if [`DataFrame`](DataFrame.md) has columns of type `String` and `Int`, `rowSum()` will successfully compute sum of `Int` values in a row and ignore `String` values.
+These statistics will be applied only to values of appropriate types, and incompatible values will be ignored.
+For example, if a [dataframe](DataFrame.md) has columns of types `String` and `Int`,
+`rowSum()` will compute the sum of the `Int` values in the row and ignore `String` values.
 
-To apply statistics only to values of particular type use `-Of` versions:
-* `rowMaxOf<T>`
-* `rowMinOf<T>`
+To apply statistics only to values of a particular type use `-Of` versions:
 * `rowSumOf<T>`
 * `rowMeanOf<T>`
+* `rowStdOf<T>`
+* `rowMinOf<T>`
+* `rowMaxOf<T>`
 * `rowMedianOf<T>`
+* `rowPercentileOf<T>`
 
 </snippet>
diff --git a/docs/StardustDocs/topics/columnStatistics.md b/docs/StardustDocs/topics/columnStatistics.md
@@ -1,3 +1,5 @@
 [//]: # (title: Column statistics)
 
-// TODO
+Statistics on columns are described:
+- [here](summaryStatistics.md) for summary statistics, like [sum](sum.md) and [mean](mean.md)
+- [here](columnStatistics.md) for cumulative statistics, like [cumSum](cumSum.md)
diff --git a/docs/StardustDocs/topics/mean.md b/docs/StardustDocs/topics/mean.md
@@ -2,17 +2,28 @@
 
 <!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
 
-Computes the mean of values.
+Computes the [mean (average)](https://en.wikipedia.org/wiki/Arithmetic_mean) of values.
 
-Is available for numeric columns. Computed value has type `Double`.
-Use `skipNA` flag to skip [`NA` values](nanAndNa.md#na) (`null` and `NaN`).
+`null` values are ignored.
+
+All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`.
+
+`mean` also supports the "mixed" `Number` type, as long as the column consists only of the aforementioned
+primitive numbers.
+The numbers are automatically converted to a [common type](numberUnification.md) for the operation.
+
+The return type is always `Double`; `Double.NaN` for empty columns.
+
+All operations on `Double`/`Float`/`Number` have the `skipNaN` option, which is
+set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
+When it's set to `true`, `NaN` values are ignored.
 
 <!---FUN meanModes-->
 
 ```kotlin
 df.mean() // mean of values per every numeric column
-df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
-df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
+df.mean { age and weight } // mean of all values in `age` and `weight`
+df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
 df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
 ```
 
@@ -31,3 +42,18 @@ df.pivot { city }.groupBy { name.lastName }.mean()
 <!---END-->
 
 See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
+
+### Type Conversion
+
+The following automatic type conversions are performed for the `mean` operation:
+
+| Conversion                                                                 | Result for Empty Input |
+|----------------------------------------------------------------------------|------------------------|
+| Int -> Double                                                              | Double.NaN             |
+| Byte -> Double                                                             | Double.NaN             |
+| Short -> Double                                                            | Double.NaN             |
+| Long -> Double                                                             | Double.NaN             |
+| Double -> Double                                                           | Double.NaN             |
+| Float -> Double                                                            | Double.NaN             |
+| Number -> Conversion([Common number type](numberUnification.md)) -> Double | Double.NaN             |
+| Nothing -> Double                                                          | Double.NaN             |
diff --git a/docs/StardustDocs/topics/median.md b/docs/StardustDocs/topics/median.md
@@ -2,17 +2,37 @@
 
 <!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
 
-Computes the median of values.
+Computes the [median](https://en.wikipedia.org/wiki/Median) of values.
 
-Is available for `Comparable` columns. [`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
+This is also called the "middle" of a sorted list, the "50th [percentile](percentile.md)", or
+the 2-[quantile](https://en.wikipedia.org/wiki/Quantile).
+
+`null` values in the input are ignored.
+The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
+or they return `null` when using the `-orNull` overloads.
+
+All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`,
+but no mix of different number types.
+In these cases, the return type is always `Double?`.
+When the number of values is even, the median is the average of the two middle values.
+
+The operation is also available for self-comparable columns
+(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, etc.)
+In this case, the return type remains `T?`.
+When the number of values is even, the median is the low of the two middle values.
+
+All operations on `Double`/`Float` have the `skipNaN` option, which is
+set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
+When it's set to `true`, `NaN` values are ignored.
 
 <!---FUN medianModes-->
 
 ```kotlin
 df.median() // median of values per every comparable column
 df.median { age and weight } // median of all values in `age` and `weight`
-df.medianFor { age and weight } // median of values per `age` and `weight` separately
+df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
 df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
+df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
 ```
 
 <!---END-->
@@ -30,3 +50,19 @@ df.pivot { city }.groupBy { name.lastName }.median()
 <!---END-->
 
 See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
+
+### Type Conversion
+
+The following automatic type conversions are performed for the `median` operation.
+(Note that `null` only appears in the return type when using `-orNull` overloads).
+
+| Conversion                     | Result for Empty Input |
+|--------------------------------|------------------------|
+| T -> T where T : Comparable<T> | null                   |
+| Int -> Double                  | null                   |
+| Byte -> Double                 | null                   |
+| Short -> Double                | null                   |
+| Long -> Double                 | null                   |
+| Double -> Double               | null                   |
+| Float -> Double                | null                   |
+| Nothing -> Nothing             | null                   |
diff --git a/docs/StardustDocs/topics/minmax.md b/docs/StardustDocs/topics/minmax.md
@@ -2,17 +2,26 @@
 
 <!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
 
-Computes the minimum / maximum of values.
+Computes the [minimum / maximum](https://en.wikipedia.org/wiki/Maximum_and_minimum) of values.
 
-Is available for [`Comparable`](https://kotlinlang.org/api/latest/jvm/stdlib/kotlin/-comparable/) columns.
-[`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
+`null` values in the input are ignored.
+The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
+or they return `null` when using the `-orNull` overloads.
+
+They are available for self-comparable columns
+(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, `Int`, etc.)
+which includes all primitive number columns, but no mix of different number types.
+
+All operations on `Double`/`Float` have the `skipNaN` option, which is
+set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
+When it's set to `true`, `NaN` values are ignored.
 
 <!---FUN minmaxModes-->
 
 ```kotlin
 df.min() // min of values per every comparable column
 df.min { age and weight } // min of all values in `age` and `weight`
-df.minFor { age and weight } // min of values per `age` and `weight` separately
+df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
 df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
 df.minBy { age } // DataRow with minimal `age`
 ```
@@ -32,3 +41,19 @@ df.pivot { city }.groupBy { name.lastName }.min()
 <!---END-->
 
 See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
+
+### Type Conversion
+
+The following automatic type conversions are performed for the `min` and `max` operations.
+(Note that `null` only appears in the return type when using `-orNull` overloads).
+
+| Conversion                     | Result for Empty Input |
+|--------------------------------|------------------------|
+| T -> T where T : Comparable<T> | null                   |
+| Int -> Int                     | null                   |
+| Byte -> Byte                   | null                   |
+| Short -> Short                 | null                   |
+| Long -> Long                   | null                   |
+| Double -> Double               | null                   |
+| Float -> Float                 | null                   |
+| Nothing -> Nothing             | null                   |
diff --git a/docs/StardustDocs/topics/numberUnification.md b/docs/StardustDocs/topics/numberUnification.md
@@ -0,0 +1,3 @@
+[//]: # (title: Number Unification)
+
+// TODO
diff --git a/docs/StardustDocs/topics/percentile.md b/docs/StardustDocs/topics/percentile.md
diff --git a/docs/StardustDocs/topics/std.md b/docs/StardustDocs/topics/std.md
diff --git a/docs/StardustDocs/topics/sum.md b/docs/StardustDocs/topics/sum.md
diff --git a/docs/StardustDocs/topics/summaryStatistics.md b/docs/StardustDocs/topics/summaryStatistics.md
diff --git a/docs/StardustDocs/topics/valueCounts.md b/docs/StardustDocs/topics/valueCounts.md

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[//]: # (title: Number Unification)`
	`2`	`+`
	`3`	`+// TODO`