|
1 | 1 | package org.jetbrains.kotlinx.dataframe.math
|
2 | 2 |
|
| 3 | +import io.github.oshai.kotlinlogging.KotlinLogging |
| 4 | +import org.jetbrains.kotlinx.dataframe.api.isNaN |
| 5 | +import org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators.CalculateReturnType |
| 6 | +import org.jetbrains.kotlinx.dataframe.impl.canBeNaN |
| 7 | +import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable |
| 8 | +import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber |
| 9 | +import org.jetbrains.kotlinx.dataframe.impl.nothingType |
| 10 | +import org.jetbrains.kotlinx.dataframe.impl.renderType |
| 11 | +import org.jetbrains.kotlinx.dataframe.math.quickSelect |
| 12 | +import java.math.BigDecimal |
| 13 | +import java.math.BigInteger |
3 | 14 | import kotlin.reflect.KType
|
| 15 | +import kotlin.reflect.full.withNullability |
4 | 16 | import kotlin.reflect.typeOf
|
5 | 17 |
|
| 18 | +private val logger = KotlinLogging.logger { } |
| 19 | + |
6 | 20 | // TODO median always returns the same type, but this can be confusing for iterables of even length
|
7 | 21 | // TODO (e.g. median of [1, 2] should be 1.5, but the type is Int, so it returns 1), Issue #558
|
| 22 | + |
| 23 | +/** |
| 24 | + * Returns the median of the comparable input: |
| 25 | + * - `null` if empty and primitive number |
| 26 | + * - `Double.NaN` if empty and primitive number |
| 27 | + * - `Double` if primitive number |
| 28 | + * - `Double.NaN` if ![skipNaN] and contains NaN |
| 29 | + * - (lower) middle else |
| 30 | + * |
| 31 | + * TODO migrate back to percentile when it's flexible enough |
| 32 | + */ |
8 | 33 | @PublishedApi
|
9 |
| -internal inline fun <reified T : Comparable<T>> Iterable<T?>.median(type: KType = typeOf<T>()): T? = |
10 |
| - percentile(50.0, type) |
| 34 | +internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN: Boolean): Any? { |
| 35 | + when { |
| 36 | + type.isMarkedNullable -> |
| 37 | + error("Encountered nullable type ${renderType(type)} in median function. This should not occur.") |
| 38 | + |
| 39 | + !type.isIntraComparable() -> |
| 40 | + error( |
| 41 | + "Unable to compute the median for ${ |
| 42 | + renderType(type) |
| 43 | + }. Only primitive numbers or self-comparables are supported.", |
| 44 | + ) |
| 45 | + |
| 46 | + type == typeOf<BigDecimal>() || type == typeOf<BigInteger>() -> |
| 47 | + throw IllegalArgumentException( |
| 48 | + "Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported.", |
| 49 | + ) |
| 50 | + |
| 51 | + type == typeOf<Long>() -> |
| 52 | + logger.warn { "Converting Longs to Doubles to calculate the median, loss of precision may occur." } |
| 53 | + |
| 54 | + // this means the sequence is empty |
| 55 | + type == nothingType -> return null |
| 56 | + } |
| 57 | + |
| 58 | + // propagate NaN to return if they are not to be skipped |
| 59 | + if (type.canBeNaN && !skipNaN && any { it.isNaN }) return Double.NaN |
| 60 | + |
| 61 | + val list = when { |
| 62 | + type.canBeNaN -> filter { !it.isNaN } |
| 63 | + else -> this |
| 64 | + }.toList() |
| 65 | + |
| 66 | + val size = list.size |
| 67 | + if (size == 0) return if (type.isPrimitiveNumber()) Double.NaN else null |
| 68 | + |
| 69 | + val isOdd = size % 2 != 0 |
| 70 | + |
| 71 | + val middleIndex = (size - 1) / 2 |
| 72 | + val lower = list.quickSelect(middleIndex) |
| 73 | + val upper = list.quickSelect(middleIndex + 1) |
| 74 | + |
| 75 | + // check for quickSelect |
| 76 | + if (isOdd && lower.compareTo(upper) != 0) { |
| 77 | + error("lower and upper median are not equal while list-size is odd. This should not happen.") |
| 78 | + } |
| 79 | + |
| 80 | + return when { |
| 81 | + isOdd && type.isPrimitiveNumber() -> (lower as Number).toDouble() |
| 82 | + isOdd -> lower |
| 83 | + type == typeOf<Double>() -> (lower as Double + upper as Double) / 2.0 |
| 84 | + type == typeOf<Float>() -> ((lower as Float).toDouble() + (upper as Float).toDouble()) / 2.0 |
| 85 | + type == typeOf<Int>() -> ((lower as Int).toDouble() + (upper as Int).toDouble()) / 2.0 |
| 86 | + type == typeOf<Short>() -> ((lower as Short).toDouble() + (upper as Short).toDouble()) / 2.0 |
| 87 | + type == typeOf<Byte>() -> ((lower as Byte).toDouble() + (upper as Byte).toDouble()) / 2.0 |
| 88 | + type == typeOf<Long>() -> ((lower as Long).toDouble() + (upper as Long).toDouble()) / 2.0 |
| 89 | + else -> lower |
| 90 | + } |
| 91 | +} |
| 92 | + |
| 93 | +/** |
| 94 | + * Primitive Number -> Double |
| 95 | + * T : Comparable<T> -> T? |
| 96 | + */ |
| 97 | +internal val medianConversion: CalculateReturnType = { type, isEmpty -> |
| 98 | + when { |
| 99 | + // uses linear interpolation, number 7 of Hyndman and Fan "Sample quantiles in statistical packages" |
| 100 | + type.isPrimitiveNumber() -> typeOf<Double>() |
| 101 | + |
| 102 | + // closest rank method, preferring lower middle, |
| 103 | + // number 3 of Hyndman and Fan "Sample quantiles in statistical packages" |
| 104 | + type.isIntraComparable() -> type.withNullability(isEmpty) |
| 105 | + |
| 106 | + else -> error("Can not calculate median for type ${renderType(type)}") |
| 107 | + } |
| 108 | +} |
| 109 | + |
| 110 | +/** |
| 111 | + * Returns the index of the median of the comparable input: |
| 112 | + * - `-1` if empty or all `null` |
| 113 | + * - index of first NaN if ![skipNaN] and contains NaN |
| 114 | + * - index (lower) middle else |
| 115 | + * NOTE: For primitive numbers the `seq.elementAt(seq.indexOfMedian())` might be different from `seq.medianOrNull()` |
| 116 | + * |
| 117 | + * TODO migrate back to percentile when it's flexible enough |
| 118 | + */ |
| 119 | +internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, skipNaN: Boolean): Int { |
| 120 | + val nonNullType = type.withNullability(false) |
| 121 | + when { |
| 122 | + !nonNullType.isIntraComparable() -> |
| 123 | + error( |
| 124 | + "Unable to compute the median for ${ |
| 125 | + renderType(type) |
| 126 | + }. Only primitive numbers or self-comparables are supported.", |
| 127 | + ) |
| 128 | + |
| 129 | + nonNullType == typeOf<BigDecimal>() || nonNullType == typeOf<BigInteger>() -> |
| 130 | + throw IllegalArgumentException( |
| 131 | + "Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported.", |
| 132 | + ) |
| 133 | + |
| 134 | + // this means the sequence is empty |
| 135 | + nonNullType == nothingType -> return -1 |
| 136 | + } |
| 137 | + |
| 138 | + // propagate NaN to return if they are not to be skipped |
| 139 | + if (nonNullType.canBeNaN && !skipNaN) { |
| 140 | + for ((i, it) in this.withIndex()) { |
| 141 | + if (it.isNaN) return i |
| 142 | + } |
| 143 | + } |
| 144 | + |
| 145 | + val indexedSequence = this.mapIndexedNotNull { i, it -> |
| 146 | + if (it == null) { |
| 147 | + null |
| 148 | + } else { |
| 149 | + IndexedComparable(i, it) |
| 150 | + } |
| 151 | + } |
| 152 | + val list = when { |
| 153 | + nonNullType.canBeNaN -> indexedSequence.filterNot { it.value.isNaN } |
| 154 | + else -> indexedSequence |
| 155 | + }.toList() |
| 156 | + |
| 157 | + val size = list.size |
| 158 | + if (size == 0) return -1 |
| 159 | + |
| 160 | + val isOdd = size % 2 != 0 |
| 161 | + |
| 162 | + val middleIndex = (size - 1) / 2 |
| 163 | + val lower = list.quickSelect(middleIndex) |
| 164 | + val upper = list.quickSelect(middleIndex + 1) |
| 165 | + |
| 166 | + // check for quickSelect |
| 167 | + if (isOdd && lower.compareTo(upper) != 0) { |
| 168 | + error("lower and upper median are not equal while list-size is odd. This should not happen.") |
| 169 | + } |
| 170 | + |
| 171 | + return lower.index |
| 172 | +} |
| 173 | + |
| 174 | +private data class IndexedComparable<T : Comparable<T>>(val index: Int, val value: T) : |
| 175 | + Comparable<IndexedComparable<T>> { |
| 176 | + override fun compareTo(other: IndexedComparable<T>): Int = value.compareTo(other.value) |
| 177 | +} |
0 commit comments