Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ import kotlin.experimental.ExperimentalTypeInference
import kotlin.reflect.KProperty

/* TODO KDocs
* numbers -> Double or null
* primitive numbers -> Double or null
* comparable -> itself or null
*
* Careful! non-primitive numbers will thus follow comparable rules
*
* TODO cases where the lambda dictates the return type require explicit type arguments for
* non-number, comparable overloads: https://youtrack.jetbrains.com/issue/KT-76683
* so, `df.median { intCol }` works, but needs `df.median<_, String> { stringCol }` or `df.median({ dateCol })`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ import kotlin.experimental.ExperimentalTypeInference
import kotlin.reflect.KProperty

/* TODO KDocs
* numbers -> Double or null
* primitive numbers -> Double or null
* comparable -> itself or null
*
* Careful! non-primitive numbers will thus follow comparable rules
*
* TODO cases where the lambda dictates the return type require explicit type arguments for
* non-number, comparable overloads: https://youtrack.jetbrains.com/issue/KT-76683
* so, `df.percentile { intCol }` works, but needs `df.percentile<_, String> { stringCol }` or `df.percentile({ dateCol })`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
import org.jetbrains.kotlinx.dataframe.impl.nothingType
import org.jetbrains.kotlinx.dataframe.impl.renderType
import java.math.BigDecimal
import java.math.BigInteger
import kotlin.math.round
import kotlin.reflect.KType
import kotlin.reflect.full.withNullability
Expand Down Expand Up @@ -40,11 +38,6 @@ internal fun <T : Comparable<T>> Sequence<T>.medianOrNull(type: KType, skipNaN:
}. Only primitive numbers or self-comparables are supported.",
)

type == typeOf<BigDecimal>() || type == typeOf<BigInteger>() ->
throw IllegalArgumentException(
"Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported.",
)

// TODO kdocs: note about loss of precision for Long
}

Expand Down Expand Up @@ -107,11 +100,6 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfMedian(type: KType, s
renderType(type)
}. Only primitive numbers or self-comparables are supported.",
)

nonNullType == typeOf<BigDecimal>() || nonNullType == typeOf<BigInteger>() ->
throw IllegalArgumentException(
"Cannot calculate the median for big numbers in DataFrame. Only primitive numbers are supported.",
)
}

// propagate NaN to return if they are not to be skipped
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
import org.jetbrains.kotlinx.dataframe.impl.nothingType
import org.jetbrains.kotlinx.dataframe.impl.renderType
import java.math.BigDecimal
import java.math.BigInteger
import kotlin.math.round
import kotlin.reflect.KType
import kotlin.reflect.full.withNullability
Expand All @@ -33,11 +31,6 @@ internal fun <T : Comparable<T>> Sequence<T>.percentileOrNull(percentile: Double
}. Only primitive numbers or self-comparables are supported.",
)

type == typeOf<BigDecimal>() || type == typeOf<BigInteger>() ->
throw IllegalArgumentException(
"Cannot calculate the percentile for big numbers in DataFrame. Only primitive numbers are supported.",
)

// TODO kdocs: note about loss of precision for Long
}

Expand Down Expand Up @@ -99,11 +92,6 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfPercentile(
renderType(type)
}. Only primitive numbers or self-comparables are supported.",
)

nonNullType == typeOf<BigDecimal>() || nonNullType == typeOf<BigInteger>() ->
throw IllegalArgumentException(
"Cannot calculate the percentile for big numbers in DataFrame. Only primitive numbers are supported.",
)
}

val indexedSequence = this.mapIndexedNotNull { i, it ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
import org.jetbrains.kotlinx.dataframe.impl.nothingType
import org.jetbrains.kotlinx.dataframe.impl.renderType
import java.math.BigDecimal
import java.math.BigInteger
import kotlin.math.ceil
import kotlin.math.floor
import kotlin.math.round
Expand Down Expand Up @@ -52,11 +50,6 @@ internal fun <T : Comparable<T>> Sequence<Any>.quantileOrNull(
renderType(type)
}. Only primitive numbers or self-comparables are supported.",
)

type == typeOf<BigDecimal>() || type == typeOf<BigInteger>() ->
throw IllegalArgumentException(
"Cannot calculate the $name for big numbers in DataFrame. Only primitive numbers are supported.",
)
}

// propagate NaN to return if they are not to be skipped
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,23 +251,25 @@ class MedianTests {
@[Test Suppress("ktlint:standard:argument-list-wrapping")]
fun `dataframe median`() {
val df = dataFrameOf(
"a", "b", "c",
"a", "b", "c", "d",
)(
1, 2f, 3.0,
4, 5f, 6.0,
7, 8f, 9.0,
1, 2f, 3.0, 1.toBigInteger(),
4, 5f, 6.0, 2.toBigInteger(),
7, 8f, 9.0, 4.toBigInteger(),
)

// Get row with median values for each column
val medians = df.median()
medians["a"] shouldBe 4
medians["b"] shouldBe 5f
medians["a"] shouldBe 4.0
medians["b"] shouldBe 5.0
medians["c"] shouldBe 6.0
medians["d"] shouldBe 2.toBigInteger() // not interpolated!

// Test median for specific columns
val medianFor = df.medianFor("a", "c")
medianFor["a"] shouldBe 4
val medianFor = df.medianFor("a", "c", "d")
medianFor["a"] shouldBe 4.0
medianFor["c"] shouldBe 6.0
medianFor["d"] shouldBe 2.toBigInteger() // not interpolated!
}

@[Test Suppress("ktlint:standard:argument-list-wrapping")]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,33 +225,37 @@ class PercentileTests {
@[Test Suppress("ktlint:standard:argument-list-wrapping")]
fun `dataframe percentile`() {
val df = dataFrameOf(
"a", "b", "c",
"a", "b", "c", "d",
)(
1, 2f, 3.0,
4, 5f, 6.0,
7, 8f, 9.0,
1, 2f, 3.0, 1.toBigInteger(),
4, 5f, 6.0, 2.toBigInteger(),
7, 8f, 9.0, 4.toBigInteger(),
)

// Get row with percentile values for each column
val percentiles50 = df.percentile(50.0)
percentiles50["a"] shouldBe 4
percentiles50["b"] shouldBe 5f
percentiles50["a"] shouldBe 4.0
percentiles50["b"] shouldBe 5.0
percentiles50["c"] shouldBe 6.0
percentiles50["d"] shouldBe 2.toBigInteger() // not interpolated!

val percentiles25 = df.percentile(25.0)
percentiles25["a"] shouldBe 1.5000000000000002
percentiles25["b"] shouldBe 2.5f
percentiles25["b"] shouldBe 2.5
percentiles25["c"] shouldBe 3.5
percentiles25["d"] shouldBe 1.toBigInteger() // not interpolated!

val percentiles75 = df.percentile(75.0)
percentiles75["a"] shouldBe 6.5
percentiles75["b"] shouldBe 7.5f
percentiles75["b"] shouldBe 7.5
percentiles75["c"] shouldBe 8.5
percentiles75["d"] shouldBe 2.toBigInteger() // not interpolated!

// Test percentile for specific columns
val percentileFor50 = df.percentileFor(50.0, "a", "c")
percentileFor50["a"] shouldBe 4
val percentileFor50 = df.percentileFor(50.0, "a", "c", "d")
percentileFor50["a"] shouldBe 4.0
percentileFor50["c"] shouldBe 6.0
percentileFor50["d"] shouldBe 2.toBigInteger() // not interpolated!
}

@[Test Suppress("ktlint:standard:argument-list-wrapping")]
Expand Down
2 changes: 2 additions & 0 deletions docs/StardustDocs/topics/median.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ The operation is also available for self-comparable columns
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, etc.)
In this case, the return type remains `T?`.
When the number of values is even, the median is the low of the two middle values.
NOTE: This logic also applies to other self-comparable `Number` types, like `BigDecimal`.
They will not be interpolated.

All operations on `Double`/`Float` have the `skipNaN` option, which is
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
Expand Down
2 changes: 2 additions & 0 deletions docs/StardustDocs/topics/percentile.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ The operation is also available for self-comparable columns
In this case, the return type remains `T?`.
The index of the result of the operation on these types is rounded using
[Quantile Estimation Method](#quantile-estimation-methods) R3.
NOTE: This logic also applies to other self-comparable `Number` types, like `BigDecimal`.
They will not be interpolated.

All operations on `Double`/`Float` have the `skipNaN` option, which is
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
Expand Down