Skip to content

Commit 1e98ec9

Browse files
Merge pull request #1275 from Kotlin/corr_kdocs
Corr kdocs
2 parents d75d723 + a2a0931 commit 1e98ec9

File tree

2 files changed

+230
-0
lines changed

2 files changed

+230
-0
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/corr.kt

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,180 @@ import org.jetbrains.kotlinx.dataframe.AnyCol
44
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
55
import org.jetbrains.kotlinx.dataframe.DataFrame
66
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
7+
import org.jetbrains.kotlinx.dataframe.api.CorrDocs.Grammar
8+
import org.jetbrains.kotlinx.dataframe.api.CorrDocs.SelectingOptions
9+
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
710
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
811
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
12+
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
13+
import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarLink
14+
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
15+
import org.jetbrains.kotlinx.dataframe.documentation.Indent
16+
import org.jetbrains.kotlinx.dataframe.documentation.LineBreak
17+
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
918
import org.jetbrains.kotlinx.dataframe.impl.api.corrImpl
1019
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
1120
import kotlin.reflect.KProperty
1221
import kotlin.reflect.typeOf
1322

23+
/**
24+
* Calculates the Pearson pairwise correlation between values in the specified [columns\].
25+
*
26+
* This function does not compute the correlation immediately.
27+
* Instead, it defines the primary set of columns
28+
* and returns a [Corr] instance that allows configuring how the correlation should be computed.
29+
*
30+
* The function is available for numeric- and [Boolean] columns.
31+
* [Boolean] values are converted into 1 for true and 0 for false.
32+
* All other columns are ignored.
33+
* If a [ColumnGroup] instance is passed as the target column for correlation,
34+
* it will be unpacked into suitable nested columns.
35+
*
36+
* The [Corr] object provides two methods to perform correlation calculations:
37+
* - [with][Corr.with] — computes correlations between the initially selected columns and a second set of columns.
38+
* - [withItself][Corr.withItself] — computes pairwise correlations within the initially selected columns.
39+
*
40+
* Each method returns a square or rectangular correlation matrix represented by a [DataFrame],
41+
* where rows and columns correspond to the selected column sets,
42+
* and each cell contains the Pearson correlation coefficient between the corresponding pair of columns.
43+
*
44+
* To compute correlations between all suitable columns in the [DataFrame], use [DataFrame.corr()][DataFrame.corr].
45+
*
46+
* Check out [Grammar].
47+
*
48+
* @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention]
49+
*
50+
* See also: [Selecting Columns][SelectingOptions].
51+
*
52+
* For more information, see: {@include [DocumentationUrls.Corr]}
53+
*/
54+
internal interface CorrDocs {
55+
56+
/**
57+
* {@comment Version of [SelectingColumns] with correctly filled in examples}
58+
* @include [SelectingColumns] {@include [SetCorrOperationArg]}
59+
*/
60+
interface SelectingOptions
61+
62+
/**
63+
* ## Corr Operation Grammar
64+
* {@include [LineBreak]}
65+
* {@include [DslGrammarLink]}
66+
* {@include [LineBreak]}
67+
*
68+
* **[`corr`][convert]**` { columnsSelector: `[`ColumnsSelector`][ColumnsSelector]` }`
69+
*
70+
* {@include [Indent]}
71+
* __`.`__[**`with`**][Corr.with]` { columnsSelector: `[`ColumnsSelector`][ColumnsSelector]` }`
72+
*
73+
* {@include [Indent]}
74+
*`| `__`.`__[**`withItself`**][Corr.withItself]`()`
75+
*/
76+
interface Grammar
77+
}
78+
79+
/** {@set [SelectingColumns.OPERATION] [corr][corr]} */
80+
@ExcludeFromSources
81+
private interface SetCorrOperationArg
82+
83+
/**
84+
* {@include [CorrDocs]}
85+
* ### This Corr Overload
86+
*/
87+
@ExcludeFromSources
88+
private interface CommonCorrDocs
89+
1490
internal fun AnyCol.isSuitableForCorr() = isSubtypeOf<Number>() || type() == typeOf<Boolean>()
1591

1692
// region DataFrame
1793

94+
/**
95+
* An intermediate class used in the [corr] operation.
96+
*
97+
* This class does not perform any computation by itself — it serves as a transitional step
98+
* before specifying how the correlation should be calculated.
99+
* It must be followed by one of the computation methods to produce a correlation [DataFrame].
100+
*
101+
* The resulting [DataFrame] is a correlation matrix where rows correspond to one set of columns,
102+
* columns to the other set, and each cell contains the Pearson correlation coefficient
103+
* between the respective pair of columns.
104+
*
105+
* Use the following methods to perform the computation:
106+
* - [with] — selects a second set of columns and computes correlations between
107+
* the initially selected columns and this second set.
108+
* - [withItself] — computes pairwise correlations within the initially selected columns.
109+
*
110+
* See [Grammar][CorrDocs.Grammar] for more details.
111+
*/
18112
public data class Corr<T, C>(internal val df: DataFrame<T>, internal val columns: ColumnsSelector<T, C>)
19113

114+
/**
115+
* Computes the pearson correlation between all suitable columns in this [DataFrame],
116+
* including nested columns at any depth.
117+
*
118+
* The result is a square correlation matrix represented by a [DataFrame],
119+
* where both rows and columns correspond to the original columns,
120+
* and each cell contains the Pearson correlation coefficient between the respective pair of columns.
121+
*
122+
* The function is available for numeric- and [Boolean] columns.
123+
* [Boolean] values are converted into 1 for true and 0 for false.
124+
* All other columns are ignored.
125+
*
126+
* For more information, see: {@include [DocumentationUrls.Corr]}
127+
*
128+
* @return A square correlation matrix as a [DataFrame], where both rows and columns correspond to the original columns.
129+
*/
20130
public fun <T> DataFrame<T>.corr(): DataFrame<T> =
21131
corr {
22132
colsAtAnyDepth().filter { it.isSuitableForCorr() }
23133
}.withItself()
24134

135+
/**
136+
* {@include [CommonCorrDocs]}
137+
* @include [SelectingColumns.Dsl] {@include [SetCorrOperationArg]}
138+
*
139+
* The function is available for numeric- and [Boolean] columns.
140+
* [Boolean] values are converted into 1 for true and 0 for false.
141+
* All other columns are ignored.
142+
* If a [ColumnGroup] instance is passed as the target column for correlation,
143+
* it will be unpacked into suitable nested columns.
144+
*
145+
* ### Examples
146+
* ```kotlin
147+
* // Compute correlations between the "age" column and the "weight" and "height" columns
148+
* df.corr { age }.with { weight and height }
149+
*
150+
* // Compute pairwise correlations between all columns of type `Number`
151+
* df.corr { colsOf<Number>() }.withItself()
152+
* ```
153+
* @param [columns\] The [Columns Selector][ColumnsSelector] used to select the columns
154+
* of this [DataFrame] to compute a correlation.
155+
* @return A [Corr] intermediate object with the selected columns.
156+
*/
25157
public fun <T, C> DataFrame<T>.corr(columns: ColumnsSelector<T, C>): Corr<T, C> = Corr(this, columns)
26158

159+
/**
160+
* {@include [CommonCorrDocs]}
161+
* @include [SelectingColumns.ColumnNames] {@include [SetCorrOperationArg]}
162+
*
163+
* The function is available for numeric- and [Boolean] columns.
164+
* [Boolean] values are converted into 1 for true and 0 for false.
165+
* All other columns are ignored.
166+
* If a [ColumnGroup] instance is passed as the target column for correlation,
167+
* it will be unpacked into suitable nested columns.
168+
*
169+
* ### Examples
170+
* ```kotlin
171+
* // Compute correlations between the "age" column and the "weight" and "height" columns
172+
* df.corr { age }.with { weight and height }
173+
*
174+
* // Compute pairwise correlations between all columns of type `Number`
175+
* df.corr { colsOf<Number>() }.withItself()
176+
* ```
177+
* @param [columns\] The [Column Names][String] used to select the columns
178+
* of this [DataFrame] to compute a correlation.
179+
* @return A [Corr] intermediate object with the selected columns.
180+
*/
27181
public fun <T> DataFrame<T>.corr(vararg columns: String): Corr<T, Any?> = corr { columns.toColumnSet() }
28182

29183
@Deprecated(DEPRECATED_ACCESS_API)
@@ -34,8 +188,67 @@ public fun <T, C> DataFrame<T>.corr(vararg columns: KProperty<C>): Corr<T, C> =
34188
@AccessApiOverload
35189
public fun <T, C> DataFrame<T>.corr(vararg columns: ColumnReference<C>): Corr<T, C> = corr { columns.toColumnSet() }
36190

191+
/**
192+
* Calculates the correlation of specified [columns][otherColumns]
193+
* with values in the columns previously selected with [corr].
194+
*
195+
* Returns a correlation matrix represented by a [DataFrame],
196+
* where rows and columns correspond to the selected column sets,
197+
* and each cell contains the Pearson correlation coefficient between the corresponding pair of columns.
198+
*
199+
* Check out [Grammar].
200+
*
201+
* @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention]
202+
*
203+
* See also: [Selecting Columns][SelectingOptions].
204+
*
205+
* For more information, see: {@include [DocumentationUrls.Corr]}
206+
*/
207+
internal interface CorrWithDocs
208+
209+
/**
210+
* {@include [CorrWithDocs]}
211+
* ### This Corr With Overload
212+
*/
213+
@ExcludeFromSources
214+
private interface CommonCorrWithDocs
215+
216+
/**
217+
* {@include [CommonCorrWithDocs]}
218+
* @include [SelectingColumns.Dsl] {@include [SetCorrOperationArg]}
219+
*
220+
* ### Examples
221+
* ```kotlin
222+
* // Compute correlations between the "age" column and the "weight" and "height" columns
223+
* df.corr { age }.with { weight and height }
224+
*
225+
* // Compute correlations between the "speed" column and all columns of type `Double` (excluding itself)
226+
* df.corr { speed }.with { colsOf<Double>() except speed }
227+
* ```
228+
*
229+
* @param otherColumns The [ColumnsSelector] used to select the second set of columns
230+
* from this [DataFrame] to compute correlations against the initially selected columns.
231+
* @return A [DataFrame] containing the resulting correlation matrix.
232+
*/
37233
public fun <T, C, R> Corr<T, C>.with(otherColumns: ColumnsSelector<T, R>): DataFrame<T> = corrImpl(otherColumns)
38234

235+
/**
236+
* {@include [CommonCorrWithDocs]}
237+
* @include [SelectingColumns.ColumnNames] {@include [SetCorrOperationArg]}
238+
*
239+
* ### Examples
240+
* ```kotlin
241+
* // Compute correlations between the "age" column and the "weight" and "height" columns
242+
* df.corr("age").with("weight", "height")
243+
*
244+
* // Compute correlations between the "speed" column and all columns of type `Number`
245+
* df.corr { colsOf<Number>() }.with("speed")
246+
* ```
247+
*
248+
* @param otherColumns The [Column Names][String] used to select the second set of columns
249+
* from this [DataFrame] to compute correlations against the initially selected columns.
250+
* @return A [DataFrame] containing the resulting correlation matrix.
251+
*/
39252
public fun <T, C> Corr<T, C>.with(vararg otherColumns: String): DataFrame<T> = with { otherColumns.toColumnSet() }
40253

41254
@Deprecated(DEPRECATED_ACCESS_API)
@@ -48,6 +261,20 @@ public fun <T, C, R> Corr<T, C>.with(vararg otherColumns: KProperty<R>): DataFra
48261
public fun <T, C, R> Corr<T, C>.with(vararg otherColumns: ColumnReference<R>): DataFrame<T> =
49262
with { otherColumns.toColumnSet() }
50263

264+
/**
265+
* Calculates Pearson pairwise correlations between the columns
266+
* previously selected with [corr].
267+
*
268+
* Returns a square correlation matrix represented by a [DataFrame],
269+
* where both rows and columns correspond to the selected columns,
270+
* and each cell contains the Pearson correlation coefficient between the respective pair of columns.
271+
*
272+
* Check out [Grammar].
273+
*
274+
* For more information, see: {@include [DocumentationUrls.Corr]}
275+
*
276+
* @return A [DataFrame] containing the pairwise correlation matrix.
277+
*/
51278
public fun <T, C> Corr<T, C>.withItself(): DataFrame<T> = with(columns)
52279

53280
// endregion

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ internal interface DocumentationUrls {
102102
/** [See `convert` on the documentation website.]({@include [Url]}/convert.html) */
103103
interface Convert
104104

105+
/** [See `convert` on the documentation website.]({@include [Url]}/corr.html) */
106+
interface Corr
107+
105108
/** [See `add` on the documentation website.]({@include [Url]}/add.html) */
106109
interface Add
107110

0 commit comments

Comments
 (0)