@@ -4,26 +4,180 @@ import org.jetbrains.kotlinx.dataframe.AnyCol
4
4
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
5
5
import org.jetbrains.kotlinx.dataframe.DataFrame
6
6
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
7
+ import org.jetbrains.kotlinx.dataframe.api.CorrDocs.Grammar
8
+ import org.jetbrains.kotlinx.dataframe.api.CorrDocs.SelectingOptions
9
+ import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
7
10
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
8
11
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
12
+ import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
13
+ import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarLink
14
+ import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
15
+ import org.jetbrains.kotlinx.dataframe.documentation.Indent
16
+ import org.jetbrains.kotlinx.dataframe.documentation.LineBreak
17
+ import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
9
18
import org.jetbrains.kotlinx.dataframe.impl.api.corrImpl
10
19
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
11
20
import kotlin.reflect.KProperty
12
21
import kotlin.reflect.typeOf
13
22
23
+ /* *
24
+ * Calculates the Pearson pairwise correlation between values in the specified [columns\].
25
+ *
26
+ * This function does not compute the correlation immediately.
27
+ * Instead, it defines the primary set of columns
28
+ * and returns a [Corr] instance that allows configuring how the correlation should be computed.
29
+ *
30
+ * The function is available for numeric- and [Boolean] columns.
31
+ * [Boolean] values are converted into 1 for true and 0 for false.
32
+ * All other columns are ignored.
33
+ * If a [ColumnGroup] instance is passed as the target column for correlation,
34
+ * it will be unpacked into suitable nested columns.
35
+ *
36
+ * The [Corr] object provides two methods to perform correlation calculations:
37
+ * - [with][Corr.with] — computes correlations between the initially selected columns and a second set of columns.
38
+ * - [withItself][Corr.withItself] — computes pairwise correlations within the initially selected columns.
39
+ *
40
+ * Each method returns a square or rectangular correlation matrix represented by a [DataFrame],
41
+ * where rows and columns correspond to the selected column sets,
42
+ * and each cell contains the Pearson correlation coefficient between the corresponding pair of columns.
43
+ *
44
+ * To compute correlations between all suitable columns in the [DataFrame], use [DataFrame.corr()][DataFrame.corr].
45
+ *
46
+ * Check out [Grammar].
47
+ *
48
+ * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention]
49
+ *
50
+ * See also: [Selecting Columns][SelectingOptions].
51
+ *
52
+ * For more information, see: {@include [DocumentationUrls.Corr]}
53
+ */
54
+ internal interface CorrDocs {
55
+
56
+ /* *
57
+ * {@comment Version of [SelectingColumns] with correctly filled in examples}
58
+ * @include [SelectingColumns] {@include [SetCorrOperationArg]}
59
+ */
60
+ interface SelectingOptions
61
+
62
+ /* *
63
+ * ## Corr Operation Grammar
64
+ * {@include [LineBreak]}
65
+ * {@include [DslGrammarLink]}
66
+ * {@include [LineBreak]}
67
+ *
68
+ * **[`corr`][convert]**` { columnsSelector: `[`ColumnsSelector`][ColumnsSelector]` }`
69
+ *
70
+ * {@include [Indent]}
71
+ * __`.`__[**`with`**][Corr.with]` { columnsSelector: `[`ColumnsSelector`][ColumnsSelector]` }`
72
+ *
73
+ * {@include [Indent]}
74
+ *`| `__`.`__[**`withItself`**][Corr.withItself]`()`
75
+ */
76
+ interface Grammar
77
+ }
78
+
79
+ /* * {@set [SelectingColumns.OPERATION] [corr][corr]} */
80
+ @ExcludeFromSources
81
+ private interface SetCorrOperationArg
82
+
83
+ /* *
84
+ * {@include [CorrDocs]}
85
+ * ### This Corr Overload
86
+ */
87
+ @ExcludeFromSources
88
+ private interface CommonCorrDocs
89
+
14
90
internal fun AnyCol.isSuitableForCorr () = isSubtypeOf<Number >() || type() == typeOf<Boolean >()
15
91
16
92
// region DataFrame
17
93
94
+ /* *
95
+ * An intermediate class used in the [corr] operation.
96
+ *
97
+ * This class does not perform any computation by itself — it serves as a transitional step
98
+ * before specifying how the correlation should be calculated.
99
+ * It must be followed by one of the computation methods to produce a correlation [DataFrame].
100
+ *
101
+ * The resulting [DataFrame] is a correlation matrix where rows correspond to one set of columns,
102
+ * columns to the other set, and each cell contains the Pearson correlation coefficient
103
+ * between the respective pair of columns.
104
+ *
105
+ * Use the following methods to perform the computation:
106
+ * - [with] — selects a second set of columns and computes correlations between
107
+ * the initially selected columns and this second set.
108
+ * - [withItself] — computes pairwise correlations within the initially selected columns.
109
+ *
110
+ * See [Grammar][CorrDocs.Grammar] for more details.
111
+ */
18
112
public data class Corr <T , C >(internal val df : DataFrame <T >, internal val columns : ColumnsSelector <T , C >)
19
113
114
+ /* *
115
+ * Computes the pearson correlation between all suitable columns in this [DataFrame],
116
+ * including nested columns at any depth.
117
+ *
118
+ * The result is a square correlation matrix represented by a [DataFrame],
119
+ * where both rows and columns correspond to the original columns,
120
+ * and each cell contains the Pearson correlation coefficient between the respective pair of columns.
121
+ *
122
+ * The function is available for numeric- and [Boolean] columns.
123
+ * [Boolean] values are converted into 1 for true and 0 for false.
124
+ * All other columns are ignored.
125
+ *
126
+ * For more information, see: {@include [DocumentationUrls.Corr]}
127
+ *
128
+ * @return A square correlation matrix as a [DataFrame], where both rows and columns correspond to the original columns.
129
+ */
20
130
public fun <T > DataFrame<T>.corr (): DataFrame <T > =
21
131
corr {
22
132
colsAtAnyDepth().filter { it.isSuitableForCorr() }
23
133
}.withItself()
24
134
135
+ /* *
136
+ * {@include [CommonCorrDocs]}
137
+ * @include [SelectingColumns.Dsl] {@include [SetCorrOperationArg]}
138
+ *
139
+ * The function is available for numeric- and [Boolean] columns.
140
+ * [Boolean] values are converted into 1 for true and 0 for false.
141
+ * All other columns are ignored.
142
+ * If a [ColumnGroup] instance is passed as the target column for correlation,
143
+ * it will be unpacked into suitable nested columns.
144
+ *
145
+ * ### Examples
146
+ * ```kotlin
147
+ * // Compute correlations between the "age" column and the "weight" and "height" columns
148
+ * df.corr { age }.with { weight and height }
149
+ *
150
+ * // Compute pairwise correlations between all columns of type `Number`
151
+ * df.corr { colsOf<Number>() }.withItself()
152
+ * ```
153
+ * @param [columns\] The [Columns Selector][ColumnsSelector] used to select the columns
154
+ * of this [DataFrame] to compute a correlation.
155
+ * @return A [Corr] intermediate object with the selected columns.
156
+ */
25
157
public fun <T , C > DataFrame<T>.corr (columns : ColumnsSelector <T , C >): Corr <T , C > = Corr (this , columns)
26
158
159
+ /* *
160
+ * {@include [CommonCorrDocs]}
161
+ * @include [SelectingColumns.ColumnNames] {@include [SetCorrOperationArg]}
162
+ *
163
+ * The function is available for numeric- and [Boolean] columns.
164
+ * [Boolean] values are converted into 1 for true and 0 for false.
165
+ * All other columns are ignored.
166
+ * If a [ColumnGroup] instance is passed as the target column for correlation,
167
+ * it will be unpacked into suitable nested columns.
168
+ *
169
+ * ### Examples
170
+ * ```kotlin
171
+ * // Compute correlations between the "age" column and the "weight" and "height" columns
172
+ * df.corr { age }.with { weight and height }
173
+ *
174
+ * // Compute pairwise correlations between all columns of type `Number`
175
+ * df.corr { colsOf<Number>() }.withItself()
176
+ * ```
177
+ * @param [columns\] The [Column Names][String] used to select the columns
178
+ * of this [DataFrame] to compute a correlation.
179
+ * @return A [Corr] intermediate object with the selected columns.
180
+ */
27
181
public fun <T > DataFrame<T>.corr (vararg columns : String ): Corr <T , Any ?> = corr { columns.toColumnSet() }
28
182
29
183
@Deprecated(DEPRECATED_ACCESS_API )
@@ -34,8 +188,67 @@ public fun <T, C> DataFrame<T>.corr(vararg columns: KProperty<C>): Corr<T, C> =
34
188
@AccessApiOverload
35
189
public fun <T , C > DataFrame<T>.corr (vararg columns : ColumnReference <C >): Corr <T , C > = corr { columns.toColumnSet() }
36
190
191
+ /* *
192
+ * Calculates the correlation of specified [columns][otherColumns]
193
+ * with values in the columns previously selected with [corr].
194
+ *
195
+ * Returns a correlation matrix represented by a [DataFrame],
196
+ * where rows and columns correspond to the selected column sets,
197
+ * and each cell contains the Pearson correlation coefficient between the corresponding pair of columns.
198
+ *
199
+ * Check out [Grammar].
200
+ *
201
+ * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention]
202
+ *
203
+ * See also: [Selecting Columns][SelectingOptions].
204
+ *
205
+ * For more information, see: {@include [DocumentationUrls.Corr]}
206
+ */
207
+ internal interface CorrWithDocs
208
+
209
+ /* *
210
+ * {@include [CorrWithDocs]}
211
+ * ### This Corr With Overload
212
+ */
213
+ @ExcludeFromSources
214
+ private interface CommonCorrWithDocs
215
+
216
+ /* *
217
+ * {@include [CommonCorrWithDocs]}
218
+ * @include [SelectingColumns.Dsl] {@include [SetCorrOperationArg]}
219
+ *
220
+ * ### Examples
221
+ * ```kotlin
222
+ * // Compute correlations between the "age" column and the "weight" and "height" columns
223
+ * df.corr { age }.with { weight and height }
224
+ *
225
+ * // Compute correlations between the "speed" column and all columns of type `Double` (excluding itself)
226
+ * df.corr { speed }.with { colsOf<Double>() except speed }
227
+ * ```
228
+ *
229
+ * @param otherColumns The [ColumnsSelector] used to select the second set of columns
230
+ * from this [DataFrame] to compute correlations against the initially selected columns.
231
+ * @return A [DataFrame] containing the resulting correlation matrix.
232
+ */
37
233
public fun <T , C , R > Corr <T , C >.with (otherColumns : ColumnsSelector <T , R >): DataFrame <T > = corrImpl(otherColumns)
38
234
235
+ /* *
236
+ * {@include [CommonCorrWithDocs]}
237
+ * @include [SelectingColumns.ColumnNames] {@include [SetCorrOperationArg]}
238
+ *
239
+ * ### Examples
240
+ * ```kotlin
241
+ * // Compute correlations between the "age" column and the "weight" and "height" columns
242
+ * df.corr("age").with("weight", "height")
243
+ *
244
+ * // Compute correlations between the "speed" column and all columns of type `Number`
245
+ * df.corr { colsOf<Number>() }.with("speed")
246
+ * ```
247
+ *
248
+ * @param otherColumns The [Column Names][String] used to select the second set of columns
249
+ * from this [DataFrame] to compute correlations against the initially selected columns.
250
+ * @return A [DataFrame] containing the resulting correlation matrix.
251
+ */
39
252
public fun <T , C > Corr <T , C >.with (vararg otherColumns : String ): DataFrame <T > = with { otherColumns.toColumnSet() }
40
253
41
254
@Deprecated(DEPRECATED_ACCESS_API )
@@ -48,6 +261,20 @@ public fun <T, C, R> Corr<T, C>.with(vararg otherColumns: KProperty<R>): DataFra
48
261
public fun <T , C , R > Corr <T , C >.with (vararg otherColumns : ColumnReference <R >): DataFrame <T > =
49
262
with { otherColumns.toColumnSet() }
50
263
264
+ /* *
265
+ * Calculates Pearson pairwise correlations between the columns
266
+ * previously selected with [corr].
267
+ *
268
+ * Returns a square correlation matrix represented by a [DataFrame],
269
+ * where both rows and columns correspond to the selected columns,
270
+ * and each cell contains the Pearson correlation coefficient between the respective pair of columns.
271
+ *
272
+ * Check out [Grammar].
273
+ *
274
+ * For more information, see: {@include [DocumentationUrls.Corr]}
275
+ *
276
+ * @return A [DataFrame] containing the pairwise correlation matrix.
277
+ */
51
278
public fun <T , C > Corr <T , C >.withItself (): DataFrame <T > = with (columns)
52
279
53
280
// endregion
0 commit comments