diff --git a/core/api/core.api b/core/api/core.api index e76e11eaba..4f5c393e97 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -2662,8 +2662,6 @@ public final class org/jetbrains/kotlinx/dataframe/api/JoinKt { public static final fun fullJoin (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lkotlin/jvm/functions/Function2;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun fullJoin (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static synthetic fun fullJoin$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lkotlin/jvm/functions/Function2;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; - public static final fun getAllowLeftNulls (Lorg/jetbrains/kotlinx/dataframe/api/JoinType;)Z - public static final fun getAllowRightNulls (Lorg/jetbrains/kotlinx/dataframe/api/JoinType;)Z public static final fun innerJoin (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lkotlin/jvm/functions/Function2;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun innerJoin (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static synthetic fun innerJoin$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lkotlin/jvm/functions/Function2;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/join.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/join.kt index 99cecc1baf..9d49458831 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/join.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/join.kt @@ -1,6 +1,7 @@ package org.jetbrains.kotlinx.dataframe.api import org.jetbrains.kotlinx.dataframe.ColumnsContainer +import org.jetbrains.kotlinx.dataframe.ColumnsSelector import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload import org.jetbrains.kotlinx.dataframe.annotations.Interpretable @@ -12,6 +13,9 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath import org.jetbrains.kotlinx.dataframe.columns.ColumnsResolver import org.jetbrains.kotlinx.dataframe.columns.UnresolvedColumnsPolicy import org.jetbrains.kotlinx.dataframe.columns.toColumnSet +import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls +import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources +import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns import org.jetbrains.kotlinx.dataframe.impl.DataFrameReceiver import org.jetbrains.kotlinx.dataframe.impl.api.extractJoinColumns import org.jetbrains.kotlinx.dataframe.impl.api.joinImpl @@ -19,6 +23,94 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnListImpl import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API import kotlin.reflect.KProperty +/** + * If no join columns are specified, all columns with matching names in both [DataFrame]s are used. + * + * If both [DataFrame]s contain columns with the same name that are *not* part of the join keys, + * such columns are treated as distinct. Such a column from the right [DataFrame] will be + * [automatically renamed][org.jetbrains.kotlinx.dataframe.documentation.AutoRenaming] + * in the resulting [DataFrame]. + */ +@ExcludeFromSources +private interface JoinBehavior + +/** + * Joins this [DataFrame] with the [other][\other] [DataFrame] using the selected key columns. + * + * Creates a new [DataFrame] by combining [rows][org.jetbrains.kotlinx.dataframe.DataRow] + * from two input dataframes according to one or more matching key columns. + * + * {@include [JoinTypeDescription]} + * + * @include [JoinBehavior] + * + * Each join type has a corresponding shortcut function: + * [innerJoin], [leftJoin], [rightJoin], [fullJoin], [filterJoin], and [excludeJoin]. + * + * See also [joinWith], which performs a join by matching row values condition. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `join` overload + */ +@ExcludeFromSources +private interface JoinDocs + +// `join` method used in the example +@Suppress("ClassName") +@ExcludeFromSources +private interface JOIN_METHOD + +/** + * [JoinDsl] allows you to define the columns used for joining [DataFrame]s + * and provides methods to match columns with different names + * between the left and right side. + * + * Provides the left [DataFrame] both as the receiver (`this`) and as the argument (`it`), + * allowing you to reference its columns directly. + * Use [right][JoinDsl.right] to access columns from the right [DataFrame], + * and [match][JoinDsl.match] to explicitly pair columns with different names. + * + * See also [Columns selection via DSL][SelectingColumns.Dsl]. + * + * ### Examples + * ```kotlin + * // Join by two columns with the same names in both dataframes + * dfLeft.{@get [JoinMethod] join}(dfRight) { name and city } + * + * // Join by one column with different names — + * // "firstName" in the left dataframe and "name" in the right one + * dfLeft.{@get [JoinMethod] join}(dfRight) { left -> left.firstName match right.name } + * + * // Match columns using String API + * dfLeft.{@get [JoinMethod] join}(dfRight) { "symbol" match right.getValue("char") } + * ``` + */ +@ExcludeFromSources +internal interface JoinDslDescription + +/** + * Select join columns (including those that have different names in different [DataFrame]s) + * using [JoinDsl]. + * + * @include [JoinDslDescription] + */ +@ExcludeFromSources +private interface SelectingColumnsJoinDsl + +/** + * @include [JoinDocs] + * @include [SelectingColumnsJoinDsl] + * @param other [DataFrame] to join with. + * @param type [JoinType] defining how the resulting rows are constructed. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("Join0") public fun DataFrame.join( @@ -27,12 +119,62 @@ public fun DataFrame.join( selector: JoinColumnsSelector? = null, ): DataFrame = joinImpl(other, type, addNewColumns = type.addNewColumns, selector) +/** + * ### Example + * ```kotlin + * // Join by two columns with the same names in both dataframes + * dfLeft.{@get [JoinMethod] join}(dfRight, "name", "city") + * ``` + */ +@ExcludeFromSources +private interface JoinStringApiExample + +/** + * @include [JoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @param type [JoinType] defining how the resulting rows are constructed. + * @return joined [DataFrame]. + */ public fun DataFrame.join( other: DataFrame, vararg columns: String, type: JoinType = JoinType.Inner, ): DataFrame = join(other, type) { columns.toColumnSet() } +/** + * Performs an [inner join][JoinType.Inner] of this [DataFrame] with the [other][\other] [DataFrame] + * using the selected key columns. + * @include [InnerJoinTypeDocs] + * + * This is a shortcut for [join] with [JoinType.Inner]. + * + * @include [JoinBehavior] + * + * See also general [join], as well as other shortcuts with each of join types: + * [leftJoin], [rightJoin], [fullJoin], [filterJoin], [excludeJoin]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `innerJoin` overload + */ +@ExcludeFromSources +private interface InnerJoinDocs + +/** + * @include [InnerJoinDocs] + * @include [SelectingColumnsJoinDsl] {@set [JOIN_METHOD] innerJoin} + * @param other [DataFrame] to join with. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("InnerJoin") public fun DataFrame.innerJoin( @@ -40,9 +182,48 @@ public fun DataFrame.innerJoin( selector: JoinColumnsSelector? = null, ): DataFrame = join(other, JoinType.Inner, selector = selector) +/** + * @include [InnerJoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] {@set [JOIN_METHOD] innerJoin} + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @return joined [DataFrame]. + */ public fun DataFrame.innerJoin(other: DataFrame, vararg columns: String): DataFrame = innerJoin(other) { columns.toColumnSet() } +/** + * Performs a [left join][JoinType.Left] of this [DataFrame] with the [other][\other] [DataFrame] + * using the selected key columns. + * @include [LeftJoinTypeDocs] + * + * This is a shortcut for [join] with [JoinType.Left]. + * + * @include [JoinBehavior] + * + * See also general [join], as well as other shortcuts with each of join types: + * [innerJoin], [rightJoin], [fullJoin], [filterJoin], [excludeJoin]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `leftJoin` overload + */ +@ExcludeFromSources +private interface LeftJoinDocs + +/** + * @include [LeftJoinDocs] + * @include [SelectingColumnsJoinDsl] {@set [JOIN_METHOD] leftJoin} + * @param other [DataFrame] to join with. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("LeftJoin") public fun DataFrame.leftJoin( @@ -50,9 +231,48 @@ public fun DataFrame.leftJoin( selector: JoinColumnsSelector? = null, ): DataFrame = join(other, JoinType.Left, selector = selector) +/** + * @include [LeftJoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] {@set [JOIN_METHOD] leftJoin} + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @return joined [DataFrame]. + */ public fun DataFrame.leftJoin(other: DataFrame, vararg columns: String): DataFrame = leftJoin(other) { columns.toColumnSet() } +/** + * Performs a [right join][JoinType.Right] of this [DataFrame] with the [other][\other] [DataFrame] + * using the selected key columns. + * @include [RightJoinTypeDocs] + * + * This is a shortcut for [join] with [JoinType.Right]. + * + * @include [JoinBehavior] + * + * See also general [join], as well as other shortcuts with each of join types: + * [innerJoin], [leftJoin], [fullJoin], [filterJoin], [excludeJoin]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `rightJoin` overload + */ +@ExcludeFromSources +private interface RightJoinDocs + +/** + * @include [RightJoinDocs] + * @include [SelectingColumnsJoinDsl] {@set [JOIN_METHOD] rightJoin} + * @param other [DataFrame] to join with. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("RightJoin") public fun DataFrame.rightJoin( @@ -60,9 +280,48 @@ public fun DataFrame.rightJoin( selector: JoinColumnsSelector? = null, ): DataFrame = join(other, JoinType.Right, selector = selector) +/** + * @include [RightJoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] {@set [JOIN_METHOD] rightJoin} + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @return joined [DataFrame]. + */ public fun DataFrame.rightJoin(other: DataFrame, vararg columns: String): DataFrame = rightJoin(other) { columns.toColumnSet() } +/** + * Performs a [full join][JoinType.Full] of this [DataFrame] with the [other][\other] [DataFrame] + * using the selected key columns. + * @include [FullJoinTypeDocs] + * + * This is a shortcut for [join] with [JoinType.Full]. + * + * @include [JoinBehavior] + * + * See also general [join], as well as other shortcuts with each of join types: + * [innerJoin], [leftJoin], [rightJoin], [filterJoin], [excludeJoin]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `fullJoin` overload + */ +@ExcludeFromSources +private interface FullJoinDocs + +/** + * @include [FullJoinDocs] + * @include [SelectingColumnsJoinDsl] {@set [JOIN_METHOD] fullJoin} + * @param other [DataFrame] to join with. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("FullJoin") public fun DataFrame.fullJoin( @@ -70,9 +329,48 @@ public fun DataFrame.fullJoin( selector: JoinColumnsSelector? = null, ): DataFrame = join(other, JoinType.Full, selector = selector) +/** + * @include [FullJoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] {@set [JOIN_METHOD] fullJoin} + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @return joined [DataFrame]. + */ public fun DataFrame.fullJoin(other: DataFrame, vararg columns: String): DataFrame = fullJoin(other) { columns.toColumnSet() } +/** + * Performs a [filter join][JoinType.Filter] of this [DataFrame] with the [other][\other] [DataFrame] + * using the selected key columns. + * @include [FilterJoinTypeDocs] + * + * This is a shortcut for [join] with [JoinType.Filter]. + * + * @include [JoinBehavior] + * + * See also general [join], as well as other shortcuts with each of join types: + * [innerJoin], [leftJoin], [rightJoin], [fullJoin], [excludeJoin]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `filterJoin` overload + */ +@ExcludeFromSources +private interface FilterJoinDocs + +/** + * @include [FilterJoinDocs] + * @include [SelectingColumnsJoinDsl] {@set [JOIN_METHOD] filterJoin} + * @param other [DataFrame] to join with. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("FilterJoin") public fun DataFrame.filterJoin( @@ -80,9 +378,48 @@ public fun DataFrame.filterJoin( selector: JoinColumnsSelector? = null, ): DataFrame = joinImpl(other, JoinType.Inner, addNewColumns = false, selector = selector) +/** + * @include [FilterJoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] {@set [JOIN_METHOD] filterJoin} + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @return joined [DataFrame]. + */ public fun DataFrame.filterJoin(other: DataFrame, vararg columns: String): DataFrame = filterJoin(other) { columns.toColumnSet() } +/** + * Performs an [exclude join][JoinType.Exclude] of this [DataFrame] with the [other][\other] [DataFrame] + * using the selected key columns. + * @include [ExcludeJoinTypeDocs] + * + * This is a shortcut for [join] with [JoinType.Exclude]. + * + * @include [JoinBehavior] + * + * See also general [join], as well as other shortcuts with each of join types: + * [innerJoin], [leftJoin], [rightJoin], [filterJoin], [fullJoin]. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * See [Selecting Columns][SelectingColumns]. + * + * For more information, {@include [DocumentationUrls.Join]}. + * + * ### This `excludeJoin` overload + */ +@ExcludeFromSources +private interface ExcludeJoinDocs + +/** + * @include [ExcludeJoinDocs] + * @include [SelectingColumnsJoinDsl] {@set [JOIN_METHOD] excludeJoin} + * @param other [DataFrame] to join with. + * @param selector [JoinColumnsSelector] specifying join columns; + * if `null`, same-name columns are used. + * @return joined [DataFrame]. + */ @Refine @Interpretable("ExcludeJoin") public fun DataFrame.excludeJoin( @@ -90,26 +427,71 @@ public fun DataFrame.excludeJoin( selector: JoinColumnsSelector? = null, ): DataFrame = joinImpl(other, JoinType.Exclude, addNewColumns = false, selector = selector) +/** + * @include [ExcludeJoinDocs] + * @include [SelectingColumns.ColumnNames] + * @include [JoinStringApiExample] {@set [JOIN_METHOD] excludeJoin} + * @param other [DataFrame] to join with. + * @param columns [Column Names][String] specifying join columns. + * @return joined [DataFrame]. + */ public fun DataFrame.excludeJoin(other: DataFrame, vararg columns: String): DataFrame = excludeJoin(other) { columns.toColumnSet() } +/** + * Joins all [DataFrame]s in this iterable into a single [DataFrame]. + * + * Sequentially applies the [join] operation to each [DataFrame] in order. + * Returns `null` if the iterable is empty. + * + * @param [joinType] [JoinType] defining how rows are matched and combined. + * @param [selector] optional [JoinColumnsSelector] specifying key columns. + * @return resulting [DataFrame], or `null` if the iterable is empty. + */ public fun Iterable>.joinOrNull( joinType: JoinType = JoinType.Inner, selector: JoinColumnsSelector? = null, ): DataFrame? = fold, DataFrame?>(null) { joined, new -> joined?.join(new, joinType, selector = selector) ?: new } +/** + * A specialized [ColumnsSelectionDsl] that allows specifying [join] matching columns + * with different names in left and right [DataFrame]s. + * + * @include [JoinDslDescription] + */ public interface JoinDsl : ColumnsSelectionDsl { + /** + * Provides access to columns of the right [DataFrame] + * for further matching with left columns [match]. + */ public val right: DataFrame + /** + * Matches columns from the left and right [DataFrame]s for [joining][join]. + * + * The receiver column must belong to the left [DataFrame], + * and the argument ([\other]) column must belong to the right [DataFrame]. + * + * @receiver column from the left [DataFrame]. + * @param [other] column from the right [DataFrame]. + * @return [ColumnMatch] representing the column pair used for joining. + */ + @ExcludeFromSources + private interface MatchDocs + + /** @include [MatchDocs] */ @Interpretable("Match0") public infix fun ColumnReference.match(other: ColumnReference): ColumnMatch = ColumnMatch(this, other) + /** @include [MatchDocs] */ public infix fun String.match(other: ColumnReference): ColumnMatch = ColumnMatch(toColumnOf(), other) + /** @include [MatchDocs] */ public infix fun ColumnReference.match(other: String): ColumnMatch = ColumnMatch(this, other.toColumnOf()) + /** @include [MatchDocs] */ public infix fun String.match(other: String): ColumnMatch = ColumnMatch(toColumnAccessor(), other.toColumnAccessor()) @@ -129,6 +511,12 @@ public interface JoinDsl : ColumnsSelectionDsl { ColumnMatch(toColumnAccessor(), other) public companion object { + /** + * **For internal use only.** + * Not intended for public API consumption. + * + * Used in Compiler Plugin. + */ public fun defaultJoinColumns(left: DataFrame, right: DataFrame): JoinColumnsSelector = { left.columnNames().intersect(right.columnNames().toSet()) @@ -136,6 +524,12 @@ public interface JoinDsl : ColumnsSelectionDsl { .let { ColumnListImpl(it) } } + /** + * **For internal use only.** + * Not intended for public API consumption. + * + * Used in Compiler Plugin. + */ public fun getColumns( left: DataFrame, other: DataFrame, @@ -150,6 +544,9 @@ public interface JoinDsl : ColumnsSelectionDsl { } } +/** + * A special [ColumnSet] that specifies a [column match][JoinDsl.match] for the [join] operation. + */ public interface ColumnMatch : ColumnSet { public val left: ColumnReference public val right: ColumnReference @@ -162,30 +559,142 @@ internal class ColumnMatchImpl(override val left: ColumnReference, overrid throw UnsupportedOperationException() } +/** + * Creates a [ColumnMatch]. + * + * Not intended for public API consumption. Please use [match][JoinDsl.match] instead. + */ public fun ColumnMatch(left: ColumnReference, right: ColumnReference): ColumnMatch = ColumnMatchImpl(left, right) +/** + * A specialized [ColumnsSelector] used for matching columns in a [join] operation. + * + * Provides [JoinDsl] both as the receiver and the lambda parameter, and expects + * a [ColumnsResolver] as the return value. + * + * Enables defining matching columns from left and right [DataFrame]s + * using [right][JoinDsl.right] and [match][JoinDsl.match]. + */ public typealias JoinColumnsSelector = JoinDsl.(ColumnsContainer) -> ColumnsResolver<*> +/** + * Includes only matching rows from both [DataFrame]s; + * rows are merged. + */ +@ExcludeFromSources +internal interface InnerJoinTypeDocs + +/** + * Includes all rows from the left [DataFrame]; matching rows are merged, + * unmatched right-side values are filled with `null`. + */ +@ExcludeFromSources +internal interface LeftJoinTypeDocs + +/** + * Includes all rows from the right [DataFrame]; matching rows are merged, + * unmatched left-side values are filled with `null`. + */ +@ExcludeFromSources +internal interface RightJoinTypeDocs + +/** + * Includes only rows from the left [DataFrame] that have a match in the right one; + * right-side columns are not merged. + */ +@ExcludeFromSources +internal interface FilterJoinTypeDocs + +/** + * Includes all rows from both [DataFrame]s; matching rows are merged, + * all mismatches are filled with `null`. + */ +@ExcludeFromSources +internal interface FullJoinTypeDocs + +/** + * Includes only rows from the left [DataFrame] that do *not* have a match in the right one; + * right-side columns are not merged. + */ +@ExcludeFromSources +internal interface ExcludeJoinTypeDocs + +/** + * Represents the type of [join] operation. + * + * {@include [JoinTypeDescription]} + */ public enum class JoinType { - Left, // all data from left dataframe, nulls for mismatches in right dataframe - Right, // all data from right dataframe, nulls for mismatches in left dataframe - Inner, // only matched data from right and left dataframe - Filter, // only matched data from left dataframe - Full, // all data from left and from right dataframe, nulls for any mismatches - Exclude, // mismatched rows from left dataframe + + /** + * Includes all rows from the left [DataFrame]; matching rows are merged, + * unmatched right-side values are filled with `null`. + */ + Left, + + /** + * Includes all rows from the right [DataFrame]; matching rows are merged, + * unmatched left-side values are filled with `null`. + */ + Right, + + /** + * Includes only matching rows from both [DataFrame]s; + * rows are merged. + */ + Inner, + + /** + * Includes only rows from the left [DataFrame] that have a match in the right one; + * right-side columns are not merged. + */ + Filter, + + /** + * Includes all rows from both [DataFrame]s; matching rows are merged, + * all mismatches are filled with `null`. + */ + Full, + + /** + * Includes only rows from the left [DataFrame] that do *not* have a match in the right one; + * right-side columns are not merged. + */ + Exclude, } +/** + * There are two categories of joins: + * * **Merging joins** — merge matching rows from both [DataFrame]s into a single row. + * * **Non-merging joins** — select rows from the left [DataFrame] based on whether + * a match exists in the right one, without merging columns. + * + * The exact behavior depends on the specified [join type][\type]: + * + * **Merging joins:** + * * [JoinType.Inner] (default) — {@include [InnerJoinTypeDocs]} + * * [JoinType.Left] — {@include [LeftJoinTypeDocs]} + * * [JoinType.Right] — {@include [RightJoinTypeDocs]} + * * [JoinType.Full] — {@include [FullJoinTypeDocs]} + * + * **Non-merging joins:** + * * [JoinType.Filter] — {@include [FilterJoinTypeDocs]} + * * [JoinType.Exclude] — {@include [ExcludeJoinTypeDocs]} + */ +@ExcludeFromSources +internal interface JoinTypeDescription + internal val JoinType.addNewColumns: Boolean get() = when (this) { JoinType.Filter, JoinType.Exclude -> false JoinType.Left, JoinType.Right, JoinType.Inner, JoinType.Full -> true } -public val JoinType.allowLeftNulls: Boolean +internal val JoinType.allowLeftNulls: Boolean get() = this == JoinType.Right || this == JoinType.Full -public val JoinType.allowRightNulls: Boolean +internal val JoinType.allowRightNulls: Boolean get() = this == JoinType.Left || this == JoinType.Full || this == JoinType.Exclude diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/joinWith.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/joinWith.kt index b0a17ae44f..87db6a0ba2 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/joinWith.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/joinWith.kt @@ -5,14 +5,101 @@ import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.Selector import org.jetbrains.kotlinx.dataframe.annotations.Interpretable import org.jetbrains.kotlinx.dataframe.annotations.Refine +import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls +import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources +import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns import org.jetbrains.kotlinx.dataframe.impl.api.joinWithImpl +/** + * A [JoinExpression] defines the matching condition between [rows][DataRow] of the two [DataFrame]s. + * It provides access to row values from both the left and right [DataFrame]s + * and expects a [Boolean] result indicating whether the rows match. + * All combinations of rows from the left- and right [DataFrame] that satisfies + * this condition are matched. + * + * This method is useful when rows should be matched based on custom logic + * rather than simple values equality. + * + * Creates a new [DataFrame] by combining [rows][DataRow] + * from both inputs according to the [\joinExpression] matching rule. + */ +@ExcludeFromSources +private interface JoinWithCommonDescription + +// `joinWith` method used in the example +@Suppress("ClassName") +@ExcludeFromSources +private interface JOIN_WITH_METHOD + +/** + * ### Examples + * ```kotlin + * // Join rows where the `fullName` value in the left `DataFrame` + * // contains the `firstName` value in the right `DataFrame`. + * dfLeft.{@get [JoinWithMethod] joinWith}(dfRight) { left -> left.fullName.contains(right.firstName) } + * + * // Join rows where the `date` value in the right `DataFrame` + * // falls within the interval defined by the `startDate` and `endDate` + * // values in the left `DataFrame`. + * dfLeft.{@get [JoinWithMethod] joinWith}(dfRight) { right.date in startDate..endDate } + * + * // String API; join rows where `score` value in the left `DataFrame` is higher than 3.4 + * // and the `passed` value in the right `DataFrame` is `true`. + * dfLeft.{@get [JoinWithMethod] joinWith}(dfRight) { "score"() > 3.4 && right["passed"] as Boolean } + * ``` + */ +@ExcludeFromSources +private interface JoinWithExample + +/** + * A specialized [DataRow] used in a [JoinExpression]. + * + * Represents a row from the left [DataFrame] (as the receiver) + * and provides access to the row from the right [DataFrame] via [right]. + */ public interface JoinedDataRow : DataRow { public val right: DataRow } +/** + * A special [row][DataRow] expression used to define + * the row-matching condition in a [joinWith] operation. + * + * Provides the [row][DataRow] of the left [DataFrame] both + * as the receiver (`this`) and as the argument (`it`), + * allowing you to reference its values directly. + * + * The [row][DataRow] of the right [DataFrame] is available + * as [right][JoinedDataRow.right]. + * + * The expression must return a [Boolean] indicating whether + * the rows from the left and right [DataFrame]s match. + */ public typealias JoinExpression = Selector, Boolean> +/** + * Joins this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. + * + * @include [JoinWithCommonDescription] + * + * {@include [JoinTypeDescription]} + * + * Each join type has a corresponding shortcut function: + * [innerJoinWith], [leftJoinWith], [rightJoinWith], [fullJoinWith], [filterJoinWith], and [excludeJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] + * @param [right] [DataFrame] to join with. + * @param [type] [JoinType] defining how rows are matched and combined. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("JoinWith") public fun DataFrame.joinWith( @@ -21,31 +108,163 @@ public fun DataFrame.joinWith( joinExpression: JoinExpression, ): DataFrame = joinWithImpl(right, type, addNewColumns = type.addNewColumns, joinExpression) +/** + * Performs an [inner join][JoinType.Inner] of this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. {@include [InnerJoinTypeDocs]} + * + * This is a shortcut for [joinWith] with [JoinType.Inner]. + * + * @include [JoinWithCommonDescription] + * + * See also general [joinWith] as well as other shortcuts with each of join types: + * [leftJoinWith], [rightJoinWith], [fullJoinWith], [filterJoinWith], [excludeJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] {@set [JOIN_WITH_METHOD] innerJoinWith} + * @param [right] [DataFrame] to join with. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("InnerJoinWith") public fun DataFrame.innerJoinWith(right: DataFrame, joinExpression: JoinExpression): DataFrame = joinWith(right, JoinType.Inner, joinExpression) +/** + * Performs a [left join][JoinType.Left] of this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. {@include [LeftJoinTypeDocs]} + * + * This is a shortcut for [joinWith] with [JoinType.Left]. + * + * @include [JoinWithCommonDescription] + * + * See also general [joinWith] as well as other shortcuts with each of join types: + * [innerJoinWith], [rightJoinWith], [fullJoinWith], [filterJoinWith], [excludeJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] {@set [JOIN_WITH_METHOD] leftJoinWith} + * @param [right] [DataFrame] to join with. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("LeftJoinWith") public fun DataFrame.leftJoinWith(right: DataFrame, joinExpression: JoinExpression): DataFrame = joinWith(right, JoinType.Left, joinExpression) +/** + * Performs a [right join][JoinType.Right] of this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. {@include [RightJoinTypeDocs]} + * + * This is a shortcut for [joinWith] with [JoinType.Right]. + * + * @include [JoinWithCommonDescription] + * + * See also general [joinWith] as well as other shortcuts with each of join types: + * [innerJoinWith], [leftJoinWith], [fullJoinWith], [filterJoinWith], [excludeJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] {@set [JOIN_WITH_METHOD] rightJoinWith} + * @param [right] [DataFrame] to join with. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("RightJoinWith") public fun DataFrame.rightJoinWith(right: DataFrame, joinExpression: JoinExpression): DataFrame = joinWith(right, JoinType.Right, joinExpression) +/** + * Performs a [full join][JoinType.Full] of this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. {@include [FullJoinTypeDocs]} + * + * This is a shortcut for [joinWith] with [JoinType.Full]. + * + * @include [JoinWithCommonDescription] + * + * See also general [joinWith] as well as other shortcuts with each of join types: + * [leftJoinWith], [rightJoinWith], [innerJoinWith], [filterJoinWith], [excludeJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] {@set [JOIN_WITH_METHOD] fullJoinWith} + * @param [right] [DataFrame] to join with. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("FullJoinWith") public fun DataFrame.fullJoinWith(right: DataFrame, joinExpression: JoinExpression): DataFrame = joinWith(right, JoinType.Full, joinExpression) +/** + * Performs a [filter join][JoinType.Filter] of this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. {@include [FilterJoinTypeDocs]} + * + * This is a shortcut for [joinWith] with [JoinType.Filter]. + * + * @include [JoinWithCommonDescription] + * + * See also general [joinWith] as well as other shortcuts with each of join types: + * [leftJoinWith], [rightJoinWith], [fullJoinWith], [innerJoinWith], [excludeJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] {@set [JOIN_WITH_METHOD] filterJoinWith} + * @param [right] [DataFrame] to join with. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("FilterJoinWith") public fun DataFrame.filterJoinWith(right: DataFrame, joinExpression: JoinExpression): DataFrame = - joinWithImpl(right, JoinType.Inner, addNewColumns = false, joinExpression) + joinWithImpl(right, JoinType.Filter, addNewColumns = false, joinExpression) +/** + * Performs an [exclude join][JoinType.Exclude] of this [DataFrame] with the [right][\right] [DataFrame] + * using the provided [\joinExpression]. {@include [ExcludeJoinTypeDocs]} + * + * This is a shortcut for [joinWith] with [JoinType.Exclude]. + * + * @include [JoinWithCommonDescription] + * + * See also general [joinWith] as well as other shortcuts with each of join types: + * [leftJoinWith], [rightJoinWith], [fullJoinWith], [filterJoinWith], [innerJoinWith]. + * + * See also [join], which performs a join by exact value equality in the selected columns. + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * For more information, {@include [DocumentationUrls.JoinWith]}. + * + * @include [JoinWithExample] {@set [JOIN_WITH_METHOD] excludeJoinWith} + * @param [right] [DataFrame] to join with. + * @param [joinExpression] [JoinExpression] specifying the rows join condition. + * @return joined [DataFrame]. + */ @Refine @Interpretable("ExcludeJoinWith") public fun DataFrame.excludeJoinWith( diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/AutoRenaming.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/AutoRenaming.kt new file mode 100644 index 0000000000..1624f0a8c6 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/AutoRenaming.kt @@ -0,0 +1,19 @@ +package org.jetbrains.kotlinx.dataframe.documentation + +import org.jetbrains.kotlinx.dataframe.DataFrame + +/** + * ## Auto-renaming in [DataFrame] + * + * In some operations, multiple columns with the same name may appear + * in the resulting [DataFrame]. + * + * In such cases, columns with duplicate names are automatically renamed + * using the pattern `"\$name\$n"`, where `name` is the original column name + * and `n` is a unique index (1, 2, 3, and so on); + * the first time the name of the column is encountered, no number is appended. + * + * It is recommended to [rename][org.jetbrains.kotlinx.dataframe.api.rename] them + * to maintain clarity and improve code readability. + */ +internal interface AutoRenaming diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt index 959cda40b3..006c494270 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DocumentationUrls.kt @@ -179,4 +179,10 @@ internal interface DocumentationUrls { /** [See "`pivot` inside aggregation" on the documentation website.]({@include [Url]}/pivot.html#pivot-inside-aggregate) */ interface PivotInsideAggregationStatistics + + /** [See `join` on the documentation website.]({@include [Url]}/join.html) */ + interface Join + + /** [See `joinWith` on the documentation website.]({@include [Url]}/joinWith.html) */ + interface JoinWith }