@@ -538,7 +538,7 @@ public actor DataFrame: Sendable {
538538 /// - right: Right side of the join operation.
539539 /// - usingColumn: Name of the column to join on. This column must exist on both sides.
540540 /// - joinType: Type of join to perform. Default `inner`.
541- /// - Returns: <#description#>
541+ /// - Returns: A `DataFrame`.
542542 public func join( _ right: DataFrame , _ usingColumn: String , _ joinType: String = " inner " ) async -> DataFrame {
543543 await join ( right, [ usingColumn] , joinType)
544544 }
@@ -588,7 +588,7 @@ public actor DataFrame: Sendable {
588588
589589 /// Explicit cartesian join with another `DataFrame`.
590590 /// - Parameter right: Right side of the join operation.
591- /// - Returns: Cartesian joins are very expensive without an extra filter that can be pushed down .
591+ /// - Returns: A `DataFrame` .
592592 public func crossJoin( _ right: DataFrame ) async -> DataFrame {
593593 let rightPlan = await ( right. getPlan ( ) as! Plan ) . root
594594 let plan = SparkConnectClient . getJoin ( self . plan. root, rightPlan, JoinType . cross)
@@ -676,6 +676,63 @@ public actor DataFrame: Sendable {
676676 return DataFrame ( spark: self . spark, plan: plan)
677677 }
678678
679+ private func buildRepartition( numPartitions: Int32 , shuffle: Bool ) -> DataFrame {
680+ let plan = SparkConnectClient . getRepartition ( self . plan. root, numPartitions, shuffle)
681+ return DataFrame ( spark: self . spark, plan: plan)
682+ }
683+
684+ private func buildRepartitionByExpression( numPartitions: Int32 ? , partitionExprs: [ String ] ) -> DataFrame {
685+ let plan = SparkConnectClient . getRepartitionByExpression ( self . plan. root, partitionExprs, numPartitions)
686+ return DataFrame ( spark: self . spark, plan: plan)
687+ }
688+
689+ /// Returns a new ``DataFrame`` that has exactly `numPartitions` partitions.
690+ /// - Parameter numPartitions: The number of partitions.
691+ /// - Returns: A `DataFrame`.
692+ public func repartition( _ numPartitions: Int32 ) -> DataFrame {
693+ return buildRepartition ( numPartitions: numPartitions, shuffle: true )
694+ }
695+
696+ /// Returns a new ``DataFrame`` partitioned by the given partitioning expressions, using
697+ /// `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash
698+ /// partitioned.
699+ /// - Parameter partitionExprs: The partition expression strings.
700+ /// - Returns: A `DataFrame`.
701+ public func repartition( _ partitionExprs: String ... ) -> DataFrame {
702+ return buildRepartitionByExpression ( numPartitions: nil , partitionExprs: partitionExprs)
703+ }
704+
705+ /// Returns a new ``DataFrame`` partitioned by the given partitioning expressions, using
706+ /// `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash
707+ /// partitioned.
708+ /// - Parameters:
709+ /// - numPartitions: The number of partitions.
710+ /// - partitionExprs: The partition expression strings.
711+ /// - Returns: A `DataFrame`.
712+ public func repartition( _ numPartitions: Int32 , _ partitionExprs: String ... ) -> DataFrame {
713+ return buildRepartitionByExpression ( numPartitions: numPartitions, partitionExprs: partitionExprs)
714+ }
715+
716+ /// Returns a new ``DataFrame`` partitioned by the given partitioning expressions, using
717+ /// `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash
718+ /// partitioned.
719+ /// - Parameter partitionExprs: The partition expression strings.
720+ /// - Returns: A `DataFrame`.
721+ public func repartitionByExpression( _ numPartitions: Int32 ? , _ partitionExprs: String ... ) -> DataFrame {
722+ return buildRepartitionByExpression ( numPartitions: numPartitions, partitionExprs: partitionExprs)
723+ }
724+
725+ /// Returns a new ``DataFrame`` that has exactly `numPartitions` partitions, when the fewer partitions
726+ /// are requested. If a larger number of partitions is requested, it will stay at the current
727+ /// number of partitions. Similar to coalesce defined on an `RDD`, this operation results in a
728+ /// narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a
729+ /// shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.
730+ /// - Parameter numPartitions: The number of partitions.
731+ /// - Returns: A `DataFrame`.
732+ public func coalesce( _ numPartitions: Int32 ) -> DataFrame {
733+ return buildRepartition ( numPartitions: numPartitions, shuffle: false )
734+ }
735+
679736 /// Returns a ``DataFrameWriter`` that can be used to write non-streaming data.
680737 public var write : DataFrameWriter {
681738 get {
0 commit comments