@@ -538,7 +538,7 @@ public actor DataFrame: Sendable {
538
538
/// - right: Right side of the join operation.
539
539
/// - usingColumn: Name of the column to join on. This column must exist on both sides.
540
540
/// - joinType: Type of join to perform. Default `inner`.
541
- /// - Returns: <#description#>
541
+ /// - Returns: A `DataFrame`.
542
542
public func join( _ right: DataFrame , _ usingColumn: String , _ joinType: String = " inner " ) async -> DataFrame {
543
543
await join ( right, [ usingColumn] , joinType)
544
544
}
@@ -588,7 +588,7 @@ public actor DataFrame: Sendable {
588
588
589
589
/// Explicit cartesian join with another `DataFrame`.
590
590
/// - Parameter right: Right side of the join operation.
591
- /// - Returns: Cartesian joins are very expensive without an extra filter that can be pushed down .
591
+ /// - Returns: A `DataFrame` .
592
592
public func crossJoin( _ right: DataFrame ) async -> DataFrame {
593
593
let rightPlan = await ( right. getPlan ( ) as! Plan ) . root
594
594
let plan = SparkConnectClient . getJoin ( self . plan. root, rightPlan, JoinType . cross)
@@ -676,6 +676,63 @@ public actor DataFrame: Sendable {
676
676
return DataFrame ( spark: self . spark, plan: plan)
677
677
}
678
678
679
+ private func buildRepartition( numPartitions: Int32 , shuffle: Bool ) -> DataFrame {
680
+ let plan = SparkConnectClient . getRepartition ( self . plan. root, numPartitions, shuffle)
681
+ return DataFrame ( spark: self . spark, plan: plan)
682
+ }
683
+
684
+ private func buildRepartitionByExpression( numPartitions: Int32 ? , partitionExprs: [ String ] ) -> DataFrame {
685
+ let plan = SparkConnectClient . getRepartitionByExpression ( self . plan. root, partitionExprs, numPartitions)
686
+ return DataFrame ( spark: self . spark, plan: plan)
687
+ }
688
+
689
+ /// Returns a new ``DataFrame`` that has exactly `numPartitions` partitions.
690
+ /// - Parameter numPartitions: The number of partitions.
691
+ /// - Returns: A `DataFrame`.
692
+ public func repartition( _ numPartitions: Int32 ) -> DataFrame {
693
+ return buildRepartition ( numPartitions: numPartitions, shuffle: true )
694
+ }
695
+
696
+ /// Returns a new ``DataFrame`` partitioned by the given partitioning expressions, using
697
+ /// `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash
698
+ /// partitioned.
699
+ /// - Parameter partitionExprs: The partition expression strings.
700
+ /// - Returns: A `DataFrame`.
701
+ public func repartition( _ partitionExprs: String ... ) -> DataFrame {
702
+ return buildRepartitionByExpression ( numPartitions: nil , partitionExprs: partitionExprs)
703
+ }
704
+
705
+ /// Returns a new ``DataFrame`` partitioned by the given partitioning expressions, using
706
+ /// `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash
707
+ /// partitioned.
708
+ /// - Parameters:
709
+ /// - numPartitions: The number of partitions.
710
+ /// - partitionExprs: The partition expression strings.
711
+ /// - Returns: A `DataFrame`.
712
+ public func repartition( _ numPartitions: Int32 , _ partitionExprs: String ... ) -> DataFrame {
713
+ return buildRepartitionByExpression ( numPartitions: numPartitions, partitionExprs: partitionExprs)
714
+ }
715
+
716
+ /// Returns a new ``DataFrame`` partitioned by the given partitioning expressions, using
717
+ /// `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash
718
+ /// partitioned.
719
+ /// - Parameter partitionExprs: The partition expression strings.
720
+ /// - Returns: A `DataFrame`.
721
+ public func repartitionByExpression( _ numPartitions: Int32 ? , _ partitionExprs: String ... ) -> DataFrame {
722
+ return buildRepartitionByExpression ( numPartitions: numPartitions, partitionExprs: partitionExprs)
723
+ }
724
+
725
+ /// Returns a new ``DataFrame`` that has exactly `numPartitions` partitions, when the fewer partitions
726
+ /// are requested. If a larger number of partitions is requested, it will stay at the current
727
+ /// number of partitions. Similar to coalesce defined on an `RDD`, this operation results in a
728
+ /// narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a
729
+ /// shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.
730
+ /// - Parameter numPartitions: The number of partitions.
731
+ /// - Returns: A `DataFrame`.
732
+ public func coalesce( _ numPartitions: Int32 ) -> DataFrame {
733
+ return buildRepartition ( numPartitions: numPartitions, shuffle: false )
734
+ }
735
+
679
736
/// Returns a ``DataFrameWriter`` that can be used to write non-streaming data.
680
737
public var write : DataFrameWriter {
681
738
get {
0 commit comments