@@ -91,6 +91,7 @@ import Synchronization
91
91
/// - ``show(_:_:_:)``
92
92
///
93
93
/// ### Transformation Operations
94
+ /// - ``toDF(_:)``
94
95
/// - ``select(_:)``
95
96
/// - ``selectExpr(_:)``
96
97
/// - ``filter(_:)``
@@ -100,6 +101,9 @@ import Synchronization
100
101
/// - ``limit(_:)``
101
102
/// - ``offset(_:)``
102
103
/// - ``drop(_:)``
104
+ /// - ``dropDuplicates(_:)``
105
+ /// - ``dropDuplicatesWithinWatermark(_:)``
106
+ /// - ``distinct()``
103
107
/// - ``withColumnRenamed(_:_:)``
104
108
///
105
109
/// ### Join Operations
@@ -440,13 +444,25 @@ public actor DataFrame: Sendable {
440
444
return DataFrame ( spark: self . spark, plan: plan)
441
445
}
442
446
443
- /// Projects a set of expressions and returns a new ``DataFrame`` .
447
+ /// Selects a subset of existing columns using column names .
444
448
/// - Parameter cols: Column names
445
449
/// - Returns: A ``DataFrame`` with subset of columns.
446
450
public func select( _ cols: String ... ) -> DataFrame {
447
451
return DataFrame ( spark: self . spark, plan: SparkConnectClient . getProject ( self . plan. root, cols) )
448
452
}
449
453
454
+ /// Selects a subset of existing columns using column names.
455
+ /// - Parameter cols: Column names
456
+ /// - Returns: A ``DataFrame`` with subset of columns.
457
+ public func toDF( _ cols: String ... ) -> DataFrame {
458
+ let df = if cols. isEmpty {
459
+ DataFrame ( spark: self . spark, plan: self . plan)
460
+ } else {
461
+ DataFrame ( spark: self . spark, plan: SparkConnectClient . getProject ( self . plan. root, cols) )
462
+ }
463
+ return df
464
+ }
465
+
450
466
/// Projects a set of expressions and returns a new ``DataFrame``.
451
467
/// - Parameter exprs: Expression strings
452
468
/// - Returns: A ``DataFrame`` with subset of columns.
@@ -461,6 +477,24 @@ public actor DataFrame: Sendable {
461
477
return DataFrame ( spark: self . spark, plan: SparkConnectClient . getDrop ( self . plan. root, cols) )
462
478
}
463
479
480
+ /// Returns a new ``DataFrame`` that contains only the unique rows from this ``DataFrame``.
481
+ /// This is an alias for `distinct`. If column names are given, Spark considers only those columns.
482
+ /// - Parameter cols: Column names
483
+ /// - Returns: A ``DataFrame``.
484
+ public func dropDuplicates( _ cols: String ... ) -> DataFrame {
485
+ let plan = SparkConnectClient . getDropDuplicates ( self . plan. root, cols, withinWatermark: false )
486
+ return DataFrame ( spark: self . spark, plan: plan)
487
+ }
488
+
489
+ /// Returns a new Dataset with duplicates rows removed, within watermark.
490
+ /// If column names are given, Spark considers only those columns.
491
+ /// - Parameter cols: Column names
492
+ /// - Returns: A ``DataFrame``.
493
+ public func dropDuplicatesWithinWatermark( _ cols: String ... ) -> DataFrame {
494
+ let plan = SparkConnectClient . getDropDuplicates ( self . plan. root, cols, withinWatermark: true )
495
+ return DataFrame ( spark: self . spark, plan: plan)
496
+ }
497
+
464
498
/// Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain existingName.
465
499
/// - Parameters:
466
500
/// - existingName: A existing column name to be renamed.
@@ -1108,6 +1142,13 @@ public actor DataFrame: Sendable {
1108
1142
return buildRepartition ( numPartitions: numPartitions, shuffle: false )
1109
1143
}
1110
1144
1145
+ /// Returns a new ``Dataset`` that contains only the unique rows from this ``Dataset``.
1146
+ /// This is an alias for `dropDuplicates`.
1147
+ /// - Returns: A `DataFrame`.
1148
+ public func distinct( ) -> DataFrame {
1149
+ return dropDuplicates ( )
1150
+ }
1151
+
1111
1152
/// Groups the DataFrame using the specified columns.
1112
1153
///
1113
1154
/// This method is used to perform aggregations on groups of data.
0 commit comments