@@ -91,6 +91,7 @@ import Synchronization
9191/// - ``show(_:_:_:)``
9292///
9393/// ### Transformation Operations
94+ /// - ``toDF(_:)``
9495/// - ``select(_:)``
9596/// - ``selectExpr(_:)``
9697/// - ``filter(_:)``
@@ -100,6 +101,9 @@ import Synchronization
100101/// - ``limit(_:)``
101102/// - ``offset(_:)``
102103/// - ``drop(_:)``
104+ /// - ``dropDuplicates(_:)``
105+ /// - ``dropDuplicatesWithinWatermark(_:)``
106+ /// - ``distinct()``
103107/// - ``withColumnRenamed(_:_:)``
104108///
105109/// ### Join Operations
@@ -440,13 +444,25 @@ public actor DataFrame: Sendable {
440444 return DataFrame ( spark: self . spark, plan: plan)
441445 }
442446
443- /// Projects a set of expressions and returns a new ``DataFrame`` .
447+ /// Selects a subset of existing columns using column names .
444448 /// - Parameter cols: Column names
445449 /// - Returns: A ``DataFrame`` with subset of columns.
446450 public func select( _ cols: String ... ) -> DataFrame {
447451 return DataFrame ( spark: self . spark, plan: SparkConnectClient . getProject ( self . plan. root, cols) )
448452 }
449453
454+ /// Selects a subset of existing columns using column names.
455+ /// - Parameter cols: Column names
456+ /// - Returns: A ``DataFrame`` with subset of columns.
457+ public func toDF( _ cols: String ... ) -> DataFrame {
458+ let df = if cols. isEmpty {
459+ DataFrame ( spark: self . spark, plan: self . plan)
460+ } else {
461+ DataFrame ( spark: self . spark, plan: SparkConnectClient . getProject ( self . plan. root, cols) )
462+ }
463+ return df
464+ }
465+
450466 /// Projects a set of expressions and returns a new ``DataFrame``.
451467 /// - Parameter exprs: Expression strings
452468 /// - Returns: A ``DataFrame`` with subset of columns.
@@ -461,6 +477,24 @@ public actor DataFrame: Sendable {
461477 return DataFrame ( spark: self . spark, plan: SparkConnectClient . getDrop ( self . plan. root, cols) )
462478 }
463479
480+ /// Returns a new ``DataFrame`` that contains only the unique rows from this ``DataFrame``.
481+ /// This is an alias for `distinct`. If column names are given, Spark considers only those columns.
482+ /// - Parameter cols: Column names
483+ /// - Returns: A ``DataFrame``.
484+ public func dropDuplicates( _ cols: String ... ) -> DataFrame {
485+ let plan = SparkConnectClient . getDropDuplicates ( self . plan. root, cols, withinWatermark: false )
486+ return DataFrame ( spark: self . spark, plan: plan)
487+ }
488+
489+ /// Returns a new Dataset with duplicates rows removed, within watermark.
490+ /// If column names are given, Spark considers only those columns.
491+ /// - Parameter cols: Column names
492+ /// - Returns: A ``DataFrame``.
493+ public func dropDuplicatesWithinWatermark( _ cols: String ... ) -> DataFrame {
494+ let plan = SparkConnectClient . getDropDuplicates ( self . plan. root, cols, withinWatermark: true )
495+ return DataFrame ( spark: self . spark, plan: plan)
496+ }
497+
464498 /// Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain existingName.
465499 /// - Parameters:
466500 /// - existingName: A existing column name to be renamed.
@@ -1108,6 +1142,13 @@ public actor DataFrame: Sendable {
11081142 return buildRepartition ( numPartitions: numPartitions, shuffle: false )
11091143 }
11101144
1145+ /// Returns a new ``Dataset`` that contains only the unique rows from this ``Dataset``.
1146+ /// This is an alias for `dropDuplicates`.
1147+ /// - Returns: A `DataFrame`.
1148+ public func distinct( ) -> DataFrame {
1149+ return dropDuplicates ( )
1150+ }
1151+
11111152 /// Groups the DataFrame using the specified columns.
11121153 ///
11131154 /// This method is used to perform aggregations on groups of data.
0 commit comments