[SPARK-51996] Support describe and summary in DataFrame

dongjoon-hyun · dongjoon-hyun · commit 568432552a8f · 2025-05-04T06:10:00.000-07:00
### What changes were proposed in this pull request? This PR aims to support `describe` and `summary` API of `DataFrame`. ### Why are the changes needed? For feature parity. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #112 from dongjoon-hyun/SPARK-51996. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/Sources/SparkConnect/DataFrame.swift b/Sources/SparkConnect/DataFrame.swift
@@ -164,6 +164,10 @@ import Synchronization
 /// - ``sample(_:_:)``
 /// - ``sample(_:)``
 ///
+/// ### Statistics
+/// - ``describe(_:)``
+/// - ``summary(_:)``
+///
 /// ### Utility Methods
 /// - ``isEmpty()``
 /// - ``isLocal()``
@@ -495,6 +499,25 @@ public actor DataFrame: Sendable {
     return DataFrame(spark: self.spark, plan: plan)
   }
 
+  /// Computes basic statistics for numeric and string columns, including count, mean, stddev, min,
+  /// and max. If no columns are given, this function computes statistics for all numerical or
+  /// string columns.
+  /// - Parameter cols: Column names.
+  /// - Returns: A ``DataFrame`` containing basic statistics.
+  public func describe(_ cols: String...) -> DataFrame {
+    return DataFrame(spark: self.spark, plan: SparkConnectClient.getDescribe(self.plan.root, cols))
+  }
+
+  /// Computes specified statistics for numeric and string columns. Available statistics are:
+  /// count, mean, stddev, min, max, arbitrary approximate percentiles specified as a percentage (e.g. 75%)
+  /// count_distinct, approx_count_distinct . If no statistics are given, this function computes count, mean,
+  /// stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+  /// - Parameter statistics: Statistics names.
+  /// - Returns: A ``DataFrame`` containing specified statistics.
+  public func summary(_ statistics: String...) -> DataFrame {
+    return DataFrame(spark: self.spark, plan: SparkConnectClient.getSummary(self.plan.root, statistics))
+  }
+
   /// Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain existingName.
   /// - Parameters:
   ///   - existingName: A existing column name to be renamed.
diff --git a/Sources/SparkConnect/SparkConnectClient.swift b/Sources/SparkConnect/SparkConnectClient.swift
@@ -474,6 +474,28 @@ public actor SparkConnectClient {
     return plan
   }
 
+  static func getDescribe(_ child: Relation, _ cols: [String]) -> Plan {
+    var describe = Spark_Connect_StatDescribe()
+    describe.input = child
+    describe.cols = cols
+    var relation = Relation()
+    relation.describe = describe
+    var plan = Plan()
+    plan.opType = .root(relation)
+    return plan
+  }
+
+  static func getSummary(_ child: Relation, _ statistics: [String]) -> Plan {
+    var summary = Spark_Connect_StatSummary()
+    summary.input = child
+    summary.statistics = statistics
+    var relation = Relation()
+    relation.summary = summary
+    var plan = Plan()
+    plan.opType = .root(relation)
+    return plan
+  }
+
   static func getSort(_ child: Relation, _ cols: [String]) -> Plan {
     var sort = Sort()
     sort.input = child
diff --git a/Tests/SparkConnectTests/DataFrameTests.swift b/Tests/SparkConnectTests/DataFrameTests.swift
@@ -682,6 +682,28 @@ struct DataFrameTests {
     await spark.stop()
   }
 
+  @Test
+  func describe() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let df = try await spark.range(10)
+    let expected = [Row("10"), Row("4.5"), Row("3.0276503540974917"), Row("0"), Row("9")]
+    #expect(try await df.describe().select("id").collect() == expected)
+    #expect(try await df.describe("id").select("id").collect() == expected)
+    await spark.stop()
+  }
+
+  @Test
+  func summary() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let expected = [
+      Row("10"), Row("4.5"), Row("3.0276503540974917"),
+      Row("0"), Row("2"), Row("4"), Row("7"), Row("9")
+    ]
+    #expect(try await spark.range(10).summary().select("id").collect() == expected)
+    #expect(try await spark.range(10).summary("min", "max").select("id").collect() == [Row("0"), Row("9")])
+    await spark.stop()
+  }
+
   @Test
   func groupBy() async throws {
     let spark = try await SparkSession.builder.getOrCreate()