Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Sources/SparkConnect/DataFrame.swift
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ import Synchronization
/// - ``sample(_:_:)``
/// - ``sample(_:)``
///
/// ### Statistics
/// - ``describe(_:)``
/// - ``summary(_:)``
///
/// ### Utility Methods
/// - ``isEmpty()``
/// - ``isLocal()``
Expand Down Expand Up @@ -495,6 +499,25 @@ public actor DataFrame: Sendable {
return DataFrame(spark: self.spark, plan: plan)
}

/// Computes basic statistics for numeric and string columns, including count, mean, stddev, min,
/// and max. If no columns are given, this function computes statistics for all numerical or
/// string columns.
/// - Parameter cols: Column names.
/// - Returns: A ``DataFrame`` containing basic statistics.
public func describe(_ cols: String...) -> DataFrame {
return DataFrame(spark: self.spark, plan: SparkConnectClient.getDescribe(self.plan.root, cols))
}

/// Computes specified statistics for numeric and string columns. Available statistics are:
/// count, mean, stddev, min, max, arbitrary approximate percentiles specified as a percentage (e.g. 75%)
/// count_distinct, approx_count_distinct . If no statistics are given, this function computes count, mean,
/// stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
/// - Parameter statistics: Statistics names.
/// - Returns: A ``DataFrame`` containing specified statistics.
public func summary(_ statistics: String...) -> DataFrame {
return DataFrame(spark: self.spark, plan: SparkConnectClient.getSummary(self.plan.root, statistics))
}

/// Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain existingName.
/// - Parameters:
/// - existingName: A existing column name to be renamed.
Expand Down
22 changes: 22 additions & 0 deletions Sources/SparkConnect/SparkConnectClient.swift
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,28 @@ public actor SparkConnectClient {
return plan
}

static func getDescribe(_ child: Relation, _ cols: [String]) -> Plan {
var describe = Spark_Connect_StatDescribe()
describe.input = child
describe.cols = cols
var relation = Relation()
relation.describe = describe
var plan = Plan()
plan.opType = .root(relation)
return plan
}

static func getSummary(_ child: Relation, _ statistics: [String]) -> Plan {
var summary = Spark_Connect_StatSummary()
summary.input = child
summary.statistics = statistics
var relation = Relation()
relation.summary = summary
var plan = Plan()
plan.opType = .root(relation)
return plan
}

static func getSort(_ child: Relation, _ cols: [String]) -> Plan {
var sort = Sort()
sort.input = child
Expand Down
22 changes: 22 additions & 0 deletions Tests/SparkConnectTests/DataFrameTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,28 @@ struct DataFrameTests {
await spark.stop()
}

@Test
func describe() async throws {
let spark = try await SparkSession.builder.getOrCreate()
let df = try await spark.range(10)
let expected = [Row("10"), Row("4.5"), Row("3.0276503540974917"), Row("0"), Row("9")]
#expect(try await df.describe().select("id").collect() == expected)
#expect(try await df.describe("id").select("id").collect() == expected)
await spark.stop()
}

@Test
func summary() async throws {
let spark = try await SparkSession.builder.getOrCreate()
let expected = [
Row("10"), Row("4.5"), Row("3.0276503540974917"),
Row("0"), Row("2"), Row("4"), Row("7"), Row("9")
]
#expect(try await spark.range(10).summary().select("id").collect() == expected)
#expect(try await spark.range(10).summary("min", "max").select("id").collect() == [Row("0"), Row("9")])
await spark.stop()
}

@Test
func groupBy() async throws {
let spark = try await SparkSession.builder.getOrCreate()
Expand Down
Loading