Skip to content

Commit 6139773

Browse files
committed
[SPARK-51784] Support xml in DataFrame(Reader/Writer)
### What changes were proposed in this pull request? This PR aims to support `xml` API in `DataFrameReader` and `DataFrameWriter`. ### Why are the changes needed? `xml` API is newly added at `Apache Spark 4.0.0`. We had better support this for the feature parity. https://github.com/apache/spark/blob/e0801d9d8e33cd8835f3e3beed99a3588c16b776/sql/api/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L394-L403 ```scala /** * Loads a XML file and returns the result as a `DataFrame`. See the documentation on the other * overloaded `xml()` method for more details. * * since 4.0.0 */ def xml(path: String): DataFrame = { // This method ensures that calls that explicit need single argument works, see SPARK-16009 xml(Seq(path): _*) } ``` ### Does this PR introduce _any_ user-facing change? No, this is a new addition. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #53 from dongjoon-hyun/SPARK-51784. Authored-by: Dongjoon Hyun <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 00f2b47 commit 6139773

File tree

4 files changed

+43
-0
lines changed

4 files changed

+43
-0
lines changed

Sources/SparkConnect/DataFrameReader.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,22 @@ public actor DataFrameReader: Sendable {
160160
return load(paths)
161161
}
162162

163+
/// Loads an XML file and returns the result as a `DataFrame`.
164+
/// - Parameter path: A path string
165+
/// - Returns: A `DataFrame`.
166+
public func xml(_ path: String) -> DataFrame {
167+
self.source = "xml"
168+
return load(path)
169+
}
170+
171+
/// Loads XML files and returns the result as a `DataFrame`.
172+
/// - Parameter paths: Path strings
173+
/// - Returns: A `DataFrame`.
174+
public func xml(_ paths: String...) -> DataFrame {
175+
self.source = "xml"
176+
return load(paths)
177+
}
178+
163179
/// Loads an ORC file and returns the result as a `DataFrame`.
164180
/// - Parameter path: A path string
165181
/// - Returns: A `DataFrame`.

Sources/SparkConnect/DataFrameWriter.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,14 @@ public actor DataFrameWriter: Sendable {
171171
return try await save(path)
172172
}
173173

174+
/// Saves the content of the `DataFrame` in XML format at the specified path.
175+
/// - Parameter path: A path string
176+
/// - Returns: A `DataFrame`.
177+
public func xml(_ path: String) async throws {
178+
self.source = "xml"
179+
return try await save(path)
180+
}
181+
174182
/// Saves the content of the `DataFrame` in ORC format at the specified path.
175183
/// - Parameter path: A path string
176184
/// - Returns: A `DataFrame`.

Tests/SparkConnectTests/DataFrameReaderTests.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,16 @@ struct DataFrameReaderTests {
4545
await spark.stop()
4646
}
4747

48+
@Test
49+
func xml() async throws {
50+
let spark = try await SparkSession.builder.getOrCreate()
51+
let path = "../examples/src/main/resources/people.xml"
52+
#expect(try await spark.read.option("rowTag", "person").format("xml").load(path).count() == 3)
53+
#expect(try await spark.read.option("rowTag", "person").xml(path).count() == 3)
54+
#expect(try await spark.read.option("rowTag", "person").xml(path, path).count() == 6)
55+
await spark.stop()
56+
}
57+
4858
@Test
4959
func orc() async throws {
5060
let spark = try await SparkSession.builder.getOrCreate()

Tests/SparkConnectTests/DataFrameWriterTests.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ struct DataFrameWriterTests {
4343
await spark.stop()
4444
}
4545

46+
@Test
47+
func xml() async throws {
48+
let tmpDir = "/tmp/" + UUID().uuidString
49+
let spark = try await SparkSession.builder.getOrCreate()
50+
try await spark.range(2025).write.option("rowTag", "person").xml(tmpDir)
51+
#expect(try await spark.read.option("rowTag", "person").xml(tmpDir).count() == 2025)
52+
await spark.stop()
53+
}
54+
4655
@Test
4756
func orc() async throws {
4857
let tmpDir = "/tmp/" + UUID().uuidString

0 commit comments

Comments
 (0)