Skip to content

Commit 93060c1

Browse files
committed
[SPARK-52072] Add option() variants to SparkSession.Builder and Data(Frame|Stream)Reader
### What changes were proposed in this pull request? This PR aims to add more `option()` API variants to `SparkSession.Builder`, `DataFrameReader`, and `DataStreamReader`. While reviewing these files, I revised documentations too. ### Why are the changes needed? For feature parity. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#129 from dongjoon-hyun/SPARK-52072. Authored-by: Dongjoon Hyun <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent ce65f4a commit 93060c1

File tree

3 files changed

+162
-84
lines changed

3 files changed

+162
-84
lines changed

Sources/SparkConnect/DataFrameReader.swift

Lines changed: 66 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//
1919
import Foundation
2020

21-
/// An interface used to load a `DataFrame` from external storage systems
21+
/// An interface used to load a ``DataFrame`` from external storage systems
2222
/// (e.g. file systems, key-value stores, etc). Use `SparkSession.read` to access this.
2323
public actor DataFrameReader: Sendable {
2424
var source: String = ""
@@ -64,7 +64,7 @@ public actor DataFrameReader: Sendable {
6464

6565
/// Specifies the input data source format.
6666
/// - Parameter source: A string.
67-
/// - Returns: A `DataFrameReader`.
67+
/// - Returns: A ``DataFrameReader``.
6868
public func format(_ source: String) -> DataFrameReader {
6969
self.source = source
7070
return self
@@ -74,17 +74,53 @@ public actor DataFrameReader: Sendable {
7474
/// - Parameters:
7575
/// - key: A key string.
7676
/// - value: A value string.
77-
/// - Returns: A `DataFrameReader`.
77+
/// - Returns: A ``DataFrameReader``.
7878
public func option(_ key: String, _ value: String) -> DataFrameReader {
7979
self.extraOptions[key] = value
8080
return self
8181
}
8282

83+
/// Adds an input option for the underlying data source.
84+
/// - Parameters:
85+
/// - key: A key string.
86+
/// - value: A bool value.
87+
/// - Returns: A ``DataFrameReader``.
88+
public func option(_ key: String, _ value: Bool) -> DataFrameReader {
89+
return option(key, String(value))
90+
}
91+
92+
/// Adds an input option for the underlying data source.
93+
/// - Parameters:
94+
/// - key: A key string.
95+
/// - value: A `Int` value.
96+
/// - Returns: A ``DataFrameReader``.
97+
public func option(_ key: String, _ value: Int) -> DataFrameReader {
98+
return option(key, String(value))
99+
}
100+
101+
/// Adds an input option for the underlying data source.
102+
/// - Parameters:
103+
/// - key: A key string.
104+
/// - value: A `Int64` value.
105+
/// - Returns: A ``DataFrameReader``.
106+
public func option(_ key: String, _ value: Int64) -> DataFrameReader {
107+
return option(key, String(value))
108+
}
109+
110+
/// Adds an input option for the underlying data source.
111+
/// - Parameters:
112+
/// - key: A key string.
113+
/// - value: A `Double` value.
114+
/// - Returns: A ``DataFrameReader``.
115+
public func option(_ key: String, _ value: Double) -> DataFrameReader {
116+
return option(key, String(value))
117+
}
118+
83119
/// Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema
84120
/// automatically from data. By specifying the schema here, the underlying data source can skip
85121
/// the schema inference step, and thus speed up data loading.
86122
/// - Parameter schema: A DDL schema string.
87-
/// - Returns: A `DataFrameReader`.
123+
/// - Returns: A ``DataFrameReader``.
88124
@discardableResult
89125
public func schema(_ schema: String) async throws -> DataFrameReader {
90126
// Validate by parsing.
@@ -97,25 +133,25 @@ public actor DataFrameReader: Sendable {
97133
return self
98134
}
99135

100-
/// Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external
136+
/// Loads input in as a ``DataFrame``, for data sources that don't require a path (e.g. external
101137
/// key-value stores).
102-
/// - Returns: A `DataFrame`.
138+
/// - Returns: A ``DataFrame``.
103139
public func load() -> DataFrame {
104140
return load([])
105141
}
106142

107-
/// Loads input in as a `DataFrame`, for data sources that require a path (e.g. data backed by a
143+
/// Loads input in as a ``DataFrame``, for data sources that require a path (e.g. data backed by a
108144
/// local or distributed file system).
109145
/// - Parameter path: A path string.
110-
/// - Returns: A `DataFrame`.
146+
/// - Returns: A ``DataFrame``.
111147
public func load(_ path: String) -> DataFrame {
112148
return load([path])
113149
}
114150

115-
/// Loads input in as a `DataFrame`, for data sources that support multiple paths. Only works if
151+
/// Loads input in as a ``DataFrame``, for data sources that support multiple paths. Only works if
116152
/// the source is a HadoopFsRelationProvider.
117153
/// - Parameter paths: An array of path strings.
118-
/// - Returns: A `DataFrame`.
154+
/// - Returns: A ``DataFrame``.
119155
public func load(_ paths: [String]) -> DataFrame {
120156
self.paths = paths
121157

@@ -139,85 +175,85 @@ public actor DataFrameReader: Sendable {
139175
return DataFrame(spark: sparkSession, plan: plan)
140176
}
141177

142-
/// Loads a CSV file and returns the result as a `DataFrame`. See the documentation on the other
178+
/// Loads a CSV file and returns the result as a ``DataFrame``. See the documentation on the other
143179
/// overloaded `csv()` method for more details.
144180
/// - Parameter path: A path string
145-
/// - Returns: A `DataFrame`.
181+
/// - Returns: A ``DataFrame``.
146182
public func csv(_ path: String) -> DataFrame {
147183
self.source = "csv"
148184
return load(path)
149185
}
150186

151-
/// Loads CSV files and returns the result as a `DataFrame`.
187+
/// Loads CSV files and returns the result as a ``DataFrame``.
152188
/// This function will go through the input once to determine the input schema if `inferSchema`
153189
/// is enabled. To avoid going through the entire data once, disable `inferSchema` option or
154190
/// specify the schema explicitly using `schema`.
155191
/// - Parameter paths: Path strings.
156-
/// - Returns: A `DataFrame`.
192+
/// - Returns: A ``DataFrame``.
157193
public func csv(_ paths: String...) -> DataFrame {
158194
self.source = "csv"
159195
return load(paths)
160196
}
161197

162-
/// Loads a JSON file and returns the result as a `DataFrame`.
198+
/// Loads a JSON file and returns the result as a ``DataFrame``.
163199
/// - Parameter path: A path string
164-
/// - Returns: A `DataFrame`.
200+
/// - Returns: A ``DataFrame``.
165201
public func json(_ path: String) -> DataFrame {
166202
self.source = "json"
167203
return load(path)
168204
}
169205

170-
/// Loads JSON files and returns the result as a `DataFrame`.
206+
/// Loads JSON files and returns the result as a ``DataFrame``.
171207
/// - Parameter paths: Path strings
172-
/// - Returns: A `DataFrame`.
208+
/// - Returns: A ``DataFrame``.
173209
public func json(_ paths: String...) -> DataFrame {
174210
self.source = "json"
175211
return load(paths)
176212
}
177213

178-
/// Loads an XML file and returns the result as a `DataFrame`.
214+
/// Loads an XML file and returns the result as a ``DataFrame``.
179215
/// - Parameter path: A path string
180-
/// - Returns: A `DataFrame`.
216+
/// - Returns: A ``DataFrame``.
181217
public func xml(_ path: String) -> DataFrame {
182218
self.source = "xml"
183219
return load(path)
184220
}
185221

186-
/// Loads XML files and returns the result as a `DataFrame`.
222+
/// Loads XML files and returns the result as a ``DataFrame``.
187223
/// - Parameter paths: Path strings
188-
/// - Returns: A `DataFrame`.
224+
/// - Returns: A ``DataFrame``.
189225
public func xml(_ paths: String...) -> DataFrame {
190226
self.source = "xml"
191227
return load(paths)
192228
}
193229

194-
/// Loads an ORC file and returns the result as a `DataFrame`.
230+
/// Loads an ORC file and returns the result as a ``DataFrame``.
195231
/// - Parameter path: A path string
196-
/// - Returns: A `DataFrame`.
232+
/// - Returns: A ``DataFrame``.
197233
public func orc(_ path: String) -> DataFrame {
198234
self.source = "orc"
199235
return load(path)
200236
}
201237

202-
/// Loads ORC files and returns the result as a `DataFrame`.
238+
/// Loads ORC files and returns the result as a ``DataFrame``.
203239
/// - Parameter paths: Path strings
204-
/// - Returns: A `DataFrame`.
240+
/// - Returns: A ``DataFrame``.
205241
public func orc(_ paths: String...) -> DataFrame {
206242
self.source = "orc"
207243
return load(paths)
208244
}
209245

210-
/// Loads a Parquet file and returns the result as a `DataFrame`.
246+
/// Loads a Parquet file and returns the result as a ``DataFrame``.
211247
/// - Parameter path: A path string
212-
/// - Returns: A `DataFrame`.
248+
/// - Returns: A ``DataFrame``.
213249
public func parquet(_ path: String) -> DataFrame {
214250
self.source = "parquet"
215251
return load(path)
216252
}
217253

218-
/// Loads Parquet files, returning the result as a `DataFrame`.
254+
/// Loads Parquet files, returning the result as a ``DataFrame``.
219255
/// - Parameter paths: Path strings
220-
/// - Returns: A `DataFrame`.
256+
/// - Returns: A ``DataFrame``.
221257
public func parquet(_ paths: String...) -> DataFrame {
222258
self.source = "parquet"
223259
return load(paths)

Sources/SparkConnect/DataStreamReader.swift

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//
1919
import Foundation
2020

21-
/// An actor to load a streaming `Dataset` from external storage systems
21+
/// An actor to load a streaming ``DataFrame`` from external storage systems
2222
/// (e.g. file systems, key-value stores, etc). Use `SparkSession.readStream` to access this.
2323
public actor DataStreamReader: Sendable {
2424
var source: String = ""
@@ -47,7 +47,7 @@ public actor DataStreamReader: Sendable {
4747
/// automatically from data. By specifying the schema here, the underlying data source can skip
4848
/// the schema inference step, and thus speed up data loading.
4949
/// - Parameter schema: A DDL schema string.
50-
/// - Returns: A `DataStreamReader`.
50+
/// - Returns: A ``DataStreamReader``.
5151
@discardableResult
5252
public func schema(_ schema: String) async throws -> DataStreamReader {
5353
// Validate by parsing.
@@ -64,7 +64,7 @@ public actor DataStreamReader: Sendable {
6464
/// - Parameters:
6565
/// - key: A key string.
6666
/// - value: A value string.
67-
/// - Returns: A `DataStreamReader`.
67+
/// - Returns: A ``DataStreamReader``.
6868
public func option(_ key: String, _ value: String) -> DataStreamReader {
6969
self.extraOptions[key] = value
7070
return self
@@ -74,53 +74,59 @@ public actor DataStreamReader: Sendable {
7474
/// - Parameters:
7575
/// - key: A key string.
7676
/// - value: A `Bool` value.
77-
/// - Returns: A `DataStreamReader`.
77+
/// - Returns: A ``DataStreamReader``.
7878
public func option(_ key: String, _ value: Bool) -> DataStreamReader {
79-
self.extraOptions[key] = String(value)
80-
return self
79+
return option(key, String(value))
80+
}
81+
82+
/// Adds an input option for the underlying data source.
83+
/// - Parameters:
84+
/// - key: A key string.
85+
/// - value: A `Int` value.
86+
/// - Returns: A ``DataStreamReader``.
87+
public func option(_ key: String, _ value: Int) -> DataStreamReader {
88+
return option(key, String(value))
8189
}
8290

8391
/// Adds an input option for the underlying data source.
8492
/// - Parameters:
8593
/// - key: A key string.
8694
/// - value: A `Int64` value.
87-
/// - Returns: A `DataStreamReader`.
95+
/// - Returns: A ``DataStreamReader``.
8896
public func option(_ key: String, _ value: Int64) -> DataStreamReader {
89-
self.extraOptions[key] = String(value)
90-
return self
97+
return option(key, String(value))
9198
}
9299

93100
/// Adds an input option for the underlying data source.
94101
/// - Parameters:
95102
/// - key: A key string.
96103
/// - value: A `Double` value.
97-
/// - Returns: A `DataStreamReader`.
104+
/// - Returns: A ``DataStreamReader``.
98105
public func option(_ key: String, _ value: Double) -> DataStreamReader {
99-
self.extraOptions[key] = String(value)
100-
return self
106+
return option(key, String(value))
101107
}
102108

103109
/// Adds input options for the underlying data source.
104110
/// - Parameter options: A string-string dictionary.
105-
/// - Returns: A `DataStreamReader`.
111+
/// - Returns: A ``DataStreamReader``.
106112
public func options(_ options: [String: String]) -> DataStreamReader {
107113
for (key, value) in options {
108114
self.extraOptions[key] = value
109115
}
110116
return self
111117
}
112118

113-
/// Loads input data stream in as a `DataFrame`, for data streams that don't require a path
119+
/// Loads input data stream in as a ``DataFrame``, for data streams that don't require a path
114120
/// (e.g. external key-value stores).
115-
/// - Returns: A `DataFrame`.
121+
/// - Returns: A ``DataFrame``.
116122
public func load() -> DataFrame {
117123
return load([])
118124
}
119125

120-
/// Loads input data stream in as a `DataFrame`, for data streams that require a path
126+
/// Loads input data stream in as a ``DataFrame``, for data streams that require a path
121127
/// (e.g. data backed by a local or distributed file system).
122128
/// - Parameter path: A path string.
123-
/// - Returns: A `DataFrame`.
129+
/// - Returns: A ``DataFrame``.
124130
public func load(_ path: String) -> DataFrame {
125131
return load([path])
126132
}
@@ -149,7 +155,7 @@ public actor DataStreamReader: Sendable {
149155
return DataFrame(spark: sparkSession, plan: plan)
150156
}
151157

152-
/// Define a Streaming DataFrame on a Table. The DataSource corresponding to the table should
158+
/// Define a Streaming ``DataFrame`` on a Table. The DataSource corresponding to the table should
153159
/// support streaming mode.
154160
/// - Parameter tableName: The name of the table.
155161
/// - Returns: A ``DataFrame``.
@@ -171,49 +177,49 @@ public actor DataStreamReader: Sendable {
171177
return DataFrame(spark: sparkSession, plan: plan)
172178
}
173179

174-
/// Loads a text file stream and returns the result as a `DataFrame`.
180+
/// Loads a text file stream and returns the result as a ``DataFrame``.
175181
/// - Parameter path: A path string
176-
/// - Returns: A `DataFrame`.
182+
/// - Returns: A ``DataFrame``.
177183
public func text(_ path: String) -> DataFrame {
178184
self.source = "text"
179185
return load(path)
180186
}
181187

182-
/// Loads a CSV file stream and returns the result as a `DataFrame`.
188+
/// Loads a CSV file stream and returns the result as a ``DataFrame``.
183189
/// - Parameter path: A path string
184-
/// - Returns: A `DataFrame`.
190+
/// - Returns: A ``DataFrame``.
185191
public func csv(_ path: String) -> DataFrame {
186192
self.source = "csv"
187193
return load(path)
188194
}
189195

190-
/// Loads a JSON file stream and returns the result as a `DataFrame`.
196+
/// Loads a JSON file stream and returns the result as a ``DataFrame``.
191197
/// - Parameter path: A path string
192-
/// - Returns: A `DataFrame`.
198+
/// - Returns: A ``DataFrame``.
193199
public func json(_ path: String) -> DataFrame {
194200
self.source = "json"
195201
return load(path)
196202
}
197203

198-
/// Loads an XML file stream and returns the result as a `DataFrame`.
204+
/// Loads an XML file stream and returns the result as a ``DataFrame``.
199205
/// - Parameter path: A path string
200-
/// - Returns: A `DataFrame`.
206+
/// - Returns: A ``DataFrame``.
201207
public func xml(_ path: String) -> DataFrame {
202208
self.source = "xml"
203209
return load(path)
204210
}
205211

206-
/// Loads an ORC file stream and returns the result as a `DataFrame`.
212+
/// Loads an ORC file stream and returns the result as a ``DataFrame``.
207213
/// - Parameter path: A path string
208-
/// - Returns: A `DataFrame`.
214+
/// - Returns: A ``DataFrame``.
209215
public func orc(_ path: String) -> DataFrame {
210216
self.source = "orc"
211217
return load(path)
212218
}
213219

214-
/// Loads a Parquet file stream and returns the result as a `DataFrame`.
220+
/// Loads a Parquet file stream and returns the result as a ``DataFrame``.
215221
/// - Parameter path: A path string
216-
/// - Returns: A `DataFrame`.
222+
/// - Returns: A ``DataFrame``.
217223
public func parquet(_ path: String) -> DataFrame {
218224
self.source = "parquet"
219225
return load(path)

0 commit comments

Comments
 (0)