@@ -34,6 +34,8 @@ public actor DataFrameReader: Sendable {
34
34
35
35
var extraOptions : CaseInsensitiveDictionary = CaseInsensitiveDictionary ( [ : ] )
36
36
37
+ var userSpecifiedSchemaDDL : String ? = nil
38
+
37
39
let sparkSession : SparkSession
38
40
39
41
init ( sparkSession: SparkSession ) {
@@ -85,6 +87,22 @@ public actor DataFrameReader: Sendable {
85
87
return self
86
88
}
87
89
90
+ /// Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema
91
+ /// automatically from data. By specifying the schema here, the underlying data source can skip
92
+ /// the schema inference step, and thus speed up data loading.
93
+ /// - Parameter schema: A DDL schema string.
94
+ /// - Returns: A `DataFrameReader`.
95
+ public func schema( _ schema: String ) async throws -> DataFrameReader {
96
+ // Validate by parsing.
97
+ do {
98
+ _ = try await sparkSession. client. ddlParse ( schema)
99
+ } catch {
100
+ throw SparkConnectError . InvalidTypeException
101
+ }
102
+ self . userSpecifiedSchemaDDL = schema
103
+ return self
104
+ }
105
+
88
106
/// Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external
89
107
/// key-value stores).
90
108
/// - Returns: A `DataFrame`.
@@ -111,6 +129,9 @@ public actor DataFrameReader: Sendable {
111
129
dataSource. format = self . source
112
130
dataSource. paths = self . paths
113
131
dataSource. options = self . extraOptions. toStringDictionary ( )
132
+ if let userSpecifiedSchemaDDL = self . userSpecifiedSchemaDDL {
133
+ dataSource. schema = userSpecifiedSchemaDDL
134
+ }
114
135
115
136
var read = Read ( )
116
137
read. dataSource = dataSource
0 commit comments