Skip to content

Commit 476d1c0

Browse files
mgrazianocMarcokou
authored
feat: Add support for Timestamp data type (#33)
### Rationale for this change Currently, the Swift implementation of Arrow does not support Timestamp, although they are available in the base C interface. This PR attempts to add its support by following the current implemented design pattern. ### What changes are included in this PR? 1. `TimestampArray` with some basic formatting utilities 2. `TimestampArrayBuilder` 3. `Timestamp` alias 4. `ArrowTimestampUnit`, which includes extensively all the variants (seconds, milliseconds, microseconds and nanoseconds) 5. `ArrowTypeTimestamp` from base `Arrow` 6. `ArrowType` support for timestamp 7. `ArrowWriterHelper` support for timestamp 8. `fromProto` support for timestamp It properly handles the presence or absence of `timezone`. ### Are these changes tested? Tests are included in both `ArrayTests.swift` and `CDataTests.swift`. ### Are there any user-facing changes? Yes - users can now work with Timestamp data types in Swift Arrow implementations. This is additive and doesn't break existing functionality. Closes #32. --------- Co-authored-by: Marco <[email protected]> Co-authored-by: Sutou Kouhei <[email protected]>
1 parent a4eb49e commit 476d1c0

File tree

8 files changed

+320
-2
lines changed

8 files changed

+320
-2
lines changed

Arrow/Sources/Arrow/ArrowArray.swift

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ public class ArrowArrayHolderImpl: ArrowArrayHolder {
105105
return try ArrowArrayHolderImpl(Time32Array(with))
106106
case .time64:
107107
return try ArrowArrayHolderImpl(Time64Array(with))
108+
case .timestamp:
109+
return try ArrowArrayHolderImpl(TimestampArray(with))
108110
case .string:
109111
return try ArrowArrayHolderImpl(StringArray(with))
110112
case .boolean:
@@ -233,6 +235,84 @@ public class Date64Array: ArrowArray<Date> {
233235
public class Time32Array: FixedArray<Time32> {}
234236
public class Time64Array: FixedArray<Time64> {}
235237

238+
public class TimestampArray: FixedArray<Timestamp> {
239+
240+
public struct FormattingOptions: Equatable {
241+
public var dateFormat: String = "yyyy-MM-dd HH:mm:ss.SSS"
242+
public var locale: Locale = .current
243+
public var includeTimezone: Bool = true
244+
public var fallbackToRaw: Bool = true
245+
246+
public init(dateFormat: String = "yyyy-MM-dd HH:mm:ss.SSS",
247+
locale: Locale = .current,
248+
includeTimezone: Bool = true,
249+
fallbackToRaw: Bool = true) {
250+
self.dateFormat = dateFormat
251+
self.locale = locale
252+
self.includeTimezone = includeTimezone
253+
self.fallbackToRaw = fallbackToRaw
254+
}
255+
256+
public static func == (lhs: FormattingOptions, rhs: FormattingOptions) -> Bool {
257+
return lhs.dateFormat == rhs.dateFormat &&
258+
lhs.locale.identifier == rhs.locale.identifier &&
259+
lhs.includeTimezone == rhs.includeTimezone &&
260+
lhs.fallbackToRaw == rhs.fallbackToRaw
261+
}
262+
}
263+
264+
private var cachedFormatter: DateFormatter?
265+
private var cachedOptions: FormattingOptions?
266+
267+
public func formattedDate(at index: UInt, options: FormattingOptions = FormattingOptions()) -> String? {
268+
guard let timestamp = self[index] else { return nil }
269+
270+
guard let timestampType = self.arrowData.type as? ArrowTypeTimestamp else {
271+
return options.fallbackToRaw ? "\(timestamp)" : nil
272+
}
273+
274+
let date = dateFromTimestamp(timestamp, unit: timestampType.unit)
275+
276+
if cachedFormatter == nil || cachedOptions != options {
277+
let formatter = DateFormatter()
278+
formatter.dateFormat = options.dateFormat
279+
formatter.locale = options.locale
280+
if options.includeTimezone, let timezone = timestampType.timezone {
281+
formatter.timeZone = TimeZone(identifier: timezone)
282+
}
283+
cachedFormatter = formatter
284+
cachedOptions = options
285+
}
286+
287+
return cachedFormatter?.string(from: date)
288+
}
289+
290+
private func dateFromTimestamp(_ timestamp: Int64, unit: ArrowTimestampUnit) -> Date {
291+
let timeInterval: TimeInterval
292+
293+
switch unit {
294+
case .seconds:
295+
timeInterval = TimeInterval(timestamp)
296+
case .milliseconds:
297+
timeInterval = TimeInterval(timestamp) / 1_000
298+
case .microseconds:
299+
timeInterval = TimeInterval(timestamp) / 1_000_000
300+
case .nanoseconds:
301+
timeInterval = TimeInterval(timestamp) / 1_000_000_000
302+
}
303+
304+
return Date(timeIntervalSince1970: timeInterval)
305+
}
306+
307+
public override func asString(_ index: UInt) -> String {
308+
if let formatted = formattedDate(at: index) {
309+
return formatted
310+
}
311+
312+
return super.asString(index)
313+
}
314+
}
315+
236316
public class BinaryArray: ArrowArray<Data> {
237317
public struct Options {
238318
public var printAsHex = false

Arrow/Sources/Arrow/ArrowArrayBuilder.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ public class Time64ArrayBuilder: ArrowArrayBuilder<FixedBufferBuilder<Time64>, T
119119
}
120120
}
121121

122+
public class TimestampArrayBuilder: ArrowArrayBuilder<FixedBufferBuilder<Int64>, TimestampArray> {
123+
fileprivate convenience init(_ unit: ArrowTimestampUnit, timezone: String? = nil) throws {
124+
try self.init(ArrowTypeTimestamp(unit, timezone: timezone))
125+
}
126+
}
127+
122128
public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructArray> {
123129
let builders: [any ArrowArrayHolderBuilder]
124130
let fields: [ArrowField]
@@ -279,6 +285,11 @@ public class ArrowArrayBuilders {
279285
throw ArrowError.invalid("Expected arrow type for \(arrowType.id) not found")
280286
}
281287
return try Time64ArrayBuilder(timeType.unit)
288+
case .timestamp:
289+
guard let timestampType = arrowType as? ArrowTypeTimestamp else {
290+
throw ArrowError.invalid("Expected arrow type for \(arrowType.id) not found")
291+
}
292+
return try TimestampArrayBuilder(timestampType.unit)
282293
default:
283294
throw ArrowError.unknownType("Builder not found for arrow type: \(arrowType.id)")
284295
}
@@ -338,4 +349,8 @@ public class ArrowArrayBuilders {
338349
public static func loadTime64ArrayBuilder(_ unit: ArrowTime64Unit) throws -> Time64ArrayBuilder {
339350
return try Time64ArrayBuilder(unit)
340351
}
352+
353+
public static func loadTimestampArrayBuilder(_ unit: ArrowTimestampUnit, timezone: String? = nil) throws -> TimestampArrayBuilder {
354+
return try TimestampArrayBuilder(unit, timezone: timezone)
355+
}
341356
}

Arrow/Sources/Arrow/ArrowReaderHelper.swift

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,24 @@ private func makeTimeHolder(_ field: ArrowField,
9090
}
9191
}
9292

93+
private func makeTimestampHolder(_ field: ArrowField,
94+
buffers: [ArrowBuffer],
95+
nullCount: UInt
96+
) -> Result<ArrowArrayHolder, ArrowError> {
97+
do {
98+
if let arrowType = field.type as? ArrowTypeTimestamp {
99+
let arrowData = try ArrowData(arrowType, buffers: buffers, nullCount: nullCount)
100+
return .success(ArrowArrayHolderImpl(try TimestampArray(arrowData)))
101+
} else {
102+
return .failure(.invalid("Incorrect field type for timestamp: \(field.type)"))
103+
}
104+
} catch let error as ArrowError {
105+
return .failure(error)
106+
} catch {
107+
return .failure(.unknownError("\(error)"))
108+
}
109+
}
110+
93111
private func makeBoolHolder(_ buffers: [ArrowBuffer],
94112
nullCount: UInt) -> Result<ArrowArrayHolder, ArrowError> {
95113
do {
@@ -186,6 +204,8 @@ func makeArrayHolder( // swiftlint:disable:this cyclomatic_complexity
186204
return makeDateHolder(field, buffers: buffers, nullCount: nullCount)
187205
case .time32, .time64:
188206
return makeTimeHolder(field, buffers: buffers, nullCount: nullCount)
207+
case .timestamp:
208+
return makeTimestampHolder(field, buffers: buffers, nullCount: nullCount)
189209
case .strct:
190210
return makeStructHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
191211
default:
@@ -203,7 +223,7 @@ func makeBuffer(_ buffer: org_apache_arrow_flatbuf_Buffer, fileData: Data,
203223

204224
func isFixedPrimitive(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
205225
switch type {
206-
case .int, .bool, .floatingpoint, .date, .time:
226+
case .int, .bool, .floatingpoint, .date, .time, .timestamp:
207227
return true
208228
default:
209229
return false
@@ -261,6 +281,22 @@ func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_bod
261281
}
262282

263283
return ArrowTypeTime64(timeType.unit == .microsecond ? .microseconds : .nanoseconds)
284+
case .timestamp:
285+
let timestampType = field.type(type: org_apache_arrow_flatbuf_Timestamp.self)!
286+
let arrowUnit: ArrowTimestampUnit
287+
switch timestampType.unit {
288+
case .second:
289+
arrowUnit = .seconds
290+
case .millisecond:
291+
arrowUnit = .milliseconds
292+
case .microsecond:
293+
arrowUnit = .microseconds
294+
case .nanosecond:
295+
arrowUnit = .nanoseconds
296+
}
297+
298+
let timezone = timestampType.timezone
299+
return ArrowTypeTimestamp(arrowUnit, timezone: timezone)
264300
case .struct_:
265301
_ = field.type(type: org_apache_arrow_flatbuf_Struct_.self)!
266302
var fields = [ArrowField]()

Arrow/Sources/Arrow/ArrowType.swift

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public typealias Time32 = Int32
2121
public typealias Time64 = Int64
2222
public typealias Date32 = Int32
2323
public typealias Date64 = Int64
24+
public typealias Timestamp = Int64
2425

2526
func FlatBuffersVersion_23_1_4() { // swiftlint:disable:this identifier_name
2627
}
@@ -65,6 +66,7 @@ public enum ArrowTypeId {
6566
case strct
6667
case time32
6768
case time64
69+
case timestamp
6870
case time
6971
case uint16
7072
case uint32
@@ -122,6 +124,47 @@ public class ArrowTypeTime64: ArrowType {
122124
}
123125
}
124126

127+
public enum ArrowTimestampUnit {
128+
case seconds
129+
case milliseconds
130+
case microseconds
131+
case nanoseconds
132+
}
133+
134+
public class ArrowTypeTimestamp: ArrowType {
135+
let unit: ArrowTimestampUnit
136+
let timezone: String?
137+
138+
public init(_ unit: ArrowTimestampUnit, timezone: String? = nil) {
139+
self.unit = unit
140+
self.timezone = timezone
141+
142+
super.init(ArrowType.ArrowTimestamp)
143+
}
144+
145+
public convenience init(type: ArrowTypeId) {
146+
self.init(.milliseconds, timezone: nil)
147+
}
148+
149+
public override var cDataFormatId: String {
150+
get throws {
151+
let unitChar: String
152+
switch self.unit {
153+
case .seconds: unitChar = "s"
154+
case .milliseconds: unitChar = "m"
155+
case .microseconds: unitChar = "u"
156+
case .nanoseconds: unitChar = "n"
157+
}
158+
159+
if let timezone = self.timezone {
160+
return "ts\(unitChar):\(timezone)"
161+
} else {
162+
return "ts\(unitChar)"
163+
}
164+
}
165+
}
166+
}
167+
125168
public class ArrowNestedType: ArrowType {
126169
let fields: [ArrowField]
127170
public init(_ info: ArrowType.Info, fields: [ArrowField]) {
@@ -150,6 +193,7 @@ public class ArrowType {
150193
public static let ArrowBinary = Info.variableInfo(ArrowTypeId.binary)
151194
public static let ArrowTime32 = Info.timeInfo(ArrowTypeId.time32)
152195
public static let ArrowTime64 = Info.timeInfo(ArrowTypeId.time64)
196+
public static let ArrowTimestamp = Info.timeInfo(ArrowTypeId.timestamp)
153197
public static let ArrowStruct = Info.complexInfo(ArrowTypeId.strct)
154198

155199
public init(_ info: ArrowType.Info) {
@@ -270,6 +314,8 @@ public class ArrowType {
270314
return MemoryLayout<Time32>.stride
271315
case .time64:
272316
return MemoryLayout<Time64>.stride
317+
case .timestamp:
318+
return MemoryLayout<Timestamp>.stride
273319
case .binary:
274320
return MemoryLayout<Int8>.stride
275321
case .string:
@@ -320,6 +366,11 @@ public class ArrowType {
320366
return try time64.cDataFormatId
321367
}
322368
return "ttu"
369+
case ArrowTypeId.timestamp:
370+
if let timestamp = self as? ArrowTypeTimestamp {
371+
return try timestamp.cDataFormatId
372+
}
373+
return "tsu"
323374
case ArrowTypeId.binary:
324375
return "z"
325376
case ArrowTypeId.string:
@@ -366,6 +417,24 @@ public class ArrowType {
366417
return ArrowTypeTime64(.microseconds)
367418
} else if from == "ttn" {
368419
return ArrowTypeTime64(.nanoseconds)
420+
} else if from.starts(with: "ts") {
421+
let components = from.split(separator: ":", maxSplits: 1)
422+
guard let unitPart = components.first, unitPart.count == 3 else {
423+
throw ArrowError.invalid("Invalid timestamp format '\(from)'. Expected format 'ts[s|m|u|n][:timezone]'")
424+
}
425+
426+
let unitChar = unitPart.suffix(1)
427+
let unit: ArrowTimestampUnit
428+
switch unitChar {
429+
case "s": unit = .seconds
430+
case "m": unit = .milliseconds
431+
case "u": unit = .microseconds
432+
case "n": unit = .nanoseconds
433+
default: throw ArrowError.invalid("Unrecognized timestamp unit '\(unitChar)'. Expected 's', 'm', 'u', or 'n'.")
434+
}
435+
436+
let timezone = components.count > 1 ? String(components[1]) : nil
437+
return ArrowTypeTimestamp(unit, timezone: timezone)
369438
} else if from == "z" {
370439
return ArrowType(ArrowType.ArrowBinary)
371440
} else if from == "u" {

Arrow/Sources/Arrow/ArrowWriterHelper.swift

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ func toFBTypeEnum(_ arrowType: ArrowType) -> Result<org_apache_arrow_flatbuf_Typ
4141
return .success(org_apache_arrow_flatbuf_Type_.date)
4242
case .time32, .time64:
4343
return .success(org_apache_arrow_flatbuf_Type_.time)
44+
case .timestamp:
45+
return .success(org_apache_arrow_flatbuf_Type_.timestamp)
4446
case .strct:
4547
return .success(org_apache_arrow_flatbuf_Type_.struct_)
4648
default:
@@ -103,6 +105,32 @@ func toFBType( // swiftlint:disable:this cyclomatic_complexity function_body_len
103105
}
104106

105107
return .failure(.invalid("Unable to case to Time64"))
108+
case .timestamp:
109+
if let timestampType = arrowType as? ArrowTypeTimestamp {
110+
let startOffset = org_apache_arrow_flatbuf_Timestamp.startTimestamp(&fbb)
111+
112+
let fbUnit: org_apache_arrow_flatbuf_TimeUnit
113+
switch timestampType.unit {
114+
case .seconds:
115+
fbUnit = .second
116+
case .milliseconds:
117+
fbUnit = .millisecond
118+
case .microseconds:
119+
fbUnit = .microsecond
120+
case .nanoseconds:
121+
fbUnit = .nanosecond
122+
}
123+
org_apache_arrow_flatbuf_Timestamp.add(unit: fbUnit, &fbb)
124+
125+
if let timezone = timestampType.timezone {
126+
let timezoneOffset = fbb.create(string: timezone)
127+
org_apache_arrow_flatbuf_Timestamp.add(timezone: timezoneOffset, &fbb)
128+
}
129+
130+
return .success(org_apache_arrow_flatbuf_Timestamp.endTimestamp(&fbb, start: startOffset))
131+
}
132+
133+
return .failure(.invalid("Unable to cast to Timestamp"))
106134
case .strct:
107135
let startOffset = org_apache_arrow_flatbuf_Struct_.startStruct_(&fbb)
108136
return .success(org_apache_arrow_flatbuf_Struct_.endStruct_(&fbb, start: startOffset))

Arrow/Sources/Arrow/ProtoUtil.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,22 @@ func fromProto( // swiftlint:disable:this cyclomatic_complexity function_body_le
6464
let arrowUnit: ArrowTime64Unit = timeType.unit == .microsecond ? .microseconds : .nanoseconds
6565
arrowType = ArrowTypeTime64(arrowUnit)
6666
}
67+
case .timestamp:
68+
let timestampType = field.type(type: org_apache_arrow_flatbuf_Timestamp.self)!
69+
let arrowUnit: ArrowTimestampUnit
70+
switch timestampType.unit {
71+
case .second:
72+
arrowUnit = .seconds
73+
case .millisecond:
74+
arrowUnit = .milliseconds
75+
case .microsecond:
76+
arrowUnit = .microseconds
77+
case .nanosecond:
78+
arrowUnit = .nanoseconds
79+
}
80+
81+
let timezone = timestampType.timezone
82+
arrowType = ArrowTypeTimestamp(arrowUnit, timezone: timezone?.isEmpty == true ? nil : timezone)
6783
case .struct_:
6884
var children = [ArrowField]()
6985
for index in 0..<field.childrenCount {

0 commit comments

Comments
 (0)