Skip to content

Commit 26a2bac

Browse files
Merge pull request #24 from willtemperley/main
Add null counts to arrays. Initial work on ArrowWriter.
2 parents 866463f + 6e5c94e commit 26a2bac

38 files changed

+1595
-987
lines changed

README.md

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,104 @@ A Swift implementation of Apache Arrow, the universal columnar format for fast d
66

77
This is a **work in progress**. Do not use in production. Progress is fast however, expect a beta in December.
88

9+
## Array interface
10+
11+
Arrow arrays are backed by a standard memory layout:
12+
https://arrow.apache.org/docs/format/Columnar.html
13+
14+
In Swift-Arrow, every array conforms to:
15+
16+
```swift
17+
public protocol ArrowArrayProtocol {
18+
associatedtype ItemType
19+
subscript(_ index: Int) -> ItemType? { get }
20+
var offset: Int { get }
21+
var length: Int { get }
22+
func slice(offset: Int, length: Int) -> Self
23+
func any(at index: Int) -> Any?
24+
}
25+
```
26+
27+
The in-memory contiguous buffers allow constant-time random access.
28+
29+
Every Arrow array supports nullable elements. This is encoded as an optional bit-packed validity buffer aka null array aka bitfield.
30+
In psuedocode, bitfield[index] == 0 means null or invalid, and bitfield[index] == 1 means not null or valid.
31+
Fixed-width types are encoded back-to-back, with placeholder values for nulls. For example the array:
32+
33+
```swift
34+
let swiftArray: [Int8?] = [1, nil, 2, 3, nil, 4]
35+
let arrayBuilder: ArrayBuilderFixedWidth<Int8> = .init()
36+
for value in swiftArray {
37+
if let value {
38+
arrayBuilder.append(value)
39+
} else {
40+
arrayBuilder.appendNull()
41+
}
42+
}
43+
let arrowArray = arrayBuilder.finish()
44+
for i in 0..<swiftArray.count {
45+
#expect(arrowArray[i] == swiftArray[i])
46+
}
47+
```
48+
49+
would be backed by a values buffer of `Int8`:
50+
51+
`[1, 0, 2, 3, 0, 4]`
52+
53+
and a bit-packed validity buffer of UInt8:
54+
`[45]` or `[b00101101]`
55+
56+
Note the validity buffer may be empty if all values are null, or all values are non null.
57+
58+
Arrow Arrays of variable-length types such as `String` have an offsets buffer. For example:
59+
60+
```swift
61+
let swiftArray: [String?] = ["ab", nil, "c", "", "."]
62+
let arrayBuilder: ArrayBuilderVariable<String> = .init()
63+
for value in swiftArray {
64+
if let value {
65+
arrayBuilder.append(value)
66+
} else {
67+
arrayBuilder.appendNull()
68+
}
69+
}
70+
let arrowArray = arrayBuilder.finish()
71+
#expect(arrowArray[0] == "ab")
72+
#expect(arrowArray[1] == nil)
73+
#expect(arrowArray[2] == "c")
74+
#expect(arrowArray[3] == "")
75+
#expect(arrowArray[4] == ".")
76+
```
77+
78+
would have an offsets array of array length + 1 integers:
79+
`[0, 2, 2, 3, 3, 4]`
80+
81+
This is a lookup into the value array, i.e.:
82+
83+
```swift
84+
let values: [UInt8] = [97, 98, 99, 46]
85+
print(values[0..<2]) // [97, 98]
86+
print(values[2..<2]) // []
87+
print(values[2..<3]) // [99]
88+
print(values[3..<4]) // [46]
89+
```
90+
91+
In practice, buffers can be any contingous storage. In Swift-Arrow, arrays created in memory are usually backed by pointers, whereas arrays loaded from IPC files are backed by memory-mapped `Data` instances.
92+
93+
Arrays can be configured to use different buffer types, by specifying the types as
94+
`public struct ArrowArrayVariable<OffsetsBuffer, ValueBuffer>`
95+
96+
this allows the buffer types to be user-specified, e.g.:
97+
```
98+
typealias ArrowArrayUtf8 = ArrowArrayVariable<
99+
FixedWidthBufferIPC<Int32>,
100+
VariableLengthBufferIPC<String>
101+
>
102+
``
103+
104+
105+
## Relationship to Arrow-Swift
106+
9107
This project is based on Arrow-Swift, the official Swift implementation of Apache Arrow. The decision was made to at least temporarily operate independently of the Apache Software Foundation (ASF). Currently there are no active ASF maintaners with knowledge of Swift, and the only [Apache approved CI for Swift](https://github.com/apache/infrastructure-actions/blob/main/approved_patterns.yml) is [setup-swift which is unmaintained](https://github.com/swift-actions/setup-swift/issues), leading to intermittent CI failures. This has led to delays in much-needed fixes being implemented.
10108
11109
The intention is to continue contributing to the official Apache-Swift repository, however changes can be iterated on more quickly here.

Sources/Arrow/Array/Array.swift

Lines changed: 101 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ public protocol ArrowArrayProtocol {
1919
subscript(_ index: Int) -> ItemType? { get }
2020
var offset: Int { get }
2121
var length: Int { get }
22+
var nullCount: Int { get }
2223
func slice(offset: Int, length: Int) -> Self
2324
func any(at index: Int) -> Any?
25+
var bufferSizes: [Int] { get }
2426
}
2527

2628
// This exists to support type-erased struct arrays.
@@ -35,6 +37,8 @@ public struct ArrowArrayBoolean: ArrowArrayProtocol {
3537
public typealias ItemType = Bool
3638
public let offset: Int
3739
public let length: Int
40+
public var bufferSizes: [Int] { [nullBuffer.length, valueBuffer.length] }
41+
public var nullCount: Int { nullBuffer.nullCount }
3842
let nullBuffer: NullBuffer
3943
let valueBuffer: NullBuffer
4044

@@ -75,11 +79,12 @@ where
7579
ValueBuffer: FixedWidthBufferProtocol,
7680
ValueBuffer.ElementType: Numeric
7781
{
78-
public typealias ItemType = ValueBuffer.ElementType
7982

80-
// public typealias ItemType = Element
83+
public typealias ItemType = ValueBuffer.ElementType
8184
public let offset: Int
8285
public let length: Int
86+
public var bufferSizes: [Int] { [nullBuffer.length, valueBuffer.length] }
87+
public var nullCount: Int { nullBuffer.nullCount }
8388
let nullBuffer: NullBuffer
8489
let valueBuffer: ValueBuffer
8590

@@ -115,16 +120,20 @@ where
115120
}
116121

117122
/// An Arrow array of variable-length types.
118-
public struct ArrowArrayVariable<Element, OffsetsBuffer, ValueBuffer>:
123+
public struct ArrowArrayVariable<OffsetsBuffer, ValueBuffer>:
119124
ArrowArrayProtocol
120125
where
121-
Element: VariableLength,
122126
OffsetsBuffer: FixedWidthBufferProtocol<Int32>,
123-
ValueBuffer: VariableLengthBufferProtocol<Element>
127+
ValueBuffer: VariableLengthBufferProtocol<ValueBuffer.ElementType>,
128+
ValueBuffer.ElementType: VariableLength
124129
{
125-
public typealias ItemType = Element
130+
public typealias ItemType = ValueBuffer.ElementType
126131
public let offset: Int
127132
public let length: Int
133+
public var bufferSizes: [Int] {
134+
[nullBuffer.length, offsetsBuffer.length, valueBuffer.length]
135+
}
136+
public var nullCount: Int { nullBuffer.nullCount }
128137
let nullBuffer: NullBuffer
129138
let offsetsBuffer: OffsetsBuffer
130139
let valueBuffer: ValueBuffer
@@ -143,10 +152,8 @@ where
143152
self.valueBuffer = valueBuffer
144153
}
145154

146-
public subscript(index: Int) -> Element? {
147-
155+
public subscript(index: Int) -> ValueBuffer.ElementType? {
148156
let offsetIndex = self.offset + index
149-
150157
if !self.nullBuffer.isSet(offsetIndex) {
151158
return nil
152159
}
@@ -170,23 +177,18 @@ where
170177
}
171178

172179
/// An Arrow array of `Date`s with a resolution of 1 day.
173-
struct ArrowArrayDate32<ValueBuffer>: ArrowArrayProtocol
180+
public struct ArrowArrayDate32<ValueBuffer>: ArrowArrayProtocol
174181
where
175182
ValueBuffer: FixedWidthBufferProtocol<Int32>
176183
{
177-
typealias ItemType = Date
178-
184+
public typealias ItemType = Date
185+
public var bufferSizes: [Int] { array.bufferSizes }
186+
public var nullCount: Int { array.nullCount }
187+
public var offset: Int { array.offset }
188+
public var length: Int { array.length }
179189
let array: ArrowArrayFixed<ValueBuffer>
180190

181-
var offset: Int {
182-
array.offset
183-
}
184-
185-
var length: Int {
186-
array.length
187-
}
188-
189-
subscript(index: Int) -> Date? {
191+
public subscript(index: Int) -> Date? {
190192
precondition(index >= 0 && index < length, "Invalid index.")
191193
let offsetIndex = self.offset + index
192194
let days: Int32? = array[offsetIndex]
@@ -197,30 +199,25 @@ where
197199
}
198200
}
199201

200-
func slice(offset: Int, length: Int) -> Self {
202+
public func slice(offset: Int, length: Int) -> Self {
201203
let internalSlice = array.slice(offset: offset, length: length)
202204
return .init(array: internalSlice)
203205
}
204206
}
205207

206208
/// An Arrow array of `Date`s with a resolution of 1 second.
207-
struct ArrowArrayDate64<ValueBuffer>: ArrowArrayProtocol
209+
public struct ArrowArrayDate64<ValueBuffer>: ArrowArrayProtocol
208210
where
209211
ValueBuffer: FixedWidthBufferProtocol<Date64>
210212
{
211-
typealias ItemType = Date
212-
213+
public typealias ItemType = Date
214+
public var bufferSizes: [Int] { array.bufferSizes }
215+
public var nullCount: Int { array.nullCount }
216+
public var offset: Int { array.offset }
217+
public var length: Int { array.length }
213218
let array: ArrowArrayFixed<ValueBuffer>
214219

215-
var offset: Int {
216-
array.offset
217-
}
218-
219-
var length: Int {
220-
array.length
221-
}
222-
223-
subscript(index: Int) -> Date? {
220+
public subscript(index: Int) -> Date? {
224221
precondition(index >= 0 && index < length, "Invalid index.")
225222
let offsetIndex = self.offset + index
226223
let milliseconds: Int64? = array[offsetIndex]
@@ -231,26 +228,44 @@ where
231228
}
232229
}
233230

234-
func slice(offset: Int, length: Int) -> Self {
231+
public func slice(offset: Int, length: Int) -> Self {
235232
let internalSlice = array.slice(offset: offset, length: length)
236233
return .init(array: internalSlice)
237234
}
238235
}
239236

240-
/// An Arrow list array which may be nested arbitrarily.
241-
struct ArrowListArray<Element>: ArrowArrayProtocol
237+
/// A strongly-typed Arrow list array which may be nested arbitrarily.
238+
public struct ArrowListArray<Element, OffsetsBuffer>: ArrowArrayProtocol
242239
where
240+
OffsetsBuffer: FixedWidthBufferProtocol<Int32>,
243241
Element: ArrowArrayProtocol
244242
{
245-
typealias ItemType = Element
246-
247-
let offset: Int
248-
let length: Int
243+
public typealias ItemType = Element
244+
public let offset: Int
245+
public let length: Int
246+
public var bufferSizes: [Int] {
247+
[nullBuffer.length, offsetsBuffer.length, values.length]
248+
}
249+
public var nullCount: Int { nullBuffer.nullCount }
249250
let nullBuffer: NullBuffer
250-
let offsetsBuffer: FixedWidthBuffer<Int32>
251+
let offsetsBuffer: OffsetsBuffer
251252
let values: Element
252253

253-
subscript(index: Int) -> Element? {
254+
public init(
255+
offset: Int = 0,
256+
length: Int,
257+
nullBuffer: NullBuffer,
258+
offsetsBuffer: OffsetsBuffer,
259+
values: Element
260+
) {
261+
self.offset = offset
262+
self.length = length
263+
self.nullBuffer = nullBuffer
264+
self.offsetsBuffer = offsetsBuffer
265+
self.values = values
266+
}
267+
268+
public subscript(index: Int) -> Element? {
254269
precondition(index >= 0 && index < length, "Invalid index.")
255270
let offsetIndex = self.offset + index
256271
if !self.nullBuffer.isSet(offsetIndex) {
@@ -263,7 +278,7 @@ where
263278
return values.slice(offset: Int(startIndex), length: Int(length))
264279
}
265280

266-
func slice(offset: Int, length: Int) -> Self {
281+
public func slice(offset: Int, length: Int) -> Self {
267282
.init(
268283
offset: self.offset + offset,
269284
length: length,
@@ -274,14 +289,54 @@ where
274289
}
275290
}
276291

292+
/// A type-erased wrapper for an Arrow list array.
293+
public struct AnyArrowListArray: ArrowArrayProtocol {
294+
295+
public typealias ItemType = any ArrowArrayProtocol
296+
public var bufferSizes: [Int] {
297+
_base.bufferSizes
298+
}
299+
300+
private let _base: any ArrowArrayProtocol
301+
private let _subscriptImpl: (Int) -> (any ArrowArrayProtocol)?
302+
private let _sliceImpl: (Int, Int) -> AnyArrowListArray
303+
304+
public let offset: Int
305+
public let length: Int
306+
public var nullCount: Int { _base.nullCount }
307+
308+
public init<Element, OffsetsBuffer>(
309+
_ list: ArrowListArray<Element, OffsetsBuffer>
310+
)
311+
where
312+
OffsetsBuffer: FixedWidthBufferProtocol<Int32>,
313+
Element: ArrowArrayProtocol
314+
{
315+
self._base = list
316+
self.offset = list.offset
317+
self.length = list.length
318+
self._subscriptImpl = { list[$0] }
319+
self._sliceImpl = { AnyArrowListArray(list.slice(offset: $0, length: $1)) }
320+
}
321+
322+
public subscript(index: Int) -> (any ArrowArrayProtocol)? {
323+
_subscriptImpl(index)
324+
}
325+
326+
public func slice(offset: Int, length: Int) -> AnyArrowListArray {
327+
_sliceImpl(offset, length)
328+
}
329+
}
330+
277331
/// An Arrow struct array.
278332
public struct ArrowStructArray: ArrowArrayProtocol {
279333
public typealias ItemType = [String: Any]
280-
281-
let nullBuffer: NullBuffer
282334
public let offset: Int
283335
public let length: Int
284336
public let fields: [(name: String, array: any ArrowArrayProtocol)]
337+
public var bufferSizes: [Int] { [nullBuffer.length] }
338+
public var nullCount: Int { nullBuffer.nullCount }
339+
let nullBuffer: NullBuffer
285340

286341
public init(
287342
offset: Int = 0,

0 commit comments

Comments
 (0)