Skip to content

Commit b52586d

Browse files
Merge pull request #30 from willtemperley/main
Initial work for dictionary arrays.
2 parents 1da474f + e823d45 commit b52586d

19 files changed

+232
-437
lines changed

Package.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import PackageDescription
2020
let package = Package(
2121
name: "Arrow",
2222
platforms: [
23-
.macOS(.v26)
23+
.macOS(.v26), .iOS(.v26), .watchOS(.v26), .tvOS(.v26), .visionOS(.v26),
2424
],
2525
products: [
2626
.library(name: "Arrow", targets: ["Arrow"]),

Sources/Arrow/Array/Builder.swift

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public class ArrayBuilderBoolean: AnyArrayBuilder {
7575
}
7676

7777
/// A builder for Arrow arrays holding fixed-width types.
78-
public class ArrayBuilderFixedWidth<T: Numeric & BitwiseCopyable>:
78+
public class ArrayBuilderNumeric<T: Numeric & BitwiseCopyable>:
7979
AnyArrayBuilder
8080
{
8181

@@ -234,7 +234,7 @@ typealias ArrayBuilderBinary = ArrayBuilderVariableLength<Data, Int32>
234234
/// A builder for Arrow arrays holding `Date`s with a resolution of one day.
235235
public struct ArrayBuilderDate32: AnyArrayBuilder {
236236
public typealias ArrayType = ArrowArrayDate32
237-
let builder: ArrayBuilderFixedWidth<Date32> = .init()
237+
let builder: ArrayBuilderNumeric<Date32> = .init()
238238

239239
public init() {}
240240

@@ -259,7 +259,7 @@ public struct ArrayBuilderDate32: AnyArrayBuilder {
259259
/// A builder for Arrow arrays holding `Date`s with a resolution of one day.
260260
public struct ArrayBuilderDate64: AnyArrayBuilder {
261261
public typealias ArrayType = ArrowArrayDate64
262-
let builder: ArrayBuilderFixedWidth<Date64> = .init()
262+
let builder: ArrayBuilderNumeric<Date64> = .init()
263263

264264
public init() {}
265265

@@ -282,13 +282,13 @@ public struct ArrayBuilderDate64: AnyArrayBuilder {
282282
}
283283

284284
/// A builder for Arrow arrays holding Time32 values.
285-
public typealias ArrayBuilderTime32 = ArrayBuilderFixedWidth<Time32>
285+
public typealias ArrayBuilderTime32 = ArrayBuilderNumeric<Time32>
286286

287287
/// A builder for Arrow arrays holding Time64 values.
288-
public typealias ArrayBuilderTime64 = ArrayBuilderFixedWidth<Time64>
288+
public typealias ArrayBuilderTime64 = ArrayBuilderNumeric<Time64>
289289

290290
/// A builder for Arrow arrays holding Timestamp values.
291-
public typealias ArrayBuilderTimestamp = ArrayBuilderFixedWidth<Timestamp>
291+
public typealias ArrayBuilderTimestamp = ArrayBuilderNumeric<Timestamp>
292292

293293
public class ArrayBuilderList<T: AnyArrayBuilder> {
294294

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Copyright 2026 The Columnar Swift Contributors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
/// A shared container for the dictionary values.
16+
public class DictionaryValues: @unchecked Sendable {
17+
var currentArray: AnyArrowArrayProtocol
18+
19+
init(_ array: AnyArrowArrayProtocol) {
20+
self.currentArray = array
21+
}
22+
}
23+
24+
///// An Arrow dictionary array.
25+
public struct ArrowDictionaryArray<
26+
IndexType: FixedWidthInteger & BitwiseCopyable
27+
>: ArrowArrayProtocol {
28+
public let offset: Int
29+
public let length: Int
30+
public var bufferSizes: [Int] { keys.bufferSizes }
31+
public var buffers: [ArrowBufferProtocol] { keys.buffers }
32+
public var nullCount: Int { keys.nullCount }
33+
34+
public let keys: ArrowArrayNumeric<IndexType>
35+
public let values: DictionaryValues
36+
37+
public init(
38+
offset: Int = 0,
39+
length: Int,
40+
keys: ArrowArrayNumeric<IndexType>,
41+
values: AnyArrowArrayProtocol
42+
) {
43+
self.offset = offset
44+
self.length = length
45+
self.keys = keys
46+
self.values = DictionaryValues(values)
47+
}
48+
49+
public subscript(index: Int) -> Any? {
50+
precondition(index >= 0 && index < length, "Invalid index.")
51+
let offsetIndex = self.offset + index
52+
guard let key = keys[offsetIndex] else {
53+
return nil
54+
}
55+
precondition(
56+
Int(key) < values.currentArray.length, "Key out of bounds for dictionary")
57+
return values.currentArray.any(at: Int(key))
58+
}
59+
60+
public func slice(offset: Int, length: Int) -> Self {
61+
.init(
62+
offset: 0,
63+
length: length,
64+
keys: keys.slice(offset: self.offset + offset, length: length),
65+
values: values.currentArray
66+
)
67+
}
68+
}

Sources/Arrow/ArrowField.swift

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ public struct ArrowField: Codable, Sendable {
3131
///
3232
/// If true, the field *may* contain null values.
3333
public var isNullable: Bool
34-
public let orderedDict: Bool
3534
/// A map of key-value pairs containing additional custom meta data.
3635
public var metadata: [String: String]
3736
}
@@ -60,7 +59,6 @@ extension ArrowField {
6059
self.name = name
6160
self.type = dataType
6261
self.isNullable = isNullable
63-
self.orderedDict = false
6462
self.metadata = metadata
6563
}
6664

@@ -76,21 +74,20 @@ extension ArrowField {
7674
)
7775
}
7876

79-
/// Create a new `ArrowField` suitable for `ArrowType::Dictionary`.
80-
///
81-
public init(
82-
dictWithName: String,
83-
key: ArrowType,
84-
value: ArrowType,
85-
isNullable: Bool
86-
) {
87-
precondition(
88-
key.isDictionaryKeyType,
89-
"\(key) is not a valid dictionary key"
90-
)
91-
let dataType: ArrowType = .dictionary(key, value)
92-
self = Self(name: dictWithName, dataType: dataType, isNullable: isNullable)
93-
}
77+
// /// Create a new `ArrowField` suitable for `ArrowType::Dictionary`.
78+
// public init(
79+
// dictWithName: String,
80+
// key: ArrowType,
81+
// value: ArrowType,
82+
// isNullable: Bool
83+
// ) {
84+
// precondition(
85+
// key.isDictionaryKeyType,
86+
// "\(key) is not a valid dictionary key"
87+
// )
88+
// let dataType: ArrowType = .dictionary(key, value)
89+
// self = Self(name: dictWithName, dataType: dataType, isNullable: isNullable)
90+
// }
9491

9592
/// Create a new struct `ArrowField`.
9693
///
@@ -218,7 +215,7 @@ extension ArrowField {
218215
@inlinable
219216
public var dictIsOrdered: Bool {
220217
switch self.type {
221-
case .dictionary: return self.orderedDict
218+
case .dictionary(_, let isOrdered, _, _): return isOrdered
222219
default: return false
223220
}
224221
}

Sources/Arrow/ArrowType.swift

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ public indirect enum ArrowType: Codable, Sendable, Equatable {
293293
///
294294
/// This type mostly used to represent low cardinality string
295295
/// arrays or a limited set of primitive types as integers.
296-
case dictionary(ArrowType, ArrowType)
296+
case dictionary(id: Int64, isOrdered: Bool, key: ArrowType, value: ArrowType)
297297
/// Exact 32-bit width decimal value with precision and scale
298298
///
299299
/// * precision is the total number of digits
@@ -496,8 +496,8 @@ extension ArrowType: CustomStringConvertible {
496496
return "LargeListView(\(elementType))"
497497
case .union(let mode, let fields):
498498
return "Union(\(mode), \(fields) fields)"
499-
case .dictionary(let keyType, let valueType):
500-
return "Dictionary(\(keyType), \(valueType))"
499+
case .dictionary(let id, let isOrdered, let keyType, let valueType):
500+
return "Dictionary(\(id), \(isOrdered), \(keyType), \(valueType))"
501501
case .decimal32(let precision, let scale):
502502
return "Decimal32(\(precision), \(scale))"
503503
case .decimal64(let precision, let scale):
@@ -627,7 +627,7 @@ extension ArrowType {
627627
@inlinable
628628
public var isNested: Bool {
629629
switch self {
630-
case .dictionary(_, let v):
630+
case .dictionary(_, _, _, let v):
631631
return v.isNested
632632
case .runEndEncoded(_, let v):
633633
return v.type.isNested
@@ -699,7 +699,8 @@ extension ArrowType {
699699
&& aField.type.equalsDataType(bField.type) && aSorted == bSorted
700700

701701
// Dictionary
702-
case (.dictionary(let aKey, let aValue), .dictionary(let bKey, let bValue)):
702+
case (.dictionary(_, _, let aKey, let aValue), .dictionary(_, _, let bKey, let bValue)):
703+
// Ignoring dictionary id here.
703704
return aKey.equalsDataType(bKey) && aValue.equalsDataType(bValue)
704705

705706
// RunEndEncoded
@@ -855,7 +856,7 @@ extension ArrowType {
855856
}
856857

857858
// Dictionary
858-
case (.dictionary(let k1, let v1), .dictionary(let k2, let v2)):
859+
case (.dictionary(_, _, let k1, let v1), .dictionary(_, _, let k2, let v2)):
859860
return k1.contains(k2) && v1.contains(v2)
860861

861862
// Base case: equality

Sources/ArrowIPC/ArrowReader.swift

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,51 @@ public struct ArrowReader {
9090
let arrowSchema = try Self.loadSchema(schema: schema)
9191
var recordBatches: [RecordBatch] = []
9292

93+
for block in footer.dictionaries {
94+
let (message, bodyOffset) = try data.withParserSpan { input in
95+
try input.seek(toAbsoluteOffset: block.offset)
96+
let marker = try UInt32(parsingLittleEndian: &input)
97+
if marker != continuationMarker {
98+
throw ArrowError(.invalid("Missing continuation marker."))
99+
}
100+
let messageLength = try UInt32(parsingLittleEndian: &input)
101+
let data = try [UInt8](parsing: &input, byteCount: Int(messageLength))
102+
var mbb = ByteBuffer(data: Data(data))
103+
let message: FMessage = getRoot(byteBuffer: &mbb)
104+
let offset = Int64(input.startPosition)
105+
return (message, offset)
106+
}
107+
108+
guard message.headerType == .dictionarybatch else {
109+
throw ArrowError(.invalid("Expected DictionaryBatch message."))
110+
}
111+
112+
guard let dictMessage = message.header(type: FDictionaryBatch.self) else {
113+
throw ArrowError(.invalid("Expected DictionaryBatch as message header"))
114+
}
115+
116+
// 1. Get the Dictionary ID and 'isDelta' flag
117+
let dictId = dictMessage.id
118+
let isDelta = dictMessage.isDelta
119+
120+
// 2. The dictionary data is actually just a RecordBatch with ONE column
121+
// The schema for this internal batch is defined by the dictionary type
122+
// found in the global Schema for this specific ID.
123+
guard let rbMessage = dictMessage.data else {
124+
throw ArrowError(.invalid("DictionaryBatch has no data"))
125+
}
126+
127+
let dictBatch = try Self.loadRecordBatch(
128+
data: self.data,
129+
arrowSchema: arrowSchema,
130+
rbMessage: rbMessage,
131+
offset: bodyOffset
132+
)
133+
134+
// 4. Update the "Box" in your provider
135+
// try dictionaryProvider.update(id: dictId, array: dictionaryArray, isDelta: isDelta)
136+
}
137+
93138
// MARK: Record batch parsing
94139
for block in footer.recordBatches {
95140

Sources/ArrowIPC/FlatBuffersTypeAliases.swift

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,22 @@
1313
// limitations under the License.
1414

1515
typealias FFooter = org_apache_arrow_flatbuf_Footer
16+
17+
typealias FMessageHeader = org_apache_arrow_flatbuf_MessageHeader
1618
typealias FMessage = org_apache_arrow_flatbuf_Message
19+
1720
typealias FBlock = org_apache_arrow_flatbuf_Block
1821
typealias FField = org_apache_arrow_flatbuf_Field
1922
typealias FSchema = org_apache_arrow_flatbuf_Schema
2023
typealias FBuffer = org_apache_arrow_flatbuf_Buffer
2124
typealias FFieldNode = org_apache_arrow_flatbuf_FieldNode
22-
typealias FRecordBatch = org_apache_arrow_flatbuf_RecordBatch
23-
typealias FMessageHeader = org_apache_arrow_flatbuf_MessageHeader
2425
typealias FKeyValue = org_apache_arrow_flatbuf_KeyValue
2526

27+
// MARK: Record batches.
28+
typealias FRecordBatch = org_apache_arrow_flatbuf_RecordBatch
29+
typealias FDictionaryBatch = org_apache_arrow_flatbuf_DictionaryBatch
30+
typealias FDictionaryEncoding = org_apache_arrow_flatbuf_DictionaryEncoding
31+
2632
// MARK: Top level type.
2733
typealias FType = org_apache_arrow_flatbuf_Type_
2834

Tests/ArrowIPCTests/ArrowTestingGold.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@ struct ArrowTestingGold {
8282
"generated_union",
8383
]
8484

85-
@Test
85+
// @Test
8686
func readFile() throws {
8787

88-
let name = "generated_nested_large_offsets"
88+
let name = "generated_dictionary"
8989
let (testFile, testCase) = try loadTestCase(
9090
name: name, fileExtension: "arrow_file")
9191
let arrowReader = try ArrowReader(url: testFile)

Tests/ArrowTests/ArrayNullBufferTests.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ struct ArrayNullBufferTests {
2121

2222
@Test func allValidValues() throws {
2323
// Should be able to omit null buffer entirely
24-
let arrayBuilder: ArrayBuilderFixedWidth<Int64> = .init()
24+
let arrayBuilder: ArrayBuilderNumeric<Int64> = .init()
2525
for i in 0..<1000 {
2626
arrayBuilder.append(Int64(i)) // No nulls
2727
}
@@ -35,7 +35,7 @@ struct ArrayNullBufferTests {
3535
}
3636

3737
@Test func allNullValues() throws {
38-
let arrayBuilder: ArrayBuilderFixedWidth<Int64> = .init()
38+
let arrayBuilder: ArrayBuilderNumeric<Int64> = .init()
3939
for _ in 0..<1000 {
4040
arrayBuilder.appendNull()
4141
}

Tests/ArrowTests/BasicArrayTests.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ struct BasicArrayTests {
3838
}
3939

4040
@Test func uint8Array() throws {
41-
let arrayBuilder: ArrayBuilderFixedWidth<UInt8> = .init()
41+
let arrayBuilder: ArrayBuilderNumeric<UInt8> = .init()
4242
for index: UInt8 in 0..<100 {
4343
arrayBuilder.append(index)
4444
}
@@ -105,7 +105,7 @@ struct BasicArrayTests {
105105
}
106106

107107
@Test func doubleArray() throws {
108-
let builder: ArrayBuilderFixedWidth<Double> = .init()
108+
let builder: ArrayBuilderNumeric<Double> = .init()
109109
builder.append(14)
110110
builder.appendNull()
111111
builder.append(40.4)

0 commit comments

Comments
 (0)