Skip to content

Commit 28f225d

Browse files
author
Marco
committed
fea(Arrow)t: Implement ListArray and ListArrayBuilder with associated functionality
1 parent 36e45ef commit 28f225d

11 files changed

+365
-32
lines changed

Arrow/Sources/Arrow/ArrowArray.swift

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ public class ArrowArrayHolderImpl: ArrowArrayHolder {
113113
return try ArrowArrayHolderImpl(BinaryArray(with))
114114
case .strct:
115115
return try ArrowArrayHolderImpl(StructArray(with))
116+
case .list:
117+
return try ArrowArrayHolderImpl(ListArray(with))
116118
default:
117119
throw ArrowError.invalid("Array not found for type: \(arrowType)")
118120
}
@@ -325,3 +327,69 @@ public class StructArray: ArrowArray<[Any?]> {
325327
return output
326328
}
327329
}
330+
331+
public class ListArray: ArrowArray<[Any?]> {
332+
public private(set) var values: ArrowArrayHolder?
333+
334+
public required init(_ arrowData: ArrowData) throws {
335+
try super.init(arrowData)
336+
guard arrowData.children.count == 1 else {
337+
throw ArrowError.invalid("List array must have exactly one child")
338+
}
339+
340+
guard let listType = arrowData.type as? ArrowTypeList else {
341+
throw ArrowError.invalid("Expected ArrowTypeList")
342+
}
343+
344+
self.values = try ArrowArrayHolderImpl.loadArray(
345+
listType.elementType,
346+
with: arrowData.children[0]
347+
)
348+
}
349+
350+
public override subscript(_ index: UInt) -> [Any?]? {
351+
guard let values = self.values else { return nil }
352+
353+
if self.arrowData.isNull(index) {
354+
return nil
355+
}
356+
357+
let offsets = self.arrowData.buffers[1]
358+
let offsetIndex = Int(index) * MemoryLayout<Int32>.stride
359+
360+
let startOffset = offsets.rawPointer.advanced(by: offsetIndex).load(as: Int32.self)
361+
let endOffset = offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).load(as: Int32.self)
362+
363+
var items = [Any?]()
364+
for i in startOffset..<endOffset {
365+
items.append(values.array.asAny(UInt(i)))
366+
}
367+
368+
return items
369+
}
370+
371+
public override func asString(_ index: UInt) -> String {
372+
guard let list = self[index] else {
373+
return "null"
374+
}
375+
376+
var output = "["
377+
378+
for (i, item) in list.enumerated() {
379+
if i > 0 {
380+
output.append(",")
381+
}
382+
383+
if item == nil {
384+
output.append("null")
385+
} else if let asStringItem = item as? AsString {
386+
output.append(asStringItem.asString(0))
387+
} else {
388+
output.append("\(item!)")
389+
}
390+
}
391+
392+
output.append("]")
393+
return output
394+
}
395+
}

Arrow/Sources/Arrow/ArrowArrayBuilder.swift

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
125125
public init(_ fields: [ArrowField], builders: [any ArrowArrayHolderBuilder]) throws {
126126
self.fields = fields
127127
self.builders = builders
128-
try super.init(ArrowNestedType(ArrowType.ArrowStruct, fields: fields))
128+
try super.init(ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields))
129129
self.bufferBuilder.initializeTypeInfo(fields)
130130
}
131131

@@ -137,7 +137,7 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
137137
}
138138

139139
self.builders = builders
140-
try super.init(ArrowNestedType(ArrowType.ArrowStruct, fields: fields))
140+
try super.init(ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields))
141141
}
142142

143143
public override func append(_ values: [Any?]?) {
@@ -168,6 +168,31 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
168168
}
169169
}
170170

171+
public class ListArrayBuilder: ArrowArrayBuilder<ListBufferBuilder, ListArray> {
172+
let valueBuilder: any ArrowArrayHolderBuilder
173+
174+
public override init(_ elementType: ArrowType) throws {
175+
self.valueBuilder = try ArrowArrayBuilders.loadBuilder(arrowType: elementType)
176+
try super.init(ArrowTypeList(elementType))
177+
}
178+
179+
public override func append(_ values: [Any?]?) {
180+
self.bufferBuilder.append(values)
181+
if let vals = values {
182+
for val in vals {
183+
self.valueBuilder.appendAny(val)
184+
}
185+
}
186+
}
187+
188+
public override func finish() throws -> ListArray {
189+
let buffers = self.bufferBuilder.finish()
190+
let childData = try valueBuilder.toHolder().array.arrowData
191+
let arrowData = try ArrowData(self.type, buffers: buffers, children: [childData], nullCount: self.nullCount, length: self.length)
192+
return try ListArray(arrowData)
193+
}
194+
}
195+
171196
public class ArrowArrayBuilders {
172197
public static func loadBuilder( // swiftlint:disable:this cyclomatic_complexity
173198
_ builderType: Any.Type) throws -> ArrowArrayHolderBuilder {
@@ -279,6 +304,16 @@ public class ArrowArrayBuilders {
279304
throw ArrowError.invalid("Expected arrow type for \(arrowType.id) not found")
280305
}
281306
return try Time64ArrayBuilder(timeType.unit)
307+
case .list:
308+
guard let listType = arrowType as? ArrowTypeList else {
309+
throw ArrowError.invalid("Expected ArrowTypeList for \(arrowType.id)")
310+
}
311+
return try ListArrayBuilder(listType.elementType)
312+
case .strct:
313+
guard let structType = arrowType as? ArrowTypeStruct else {
314+
throw ArrowError.invalid("Expected ArrowStructType for \(arrowType.id)")
315+
}
316+
return try StructArrayBuilder(structType.fields)
282317
default:
283318
throw ArrowError.unknownType("Builder not found for arrow type: \(arrowType.id)")
284319
}
@@ -338,4 +373,12 @@ public class ArrowArrayBuilders {
338373
public static func loadTime64ArrayBuilder(_ unit: ArrowTime64Unit) throws -> Time64ArrayBuilder {
339374
return try Time64ArrayBuilder(unit)
340375
}
376+
377+
public static func loadStructArrayBuilder(_ fields: [ArrowField]) throws -> StructArrayBuilder {
378+
return try StructArrayBuilder(fields)
379+
}
380+
381+
public static func loadListArrayBuilder(_ elementType: ArrowType) throws -> ListArrayBuilder {
382+
return try ListArrayBuilder(elementType)
383+
}
341384
}

Arrow/Sources/Arrow/ArrowBufferBuilder.swift

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,14 +338,14 @@ public class Date64BufferBuilder: AbstractWrapperBufferBuilder<Date, Int64> {
338338

339339
public final class StructBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
340340
public typealias ItemType = [Any?]
341-
var info: ArrowNestedType?
341+
var info: ArrowTypeStruct?
342342
public init() throws {
343343
let nulls = ArrowBuffer.createBuffer(0, size: UInt(MemoryLayout<UInt8>.stride))
344344
super.init(nulls)
345345
}
346346

347347
public func initializeTypeInfo(_ fields: [ArrowField]) {
348-
info = ArrowNestedType(ArrowType.ArrowStruct, fields: fields)
348+
info = ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields)
349349
}
350350

351351
public func append(_ newValue: [Any?]?) {
@@ -379,3 +379,62 @@ public final class StructBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
379379
return [nulls]
380380
}
381381
}
382+
383+
public class ListBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
384+
public typealias ItemType = [Any?]
385+
var offsets: ArrowBuffer
386+
387+
public required init() throws {
388+
self.offsets = ArrowBuffer.createBuffer(1, size: UInt(MemoryLayout<Int32>.stride))
389+
let nulls = ArrowBuffer.createBuffer(0, size: UInt(MemoryLayout<UInt8>.stride))
390+
super.init(nulls)
391+
self.offsets.rawPointer.storeBytes(of: Int32(0), as: Int32.self)
392+
}
393+
394+
public func append(_ newValue: [Any?]?) {
395+
let index = UInt(self.length)
396+
self.length += 1
397+
398+
if length >= self.offsets.length {
399+
self.resize(length + 1)
400+
}
401+
402+
let offsetIndex = Int(index) * MemoryLayout<Int32>.stride
403+
let currentOffset = self.offsets.rawPointer.advanced(by: offsetIndex).load(as: Int32.self)
404+
405+
if let vals = newValue {
406+
BitUtility.setBit(index + self.offset, buffer: self.nulls)
407+
let newOffset = currentOffset + Int32(vals.count)
408+
self.offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).storeBytes(of: newOffset, as: Int32.self)
409+
} else {
410+
self.nullCount += 1
411+
BitUtility.clearBit(index + self.offset, buffer: self.nulls)
412+
self.offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).storeBytes(of: currentOffset, as: Int32.self)
413+
}
414+
}
415+
416+
public override func isNull(_ index: UInt) -> Bool {
417+
return !BitUtility.isSet(index + self.offset, buffer: self.nulls)
418+
}
419+
420+
public func resize(_ length: UInt) {
421+
if length > self.offsets.length {
422+
let resizeLength = resizeLength(self.offsets)
423+
var offsets = ArrowBuffer.createBuffer(resizeLength, size: UInt(MemoryLayout<Int32>.size))
424+
var nulls = ArrowBuffer.createBuffer(resizeLength/8 + 1, size: UInt(MemoryLayout<UInt8>.size))
425+
ArrowBuffer.copyCurrent(self.offsets, to: &offsets, len: self.offsets.capacity)
426+
ArrowBuffer.copyCurrent(self.nulls, to: &nulls, len: self.nulls.capacity)
427+
self.offsets = offsets
428+
self.nulls = nulls
429+
}
430+
}
431+
432+
public func finish() -> [ArrowBuffer] {
433+
let length = self.length
434+
var nulls = ArrowBuffer.createBuffer(length/8 + 1, size: UInt(MemoryLayout<UInt8>.size))
435+
var offsets = ArrowBuffer.createBuffer(length + 1, size: UInt(MemoryLayout<Int32>.size))
436+
ArrowBuffer.copyCurrent(self.nulls, to: &nulls, len: nulls.capacity)
437+
ArrowBuffer.copyCurrent(self.offsets, to: &offsets, len: offsets.capacity)
438+
return [nulls, offsets]
439+
}
440+
}

Arrow/Sources/Arrow/ArrowReader.swift

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,35 @@ public class ArrowReader { // swiftlint:disable:this type_body_length
116116
rbLength: UInt(loadInfo.batchData.recordBatch.length))
117117
}
118118

119+
private func loadListData(_ loadInfo: DataLoadInfo, field: org_apache_arrow_flatbuf_Field) -> Result<ArrowArrayHolder, ArrowError> {
120+
guard let node = loadInfo.batchData.nextNode() else {
121+
return .failure(.invalid("Node not found"))
122+
}
123+
124+
guard let nullBuffer = loadInfo.batchData.nextBuffer() else {
125+
return .failure(.invalid("Null buffer not found"))
126+
}
127+
128+
guard let offsetBuffer = loadInfo.batchData.nextBuffer() else {
129+
return .failure(.invalid("Offset buffer not found"))
130+
}
131+
132+
let nullLength = UInt(ceil(Double(node.length) / 8))
133+
let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, length: nullLength, messageOffset: loadInfo.messageOffset)
134+
let arrowOffsetBuffer = makeBuffer(offsetBuffer, fileData: loadInfo.fileData, length: UInt(node.length + 1), messageOffset: loadInfo.messageOffset)
135+
136+
guard field.childrenCount == 1, let childField = field.children(at: 0) else {
137+
return .failure(.invalid("List must have exactly one child"))
138+
}
139+
140+
switch loadField(loadInfo, field: childField) {
141+
case .success(let childHolder):
142+
return makeArrayHolder(field, buffers: [arrowNullBuffer, arrowOffsetBuffer], nullCount: UInt(node.nullCount), children: [childHolder.array.arrowData], rbLength: UInt(loadInfo.batchData.recordBatch.length))
143+
case .failure(let error):
144+
return .failure(error)
145+
}
146+
}
147+
119148
private func loadPrimitiveData(
120149
_ loadInfo: DataLoadInfo,
121150
field: org_apache_arrow_flatbuf_Field)
@@ -178,12 +207,17 @@ public class ArrowReader { // swiftlint:disable:this type_body_length
178207
_ loadInfo: DataLoadInfo,
179208
field: org_apache_arrow_flatbuf_Field)
180209
-> Result<ArrowArrayHolder, ArrowError> {
181-
if isNestedType(field.typeType) {
210+
switch field.typeType {
211+
case .struct_:
182212
return loadStructData(loadInfo, field: field)
183-
} else if isFixedPrimitive(field.typeType) {
184-
return loadPrimitiveData(loadInfo, field: field)
185-
} else {
186-
return loadVariableData(loadInfo, field: field)
213+
case .list:
214+
return loadListData(loadInfo, field: field)
215+
default:
216+
if isFixedPrimitive(field.typeType) {
217+
return loadPrimitiveData(loadInfo, field: field)
218+
} else {
219+
return loadVariableData(loadInfo, field: field)
220+
}
187221
}
188222
}
189223

Arrow/Sources/Arrow/ArrowReaderHelper.swift

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,23 @@ func makeStructHolder(
136136
}
137137
}
138138

139+
func makeListHolder(
140+
_ field: ArrowField,
141+
buffers: [ArrowBuffer],
142+
nullCount: UInt,
143+
children: [ArrowData],
144+
rbLength: UInt
145+
) -> Result<ArrowArrayHolder, ArrowError> {
146+
do {
147+
let arrowData = try ArrowData(field.type, buffers: buffers, children: children, nullCount: nullCount, length: rbLength)
148+
return .success(ArrowArrayHolderImpl(try ListArray(arrowData)))
149+
} catch let error as ArrowError {
150+
return .failure(error)
151+
} catch {
152+
return .failure(.unknownError("\(error)"))
153+
}
154+
}
155+
139156
func makeArrayHolder(
140157
_ field: org_apache_arrow_flatbuf_Field,
141158
buffers: [ArrowBuffer],
@@ -188,6 +205,8 @@ func makeArrayHolder( // swiftlint:disable:this cyclomatic_complexity
188205
return makeTimeHolder(field, buffers: buffers, nullCount: nullCount)
189206
case .strct:
190207
return makeStructHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
208+
case .list:
209+
return makeListHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
191210
default:
192211
return .failure(.unknownType("Type \(typeId) currently not supported"))
193212
}
@@ -210,15 +229,6 @@ func isFixedPrimitive(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
210229
}
211230
}
212231

213-
func isNestedType(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
214-
switch type {
215-
case .struct_:
216-
return true
217-
default:
218-
return false
219-
}
220-
}
221-
222232
func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_body_length
223233
_ field: org_apache_arrow_flatbuf_Field) -> ArrowType {
224234
let type = field.typeType
@@ -271,7 +281,13 @@ func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_bod
271281
ArrowField(childField.name ?? "", type: childType, isNullable: childField.nullable))
272282
}
273283

274-
return ArrowNestedType(ArrowType.ArrowStruct, fields: fields)
284+
return ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields)
285+
case .list:
286+
guard field.childrenCount == 1, let childField = field.children(at: 0) else {
287+
return ArrowType(ArrowType.ArrowUnknown)
288+
}
289+
let childType = findArrowType(childField)
290+
return ArrowTypeList(childType)
275291
default:
276292
return ArrowType(ArrowType.ArrowUnknown)
277293
}

0 commit comments

Comments
 (0)