diff --git a/README.md b/README.md index 41e8058..8a0a9c9 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ A Swift implementation of Apache Arrow, the universal columnar format for fast data interchange and in-memory analytics. +This is a **work in progress**. Do not use in production. Progress is fast however, expect a beta in December. + This project is based on Arrow-Swift, the official Swift implementation of Apache Arrow. The decision was made to at least temporarily operate independently of the Apache Software Foundation (ASF). Currently there are no active ASF maintaners with knowledge of Swift, and the only [Apache approved CI for Swift](https://github.com/apache/infrastructure-actions/blob/main/approved_patterns.yml) is [setup-swift which is unmaintained](https://github.com/swift-actions/setup-swift/issues), leading to intermittent CI failures. This has led to delays in much-needed fixes being implemented. The intention is to continue contributing to the official Apache-Swift repository, however changes can be iterated on more quickly here. diff --git a/Sources/Arrow/ArrowArray.swift b/Sources/Arrow/ArrowArray.swift index 24034d7..d938f93 100644 --- a/Sources/Arrow/ArrowArray.swift +++ b/Sources/Arrow/ArrowArray.swift @@ -22,15 +22,13 @@ public protocol AnyArrowArray { var length: UInt { get } var nullCount: UInt { get } var arrowData: ArrowData { get } - var bufferData: [Data] { get } - var bufferDataSizes: [Int] { get } + var bufferData: [Data] { get } // TODO: remove + var bufferDataSizes: [Int] { get } // TODO: remove func asAny(_ index: UInt) -> Any? func asString(_ index: UInt) -> String func setCArrayPtr(_ cArrayPtr: UnsafePointer?) } -// MARK: Core Protocol - /// The interface for Arrow array types. public protocol ArrowArray: AnyArrowArray { associatedtype ItemType @@ -86,16 +84,14 @@ extension ArrowArrayBase { arrowData.type } + // TODO: Remove public var bufferData: [Data] { - arrowData.buffers.map { buffer in - var data = Data() - buffer.append(to: &data) - return data - } + arrowData.bufferData } + // TODO: Remove public var bufferDataSizes: [Int] { - arrowData.buffers.map { Int($0.capacity) } + arrowData.bufferDataSizes } public func isNull(at index: UInt) throws(ArrowError) -> Bool { @@ -114,17 +110,8 @@ public class FixedArray: ArrowArrayBase where T: BitwiseCopyable { if arrowData.isNull(index) { return nil } - let byteOffset = arrowData.stride * Int(index) - - // FIXME: Can probably do this and remove BitwiseCopyable constraint. - // let buffer = UnsafeBufferPointer( - // start: arrowData.buffers[1].rawPointer.assumingMemoryBound(to: ItemType.self), - // count: Int(arrowData.length) - // ) - // return buffer[Int(index)] - return arrowData.buffers[1].rawPointer - .advanced(by: byteOffset) - .load(as: ItemType.self) + let value: ItemType = arrowData.load(at: index) + return value } } @@ -134,28 +121,12 @@ public class StringArray: ArrowArrayBase { if self.arrowData.isNull(index) { return nil } - - let offsets = self.arrowData.buffers[1] - let offsetIndex = MemoryLayout.stride * Int(index) - var startIndex: Int32 = 0 - if index > 0 { - startIndex = offsets.rawPointer.advanced(by: offsetIndex) - .load(as: Int32.self) - } - let endIndex = offsets.rawPointer.advanced( - by: offsetIndex + MemoryLayout.stride - ) - .load(as: Int32.self) - + let offsetBuffer: OffsetsBuffer = arrowData.offsets + let (startIndex, endIndex) = offsetBuffer.offsets(at: Int(index)) let arrayLength = Int(endIndex - startIndex) - let values = self.arrowData.buffers[2] - let rawPointer = values.rawPointer.advanced(by: Int(startIndex)) - .bindMemory(to: UInt8.self, capacity: arrayLength) - let buffer = UnsafeBufferPointer( - start: rawPointer, - count: arrayLength - ) - return String(bytes: buffer, encoding: .utf8) + let value: String = self.arrowData.loadVariable( + at: Int(startIndex), arrayLength: arrayLength) + return value } } @@ -165,8 +136,7 @@ public class BoolArray: ArrowArrayBase { if self.arrowData.isNull(index) { return nil } - let valueBuffer = self.arrowData.buffers[1] - return BitUtility.isSet(index, buffer: valueBuffer) + return arrowData.isNullValue(at: index) } } @@ -176,10 +146,7 @@ public class Date32Array: ArrowArrayBase { if self.arrowData.isNull(index) { return nil } - let byteOffset = self.arrowData.stride * Int(index) - let milliseconds = self.arrowData.buffers[1].rawPointer.advanced( - by: byteOffset - ).load(as: UInt32.self) + let milliseconds: UInt32 = arrowData.load(at: index) return Date(timeIntervalSince1970: TimeInterval(milliseconds * 86400)) } } @@ -190,10 +157,8 @@ public class Date64Array: ArrowArrayBase { if self.arrowData.isNull(index) { return nil } - let byteOffset = self.arrowData.stride * Int(index) - let milliseconds = self.arrowData.buffers[1].rawPointer.advanced( - by: byteOffset - ).load(as: UInt64.self) + + let milliseconds: UInt64 = self.arrowData.load(at: index) return Date(timeIntervalSince1970: TimeInterval(milliseconds / 1000)) } } @@ -298,28 +263,17 @@ public class BinaryArray: ArrowArrayBase { public var options = Options() public override subscript(_ index: UInt) -> Data? { - let offsetIndex = MemoryLayout.stride * Int(index) if self.arrowData.isNull(index) { return nil } - let offsets = self.arrowData.buffers[1] - let values = self.arrowData.buffers[2] - var startIndex: Int32 = 0 - if index > 0 { - startIndex = offsets.rawPointer.advanced(by: offsetIndex) - .load(as: Int32.self) - } - let endIndex = offsets.rawPointer.advanced( - by: offsetIndex + MemoryLayout.stride - ) - .load(as: Int32.self) + + let (startIndex, endIndex) = arrowData.offsets.offsets(at: Int(index)) + let arrayLength = Int(endIndex - startIndex) - let rawPointer = values.rawPointer.advanced(by: Int(startIndex)) - .bindMemory(to: UInt8.self, capacity: arrayLength) - let buffer = UnsafeBufferPointer( - start: rawPointer, count: arrayLength) - let byteArray = Array(buffer) - return Data(byteArray) + + let data: Data = self.arrowData.loadVariable( + at: Int(startIndex), arrayLength: arrayLength) + return data } public override func asString(_ index: UInt) -> String { @@ -381,16 +335,10 @@ public class NestedArray: ArrowArrayBase<[Any?]> { switch arrowData.type { case .list(let _): guard let values = children.first else { return nil } - let offsets = self.arrowData.buffers[1] - let offsetIndex = Int(index) * MemoryLayout.stride - let startOffset = offsets.rawPointer.advanced(by: offsetIndex) - .load(as: Int32.self) - let endOffset = offsets.rawPointer.advanced( - by: offsetIndex + MemoryLayout.stride - ) - .load(as: Int32.self) + + let (startIndex, endIndex) = arrowData.offsets.offsets(at: Int(index)) var items: [Any?] = [] - for i in startOffset...allocate( @@ -132,7 +135,7 @@ public class ArrowCExporter { cArray.buffers = exportArray.buffers cArray.length = Int64(arrowData.length) cArray.null_count = Int64(arrowData.nullCount) - cArray.n_buffers = Int64(arrowData.buffers.count) + cArray.n_buffers = Int64(arrowData.bufferCount) // Swift Arrow does not currently support children or dictionaries // This will need to be updated once support has been added cArray.n_children = 0 diff --git a/Sources/Arrow/ArrowData.swift b/Sources/Arrow/ArrowData.swift index a647d8e..e6b79f9 100644 --- a/Sources/Arrow/ArrowData.swift +++ b/Sources/Arrow/ArrowData.swift @@ -14,15 +14,73 @@ import Foundation +protocol VariableLength { + init(_ value: UnsafeBufferPointer) +} + +extension String: VariableLength { + init(_ value: UnsafeBufferPointer) { + self.init(decoding: value, as: Unicode.UTF8.self) + } +} + +extension Data: VariableLength { + init(value: UnsafeBufferPointer) { + self.init(value) + } +} + public struct ArrowData { + + // FIXME: Remove + public var bufferData: [Data] { + buffers.map { buffer in + var data = Data() + buffer.append(to: &data) + return data + } + } + + // FIXME: Remove + public var bufferDataSizes: [Int] { + buffers.map { Int($0.capacity) } + } + + // FIXME: Remove + public var data: [UnsafeMutableRawPointer] { + buffers.map { $0.rawPointer } + } + + // FIXME: Remove + public var bufferCount: Int { + buffers.count + } + + // TODO: Typed accessors - migration + var offsets: OffsetsBuffer { + if !type.isVariable && !type.isNested { + fatalError() + } + return ArrowBufferBackedOffsets(buffers[1]) + } + + // TODO: this should replace nullBuffer + var nulls: NullBuffer { + let buffer = buffers[0] + let pointer = buffer.rawPointer.assumingMemoryBound(to: UInt8.self) + return NullBuffer( + length: Int(buffer.length), capacity: 0, ownsMemory: false, + buffer: pointer) + } + public let type: ArrowType - public let buffers: [ArrowBuffer] public let children: [ArrowData] public let nullCount: UInt public let length: UInt - public let stride: Int - let nullBuffer: ArrowBuffer + private let nullBuffer: ArrowBuffer + // FIXME: Remove + private let buffers: [ArrowBuffer] init( _ arrowType: ArrowType, @@ -49,12 +107,49 @@ public struct ArrowData { self.children = children self.nullCount = nullCount self.length = length - self.stride = arrowType.getStride() - self.nullBuffer = buffers[0] } + // TODO: Temporary while removing ArrowBuffer + public func load(at index: UInt) -> T where T: BitwiseCopyable { + let valueType = T.self + let byteOffset = type.getStride() * Int(index) + let milliseconds = buffers[1].rawPointer.advanced( + by: byteOffset + ).load(as: valueType) + return milliseconds + } + + // TODO: Temporary while removing ArrowBuffer + func loadVariable( + at startIndex: Int, + arrayLength: Int + ) -> T where T: VariableLength { + let values = buffers[2] + let rawPointer = values.rawPointer.advanced(by: startIndex) + .bindMemory(to: UInt8.self, capacity: arrayLength) + let buffer = UnsafeBufferPointer( + start: rawPointer, count: arrayLength) + return T(buffer) + } + + // TODO: Temporary while removing ArrowBuffer public func isNull(_ at: UInt) -> Bool { - nullBuffer.length > 0 && !BitUtility.isSet(at, buffer: nullBuffer) + let a = nulls.length > 0 && !nulls.isSet(Int(at)) + let b = nullBuffer.length > 0 && !BitUtility.isSet(at, buffer: nullBuffer) + if nulls.length != nullBuffer.length { + fatalError("Check new null handling") + } + if a != b { + fatalError("Check new null handling") + } + return a } + + // TODO: Temporary while removing ArrowBuffer + func isNullValue(at index: UInt) -> Bool { + let valueBuffer = buffers[1] + return BitUtility.isSet(index, buffer: valueBuffer) + } + } diff --git a/Sources/Arrow/ArrowReader.swift b/Sources/Arrow/ArrowReader.swift index 166839a..a2310ab 100644 --- a/Sources/Arrow/ArrowReader.swift +++ b/Sources/Arrow/ArrowReader.swift @@ -30,6 +30,8 @@ public struct ArrowReader: Sendable { let startOffset = messageOffset + buffer.offset let endOffset = startOffset + buffer.length + let range = Int(startOffset).. (start: Int32, end: Int32) { + + let offsets = arrowBuffer + let offsetIndex = MemoryLayout.stride * Int(index) + var startIndex: Int32 = 0 + if index > 0 { + startIndex = offsets.rawPointer.advanced(by: offsetIndex) + .load(as: Int32.self) + } + let endIndex = offsets.rawPointer.advanced( + by: offsetIndex + MemoryLayout.stride + ) + .load(as: Int32.self) + + return (start: startIndex, end: endIndex) + } +} diff --git a/Sources/Arrow/Buffer/NullBufferBuilder.swift b/Sources/Arrow/Buffer/NullBufferBuilder.swift index f3046ed..024af9b 100644 --- a/Sources/Arrow/Buffer/NullBufferBuilder.swift +++ b/Sources/Arrow/Buffer/NullBufferBuilder.swift @@ -51,7 +51,6 @@ final class NullBufferBuilder { // ensure we have space to write at index `length` if length >= capacity { resize(to: capacity * 2) - print("capacity: \(capacity)") } buffer[length] = currentByte currentByte = 0 diff --git a/Sources/Arrow/Buffer/OffsetProtocol.swift b/Sources/Arrow/Buffer/OffsetProtocol.swift index 70092af..67f8ae5 100644 --- a/Sources/Arrow/Buffer/OffsetProtocol.swift +++ b/Sources/Arrow/Buffer/OffsetProtocol.swift @@ -15,7 +15,7 @@ /// A type which provides offset ranges in Arrow arrays. protocol OffsetsBuffer { /// Number of offset pairs available - var count: Int { get } + // var count: Int { get } /// Get the start and end offsets for the element at index /// - Parameter index: Zero-based index of the element diff --git a/Sources/Arrow/ChunkedArray.swift b/Sources/Arrow/ChunkedArray.swift index 182caf0..90bd7e1 100644 --- a/Sources/Arrow/ChunkedArray.swift +++ b/Sources/Arrow/ChunkedArray.swift @@ -20,64 +20,11 @@ public class ChunkedArrayHolder { public let nullCount: UInt public let holder: Any - public let getBufferData: () -> Result<[Data], ArrowError> - public let getBufferDataSizes: () -> Result<[Int], ArrowError> public init(_ chunked: ChunkedArray) { self.holder = chunked self.length = chunked.length self.type = chunked.type self.nullCount = chunked.nullCount - self.getBufferData = { () -> Result<[Data], ArrowError> in - var bufferData: [Data] = [] - var numBuffers = 2 - switch toFBTypeEnum(chunked.type) { - case .success(let fbType): - if !isFixedPrimitive(fbType) { - numBuffers = 3 - } - case .failure(let error): - return .failure(error) - } - - for _ in 0.. Result<[Int], ArrowError> in - var bufferDataSizes: [Int] = [] - var numBuffers = 2 - - switch toFBTypeEnum(chunked.type) { - case .success(let fbType): - if !isFixedPrimitive(fbType) { - numBuffers = 3 - } - case .failure(let error): - return .failure(error) - } - - for _ in 0.. { throw ArrowError.arrayHasNoElements } - self.type = arrays[0].arrowData.type + self.type = arrays[0].type var len: UInt = 0 var nullCount: UInt = 0 for array in arrays { diff --git a/Tests/ArrowTests/ArrayTests.swift b/Tests/ArrowTests/ArrayTests.swift index 46df65f..e665c4e 100644 --- a/Tests/ArrowTests/ArrayTests.swift +++ b/Tests/ArrowTests/ArrayTests.swift @@ -40,10 +40,10 @@ struct ArrayTests { #expect(array[10]! == 10) #expect(try array.isNull(at: 100) == true) - for buffer in array.arrowData.buffers { - let dataAddress = UInt(bitPattern: buffer.rawPointer) - #expect(dataAddress % 64 == 0, "Buffer should be 64-byte aligned") - } + // for buffer in array.arrowData.buffers { + // let dataAddress = UInt(bitPattern: buffer.rawPointer) + // #expect(dataAddress % 64 == 0, "Buffer should be 64-byte aligned") + // } } @Test func doubleArray() throws { @@ -91,11 +91,6 @@ struct ArrayTests { @Test func boolArray() throws { - for i in 0..<100 { - let bytesNeeded = (i + 7) / 8 - print("\(i) bits requires \(bytesNeeded) bytes") - } - let boolBuilder = try ArrowArrayBuilders.loadBoolArrayBuilder() boolBuilder.append(true) boolBuilder.append(nil) diff --git a/Tests/ArrowTests/BufferTests.swift b/Tests/ArrowTests/BufferTests.swift index 3335ef4..79d04a9 100644 --- a/Tests/ArrowTests/BufferTests.swift +++ b/Tests/ArrowTests/BufferTests.swift @@ -21,6 +21,9 @@ import Testing struct BufferTests { @Test func nullBufferBuilder() { + + // TODO: consider empty buffers + let mutableNullBuffer = NullBufferBuilder() for i in 0..<10000 { if i % 7 == 0 {