diff --git a/Sources/Arrow/Array/Array.swift b/Sources/Arrow/Array/Array.swift index 12026ea..c4d427c 100644 --- a/Sources/Arrow/Array/Array.swift +++ b/Sources/Arrow/Array/Array.swift @@ -38,26 +38,30 @@ extension ArrowArrayProtocol { // MARK: Capability protocols. -public protocol ArrowArrayOfString { +public protocol StringArrayProtocol { + var length: Int { get } subscript(index: Int) -> String? { get } } -extension ArrowArrayVariable: ArrowArrayOfString where ItemType == String {} +extension ArrowArrayVariable: StringArrayProtocol where ItemType == String {} -public protocol ArrowArrayOfData { - subscript(index: Int) -> Data? { get } -} -extension ArrowArrayFixedSizeBinary: ArrowArrayOfData where ItemType == Data {} -extension ArrowArrayVariable: ArrowArrayOfData where ItemType == Data {} +protocol BinaryArrayProtocol: ArrowArrayProtocol where ItemType == Data {} +extension ArrowArrayFixedSizeBinary: BinaryArrayProtocol {} +extension ArrowArrayVariable: BinaryArrayProtocol +where ItemType == Data, OffsetType: FixedWidthInteger & SignedInteger {} -public protocol ArrowArrayOfInt8 { - subscript(index: Int) -> Int8? { get } -} -extension ArrowArrayFixed: ArrowArrayOfInt8 where ItemType == Int8 {} +protocol Utf8ArrayProtocol: ArrowArrayProtocol where ItemType == String {} +extension ArrowArrayVariable: Utf8ArrayProtocol +where ItemType == String, OffsetType: FixedWidthInteger & SignedInteger {} -public protocol ArrowArrayOfInt32 { - subscript(index: Int) -> Int32? { get } +public protocol ListArrayProtocol { + var length: Int { get } + var values: AnyArrowArrayProtocol { get } + subscript(index: Int) -> AnyArrowArrayProtocol? { get } } -extension ArrowArrayFixed: ArrowArrayOfInt32 where ItemType == Int32 {} +extension ArrowListArray: ListArrayProtocol {} +extension ArrowFixedSizeListArray: ListArrayProtocol {} + +// MARK: Array implementations. /// An Arrow array of booleans using the three-valued logical model (true / false / null). public struct ArrowArrayBoolean: ArrowArrayProtocol { @@ -102,34 +106,32 @@ public struct ArrowArrayBoolean: ArrowArrayProtocol { } /// An Arrow array of fixed-width types. -public struct ArrowArrayFixed: ArrowArrayProtocol -where - ValueBuffer: FixedWidthBufferProtocol, - ValueBuffer.ElementType: Numeric +public struct ArrowArrayNumeric: + ArrowArrayProtocol { - - public typealias ItemType = ValueBuffer.ElementType public let offset: Int public let length: Int + public var nullCount: Int { nullBuffer.nullCount } public var bufferSizes: [Int] { [nullBuffer.length, valueBuffer.length] } public var buffers: [ArrowBufferProtocol] { [nullBuffer, valueBuffer] } - public var nullCount: Int { nullBuffer.nullCount } + let nullBuffer: NullBuffer - let valueBuffer: ValueBuffer + private let valueBuffer: any FixedWidthBufferProtocol - public init( + // Initialize from concrete buffer type + public init( offset: Int = 0, length: Int, nullBuffer: NullBuffer, valueBuffer: ValueBuffer - ) { + ) where ValueBuffer.ElementType == ItemType { self.offset = offset self.length = length self.nullBuffer = nullBuffer self.valueBuffer = valueBuffer } - public subscript(index: Int) -> ValueBuffer.ElementType? { + public subscript(index: Int) -> ItemType? { precondition(index >= 0 && index < length, "Invalid index.") let offsetIndex = self.offset + index if !self.nullBuffer.isSet(offsetIndex) { @@ -148,10 +150,7 @@ where } } -public struct ArrowArrayFixedSizeBinary: ArrowArrayProtocol -where - ValueBuffer: VariableLengthBufferProtocol -{ +public struct ArrowArrayFixedSizeBinary: ArrowArrayProtocol { public typealias ItemType = Data public let offset: Int public let length: Int @@ -163,14 +162,14 @@ where public var nullCount: Int { nullBuffer.nullCount } let nullBuffer: NullBuffer - let valueBuffer: ValueBuffer + let valueBuffer: any VariableLengthBufferProtocol public init( offset: Int = 0, length: Int, byteWidth: Int, nullBuffer: NullBuffer, - valueBuffer: ValueBuffer + valueBuffer: any VariableLengthBufferProtocol ) { self.offset = offset self.length = length @@ -179,7 +178,7 @@ where self.valueBuffer = valueBuffer } - public subscript(index: Int) -> ValueBuffer.ElementType? { + public subscript(index: Int) -> ItemType? { guard nullBuffer.isSet(index) else { return nil } let startIndex = index * byteWidth return valueBuffer.loadVariable(at: startIndex, arrayLength: byteWidth) @@ -197,34 +196,36 @@ where } /// An Arrow array of variable-length types. -public struct ArrowArrayVariable: - ArrowArrayProtocol -where - OffsetsBuffer: FixedWidthBufferProtocol, - ValueBuffer: VariableLengthBufferProtocol, - ValueBuffer.ElementType: VariableLength -{ - public typealias ItemType = ValueBuffer.ElementType +public struct ArrowArrayVariable< + ItemType: VariableLength, + OffsetType: FixedWidthInteger & SignedInteger +>: ArrowArrayProtocol { public let offset: Int public let length: Int + private let nullBuffer: NullBuffer + private let offsetsBuffer: any FixedWidthBufferProtocol + private let valueBuffer: any VariableLengthBufferProtocol + public var bufferSizes: [Int] { [nullBuffer.length, offsetsBuffer.length, valueBuffer.length] } + public var buffers: [ArrowBufferProtocol] { [nullBuffer, offsetsBuffer, valueBuffer] } + public var nullCount: Int { nullBuffer.nullCount } - let nullBuffer: NullBuffer - let offsetsBuffer: OffsetsBuffer - let valueBuffer: ValueBuffer - public init( + public init< + Offsets: FixedWidthBufferProtocol, + Values: VariableLengthBufferProtocol + >( offset: Int = 0, length: Int, nullBuffer: NullBuffer, - offsetsBuffer: OffsetsBuffer, - valueBuffer: ValueBuffer - ) { + offsetsBuffer: Offsets, + valueBuffer: Values + ) where Values.ElementType == ItemType { self.offset = offset self.length = length self.nullBuffer = nullBuffer @@ -232,16 +233,19 @@ where self.valueBuffer = valueBuffer } - public subscript(index: Int) -> ValueBuffer.ElementType? { + public subscript(index: Int) -> ItemType? { let offsetIndex = self.offset + index - if !self.nullBuffer.isSet(offsetIndex) { + guard self.nullBuffer.isSet(offsetIndex) else { return nil } - let startIndex = offsetsBuffer[offsetIndex] - let endIndex = offsetsBuffer[offsetIndex + 1] + + // Use runtime dispatch through the existential + let startOffset = offsetsBuffer[offsetIndex] + let endOffset = offsetsBuffer[offsetIndex + 1] + return valueBuffer.loadVariable( - at: Int(startIndex), - arrayLength: Int(endIndex - startIndex) + at: Int(startOffset), + arrayLength: Int(endOffset - startOffset) ) } @@ -257,17 +261,14 @@ where } /// An Arrow array of `Date`s with a resolution of 1 day. -public struct ArrowArrayDate32: ArrowArrayProtocol -where - ValueBuffer: FixedWidthBufferProtocol -{ +public struct ArrowArrayDate32: ArrowArrayProtocol { public typealias ItemType = Date public var bufferSizes: [Int] { array.bufferSizes } public var buffers: [ArrowBufferProtocol] { array.buffers } public var nullCount: Int { array.nullCount } public var offset: Int { array.offset } public var length: Int { array.length } - let array: ArrowArrayFixed + let array: ArrowArrayNumeric public subscript(index: Int) -> Date? { precondition(index >= 0 && index < length, "Invalid index.") @@ -287,17 +288,14 @@ where } /// An Arrow array of `Date`s with a resolution of 1 second. -public struct ArrowArrayDate64: ArrowArrayProtocol -where - ValueBuffer: FixedWidthBufferProtocol -{ +public struct ArrowArrayDate64: ArrowArrayProtocol { public typealias ItemType = Date public var bufferSizes: [Int] { array.bufferSizes } public var buffers: [ArrowBufferProtocol] { array.buffers } public var nullCount: Int { array.nullCount } public var offset: Int { array.offset } public var length: Int { array.length } - let array: ArrowArrayFixed + let array: ArrowArrayNumeric public subscript(index: Int) -> Date? { precondition(index >= 0 && index < length, "Invalid index.") @@ -316,13 +314,12 @@ where } } -/// A strongly-typed Arrow list array which may be nested arbitrarily. -public struct ArrowListArray: ArrowArrayProtocol +///// An Arrow list array which may be nested arbitrarily. +public struct ArrowListArray: ArrowArrayProtocol where - OffsetsBuffer: FixedWidthBufferProtocol, - Element: AnyArrowArrayProtocol + OffsetsBuffer: FixedWidthBufferProtocol, + OffsetsBuffer.ElementType: FixedWidthInteger & SignedInteger { - public typealias ItemType = Element public let offset: Int public let length: Int public var bufferSizes: [Int] { @@ -332,16 +329,17 @@ where [nullBuffer, offsetsBuffer] } public var nullCount: Int { nullBuffer.nullCount } + let nullBuffer: NullBuffer let offsetsBuffer: OffsetsBuffer - let values: Element + public let values: AnyArrowArrayProtocol public init( offset: Int = 0, length: Int, nullBuffer: NullBuffer, offsetsBuffer: OffsetsBuffer, - values: Element + values: AnyArrowArrayProtocol ) { self.offset = offset self.length = length @@ -350,7 +348,7 @@ where self.values = values } - public subscript(index: Int) -> Element? { + public subscript(index: Int) -> AnyArrowArrayProtocol? { precondition(index >= 0 && index < length, "Invalid index.") let offsetIndex = self.offset + index if !self.nullBuffer.isSet(offsetIndex) { @@ -373,45 +371,59 @@ where } } -/// A type-erased wrapper for an Arrow list array. -public struct AnyArrowListArray: ArrowArrayProtocol { +/// An Arrow list array with fixed size elements. +public struct ArrowFixedSizeListArray: ArrowArrayProtocol { + public let offset: Int + public let length: Int + public let listSize: Int - public typealias ItemType = AnyArrowArrayProtocol public var bufferSizes: [Int] { - _base.bufferSizes + [nullBuffer.length] } + public var buffers: [ArrowBufferProtocol] { - _base.buffers + [nullBuffer] } - private let _base: any ArrowArrayProtocol - private let _subscriptImpl: (Int) -> AnyArrowArrayProtocol? - private let _sliceImpl: (Int, Int) -> AnyArrowListArray + public var nullCount: Int { nullBuffer.nullCount } + + let nullBuffer: NullBuffer + public let values: AnyArrowArrayProtocol - public let offset: Int - public let length: Int - public var nullCount: Int { _base.nullCount } - - public init( - _ list: ArrowListArray - ) - where - OffsetsBuffer: FixedWidthBufferProtocol, - Element: AnyArrowArrayProtocol - { - self._base = list - self.offset = list.offset - self.length = list.length - self._subscriptImpl = { list[$0] } - self._sliceImpl = { AnyArrowListArray(list.slice(offset: $0, length: $1)) } + public init( + offset: Int = 0, + length: Int, + listSize: Int, + nullBuffer: NullBuffer, + values: AnyArrowArrayProtocol + ) { + self.offset = offset + self.length = length + self.listSize = listSize + self.nullBuffer = nullBuffer + self.values = values } public subscript(index: Int) -> AnyArrowArrayProtocol? { - _subscriptImpl(index) + precondition(index >= 0 && index < length, "Invalid index.") + let offsetIndex = self.offset + index + + if !self.nullBuffer.isSet(offsetIndex) { + return nil + } + + let startIndex = offsetIndex * listSize + return values.slice(offset: startIndex, length: listSize) } - public func slice(offset: Int, length: Int) -> AnyArrowListArray { - _sliceImpl(offset, length) + public func slice(offset: Int, length: Int) -> Self { + .init( + offset: self.offset + offset, + length: length, + listSize: listSize, + nullBuffer: nullBuffer, + values: values + ) } } diff --git a/Sources/Arrow/Array/Builder.swift b/Sources/Arrow/Array/Builder.swift index d79db89..1913f40 100644 --- a/Sources/Arrow/Array/Builder.swift +++ b/Sources/Arrow/Array/Builder.swift @@ -75,9 +75,11 @@ public class ArrayBuilderBoolean: AnyArrayBuilder { } /// A builder for Arrow arrays holding fixed-width types. -public class ArrayBuilderFixedWidth: AnyArrayBuilder { +public class ArrayBuilderFixedWidth: + AnyArrayBuilder +{ - public typealias ArrayType = ArrowArrayFixed> + public typealias ArrayType = ArrowArrayNumeric public var length: Int let nullBuilder: NullBufferBuilder @@ -118,9 +120,7 @@ public class ArrayBuilderFixedWidth: AnyArrayBuilder { public class ArrayBuilderFixedSizedBinary: AnyArrayBuilder { - public typealias ArrayType = ArrowArrayFixedSizeBinary< - VariableLengthTypeBuffer - > + public typealias ArrayType = ArrowArrayFixedSizeBinary var length: Int let byteWidth: Int @@ -171,24 +171,23 @@ public class ArrayBuilderFixedSizedBinary: } /// A builder for Arrow arrays holding variable length types. -public class ArrayBuilderVariableLength: - AnyArrayBuilder -{ - public typealias ArrayType = ArrowArrayVariable< - FixedWidthBuffer, VariableLengthTypeBuffer - > +public class ArrayBuilderVariableLength< + Element: VariableLength, OffsetType: FixedWidthInteger & SignedInteger +>: AnyArrayBuilder { + + public typealias ArrayType = ArrowArrayVariable var length: Int let nullBuilder: NullBufferBuilder - let offsetsBuilder: FixedWidthBufferBuilder + let offsetsBuilder: FixedWidthBufferBuilder let valueBuilder: VariableLengthTypeBufferBuilder public init() { self.length = 0 self.nullBuilder = NullBufferBuilder() - self.offsetsBuilder = FixedWidthBufferBuilder() + self.offsetsBuilder = FixedWidthBufferBuilder() self.valueBuilder = VariableLengthTypeBufferBuilder() - self.offsetsBuilder.append(Int32.zero) + self.offsetsBuilder.append(OffsetType.zero) } public func append(_ value: Element) { @@ -204,40 +203,36 @@ public class ArrayBuilderVariableLength: valueBuilder.increaseCapacity(to: newCapacity) } valueBuilder.append(data) - let newOffset = Int32(valueBuilder.length) + let newOffset = OffsetType(valueBuilder.length) offsetsBuilder.append(newOffset) } public func appendNull() { length += 1 nullBuilder.appendValid(false) - let newOffset = Int32(valueBuilder.length) + let newOffset = OffsetType(valueBuilder.length) offsetsBuilder.append(newOffset) } public func finish() -> ArrayType { - let nullBuffer = nullBuilder.finish() - let offsetsBuffer = offsetsBuilder.finish() - let valueBuffer = valueBuilder.finish() - return .init( - offset: 0, + ArrayType( length: length, - nullBuffer: nullBuffer, - offsetsBuffer: offsetsBuffer, - valueBuffer: valueBuffer + nullBuffer: nullBuilder.finish(), + offsetsBuffer: offsetsBuilder.finish(), + valueBuffer: valueBuilder.finish() ) } } /// A builder for Arrow arrays holding `String` values. -typealias ArrayBuilderString = ArrayBuilderVariableLength +typealias ArrayBuilderString = ArrayBuilderVariableLength /// A builder for Arrow arrays holding `Data` values. -typealias ArrayBuilderBinary = ArrayBuilderVariableLength +typealias ArrayBuilderBinary = ArrayBuilderVariableLength /// A builder for Arrow arrays holding `Date`s with a resolution of one day. public struct ArrayBuilderDate32: AnyArrayBuilder { - public typealias ArrayType = ArrowArrayDate32> + public typealias ArrayType = ArrowArrayDate32 let builder: ArrayBuilderFixedWidth = .init() public init() {} @@ -262,7 +257,7 @@ public struct ArrayBuilderDate32: AnyArrayBuilder { /// A builder for Arrow arrays holding `Date`s with a resolution of one day. public struct ArrayBuilderDate64: AnyArrayBuilder { - public typealias ArrayType = ArrowArrayDate64> + public typealias ArrayType = ArrowArrayDate64 let builder: ArrayBuilderFixedWidth = .init() public init() {} @@ -294,11 +289,9 @@ public typealias ArrayBuilderTime64 = ArrayBuilderFixedWidth /// A builder for Arrow arrays holding Timestamp values. public typealias ArrayBuilderTimestamp = ArrayBuilderFixedWidth -class ArrayBuilderList: AnyArrayBuilder { - - func append(_ value: T.ArrayType) {} +class ArrayBuilderList { - typealias ArrayType = ArrowListArray> + typealias ArrayType = ArrowListArray> var length: Int let nullBuilder: NullBufferBuilder @@ -318,7 +311,6 @@ class ArrayBuilderList: AnyArrayBuilder { length += 1 nullBuilder.appendValid(true) - // let startLength = valueBuilder.length builder(valueBuilder) // User adds items to child builder let endLength = valueBuilder.length @@ -342,7 +334,7 @@ class ArrayBuilderList: AnyArrayBuilder { length: length, nullBuffer: nullBuffer, offsetsBuffer: offsetsBuffer, - values: valuesArray + values: valuesArray // Now accepts AnyArrowArrayProtocol ) } } diff --git a/Sources/Arrow/ArrowField.swift b/Sources/Arrow/ArrowField.swift index 57ab839..600dea2 100644 --- a/Sources/Arrow/ArrowField.swift +++ b/Sources/Arrow/ArrowField.swift @@ -50,13 +50,18 @@ extension ArrowField { /// Default list member field name. public static let listFieldDefaultName = "item" - /// Creates a new field with the given name, data type, and nullability. - public init(name: String, dataType: ArrowType, isNullable: Bool) { + /// Creates a new field with the given name, data type, nullability and metadata. + public init( + name: String, + dataType: ArrowType, + isNullable: Bool, + metadata: [String: String] = [:] + ) { self.name = name self.type = dataType self.isNullable = isNullable self.orderedDict = false - self.metadata = .init() + self.metadata = metadata } /// Creates a new `ArrowField` suitable for `ArrowType::List`. diff --git a/Sources/Arrow/ArrowSchema.swift b/Sources/Arrow/ArrowSchema.swift index 5549f37..735f39d 100644 --- a/Sources/Arrow/ArrowSchema.swift +++ b/Sources/Arrow/ArrowSchema.swift @@ -18,14 +18,16 @@ import Foundation public final class ArrowSchema: Sendable { public let fields: [ArrowField] public let fieldLookup: [String: Int] - init(_ fields: [ArrowField]) { + let metadata: [String: String]? + + public init(_ fields: [ArrowField], metadata: [String: String]? = nil) { var fieldLookup: [String: Int] = [:] for (index, field) in fields.enumerated() { fieldLookup[field.name] = index } - self.fields = fields self.fieldLookup = fieldLookup + self.metadata = metadata } public func field(_ index: Int) -> ArrowField { diff --git a/Sources/Arrow/ArrowType.swift b/Sources/Arrow/ArrowType.swift index ddfc539..829c9de 100644 --- a/Sources/Arrow/ArrowType.swift +++ b/Sources/Arrow/ArrowType.swift @@ -518,7 +518,7 @@ extension ArrowType { /// Returns true if the type is primitive: (numeric, temporal). @inlinable - public func isPrimitive() -> Bool { + public var isPrimitive: Bool { self.isNumeric || self.isTemporal } @@ -1023,7 +1023,6 @@ extension ArrowType { } else if from == "u" { return .utf8 } - throw .notImplemented } } diff --git a/Sources/Arrow/Buffer/ArrowBufferProtocol.swift b/Sources/Arrow/Buffer/ArrowBufferProtocol.swift index b192c3c..e4d00de 100644 --- a/Sources/Arrow/Buffer/ArrowBufferProtocol.swift +++ b/Sources/Arrow/Buffer/ArrowBufferProtocol.swift @@ -16,13 +16,13 @@ import Foundation /// An Arrow buffer. public protocol ArrowBufferProtocol { + var length: Int { get } func withUnsafeBytes( _ body: (UnsafeRawBufferPointer) throws -> R ) rethrows -> R } internal protocol ArrowBufferUInt8: ArrowBufferProtocol { - var length: Int { get } var buffer: UnsafePointer { get } } diff --git a/Sources/Arrow/Buffer/FixedWidthBuffer.swift b/Sources/Arrow/Buffer/FixedWidthBuffer.swift index a90379e..a0023f1 100644 --- a/Sources/Arrow/Buffer/FixedWidthBuffer.swift +++ b/Sources/Arrow/Buffer/FixedWidthBuffer.swift @@ -19,9 +19,8 @@ public protocol FixedWidthBufferProtocol: ArrowBufferProtocol { } /// A buffer used in Arrow arrays that hold fixed-width types. -public final class FixedWidthBuffer: FixedWidthBufferProtocol +final class FixedWidthBuffer: FixedWidthBufferProtocol where T: Numeric { - public typealias ElementType = T public var length: Int var capacity: Int diff --git a/Sources/ArrowIPC/Array+IPC.swift b/Sources/ArrowIPC/Array+IPC.swift index 79c5f04..83bfb71 100644 --- a/Sources/ArrowIPC/Array+IPC.swift +++ b/Sources/ArrowIPC/Array+IPC.swift @@ -15,65 +15,65 @@ import Arrow import Foundation -/// A `Data` backed Arrow utf8 array. -typealias ArrowArrayUtf8 = ArrowArrayVariable< - FixedWidthBufferIPC, - VariableLengthBufferIPC -> - -extension ArrowArrayUtf8 { - - /// Build a `Data` backed Arrow utf8 array. - /// - Parameters: - /// - length: The array length. - /// - nullBuffer: The null buffer. - /// - offsetsBuffer: A view over file-backed data. - /// - valueBuffer: A view over file-backed data. - /// - Returns: A file-backed Arrow utf8 array. - static func utf8( - length: Int, - nullBuffer: NullBuffer, - offsetsBuffer: FileDataBuffer, - valueBuffer: FileDataBuffer - ) -> Self { - let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) - let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) - return Self( - length: length, - nullBuffer: nullBuffer, - offsetsBuffer: offsetsBufferTyped, - valueBuffer: valueBufferTyped - ) - } -} - -typealias ArrowArrayBinary = ArrowArrayVariable< - FixedWidthBufferIPC, - VariableLengthBufferIPC -> - -extension ArrowArrayBinary { - - /// Build a `Data` backed Arrow binary array. - /// - Parameters: - /// - length: The array length. - /// - nullBuffer: The null buffer. - /// - offsetsBuffer: A view over file-backed data. - /// - valueBuffer: A view over file-backed data. - /// - Returns: A file-backed Arrow utf8 array. - static func binary( - length: Int, - nullBuffer: NullBuffer, - offsetsBuffer: FileDataBuffer, - valueBuffer: FileDataBuffer - ) -> Self { - let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) - let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) - return Self( - length: length, - nullBuffer: nullBuffer, - offsetsBuffer: offsetsBufferTyped, - valueBuffer: valueBufferTyped - ) - } -} +///// A `Data` backed Arrow utf8 array. +//typealias ArrowArrayUtf8 = ArrowArrayVariable< +// FixedWidthBufferIPC, +// VariableLengthBufferIPC +//> +// +//extension ArrowArrayUtf8 { +// +// /// Build a `Data` backed Arrow utf8 array. +// /// - Parameters: +// /// - length: The array length. +// /// - nullBuffer: The null buffer. +// /// - offsetsBuffer: A view over file-backed data. +// /// - valueBuffer: A view over file-backed data. +// /// - Returns: A file-backed Arrow utf8 array. +// static func utf8( +// length: Int, +// nullBuffer: NullBuffer, +// offsetsBuffer: FileDataBuffer, +// valueBuffer: FileDataBuffer +// ) -> Self { +// let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) +// let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) +// return Self( +// length: length, +// nullBuffer: nullBuffer, +// offsetsBuffer: offsetsBufferTyped, +// valueBuffer: valueBufferTyped +// ) +// } +//} +// +//typealias ArrowArrayBinary = ArrowArrayVariable< +// FixedWidthBufferIPC, +// VariableLengthBufferIPC +//> +// +//extension ArrowArrayBinary { +// +// /// Build a `Data` backed Arrow binary array. +// /// - Parameters: +// /// - length: The array length. +// /// - nullBuffer: The null buffer. +// /// - offsetsBuffer: A view over file-backed data. +// /// - valueBuffer: A view over file-backed data. +// /// - Returns: A file-backed Arrow utf8 array. +// static func binary( +// length: Int, +// nullBuffer: NullBuffer, +// offsetsBuffer: FileDataBuffer, +// valueBuffer: FileDataBuffer +// ) -> Self { +// let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) +// let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) +// return Self( +// length: length, +// nullBuffer: nullBuffer, +// offsetsBuffer: offsetsBufferTyped, +// valueBuffer: valueBufferTyped +// ) +// } +//} diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 1b6f643..cb21629 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -85,7 +85,9 @@ where } /// A `Data` backed buffer for variable-length types. -struct VariableLengthBufferIPC: +struct VariableLengthBufferIPC< + Element: VariableLength, OffsetType: FixedWidthInteger +>: VariableLengthBufferProtocol, ArrowBufferIPC { typealias ElementType = Element @@ -152,7 +154,7 @@ public struct ArrowReader { guard let schema = footer.schema else { throw ArrowError.invalid("Missing schema in footer") } - let arrowSchema = try loadSchema(schema) + let arrowSchema = try loadSchema(schema: schema) var recordBatches: [RecordBatch] = [] // MARK: Record batch parsing @@ -281,20 +283,36 @@ public struct ArrowReader { return makeFixedArray( length: length, elementType: Int8.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint8: + return makeFixedArray( + length: length, elementType: UInt8.self, + nullBuffer: nullBuffer, buffer: buffer1) case .int16: return makeFixedArray( length: length, elementType: Int16.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint16: + return makeFixedArray( + length: length, elementType: UInt16.self, + nullBuffer: nullBuffer, buffer: buffer1) case .int32: return makeFixedArray( length: length, elementType: Int32.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint32: + return makeFixedArray( + length: length, elementType: UInt32.self, + nullBuffer: nullBuffer, buffer: buffer1) case .int64: return makeFixedArray( length: length, elementType: Int64.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint64: + return makeFixedArray( + length: length, elementType: UInt64.self, + nullBuffer: nullBuffer, buffer: buffer1) default: - throw ArrowError.notImplemented + throw ArrowError.invalid("TODO: Unimplemented arrow type: \(arrowType)") } } else if arrowType.isVariable { let buffer1 = try nextBuffer( @@ -302,47 +320,66 @@ public struct ArrowReader { let buffer2 = try nextBuffer( message: rbMessage, index: &bufferIndex, offset: offset, data: data) + let offsetsBufferTyped = FixedWidthBufferIPC(buffer: buffer1) + if arrowType == .utf8 { - return ArrowArrayVariable.utf8( + let valueBufferTyped = VariableLengthBufferIPC( + buffer: buffer2) + return ArrowArrayVariable( length: length, nullBuffer: nullBuffer, - offsetsBuffer: buffer1, - valueBuffer: buffer2 + offsetsBuffer: offsetsBufferTyped, + valueBuffer: valueBufferTyped ) } else if arrowType == .binary { - return ArrowArrayVariable.binary( + let valueBufferTyped = VariableLengthBufferIPC( + buffer: buffer2) + return ArrowArrayVariable( length: length, nullBuffer: nullBuffer, - offsetsBuffer: buffer1, - valueBuffer: buffer2 + offsetsBuffer: offsetsBufferTyped, + valueBuffer: valueBufferTyped ) } else { throw ArrowError.notImplemented } } else if arrowType.isNested { switch arrowType { - case .list(let field): + case .list(let childField): + let buffer1 = try nextBuffer( + message: rbMessage, index: &bufferIndex, offset: offset, data: data) + var offsetsBuffer = FixedWidthBufferIPC(buffer: buffer1) + let array: AnyArrowArrayProtocol = try loadField( rbMessage: rbMessage, - field: field, + field: childField, offset: offset, nodeIndex: &nodeIndex, bufferIndex: &bufferIndex ) - let buffer1 = try nextBuffer( - message: rbMessage, index: &bufferIndex, offset: offset, data: data) - var offsetsBuffer = FixedWidthBufferIPC(buffer: buffer1) - // TODO: This is a hack for the special-case where buffer length 0 means all-zero offset. - // Can follow the null buffer example. - if offsetsBuffer.length != length + 1 { - let offsetCount = length + 1 - let byteCount = offsetCount * MemoryLayout.stride - let fileDataBuffer = FileDataBuffer( - data: Data(count: byteCount), // Zero-initialized - range: 0..(buffer: fileDataBuffer) + if offsetsBuffer.length == 0 { + // Empty offsets buffer is valid when child array is empty + // There could be any number of empty lists referencing into an empty list + guard array.length == 0 else { + throw ArrowError.invalid( + "Empty offsets buffer but non-empty child array") + } + let emptyBuffer = emptyOffsetBuffer(offsetCount: length + 1) + offsetsBuffer = FixedWidthBufferIPC(buffer: emptyBuffer) + } else { + let requiredBytes = (length + 1) * MemoryLayout.stride + guard offsetsBuffer.length >= requiredBytes else { + throw ArrowError.invalid( + "Offsets buffer too small: need \(requiredBytes) bytes for \(length) lists" + ) + } + // Verify last offset matches child array length + let lastOffset = offsetsBuffer[length] + guard lastOffset == Int32(array.length) else { + throw ArrowError.invalid( + "Expected last offset to match child array length.") + } } return makeListArray( length: length, @@ -350,6 +387,20 @@ public struct ArrowReader { offsetsBuffer: offsetsBuffer, values: array ) + case .fixedSizeList(let field, let listSize): + let array: AnyArrowArrayProtocol = try loadField( + rbMessage: rbMessage, + field: field, + offset: offset, + nodeIndex: &nodeIndex, + bufferIndex: &bufferIndex + ) + return ArrowFixedSizeListArray( + length: length, + listSize: Int(listSize), + nullBuffer: nullBuffer, + values: array + ) case .strct(let fields): var arrays: [(String, AnyArrowArrayProtocol)] = [] for field in fields { @@ -375,7 +426,7 @@ public struct ArrowReader { if case .fixedSizeBinary(let byteWidth) = arrowType { let valueBuffer = try nextBuffer( message: rbMessage, index: &bufferIndex, offset: offset, data: data) - let valueBufferTyped = VariableLengthBufferIPC( + let valueBufferTyped = VariableLengthBufferIPC( buffer: valueBuffer) return ArrowArrayFixedSizeBinary( length: length, @@ -408,48 +459,82 @@ public struct ArrowReader { elementType: T.Type, nullBuffer: NullBuffer, buffer: FileDataBuffer - ) -> ArrowArrayFixed> { + ) -> ArrowArrayNumeric { let fixedBuffer = FixedWidthBufferIPC(buffer: buffer) - return ArrowArrayFixed( + return ArrowArrayNumeric( length: length, nullBuffer: nullBuffer, valueBuffer: fixedBuffer ) } - func makeListArray( + func makeListArray( length: Int, nullBuffer: NullBuffer, - offsetsBuffer: FixedWidthBufferIPC, - values: Element - ) -> AnyArrowListArray where Element: AnyArrowArrayProtocol { - let list = ArrowListArray( + offsetsBuffer: OffsetsBuffer, + values: AnyArrowArrayProtocol + ) -> ArrowListArray + where + OffsetsBuffer: FixedWidthBufferProtocol, + OffsetsBuffer.ElementType: FixedWidthInteger & SignedInteger + { + ArrowListArray( length: length, nullBuffer: nullBuffer, offsetsBuffer: offsetsBuffer, values: values ) - return AnyArrowListArray(list) } - private func loadSchema(_ schema: FSchema) throws(ArrowError) -> ArrowSchema { - let builder = ArrowSchema.Builder() + private func loadSchema(schema: FSchema) throws(ArrowError) -> ArrowSchema { + let metadata = (0.. FileDataBuffer { + let byteCount = offsetCount * MemoryLayout.stride + return FileDataBuffer( + data: Data(count: byteCount), // Zero-initialized + range: 0.. Self { + let fieldType: ArrowType = try .type(for: field) + guard let fieldName = field.name else { + throw .invalid("Field name not found") + } + let fieldMetadata = (0..> + as? ArrowArrayNumeric else { Issue.record("Failed to cast column 0 to ArrowArrayDouble") return @@ -53,7 +53,10 @@ struct ArrowReaderTests { #expect(doubleColumn[4] == 5.5) // Test the String column (index 1) - guard let stringColumn = recordBatch.arrays[1] as? ArrowArrayUtf8 else { + guard + let stringColumn = recordBatch.arrays[1] + as? ArrowArrayVariable + else { Issue.record("Failed to cast column 1 to ArrowArrayString") return } diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index 6e952dc..555bd67 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -15,35 +15,108 @@ import Foundation /// The JSON file structure used to validate gold-standard Arrow test files. -struct ArrowGold: Codable { +struct ArrowGold: Codable, Equatable { let schema: Schema let batches: [Batch] let dictionaries: [Dictionary]? - struct Dictionary: Codable { + struct Dictionary: Codable, Equatable { let id: Int let data: Batch } - struct DictionaryInfo: Codable { + struct DictionaryInfo: Codable, Equatable { let id: Int let indexType: FieldType let isOrdered: Bool? } - struct Schema: Codable { + struct Schema: Codable, Equatable { let fields: [Field] + let metadata: [String: String]? + + enum CodingKeys: String, CodingKey { + case fields + case metadata + } + + init(fields: [Field], metadata: [String: String]?) { + self.fields = fields + self.metadata = metadata + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + self.fields = try container.decode([Field].self, forKey: .fields) + if container.contains(.metadata) { + var metadataArray = try container.nestedUnkeyedContainer( + forKey: .metadata + ) + try self.metadata = buildDictionary(from: &metadataArray) + } else { + self.metadata = nil + } + } } - struct Field: Codable { + struct Field: Codable, Equatable { let name: String let type: FieldType let nullable: Bool let children: [Field]? let dictionary: DictionaryInfo? + let metadata: [String: String]? + + init( + name: String, + type: FieldType, + nullable: Bool, + children: [Field]? = nil, + dictionary: DictionaryInfo? = nil, + metadata: [String: String]? = nil + ) { + self.name = name + self.type = type + self.nullable = nullable + self.children = children + self.dictionary = dictionary + self.metadata = metadata + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + self.name = try container.decode(String.self, forKey: .name) + self.type = try container.decode(FieldType.self, forKey: .type) + self.nullable = try container.decode(Bool.self, forKey: .nullable) + self.children = try container.decodeIfPresent( + [Field].self, + forKey: .children + ) + self.dictionary = try container.decodeIfPresent( + DictionaryInfo.self, + forKey: .dictionary + ) + if container.contains(.metadata) { + var metadataArray = try container.nestedUnkeyedContainer( + forKey: .metadata + ) + try self.metadata = buildDictionary(from: &metadataArray) + } else { + self.metadata = nil + } + } + + enum CodingKeys: String, CodingKey { + case name + case type + case nullable + case children + case dictionary + case metadata + } } - struct FieldType: Codable { + struct FieldType: Codable, Equatable { let name: String let byteWidth: Int? let bitWidth: Int? @@ -52,14 +125,15 @@ struct ArrowGold: Codable { let scale: Int? let unit: String? let timezone: String? + let listSize: Int? } - struct Batch: Codable { + struct Batch: Codable, Equatable { let count: Int let columns: [Column] } - struct Column: Codable { + struct Column: Codable, Equatable { let name: String let count: Int let validity: [Int]? @@ -77,18 +151,24 @@ struct ArrowGold: Codable { } } - enum Value: Codable { + enum Value: Codable, Equatable { case int(Int) case string(String) case bool(Bool) } } +/// A metadata key-value entry. +private struct KeyValue: Codable, Equatable, Hashable { + let key: String + let value: String +} + /// Arrow gold files data values have variable types. -enum DataValue: Codable { +enum DataValue: Codable, Equatable { case string(String) case int(Int) - case double(Double) + case bool(Bool) case null init(from decoder: Decoder) throws { @@ -99,9 +179,11 @@ enum DataValue: Codable { } else if let intValue = try? container.decode(Int.self) { self = .int(intValue) } else if let doubleValue = try? container.decode(Double.self) { - self = .double(doubleValue) + self = .string(String(doubleValue)) } else if let stringValue = try? container.decode(String.self) { self = .string(stringValue) + } else if let boolValue = try? container.decode(Bool.self) { + self = .bool(boolValue) } else { throw DecodingError.typeMismatch( DataValue.self, @@ -112,3 +194,40 @@ enum DataValue: Codable { } } } + +extension ArrowGold.Column { + + /// Filter for the valid values. + /// - Returns: The test column data with nulls in place of junk values. + func withoutJunkData() -> Self { + guard let validity = self.validity else { + fatalError() + } + let filteredData = data?.enumerated().map { index, value in + validity[index] == 1 ? value : .null + } + return Self( + name: name, + count: count, + validity: validity, + offset: offset, + data: filteredData, + children: children?.map { $0.withoutJunkData() } + ) + } +} + +/// Decode a list of `KeyValue` to a dictionary. +/// - Parameter keyValues: The key values to convert. +/// - Throws: If decoding fails. +/// - Returns: A metadata dictionary. +private func buildDictionary( + from keyValues: inout any UnkeyedDecodingContainer +) throws -> [String: String]? { + var dict: [String: String] = [:] + while !keyValues.isAtEnd { + let pair = try keyValues.decode(KeyValue.self) + dict[pair.key] = pair.value + } + return dict.isEmpty ? nil : dict +} diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift new file mode 100644 index 0000000..ed923e5 --- /dev/null +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -0,0 +1,236 @@ +// Copyright 2025 The Columnar Swift Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +@testable import Arrow + +/// Encode an array to the gold testing JSON format. +/// - Parameters: +/// - array: The array to encode. +/// - field: The field associated with the array. +/// - Throws: An `ArrowError` if encoding fails. +/// - Returns: The column exactly as the test format expects it. +/// Note the junk values present in the test data are not replicated here therefore these need to be +/// removed from test data before comparison happens. +func encodeColumn( + array: AnyArrowArrayProtocol, + field: ArrowField +) throws(ArrowError) -> ArrowGold.Column { + guard let array = array as? (any ArrowArrayProtocol) else { + throw .invalid("Expected ArrowArray, got \(type(of: array))") + } + // Validity is always present in the gold files. + let validity: [Int] = (0.. 0 { + switch field.type { + case .list(let listField): + guard let listArray = array as? ListArrayProtocol else { + throw ArrowError.invalid("Expected list array") + } + let childColumn = try encodeColumn( + array: listArray.values, field: listField) + children = [childColumn] + // List arrays point to child arrays therefore have nil data buffers. + data = nil + case .fixedSizeList(let listField, _): + guard let listArray = array as? ListArrayProtocol else { + throw ArrowError.invalid("Expected list array") + } + let childColumn = try encodeColumn( + array: listArray.values, field: listField) + children = [childColumn] + data = nil + case .strct(let arrowFields): + guard let structArray = array as? ArrowStructArray else { + throw ArrowError.invalid("Expected list array") + } + children = [] + for (arrowField, (_, array)) in zip(arrowFields, structArray.fields) { + let childColumn = try encodeColumn( + array: array, field: arrowField) + children?.append(childColumn) + data = nil + } + children = try arrowFields.enumerated().map { + index, arrowField throws(ArrowError) in + try encodeColumn(array: structArray.fields[index].1, field: arrowField) + } + data = nil + case .boolean: + data = try extractBoolData(from: array) + case .int8: + data = try extractIntData(from: array, expectedType: Int8.self) + case .int16: + data = try extractIntData(from: array, expectedType: Int16.self) + case .int32: + data = try extractIntData(from: array, expectedType: Int32.self) + case .int64: + data = try extractIntData(from: array, expectedType: Int64.self) + case .uint8: + data = try extractIntData(from: array, expectedType: UInt8.self) + case .uint16: + data = try extractIntData(from: array, expectedType: UInt16.self) + case .uint32: + data = try extractIntData(from: array, expectedType: UInt32.self) + case .uint64: + data = try extractIntData(from: array, expectedType: UInt64.self) + case .float16: + data = try extractFloatData(from: array, expectedType: Float16.self) + case .float32: + data = try extractFloatData(from: array, expectedType: Float32.self) + case .float64: + data = try extractFloatData(from: array, expectedType: Float64.self) + case .binary: + try extractBinaryData(from: array, into: &data) + case .fixedSizeBinary(_): + try extractBinaryData(from: array, into: &data) + case .utf8: + try extractUtf8Data(from: array, into: &data) + default: + throw .invalid("Encoder did not handle a field type: \(field.type)") + } + } + return .init( + name: field.name, + count: array.length, + validity: validity, + offset: offsets, + data: data, + children: children + ) +} + +func extractIntData( + from array: AnyArrowArrayProtocol, + expectedType: T.Type +) throws(ArrowError) -> [DataValue] { + guard let typedArray = array as? ArrowArrayNumeric else { + throw .invalid("Expected \(T.self) array, got \(type(of: array))") + } + do { + return try (0..( + from array: AnyArrowArrayProtocol, + expectedType: T.Type +) throws(ArrowError) -> [DataValue] { + guard let typedArray = array as? ArrowArrayNumeric else { + throw ArrowError.invalid("Expected \(T.self) array, got \(type(of: array))") + } + let encoder = JSONEncoder() + let decoder = JSONDecoder() + do { + return try (0.. [DataValue] { + guard let typedArray = array as? ArrowArrayBoolean else { + throw .invalid("Expected boolean array, got \(type(of: array))") + } + return (0..( + func testBoolean( + actual: AnyArrowArrayProtocol, + expected: ArrowGold.Column + ) throws { + guard let expectedValidity = expected.validity, + let expectedValues = expected.data + else { + throw ArrowError.invalid("Test column is incomplete.") + } + guard let array = actual as? ArrowArrayBoolean, + array.length == expectedValidity.count + else { + Issue.record("Array type mismatch") + return + } + for (i, isNull) in expectedValidity.enumerated() { + guard case .bool(let expectedValue) = expectedValues[i] else { + throw ArrowError.invalid("Expected boolean value") + } + if isNull == 0 { + #expect(array[i] == nil) + } else { + #expect(array[i] == expectedValue) + } + } + } + + func testFixedWidth( actual: AnyArrowArrayProtocol, expected: ArrowGold.Column, as type: T.Type - ) throws where T: BinaryInteger { + ) throws where T: BinaryInteger & LosslessStringConvertible { guard let expectedValidity = expected.validity, let expectedValues = expected.data else { throw ArrowError.invalid("Test column is incomplete.") } - guard let array = actual as? any ArrowArrayProtocol, array.length == expectedValidity.count else { Issue.record("Array type mismatch") return } - for (i, isNull) in expectedValidity.enumerated() { - guard case .int(let val) = expectedValues[i] else { - throw ArrowError.invalid("Expected integer value") + let expected: T + if case .int(let intVal) = expectedValues[i] { + expected = try T(throwingOnOverflow: intVal) + } else if case .string(let strVal) = expectedValues[i], + let parsed = T(strVal) + { + expected = parsed + } else { + throw ArrowError.invalid("Expected integer value or numeric string") } - let expected = try T(throwingOnOverflow: val) - if isNull == 0 { #expect(array[i] == nil) } else { @@ -153,6 +274,66 @@ struct ArrowTestingIPC { } } + func testFixedWidth( + actual: AnyArrowArrayProtocol, + expected: ArrowGold.Column, + as type: T.Type + ) throws where T: BinaryFloatingPoint & LosslessStringConvertible { + guard let expectedValidity = expected.validity, + let expectedValues = expected.data + else { + throw ArrowError.invalid("Test column is incomplete.") + } + guard let array = actual as? any ArrowArrayProtocol, + array.length == expectedValidity.count + else { + Issue.record("Array type mismatch") + return + } + for (i, isValid) in expectedValidity.enumerated() { + guard case .string(let strVal) = expectedValues[i], + let expected = T(strVal) + else { + throw ArrowError.invalid("Expected float value or numeric string") + } + if isValid == 1 { + #expect(array[i] as? T == expected) + print("comparing \(array[i]) to \(expected)") + } else { + #expect(array[i] == nil) + } + } + } + + func validateFixedWidthListArray( + actual: AnyArrowArrayProtocol, + expected: ArrowGold.Column, + listSize: Int32 + ) throws { + + guard let expectedValidity = expected.validity + else { + throw ArrowError.invalid("Test column is incomplete.") + } + guard let listArray = actual as? ArrowFixedSizeListArray + else { + Issue.record("Unexpected array type: \(type(of: actual))") + return + } + + for (i, isNull) in expectedValidity.enumerated() { + if isNull == 0 { + #expect(listArray[i] == nil) + } else { + guard let actualChildSlice = listArray[i] else { + Issue.record("Expected non-null list at index \(i)") + continue + } + #expect(actualChildSlice.length == listSize) + } + } + } + func validateListArray( actual: AnyArrowArrayProtocol, expected: ArrowGold.Column @@ -168,12 +349,15 @@ struct ArrowTestingIPC { let offsets = ptr.bindMemory(to: Int32.self) #expect(offsets.count == expectedOffsets.count) for (i, expectedOffset) in expectedOffsets.enumerated() { - #expect(offsets[i] == expectedOffset) + let actualOffset = offsets[i] + #expect(actualOffset == expectedOffset) } } - guard let listArray = actual as? AnyArrowListArray else { - Issue.record("Unexpected array type") + // TODO: Need a simpler type signature at call site. + guard let listArray = actual as? ArrowListArray> + else { + Issue.record("Unexpected array type: \(type(of: actual))") return } @@ -248,7 +432,7 @@ struct ArrowTestingIPC { } switch type { case .binary: - guard let binaryArray = actual as? ArrowArrayOfData else { + guard let binaryArray = actual as? any BinaryArrayProtocol else { Issue.record("Binary array expected.") return } @@ -267,7 +451,7 @@ struct ArrowTestingIPC { } } case .utf8: - guard let binaryArray = actual as? ArrowArrayOfString else { + guard let binaryArray = actual as? StringArrayProtocol else { Issue.record("Binary array expected.") return } diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift new file mode 100644 index 0000000..ac22c98 --- /dev/null +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift @@ -0,0 +1,128 @@ +// Copyright 2025 The Columnar Swift Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation +import Testing + +@testable import Arrow +@testable import ArrowIPC + +/// Tests round trip from JSON -> Array -> JSON. +/// +/// See https://arrow.apache.org/docs/format/Integration.html#strategy +/// +/// The producer typically reads a JSON file, converts it to in-memory Arrow data, and exposes this data +/// using the format under test. The consumer reads the data in the said format and converts it back to +/// Arrow in-memory data; it also reads the same JSON file as the producer, and validates that both +/// datasets are identical. +/// +struct ArrowTestingJSON { + + static let testCases: [String] = [ + "generated_primitive", + "generated_primitive_no_batches", + "generated_primitive_zerolength", + "generated_binary", + "generated_binary_zerolength", + "generated_custom_metadata", + "generated_nested", + "generated_recursive_nested", + ] + + @Test(arguments: testCases) + func json(name: String) throws { + let resourceURL = try loadTestResource( + name: name, + withExtension: "json.lz4", + subdirectory: "integration/cpp-21.0.0" + ) + let lz4Data = try Data(contentsOf: resourceURL) + let lz4 = try LZ4(parsing: lz4Data) + let testCase = try JSONDecoder().decode(ArrowGold.self, from: lz4.data) + let testFile = try loadTestResource( + name: name, + withExtension: "arrow_file", + subdirectory: "integration/cpp-21.0.0" + ) + let arrowReader = try ArrowReader(url: testFile) + let (arrowSchema, recordBatches) = try arrowReader.read() + + #expect(testCase.batches.count == recordBatches.count) + + // Strip placeholder values. + let expectedBatches = testCase.batches.map { batch in + ArrowGold.Batch( + count: batch.count, + columns: batch.columns.map { $0.withoutJunkData() } + ) + } + let expectedSchema = testCase.schema + let expectedDictionaries = testCase.dictionaries + let _ = ArrowGold( + schema: expectedSchema, + batches: expectedBatches, + dictionaries: expectedDictionaries + ) + let actualSchema = encodeSchema(schema: arrowSchema) + #expect(actualSchema == expectedSchema) + for (testBatch, recordBatch) in zip(expectedBatches, recordBatches) { + for ( + (arrowField, arrowArray), + (_, expected) + ) in zip( + zip(arrowSchema.fields, recordBatch.arrays), + zip(testCase.schema.fields, testBatch.columns) + ) { + let actual = try encodeColumn(array: arrowArray, field: arrowField) + + #expect(actual == expected) + + // This is just useful for pin-pointing differences. + if actual != expected { + print("==== \(expected.name) ====") + #expect(actual.validity == expected.validity) + #expect(actual.offset == expected.offset) + if actual.data != expected.data { + guard let actualData = actual.data, + let expectedData = expected.data, let validity = actual.validity + else { + throw ArrowError.invalid("Expected and actual data both nil") + } + for (i, isValid) in validity.enumerated() { + if isValid == 1 { + let aV = actualData[i] + let eV = expectedData[i] + #expect(aV == eV) + } + } + } + } + } + } + } + +} + +private func encodeSchema(schema: ArrowSchema) -> ArrowGold.Schema { + let fields = schema.fields.map { arrowField in + arrowField.toGoldField() + } + let encodedMetadata: [String: String]? = + switch schema.metadata { + case .none: nil + case .some(let metadata): metadata.isEmpty ? nil : metadata + } + + return .init(fields: fields, metadata: encodedMetadata) +} diff --git a/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift new file mode 100644 index 0000000..c802ee5 --- /dev/null +++ b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift @@ -0,0 +1,306 @@ +// Copyright 2025 The Columnar Swift Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Arrow + +extension ArrowType { + + /// Recursively check this type matches the expected field type.. + /// - Parameter expectedField: The Arrow integration test field. + /// - Returns: True if this type and the field match exactly. + func matches(expectedField: ArrowGold.Field) -> Bool { + let fieldType = expectedField.type + switch self { + case .int8: + return fieldType.name == "int" && fieldType.bitWidth == 8 + && fieldType.isSigned == true + case .int16: + return fieldType.name == "int" && fieldType.bitWidth == 16 + && fieldType.isSigned == true + case .int32: + return fieldType.name == "int" && fieldType.bitWidth == 32 + && fieldType.isSigned == true + case .int64: + return fieldType.name == "int" && fieldType.bitWidth == 64 + && fieldType.isSigned == true + case .uint8: + return fieldType.name == "int" && fieldType.bitWidth == 8 + && fieldType.isSigned == false + case .uint16: + return fieldType.name == "int" && fieldType.bitWidth == 16 + && fieldType.isSigned == false + case .uint32: + return fieldType.name == "int" && fieldType.bitWidth == 32 + && fieldType.isSigned == false + case .uint64: + return fieldType.name == "int" && fieldType.bitWidth == 64 + && fieldType.isSigned == false + case .float16: + return fieldType.name == "floatingpoint" && fieldType.precision == "HALF" + case .float32: + return fieldType.name == "floatingpoint" + && fieldType.precision == "SINGLE" + case .float64: + return fieldType.name == "floatingpoint" + && fieldType.precision == "DOUBLE" + case .boolean: + return fieldType.name == "bool" + case .utf8: + return fieldType.name == "utf8" + case .binary: + return fieldType.name == "binary" + case .fixedSizeBinary(let byteWidth): + guard let expectedByteWidth = fieldType.byteWidth else { + fatalError("FieldType does not contain byteWidth.") + } + return fieldType.name == "fixedsizebinary" + && expectedByteWidth == byteWidth + case .date32: + return fieldType.name == "date" && fieldType.unit == "DAY" + case .date64: + return fieldType.name == "date" && fieldType.unit == "MILLISECOND" + case .timestamp(let unit, let timezone): + return fieldType.name == "timestamp" && fieldType.unit == unit.jsonName + && fieldType.timezone == timezone + case .time32(let unit): + return fieldType.name == "time" && fieldType.unit == unit.jsonName + && fieldType.bitWidth == 32 + case .time64(let unit): + return fieldType.name == "time" && fieldType.unit == unit.jsonName + && fieldType.bitWidth == 64 + case .duration(let unit): + return fieldType.name == "duration" && fieldType.unit == unit.jsonName + case .decimal128(let precision, let scale): + guard let expectedScale = fieldType.scale else { + fatalError("FieldType does not contain scale.") + } + return fieldType.name == "decimal" && fieldType.bitWidth == 128 + && fieldType.precision == String(precision) && expectedScale == scale + case .decimal256(let precision, let scale): + guard let expectedScale = fieldType.scale else { + fatalError("FieldType does not contain scale.") + } + return fieldType.name == "decimal" && fieldType.bitWidth == 256 + && fieldType.precision == String(precision) && expectedScale == scale + case .list(let arrowField), .largeList(let arrowField): + + guard fieldType.name == "list" || fieldType.name == "largelist", + let children = expectedField.children, + children.count == 1 + else { + return false + } + return arrowField.type.matches(expectedField: children[0]) + case .fixedSizeList(let arrowField, let listSize): + guard fieldType.name == "fixedsizelist", + let children = expectedField.children, + children.count == 1, + let expectedListSize = fieldType.listSize, + expectedListSize == listSize + else { + return false + } + return arrowField.type.matches(expectedField: children[0]) + case .strct(let arrowFields): + guard fieldType.name == "struct", let children = expectedField.children + else { + return false + } + for (arrowField, child) in zip(arrowFields, children) { + let matches = arrowField.type.matches(expectedField: child) + if !matches { + return false + } + } + return true + case .map: + // return fieldType.name == self.jsonTypeName + fatalError("Not implemented.") + + default: + fatalError("Not implemented.") + } + } + + var jsonTypeName: String { + switch self { + case .list: return "list" + case .largeList: return "largelist" + case .fixedSizeList: return "fixedsizelist" + case .strct: return "struct" + case .map: return "map" + default: fatalError("Not a container type") + } + } +} + +extension TimeUnit { + var jsonName: String { + switch self { + case .second: return "SECOND" + case .millisecond: return "MILLISECOND" + case .microsecond: return "MICROSECOND" + case .nanosecond: return "NANOSECOND" + } + } +} + +extension ArrowField { + func toGoldField() -> ArrowGold.Field { + ArrowGold.Field( + name: name, + type: type.toGoldFieldType(), + nullable: isNullable, + children: type.goldChildren(), + dictionary: nil, // TODO: handle dictionary encoding if needed + metadata: self.metadata.isEmpty ? nil : self.metadata + ) + } +} + +extension ArrowType { + func toGoldFieldType() -> ArrowGold.FieldType { + let name: String + var byteWidth: Int? + var bitWidth: Int? + var isSigned: Bool? = nil + var precision: String? = nil + var scale: Int? = nil + var unit: String? = nil + var timezone: String? = nil + var listSize: Int? = nil + + switch self { + case .int8: + name = "int" + bitWidth = 8 + isSigned = true + case .int16: + name = "int" + bitWidth = 16 + isSigned = true + case .int32: + name = "int" + bitWidth = 32 + isSigned = true + case .int64: + name = "int" + bitWidth = 64 + isSigned = true + case .uint8: + name = "int" + bitWidth = 8 + isSigned = false + case .uint16: + name = "int" + bitWidth = 16 + isSigned = false + case .uint32: + name = "int" + bitWidth = 32 + isSigned = false + case .uint64: + name = "int" + bitWidth = 64 + isSigned = false + case .float16: + name = "floatingpoint" + precision = "HALF" + case .float32: + name = "floatingpoint" + precision = "SINGLE" + case .float64: + name = "floatingpoint" + precision = "DOUBLE" + case .boolean: + name = "bool" + case .utf8: + name = "utf8" + case .binary: + name = "binary" + case .fixedSizeBinary(let byteWidth_): + byteWidth = Int(byteWidth_) + name = "fixedsizebinary" + case .date32: + name = "date" + unit = "DAY" + case .date64: + name = "date" + unit = "MILLISECOND" + case .timestamp(let unit_, let timezone_): + name = "timestamp" + unit = unit_.jsonName + timezone = timezone_ + case .time32(let unit_): + name = "time" + bitWidth = 32 + unit = unit_.jsonName + case .time64(let unit_): + name = "time" + bitWidth = 64 + unit = unit_.jsonName + case .duration(let unit_): + name = "duration" + bitWidth = nil + unit = unit_.jsonName + case .decimal128(let precision_, let scale_): + name = "decimal" + bitWidth = 128 + precision = String(precision_) + scale = Int(scale_) + case .decimal256(let precision_, let scale_): + name = "decimal" + bitWidth = 256 + precision = String(precision_) + scale = Int(scale_) + case .list(let field): + name = "list" + case .largeList(let field): + name = "largelist" + case .fixedSizeList(let field, let listSize_): + name = "fixedsizelist" + listSize = Int(listSize_) + case .strct(let fields): + name = "struct" + case .map: + name = "struct" + default: + fatalError("Unhandled type: \(self)") + } + return ArrowGold.FieldType( + name: name, + byteWidth: byteWidth, + bitWidth: bitWidth, + isSigned: isSigned, + precision: precision, + scale: scale, + unit: unit, + timezone: timezone, + listSize: listSize + ) + } + + func goldChildren() -> [ArrowGold.Field]? { + switch self { + case .list(let field), .largeList(let field), .fixedSizeList(let field, _): + return [field.toGoldField()] + case .strct(let fields): + return fields.map { $0.toGoldField() } + default: + // May need to implement different nested types. + if isNested { fatalError("Not implemented for nested ArrowType") } + return [] + } + } +} diff --git a/Tests/ArrowIPCTests/TestSupport.swift b/Tests/ArrowIPCTests/TestSupport.swift index 0910884..f19c28a 100644 --- a/Tests/ArrowIPCTests/TestSupport.swift +++ b/Tests/ArrowIPCTests/TestSupport.swift @@ -64,7 +64,7 @@ func checkBoolRecordBatch(recordBatch: RecordBatch) { #expect(one[4] == true) guard - let utf8Column = recordBatch.arrays[1] as? ArrowArrayOfString + let utf8Column = recordBatch.arrays[1] as? StringArrayProtocol else { Issue.record("Failed to cast column to ArrowUtf8Array") return diff --git a/Tests/ArrowTests/Array/BasicArrayTests.swift b/Tests/ArrowTests/Array/BasicArrayTests.swift index 7fe77dd..76b9c81 100644 --- a/Tests/ArrowTests/Array/BasicArrayTests.swift +++ b/Tests/ArrowTests/Array/BasicArrayTests.swift @@ -76,7 +76,7 @@ struct BasicArrayTests { } @Test func stringArray() throws { - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() builder.appendNull() builder.append("abc") builder.append("def") diff --git a/Tests/ArrowTests/Array/FuzzedArrayTests.swift b/Tests/ArrowTests/Array/FuzzedArrayTests.swift index e7fb937..ebada6b 100644 --- a/Tests/ArrowTests/Array/FuzzedArrayTests.swift +++ b/Tests/ArrowTests/Array/FuzzedArrayTests.swift @@ -56,7 +56,7 @@ struct FuzzedArrayTests { testArray[i] = nil } } - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() for value in testArray { if let value { builder.append(value) @@ -84,7 +84,7 @@ struct FuzzedArrayTests { } @Test func binaryStringArray() throws { - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() var byteCount: Int = 0 let count: Int = 100 var nullCount: Int = 0 @@ -143,7 +143,7 @@ struct FuzzedArrayTests { nullCount += 1 } } - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() for value in expected { if let value { builder.append(value) @@ -223,7 +223,7 @@ struct FuzzedArrayTests { expected[i] = randomString(length: length, using: &rng) } } - let arrayBuilder: ArrayBuilderVariableLength = .init() + let arrayBuilder: ArrayBuilderVariableLength = .init() for value in expected { if let value { arrayBuilder.append(value) @@ -281,7 +281,7 @@ struct FuzzedArrayTests { byteCount += value.utf8.count } } - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() for value in expected { if let value { builder.append(value) diff --git a/Tests/ArrowTests/Array/ListArrayTests.swift b/Tests/ArrowTests/Array/ListArrayTests.swift index eae945b..a79e3ac 100644 --- a/Tests/ArrowTests/Array/ListArrayTests.swift +++ b/Tests/ArrowTests/Array/ListArrayTests.swift @@ -38,8 +38,8 @@ struct ListArrayTests { let listArray = builder.finish() - let list0 = listArray[0] - let list1 = listArray[2] + let list0 = listArray[0] as? ArrowArrayNumeric + let list1 = listArray[2] as? ArrowArrayNumeric #expect(list0?.length == 2) #expect(list0?[0] == 1) #expect(list0?[1] == 2) @@ -71,8 +71,8 @@ struct ListArrayTests { let listArray = builder.finish() - let list0 = listArray[0] - let list1 = listArray[2] + let list0 = listArray[0] as? StringArrayProtocol + let list1 = listArray[2] as? StringArrayProtocol #expect(list0?.length == 2) #expect(list0?[0] == "a") #expect(list0?[1] == "b") diff --git a/Tests/ArrowTests/Array/ReadmeExamples.swift b/Tests/ArrowTests/Array/ReadmeExamples.swift index a6d1515..41ed43e 100644 --- a/Tests/ArrowTests/Array/ReadmeExamples.swift +++ b/Tests/ArrowTests/Array/ReadmeExamples.swift @@ -43,7 +43,7 @@ struct ReadmeExamples { @Test func stringArray() throws { let swiftArray: [String?] = ["ab", nil, "c", "", "."] - let arrayBuilder: ArrayBuilderVariableLength = .init() + let arrayBuilder: ArrayBuilderVariableLength = .init() for value in swiftArray { if let value { arrayBuilder.append(value) diff --git a/Tests/ArrowTests/Array/StructArrayTests.swift b/Tests/ArrowTests/Array/StructArrayTests.swift index eefbcbf..97c4be3 100644 --- a/Tests/ArrowTests/Array/StructArrayTests.swift +++ b/Tests/ArrowTests/Array/StructArrayTests.swift @@ -22,7 +22,7 @@ struct StructArrayTests { @Test func testStructArray() { // Create builders for struct fields let idBuilder = ArrayBuilderFixedWidth() - let nameBuilder = ArrayBuilderVariableLength() + let nameBuilder = ArrayBuilderVariableLength() // Create struct builder let structBuilder = ArrayBuilderStruct(fields: [