From 345bfe2daf0a6515b8161e615e91b4d7f2795305 Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Mon, 24 Nov 2025 17:16:31 +0800 Subject: [PATCH 01/10] Test schema metadata. Add type checks for fields in gold tests. --- Sources/Arrow/ArrowField.swift | 11 +- Sources/Arrow/ArrowSchema.swift | 5 +- Sources/ArrowIPC/ArrowReader.swift | 27 +++- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 14 ++ ...estingIPC.swift => ArrowTestingGold.swift} | 19 ++- .../Gold/ArrowType+validation.swift | 135 ++++++++++++++++++ 6 files changed, 198 insertions(+), 13 deletions(-) rename Tests/ArrowIPCTests/Gold/{ArrowTestingIPC.swift => ArrowTestingGold.swift} (93%) create mode 100644 Tests/ArrowIPCTests/Gold/ArrowType+validation.swift diff --git a/Sources/Arrow/ArrowField.swift b/Sources/Arrow/ArrowField.swift index 57ab839..600dea2 100644 --- a/Sources/Arrow/ArrowField.swift +++ b/Sources/Arrow/ArrowField.swift @@ -50,13 +50,18 @@ extension ArrowField { /// Default list member field name. public static let listFieldDefaultName = "item" - /// Creates a new field with the given name, data type, and nullability. - public init(name: String, dataType: ArrowType, isNullable: Bool) { + /// Creates a new field with the given name, data type, nullability and metadata. + public init( + name: String, + dataType: ArrowType, + isNullable: Bool, + metadata: [String: String] = [:] + ) { self.name = name self.type = dataType self.isNullable = isNullable self.orderedDict = false - self.metadata = .init() + self.metadata = metadata } /// Creates a new `ArrowField` suitable for `ArrowType::List`. diff --git a/Sources/Arrow/ArrowSchema.swift b/Sources/Arrow/ArrowSchema.swift index 5549f37..af93450 100644 --- a/Sources/Arrow/ArrowSchema.swift +++ b/Sources/Arrow/ArrowSchema.swift @@ -18,7 +18,9 @@ import Foundation public final class ArrowSchema: Sendable { public let fields: [ArrowField] public let fieldLookup: [String: Int] - init(_ fields: [ArrowField]) { + let metadata: [String: String]? + + public init(_ fields: [ArrowField], metadata: [String: String]? = nil) { var fieldLookup: [String: Int] = [:] for (index, field) in fields.enumerated() { fieldLookup[field.name] = index @@ -26,6 +28,7 @@ public final class ArrowSchema: Sendable { self.fields = fields self.fieldLookup = fieldLookup + self.metadata = metadata } public func field(_ index: Int) -> ArrowField { diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 1b6f643..13d19a4 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -152,7 +152,7 @@ public struct ArrowReader { guard let schema = footer.schema else { throw ArrowError.invalid("Missing schema in footer") } - let arrowSchema = try loadSchema(schema) + let arrowSchema = try loadSchema(schema: schema) var recordBatches: [RecordBatch] = [] // MARK: Record batch parsing @@ -432,8 +432,15 @@ public struct ArrowReader { return AnyArrowListArray(list) } - private func loadSchema(_ schema: FSchema) throws(ArrowError) -> ArrowSchema { - let builder = ArrowSchema.Builder() + private func loadSchema(schema: FSchema) throws(ArrowError) -> ArrowSchema { + let metadata = (0.. 0 { + print(expectedMetadata) + } + for (testBatch, recordBatch) in zip(testCase.batches, recordBatches) { - for ((expectedField, expectedColumn), (arrowField, arrowArray)) in zip( - zip(testCase.schema.fields, testBatch.columns), - zip(arrowSchema.fields, recordBatch.arrays) + for ( + (arrowField, arrowArray), + (expectedField, expectedColumn) + ) in zip( + zip(arrowSchema.fields, recordBatch.arrays), + zip(testCase.schema.fields, testBatch.columns) ) { + + #expect(arrowField.name == expectedField.name) + #expect(arrowField.isNullable == expectedField.nullable) + #expect(arrowField.type.matches(expectedField: expectedField)) #expect(arrowArray.length == expectedColumn.count) #expect(arrowField.name == expectedColumn.name) diff --git a/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift new file mode 100644 index 0000000..106cb63 --- /dev/null +++ b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift @@ -0,0 +1,135 @@ +// Copyright 2025 The Columnar Swift Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Arrow + +extension ArrowType { + + /// Recursively check this type matches the expected field type.. + /// - Parameter expectedField: The Arrow integration test field. + /// - Returns: True if this type and the field match exactly. + func matches(expectedField: ArrowGold.Field) -> Bool { + let fieldType = expectedField.type + switch self { + case .int8: + return fieldType.name == "int" && fieldType.bitWidth == 8 + && fieldType.isSigned == true + case .int16: + return fieldType.name == "int" && fieldType.bitWidth == 16 + && fieldType.isSigned == true + case .int32: + return fieldType.name == "int" && fieldType.bitWidth == 32 + && fieldType.isSigned == true + case .int64: + return fieldType.name == "int" && fieldType.bitWidth == 64 + && fieldType.isSigned == true + case .uint8: + return fieldType.name == "int" && fieldType.bitWidth == 8 + && fieldType.isSigned == false + case .uint16: + return fieldType.name == "int" && fieldType.bitWidth == 16 + && fieldType.isSigned == false + case .uint32: + return fieldType.name == "int" && fieldType.bitWidth == 32 + && fieldType.isSigned == false + case .uint64: + return fieldType.name == "int" && fieldType.bitWidth == 64 + && fieldType.isSigned == false + case .float16: + return fieldType.name == "floatingpoint" && fieldType.precision == "HALF" + case .float32: + return fieldType.name == "floatingpoint" + && fieldType.precision == "SINGLE" + case .float64: + return fieldType.name == "floatingpoint" + && fieldType.precision == "DOUBLE" + case .boolean: + return fieldType.name == "bool" + case .utf8: + return fieldType.name == "utf8" + case .binary: + return fieldType.name == "binary" + case .fixedSizeBinary(let byteWidth): + guard let expectedByteWidth = fieldType.byteWidth else { + fatalError("FieldType does not contain byteWidth.") + } + return fieldType.name == "fixedsizebinary" + && expectedByteWidth == byteWidth + case .date32: + return fieldType.name == "date" && fieldType.unit == "DAY" + case .date64: + return fieldType.name == "date" && fieldType.unit == "MILLISECOND" + case .timestamp(let unit, let timezone): + return fieldType.name == "timestamp" && fieldType.unit == unit.jsonName + && fieldType.timezone == timezone + case .time32(let unit): + return fieldType.name == "time" && fieldType.unit == unit.jsonName + && fieldType.bitWidth == 32 + case .time64(let unit): + return fieldType.name == "time" && fieldType.unit == unit.jsonName + && fieldType.bitWidth == 64 + case .duration(let unit): + return fieldType.name == "duration" && fieldType.unit == unit.jsonName + case .decimal128(let precision, let scale): + guard let expectedScale = fieldType.scale else { + fatalError("FieldType does not contain scale.") + } + return fieldType.name == "decimal" && fieldType.bitWidth == 128 + && fieldType.precision == String(precision) && expectedScale == scale + case .decimal256(let precision, let scale): + guard let expectedScale = fieldType.scale else { + fatalError("FieldType does not contain scale.") + } + return fieldType.name == "decimal" && fieldType.bitWidth == 256 + && fieldType.precision == String(precision) && expectedScale == scale + case .list(let arrowField), .largeList(let arrowField): + + guard fieldType.name == "list" || fieldType.name == "largelist", + let children = expectedField.children, + children.count == 1 + else { + return false + } + return arrowField.type.matches(expectedField: children[0]) + case .fixedSizeList, .strct, .map: + // return fieldType.name == self.jsonTypeName + fatalError("Not implemented.") + + default: + fatalError("Not implemented.") + } + } + + var jsonTypeName: String { + switch self { + case .list: return "list" + case .largeList: return "largelist" + case .fixedSizeList: return "fixedsizelist" + case .strct: return "struct" + case .map: return "map" + default: fatalError("Not a container type") + } + } +} + +extension TimeUnit { + var jsonName: String { + switch self { + case .second: return "SECOND" + case .millisecond: return "MILLISECOND" + case .microsecond: return "MICROSECOND" + case .nanosecond: return "NANOSECOND" + } + } +} From 8c3220cc72c3af7c6eb85a9ac242f89446c5c242 Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Mon, 24 Nov 2025 18:29:11 +0800 Subject: [PATCH 02/10] Primitive gold tests passing. --- Sources/ArrowIPC/ArrowReader.swift | 18 +- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 3 + .../ArrowIPCTests/Gold/ArrowTestingGold.swift | 157 ++++++++++++++++-- 3 files changed, 164 insertions(+), 14 deletions(-) diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 13d19a4..3f5c61e 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -281,20 +281,36 @@ public struct ArrowReader { return makeFixedArray( length: length, elementType: Int8.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint8: + return makeFixedArray( + length: length, elementType: UInt8.self, + nullBuffer: nullBuffer, buffer: buffer1) case .int16: return makeFixedArray( length: length, elementType: Int16.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint16: + return makeFixedArray( + length: length, elementType: UInt16.self, + nullBuffer: nullBuffer, buffer: buffer1) case .int32: return makeFixedArray( length: length, elementType: Int32.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint32: + return makeFixedArray( + length: length, elementType: UInt32.self, + nullBuffer: nullBuffer, buffer: buffer1) case .int64: return makeFixedArray( length: length, elementType: Int64.self, nullBuffer: nullBuffer, buffer: buffer1) + case .uint64: + return makeFixedArray( + length: length, elementType: UInt64.self, + nullBuffer: nullBuffer, buffer: buffer1) default: - throw ArrowError.notImplemented + throw ArrowError.invalid("TODO: Unimplemented arrow type: \(arrowType)") } } else if arrowType.isVariable { let buffer1 = try nextBuffer( diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index 3800954..aed2806 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -103,6 +103,7 @@ enum DataValue: Codable { case string(String) case int(Int) case double(Double) + case bool(Bool) case null init(from decoder: Decoder) throws { @@ -116,6 +117,8 @@ enum DataValue: Codable { self = .double(doubleValue) } else if let stringValue = try? container.decode(String.self) { self = .string(stringValue) + } else if let boolValue = try? container.decode(Bool.self) { + self = .bool(boolValue) } else { throw DecodingError.typeMismatch( DataValue.self, diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift index c048f05..76af002 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift @@ -20,9 +20,46 @@ import Testing struct ArrowTestingIPC { + static let allTests = [ + "generated_binary", + "generated_binary_no_batches", + "generated_binary_view", + "generated_binary_zerolength", + "generated_custom_metadata", + "generated_datetime", + "generated_decimal", + "generated_decimal256", + "generated_decimal32", + "generated_decimal64", + "generated_dictionary", + "generated_dictionary_unsigned", + "generated_duplicate_fieldnames", + "generated_duration", + "generated_extension", + "generated_interval", + "generated_interval_mdn", + "generated_large_binary", + "generated_list_view", + "generated_map", + "generated_map_non_canonical", + "generated_nested", + "generated_nested_dictionary", + "generated_nested_large_offsets", + "generated_null", + "generated_null_trivial", + "generated_primitive", + "generated_primitive_no_batches", + "generated_primitive_zerolength", + "generated_recursive_nested", + "generated_run_end_encoded", + "generated_union", + ] + static let testCases: [String] = [ + "generated_primitive", + "generated_primitive_no_batches", + "generated_primitive_zerolength", "generated_binary", - // "generated_binary_view", "generated_binary_zerolength", "generated_binary_no_batches", "generated_custom_metadata", @@ -31,6 +68,11 @@ struct ArrowTestingIPC { @Test(arguments: testCases) func gold(name: String) throws { + let todos = Set(Self.allTests).subtracting(Set(Self.testCases)) + for todo in todos.sorted() { + print(todo) + } + let resourceURL = try loadTestResource( name: name, withExtension: "json.lz4", @@ -51,9 +93,6 @@ struct ArrowTestingIPC { let expectedMetadata = testCase.schema.metadata?.asDictionary ?? [:] #expect(expectedMetadata == arrowSchema.metadata) - if expectedMetadata.keys.count > 0 { - print(expectedMetadata) - } for (testBatch, recordBatch) in zip(testCase.batches, recordBatches) { for ( @@ -69,6 +108,8 @@ struct ArrowTestingIPC { #expect(arrowField.type.matches(expectedField: expectedField)) #expect(arrowArray.length == expectedColumn.count) #expect(arrowField.name == expectedColumn.name) + let expectedMetadata = expectedField.metadata?.asDictionary ?? [:] + #expect(arrowField.metadata == expectedMetadata) switch arrowField.type { case .fixedSizeBinary(let byteWidth): @@ -94,15 +135,41 @@ struct ArrowTestingIPC { case .int8: try testFixedWidth( actual: arrowArray, expected: expectedColumn, as: Int8.self) + case .uint8: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: UInt8.self) + case .int16: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: Int16.self) + case .uint16: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: UInt16.self) case .int32: try testFixedWidth( actual: arrowArray, expected: expectedColumn, as: Int32.self) + case .uint32: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: UInt32.self) + case .int64: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: Int64.self) + case .uint64: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: UInt64.self) + case .float32: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: Float.self) + case .float64: + try testFixedWidth( + actual: arrowArray, expected: expectedColumn, as: Double.self) case .list(_): try validateListArray(actual: arrowArray, expected: expectedColumn) break + case .boolean: + try testBoolean(actual: arrowArray, expected: expectedColumn) default: throw ArrowError.invalid( - "Unsupported arrow field type: \(arrowField.type)") + "TODO: Implement test for arrow field type: \(arrowField.type)") } } } @@ -116,7 +183,6 @@ struct ArrowTestingIPC { else { throw ArrowError.invalid("Test column is incomplete.") } - for (i, isNull) in validity.enumerated() { guard case .string(let hex) = dataValues[i] else { throw ArrowError.invalid("Data values are not all strings.") @@ -133,31 +199,96 @@ struct ArrowTestingIPC { } } - func testFixedWidth( + func testBoolean( + actual: AnyArrowArrayProtocol, + expected: ArrowGold.Column + ) throws { + guard let expectedValidity = expected.validity, + let expectedValues = expected.data + else { + throw ArrowError.invalid("Test column is incomplete.") + } + guard let array = actual as? ArrowArrayBoolean, + array.length == expectedValidity.count + else { + Issue.record("Array type mismatch") + return + } + for (i, isNull) in expectedValidity.enumerated() { + guard case .bool(let expectedValue) = expectedValues[i] else { + throw ArrowError.invalid("Expected boolean value") + } + if isNull == 0 { + #expect(array[i] == nil) + } else { + #expect(array[i] == expectedValue) + } + } + } + + func testFixedWidth( actual: AnyArrowArrayProtocol, expected: ArrowGold.Column, as type: T.Type - ) throws where T: BinaryInteger { + ) throws where T: BinaryInteger & LosslessStringConvertible { guard let expectedValidity = expected.validity, let expectedValues = expected.data else { throw ArrowError.invalid("Test column is incomplete.") } - guard let array = actual as? any ArrowArrayProtocol, array.length == expectedValidity.count else { Issue.record("Array type mismatch") return } - for (i, isNull) in expectedValidity.enumerated() { - guard case .int(let val) = expectedValues[i] else { - throw ArrowError.invalid("Expected integer value") + let expected: T + if case .int(let intVal) = expectedValues[i] { + expected = try T(throwingOnOverflow: intVal) + } else if case .string(let strVal) = expectedValues[i], + let parsed = T(strVal) + { + expected = parsed + } else { + throw ArrowError.invalid("Expected integer value or numeric string") } - let expected = try T(throwingOnOverflow: val) + if isNull == 0 { + #expect(array[i] == nil) + } else { + #expect(array[i] as? T == expected) + } + } + } + func testFixedWidth( + actual: AnyArrowArrayProtocol, + expected: ArrowGold.Column, + as type: T.Type + ) throws where T: BinaryFloatingPoint & LosslessStringConvertible { + guard let expectedValidity = expected.validity, + let expectedValues = expected.data + else { + throw ArrowError.invalid("Test column is incomplete.") + } + guard let array = actual as? any ArrowArrayProtocol, + array.length == expectedValidity.count + else { + Issue.record("Array type mismatch") + return + } + for (i, isNull) in expectedValidity.enumerated() { + let expected: T + if case .double(let doubleVal) = expectedValues[i] { + expected = T(doubleVal) + } else if case .string(let strVal) = expectedValues[i], + let parsed = T(strVal) + { + expected = parsed + } else { + throw ArrowError.invalid("Expected float value or numeric string") + } if isNull == 0 { #expect(array[i] == nil) } else { From 729de6b7bd39d5935c58ad987932d66bd6fda1f3 Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Mon, 24 Nov 2025 19:53:13 +0800 Subject: [PATCH 03/10] Redesign list array. --- Sources/Arrow/Array/Array.swift | 62 ++++--------------- Sources/Arrow/Array/Builder.swift | 9 +-- .../Arrow/Buffer/ArrowBufferProtocol.swift | 2 +- Sources/ArrowIPC/ArrowReader.swift | 15 +++-- Sources/ArrowIPC/ArrowType+IPC.swift | 7 +++ .../ArrowIPC/Generated/FlatBuffersTypes.swift | 1 + Tests/ArrowIPCTests/Gold/ArrowGold.swift | 1 + .../ArrowIPCTests/Gold/ArrowTestingGold.swift | 5 +- Tests/ArrowTests/Array/ListArrayTests.swift | 8 +-- 9 files changed, 42 insertions(+), 68 deletions(-) diff --git a/Sources/Arrow/Array/Array.swift b/Sources/Arrow/Array/Array.swift index 12026ea..3bacb62 100644 --- a/Sources/Arrow/Array/Array.swift +++ b/Sources/Arrow/Array/Array.swift @@ -39,22 +39,26 @@ extension ArrowArrayProtocol { // MARK: Capability protocols. public protocol ArrowArrayOfString { + var length: Int { get } subscript(index: Int) -> String? { get } } extension ArrowArrayVariable: ArrowArrayOfString where ItemType == String {} public protocol ArrowArrayOfData { + var length: Int { get } subscript(index: Int) -> Data? { get } } extension ArrowArrayFixedSizeBinary: ArrowArrayOfData where ItemType == Data {} extension ArrowArrayVariable: ArrowArrayOfData where ItemType == Data {} public protocol ArrowArrayOfInt8 { + var length: Int { get } subscript(index: Int) -> Int8? { get } } extension ArrowArrayFixed: ArrowArrayOfInt8 where ItemType == Int8 {} public protocol ArrowArrayOfInt32 { + var length: Int { get } subscript(index: Int) -> Int32? { get } } extension ArrowArrayFixed: ArrowArrayOfInt32 where ItemType == Int32 {} @@ -316,13 +320,12 @@ where } } -/// A strongly-typed Arrow list array which may be nested arbitrarily. -public struct ArrowListArray: ArrowArrayProtocol +///// An Arrow list array which may be nested arbitrarily. +public struct ArrowListArray: ArrowArrayProtocol where - OffsetsBuffer: FixedWidthBufferProtocol, - Element: AnyArrowArrayProtocol + OffsetsBuffer: FixedWidthBufferProtocol, + OffsetsBuffer.ElementType: FixedWidthInteger & SignedInteger { - public typealias ItemType = Element public let offset: Int public let length: Int public var bufferSizes: [Int] { @@ -332,16 +335,17 @@ where [nullBuffer, offsetsBuffer] } public var nullCount: Int { nullBuffer.nullCount } + let nullBuffer: NullBuffer let offsetsBuffer: OffsetsBuffer - let values: Element + public let values: AnyArrowArrayProtocol public init( offset: Int = 0, length: Int, nullBuffer: NullBuffer, offsetsBuffer: OffsetsBuffer, - values: Element + values: AnyArrowArrayProtocol ) { self.offset = offset self.length = length @@ -350,7 +354,7 @@ where self.values = values } - public subscript(index: Int) -> Element? { + public subscript(index: Int) -> AnyArrowArrayProtocol? { precondition(index >= 0 && index < length, "Invalid index.") let offsetIndex = self.offset + index if !self.nullBuffer.isSet(offsetIndex) { @@ -373,48 +377,6 @@ where } } -/// A type-erased wrapper for an Arrow list array. -public struct AnyArrowListArray: ArrowArrayProtocol { - - public typealias ItemType = AnyArrowArrayProtocol - public var bufferSizes: [Int] { - _base.bufferSizes - } - public var buffers: [ArrowBufferProtocol] { - _base.buffers - } - - private let _base: any ArrowArrayProtocol - private let _subscriptImpl: (Int) -> AnyArrowArrayProtocol? - private let _sliceImpl: (Int, Int) -> AnyArrowListArray - - public let offset: Int - public let length: Int - public var nullCount: Int { _base.nullCount } - - public init( - _ list: ArrowListArray - ) - where - OffsetsBuffer: FixedWidthBufferProtocol, - Element: AnyArrowArrayProtocol - { - self._base = list - self.offset = list.offset - self.length = list.length - self._subscriptImpl = { list[$0] } - self._sliceImpl = { AnyArrowListArray(list.slice(offset: $0, length: $1)) } - } - - public subscript(index: Int) -> AnyArrowArrayProtocol? { - _subscriptImpl(index) - } - - public func slice(offset: Int, length: Int) -> AnyArrowListArray { - _sliceImpl(offset, length) - } -} - /// An Arrow struct array. public struct ArrowStructArray: ArrowArrayProtocol { public typealias ItemType = [String: Any] diff --git a/Sources/Arrow/Array/Builder.swift b/Sources/Arrow/Array/Builder.swift index d79db89..e587464 100644 --- a/Sources/Arrow/Array/Builder.swift +++ b/Sources/Arrow/Array/Builder.swift @@ -294,11 +294,9 @@ public typealias ArrayBuilderTime64 = ArrayBuilderFixedWidth /// A builder for Arrow arrays holding Timestamp values. public typealias ArrayBuilderTimestamp = ArrayBuilderFixedWidth -class ArrayBuilderList: AnyArrayBuilder { +class ArrayBuilderList { - func append(_ value: T.ArrayType) {} - - typealias ArrayType = ArrowListArray> + typealias ArrayType = ArrowListArray> var length: Int let nullBuilder: NullBufferBuilder @@ -318,7 +316,6 @@ class ArrayBuilderList: AnyArrayBuilder { length += 1 nullBuilder.appendValid(true) - // let startLength = valueBuilder.length builder(valueBuilder) // User adds items to child builder let endLength = valueBuilder.length @@ -342,7 +339,7 @@ class ArrayBuilderList: AnyArrayBuilder { length: length, nullBuffer: nullBuffer, offsetsBuffer: offsetsBuffer, - values: valuesArray + values: valuesArray // Now accepts AnyArrowArrayProtocol ) } } diff --git a/Sources/Arrow/Buffer/ArrowBufferProtocol.swift b/Sources/Arrow/Buffer/ArrowBufferProtocol.swift index b192c3c..e4d00de 100644 --- a/Sources/Arrow/Buffer/ArrowBufferProtocol.swift +++ b/Sources/Arrow/Buffer/ArrowBufferProtocol.swift @@ -16,13 +16,13 @@ import Foundation /// An Arrow buffer. public protocol ArrowBufferProtocol { + var length: Int { get } func withUnsafeBytes( _ body: (UnsafeRawBufferPointer) throws -> R ) rethrows -> R } internal protocol ArrowBufferUInt8: ArrowBufferProtocol { - var length: Int { get } var buffer: UnsafePointer { get } } diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 3f5c61e..932e1dc 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -433,19 +433,22 @@ public struct ArrowReader { ) } - func makeListArray( + func makeListArray( length: Int, nullBuffer: NullBuffer, - offsetsBuffer: FixedWidthBufferIPC, - values: Element - ) -> AnyArrowListArray where Element: AnyArrowArrayProtocol { - let list = ArrowListArray( + offsetsBuffer: OffsetsBuffer, + values: AnyArrowArrayProtocol + ) -> ArrowListArray + where + OffsetsBuffer: FixedWidthBufferProtocol, + OffsetsBuffer.ElementType: FixedWidthInteger & SignedInteger + { + ArrowListArray( length: length, nullBuffer: nullBuffer, offsetsBuffer: offsetsBuffer, values: values ) - return AnyArrowListArray(list) } private func loadSchema(schema: FSchema) throws(ArrowError) -> ArrowSchema { diff --git a/Sources/ArrowIPC/ArrowType+IPC.swift b/Sources/ArrowIPC/ArrowType+IPC.swift index 6bd964c..24dc53e 100644 --- a/Sources/ArrowIPC/ArrowType+IPC.swift +++ b/Sources/ArrowIPC/ArrowType+IPC.swift @@ -143,6 +143,13 @@ extension ArrowType { isNullable: childField.nullable ) return .list(arrowField) + case .fixedsizelist: + guard let fType = field.type(type: FFixedSizeList.self) else { + throw .invalid("Could not get byteWidth from fixed binary field.") + } + let listSize = fType.listSize + + fatalError("Not implemented") default: throw .invalid("Unhandled field type: \(field.typeType)") } diff --git a/Sources/ArrowIPC/Generated/FlatBuffersTypes.swift b/Sources/ArrowIPC/Generated/FlatBuffersTypes.swift index 97a567d..1e0f927 100644 --- a/Sources/ArrowIPC/Generated/FlatBuffersTypes.swift +++ b/Sources/ArrowIPC/Generated/FlatBuffersTypes.swift @@ -33,5 +33,6 @@ typealias FStruct = org_apache_arrow_flatbuf_Struct_ typealias FUtf8 = org_apache_arrow_flatbuf_Utf8 typealias FBinary = org_apache_arrow_flatbuf_Binary typealias FFixedSizeBinary = org_apache_arrow_flatbuf_FixedSizeBinary +typealias FFixedSizeList = org_apache_arrow_flatbuf_FixedSizeList typealias FMessageHeader = org_apache_arrow_flatbuf_MessageHeader diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index aed2806..12f51e0 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -54,6 +54,7 @@ struct ArrowGold: Codable { let scale: Int? let unit: String? let timezone: String? + let listSize: Int? } struct Batch: Codable { diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift index 76af002..f523429 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift @@ -63,6 +63,7 @@ struct ArrowTestingIPC { "generated_binary_zerolength", "generated_binary_no_batches", "generated_custom_metadata", + // "generated_nested", ] @Test(arguments: testCases) @@ -316,7 +317,9 @@ struct ArrowTestingIPC { } } - guard let listArray = actual as? AnyArrowListArray else { + // TODO: Need a simpler type signature at call site. + guard let listArray = actual as? ArrowListArray> + else { Issue.record("Unexpected array type") return } diff --git a/Tests/ArrowTests/Array/ListArrayTests.swift b/Tests/ArrowTests/Array/ListArrayTests.swift index eae945b..0d0b5db 100644 --- a/Tests/ArrowTests/Array/ListArrayTests.swift +++ b/Tests/ArrowTests/Array/ListArrayTests.swift @@ -38,8 +38,8 @@ struct ListArrayTests { let listArray = builder.finish() - let list0 = listArray[0] - let list1 = listArray[2] + let list0 = listArray[0] as? ArrowArrayOfInt32 + let list1 = listArray[2] as? ArrowArrayOfInt32 #expect(list0?.length == 2) #expect(list0?[0] == 1) #expect(list0?[1] == 2) @@ -71,8 +71,8 @@ struct ListArrayTests { let listArray = builder.finish() - let list0 = listArray[0] - let list1 = listArray[2] + let list0 = listArray[0] as? ArrowArrayOfString + let list1 = listArray[2] as? ArrowArrayOfString #expect(list0?.length == 2) #expect(list0?[0] == "a") #expect(list0?[1] == "b") From b6022a02f451829d4643e2b07007582df2b1017f Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Thu, 27 Nov 2025 12:32:37 +0800 Subject: [PATCH 04/10] Remove explicit buffer type from fixed array signature. Add Archery style JSON testing. --- Sources/Arrow/Array/Array.swift | 113 +++++++++++++----- Sources/Arrow/Array/Builder.swift | 10 +- Sources/ArrowIPC/ArrowReader.swift | 71 ++++++++--- Sources/ArrowIPC/ArrowType+IPC.swift | 16 ++- Tests/ArrowIPCTests/ArrowReaderTests.swift | 6 +- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 44 +++++-- .../ArrowIPCTests/Gold/ArrowJSONEncoder.swift | 100 ++++++++++++++++ .../ArrowIPCTests/Gold/ArrowTestingGold.swift | 76 +++++++++--- .../ArrowIPCTests/Gold/ArrowTestingJSON.swift | 102 ++++++++++++++++ .../Gold/ArrowType+validation.swift | 24 +++- Tests/ArrowTests/Array/ListArrayTests.swift | 4 +- 11 files changed, 480 insertions(+), 86 deletions(-) create mode 100644 Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift create mode 100644 Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift diff --git a/Sources/Arrow/Array/Array.swift b/Sources/Arrow/Array/Array.swift index 3bacb62..62b7070 100644 --- a/Sources/Arrow/Array/Array.swift +++ b/Sources/Arrow/Array/Array.swift @@ -51,17 +51,12 @@ public protocol ArrowArrayOfData { extension ArrowArrayFixedSizeBinary: ArrowArrayOfData where ItemType == Data {} extension ArrowArrayVariable: ArrowArrayOfData where ItemType == Data {} -public protocol ArrowArrayOfInt8 { +public protocol ArrowArrayOfList { var length: Int { get } - subscript(index: Int) -> Int8? { get } + var values: AnyArrowArrayProtocol { get } + subscript(index: Int) -> AnyArrowArrayProtocol? { get } } -extension ArrowArrayFixed: ArrowArrayOfInt8 where ItemType == Int8 {} - -public protocol ArrowArrayOfInt32 { - var length: Int { get } - subscript(index: Int) -> Int32? { get } -} -extension ArrowArrayFixed: ArrowArrayOfInt32 where ItemType == Int32 {} +extension ArrowListArray: ArrowArrayOfList {} /// An Arrow array of booleans using the three-valued logical model (true / false / null). public struct ArrowArrayBoolean: ArrowArrayProtocol { @@ -106,34 +101,32 @@ public struct ArrowArrayBoolean: ArrowArrayProtocol { } /// An Arrow array of fixed-width types. -public struct ArrowArrayFixed: ArrowArrayProtocol -where - ValueBuffer: FixedWidthBufferProtocol, - ValueBuffer.ElementType: Numeric +public struct ArrowArrayNumeric: + ArrowArrayProtocol { - - public typealias ItemType = ValueBuffer.ElementType public let offset: Int public let length: Int + public var nullCount: Int { nullBuffer.nullCount } public var bufferSizes: [Int] { [nullBuffer.length, valueBuffer.length] } public var buffers: [ArrowBufferProtocol] { [nullBuffer, valueBuffer] } - public var nullCount: Int { nullBuffer.nullCount } + let nullBuffer: NullBuffer - let valueBuffer: ValueBuffer + private let valueBuffer: any FixedWidthBufferProtocol - public init( + // Initialize from concrete buffer type + public init( offset: Int = 0, length: Int, nullBuffer: NullBuffer, valueBuffer: ValueBuffer - ) { + ) where ValueBuffer.ElementType == ItemType { self.offset = offset self.length = length self.nullBuffer = nullBuffer self.valueBuffer = valueBuffer } - public subscript(index: Int) -> ValueBuffer.ElementType? { + public subscript(index: Int) -> ItemType? { precondition(index >= 0 && index < length, "Invalid index.") let offsetIndex = self.offset + index if !self.nullBuffer.isSet(offsetIndex) { @@ -261,17 +254,14 @@ where } /// An Arrow array of `Date`s with a resolution of 1 day. -public struct ArrowArrayDate32: ArrowArrayProtocol -where - ValueBuffer: FixedWidthBufferProtocol -{ +public struct ArrowArrayDate32: ArrowArrayProtocol { public typealias ItemType = Date public var bufferSizes: [Int] { array.bufferSizes } public var buffers: [ArrowBufferProtocol] { array.buffers } public var nullCount: Int { array.nullCount } public var offset: Int { array.offset } public var length: Int { array.length } - let array: ArrowArrayFixed + let array: ArrowArrayNumeric public subscript(index: Int) -> Date? { precondition(index >= 0 && index < length, "Invalid index.") @@ -291,17 +281,14 @@ where } /// An Arrow array of `Date`s with a resolution of 1 second. -public struct ArrowArrayDate64: ArrowArrayProtocol -where - ValueBuffer: FixedWidthBufferProtocol -{ +public struct ArrowArrayDate64: ArrowArrayProtocol { public typealias ItemType = Date public var bufferSizes: [Int] { array.bufferSizes } public var buffers: [ArrowBufferProtocol] { array.buffers } public var nullCount: Int { array.nullCount } public var offset: Int { array.offset } public var length: Int { array.length } - let array: ArrowArrayFixed + let array: ArrowArrayNumeric public subscript(index: Int) -> Date? { precondition(index >= 0 && index < length, "Invalid index.") @@ -377,6 +364,72 @@ where } } +protocol ListArrayProtocol: ArrowArrayProtocol { + var length: Int { get } + var values: AnyArrowArrayProtocol { get } +} + +extension ArrowListArray: ListArrayProtocol { + + // No implementation needed - offsetsBuffer and values already exist + // Swift automatically satisfies the protocol requirements +} + +public struct ArrowFixedSizeListArray: ArrowArrayProtocol { + public let offset: Int + public let length: Int + public let listSize: Int + + public var bufferSizes: [Int] { + [nullBuffer.length] + } + + public var buffers: [ArrowBufferProtocol] { + [nullBuffer] + } + + public var nullCount: Int { nullBuffer.nullCount } + + let nullBuffer: NullBuffer + public let values: AnyArrowArrayProtocol + + public init( + offset: Int = 0, + length: Int, + listSize: Int, + nullBuffer: NullBuffer, + values: AnyArrowArrayProtocol + ) { + self.offset = offset + self.length = length + self.listSize = listSize + self.nullBuffer = nullBuffer + self.values = values + } + + public subscript(index: Int) -> AnyArrowArrayProtocol? { + precondition(index >= 0 && index < length, "Invalid index.") + let offsetIndex = self.offset + index + + if !self.nullBuffer.isSet(offsetIndex) { + return nil + } + + let startIndex = offsetIndex * listSize + return values.slice(offset: startIndex, length: listSize) + } + + public func slice(offset: Int, length: Int) -> Self { + .init( + offset: self.offset + offset, + length: length, + listSize: listSize, + nullBuffer: nullBuffer, + values: values + ) + } +} + /// An Arrow struct array. public struct ArrowStructArray: ArrowArrayProtocol { public typealias ItemType = [String: Any] diff --git a/Sources/Arrow/Array/Builder.swift b/Sources/Arrow/Array/Builder.swift index e587464..19b2f48 100644 --- a/Sources/Arrow/Array/Builder.swift +++ b/Sources/Arrow/Array/Builder.swift @@ -75,9 +75,11 @@ public class ArrayBuilderBoolean: AnyArrayBuilder { } /// A builder for Arrow arrays holding fixed-width types. -public class ArrayBuilderFixedWidth: AnyArrayBuilder { +public class ArrayBuilderFixedWidth: + AnyArrayBuilder +{ - public typealias ArrayType = ArrowArrayFixed> + public typealias ArrayType = ArrowArrayNumeric public var length: Int let nullBuilder: NullBufferBuilder @@ -237,7 +239,7 @@ typealias ArrayBuilderBinary = ArrayBuilderVariableLength /// A builder for Arrow arrays holding `Date`s with a resolution of one day. public struct ArrayBuilderDate32: AnyArrayBuilder { - public typealias ArrayType = ArrowArrayDate32> + public typealias ArrayType = ArrowArrayDate32 let builder: ArrayBuilderFixedWidth = .init() public init() {} @@ -262,7 +264,7 @@ public struct ArrayBuilderDate32: AnyArrayBuilder { /// A builder for Arrow arrays holding `Date`s with a resolution of one day. public struct ArrayBuilderDate64: AnyArrayBuilder { - public typealias ArrayType = ArrowArrayDate64> + public typealias ArrayType = ArrowArrayDate64 let builder: ArrayBuilderFixedWidth = .init() public init() {} diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 932e1dc..68cfad1 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -337,28 +337,41 @@ public struct ArrowReader { } } else if arrowType.isNested { switch arrowType { - case .list(let field): + case .list(let childField): + let buffer1 = try nextBuffer( + message: rbMessage, index: &bufferIndex, offset: offset, data: data) + var offsetsBuffer = FixedWidthBufferIPC(buffer: buffer1) + let array: AnyArrowArrayProtocol = try loadField( rbMessage: rbMessage, - field: field, + field: childField, offset: offset, nodeIndex: &nodeIndex, bufferIndex: &bufferIndex ) - let buffer1 = try nextBuffer( - message: rbMessage, index: &bufferIndex, offset: offset, data: data) - var offsetsBuffer = FixedWidthBufferIPC(buffer: buffer1) - // TODO: This is a hack for the special-case where buffer length 0 means all-zero offset. - // Can follow the null buffer example. - if offsetsBuffer.length != length + 1 { - let offsetCount = length + 1 - let byteCount = offsetCount * MemoryLayout.stride - let fileDataBuffer = FileDataBuffer( - data: Data(count: byteCount), // Zero-initialized - range: 0..(buffer: fileDataBuffer) + if offsetsBuffer.length == 0 { + // Empty offsets buffer is valid when child array is empty + // There could be any number of empty lists referencing into an empty list + guard array.length == 0 else { + throw ArrowError.invalid( + "Empty offsets buffer but non-empty child array") + } + let emptyBuffer = emptyOffsetBuffer(offsetCount: length + 1) + offsetsBuffer = FixedWidthBufferIPC(buffer: emptyBuffer) + } else { + let requiredBytes = (length + 1) * MemoryLayout.stride + guard offsetsBuffer.length >= requiredBytes else { + throw ArrowError.invalid( + "Offsets buffer too small: need \(requiredBytes) bytes for \(length) lists" + ) + } + // Verify last offset matches child array length + let lastOffset = offsetsBuffer[length] + guard lastOffset == Int32(array.length) else { + throw ArrowError.invalid( + "Expected last offset to match child array length.") + } } return makeListArray( length: length, @@ -366,6 +379,20 @@ public struct ArrowReader { offsetsBuffer: offsetsBuffer, values: array ) + case .fixedSizeList(let field, let listSize): + let array: AnyArrowArrayProtocol = try loadField( + rbMessage: rbMessage, + field: field, + offset: offset, + nodeIndex: &nodeIndex, + bufferIndex: &bufferIndex + ) + return ArrowFixedSizeListArray( + length: length, + listSize: Int(listSize), + nullBuffer: nullBuffer, + values: array + ) case .strct(let fields): var arrays: [(String, AnyArrowArrayProtocol)] = [] for field in fields { @@ -424,9 +451,9 @@ public struct ArrowReader { elementType: T.Type, nullBuffer: NullBuffer, buffer: FileDataBuffer - ) -> ArrowArrayFixed> { + ) -> ArrowArrayNumeric { let fixedBuffer = FixedWidthBufferIPC(buffer: buffer) - return ArrowArrayFixed( + return ArrowArrayNumeric( length: length, nullBuffer: nullBuffer, valueBuffer: fixedBuffer @@ -486,4 +513,14 @@ public struct ArrowReader { return ArrowSchema(fields, metadata: metadata) } + //TODO: This is for the special-case where buffer length 0 means all-zero offset. + // Would be better to have a specialised empty null buffer + func emptyOffsetBuffer(offsetCount: Int) -> FileDataBuffer { + let byteCount = offsetCount * MemoryLayout.stride + return FileDataBuffer( + data: Data(count: byteCount), // Zero-initialized + range: 0..> + as? ArrowArrayNumeric else { Issue.record("Failed to cast column 0 to ArrowArrayDouble") return diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index 12f51e0..b2c5f29 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -15,28 +15,28 @@ import Foundation /// The JSON file structure used to validate gold-standard Arrow test files. -struct ArrowGold: Codable { +struct ArrowGold: Codable, Equatable { let schema: Schema let batches: [Batch] let dictionaries: [Dictionary]? - struct Dictionary: Codable { + struct Dictionary: Codable, Equatable { let id: Int let data: Batch } - struct DictionaryInfo: Codable { + struct DictionaryInfo: Codable, Equatable { let id: Int let indexType: FieldType let isOrdered: Bool? } - struct Schema: Codable { + struct Schema: Codable, Equatable { let fields: [Field] let metadata: [KeyValue]? } - struct Field: Codable { + struct Field: Codable, Equatable { let name: String let type: FieldType let nullable: Bool @@ -45,7 +45,7 @@ struct ArrowGold: Codable { let metadata: [KeyValue]? } - struct FieldType: Codable { + struct FieldType: Codable, Equatable { let name: String let byteWidth: Int? let bitWidth: Int? @@ -57,12 +57,12 @@ struct ArrowGold: Codable { let listSize: Int? } - struct Batch: Codable { + struct Batch: Codable, Equatable { let count: Int let columns: [Column] } - struct Column: Codable { + struct Column: Codable, Equatable { let name: String let count: Int let validity: [Int]? @@ -80,7 +80,7 @@ struct ArrowGold: Codable { } } - enum Value: Codable { + enum Value: Codable, Equatable { case int(Int) case string(String) case bool(Bool) @@ -88,7 +88,7 @@ struct ArrowGold: Codable { } /// A metadata key-value entry. -struct KeyValue: Codable { +struct KeyValue: Codable, Equatable { let key: String let value: String } @@ -100,7 +100,7 @@ extension [KeyValue] { } /// Arrow gold files data values have variable types. -enum DataValue: Codable { +enum DataValue: Codable, Equatable { case string(String) case int(Int) case double(Double) @@ -130,3 +130,25 @@ enum DataValue: Codable { } } } + +extension ArrowGold.Column { + + /// Filter for the valid values. + /// - Returns: The test column data with nulls in place of junk values. + func withoutJunkData() -> Self { + guard let data = self.data, let validity = self.validity else { + return self + } + let filteredData = data.enumerated().map { index, value in + validity[index] == 1 ? value : .null + } + return Self( + name: name, + count: count, + validity: validity, + offset: offset, + data: filteredData.isEmpty ? nil : filteredData, + children: children?.map { $0.withoutJunkData() } + ) + } +} diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift new file mode 100644 index 0000000..1f6e9ec --- /dev/null +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -0,0 +1,100 @@ +// Copyright 2025 The Columnar Swift Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +@testable import Arrow + +func encodeColumn( + array: AnyArrowArrayProtocol, + field: ArrowField +) throws -> ArrowGold.Column { + + guard let array = array as? (any ArrowArrayProtocol) else { + throw ArrowError.invalid("Expected ArrowArray, got \(type(of: array))") + } + + var validity: [Int] = [] + + for i in 0..( + from array: AnyArrowArrayProtocol, + expectedType: T.Type +) throws -> [DataValue] { + guard let typedArray = array as? ArrowArrayNumeric else { + throw ArrowError.invalid("Expected \(T.self) array, got \(type(of: array))") + } + + return try (0..> else { - Issue.record("Unexpected array type") + Issue.record("Unexpected array type: \(type(of: actual))") return } diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift new file mode 100644 index 0000000..2a135e2 --- /dev/null +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift @@ -0,0 +1,102 @@ +// ArrowTestingIPC.swift +// Arrow +// +// Created by Will Temperley on 26/11/2025. All rights reserved. +// Copyright 2025 Will Temperley. +// +// Copying or reproduction of this file via any medium requires prior express +// written permission from the copyright holder. +// ----------------------------------------------------------------------------- +/// +/// Implementation notes, links and internal documentation go here. +/// +// ----------------------------------------------------------------------------- + +// Copyright 2025 The Columnar Swift Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation +import Testing + +@testable import Arrow +@testable import ArrowIPC + +/// Tests round trip from JSON -> Array -> JSON. +/// +/// See https://arrow.apache.org/docs/format/Integration.html#strategy +/// +/// The producer typically reads a JSON file, converts it to in-memory Arrow data, and exposes this data +/// using the format under test. The consumer reads the data in the said format and converts it back to +/// Arrow in-memory data; it also reads the same JSON file as the producer, and validates that both +/// datasets are identical. +/// +struct ArrowTestingJSON { + + static let testCases: [String] = [ + "generated_primitive" + // "generated_primitive_no_batches", + // "generated_primitive_zerolength", + // "generated_binary", + // "generated_binary_zerolength", + // "generated_binary_no_batches", + // "generated_custom_metadata", + // "generated_nested", + ] + + // @Test(.serialized, arguments: testCases) + @Test(arguments: testCases) + func json(name: String) throws { + + let resourceURL = try loadTestResource( + name: name, + withExtension: "json.lz4", + subdirectory: "integration/cpp-21.0.0" + ) + let lz4Data = try Data(contentsOf: resourceURL) + let lz4 = try LZ4(parsing: lz4Data) + let testCase = try JSONDecoder().decode(ArrowGold.self, from: lz4.data) + let testFile = try loadTestResource( + name: name, + withExtension: "arrow_file", + subdirectory: "integration/cpp-21.0.0" + ) + let arrowReader = try ArrowReader(url: testFile) + let (arrowSchema, recordBatches) = try arrowReader.read() + + #expect(testCase.batches.count == recordBatches.count) + + // for recordBatch in recordBatches { + // for (field, array) in zip(arrowSchema.fields, recordBatch.arrays) { + // let result = try encodeColumn(array: array, field: field) + // } + // } + + for (testBatch, recordBatch) in zip(testCase.batches, recordBatches) { + for ( + (arrowField, arrowArray), + (expectedField, expectedColumn) + ) in zip( + zip(arrowSchema.fields, recordBatch.arrays), + zip(testCase.schema.fields, testBatch.columns) + ) { + + if arrowField.type == .int8 || arrowField.type == .uint8 { + let result = try encodeColumn(array: arrowArray, field: arrowField) + #expect(result == expectedColumn.withoutJunkData()) + } + } + } + + } +} diff --git a/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift index 106cb63..363ccf0 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift @@ -102,7 +102,29 @@ extension ArrowType { return false } return arrowField.type.matches(expectedField: children[0]) - case .fixedSizeList, .strct, .map: + case .fixedSizeList(let arrowField, let listSize): + guard fieldType.name == "fixedsizelist", + let children = expectedField.children, + children.count == 1, + let expectedListSize = fieldType.listSize, + expectedListSize == listSize + else { + return false + } + return arrowField.type.matches(expectedField: children[0]) + case .strct(let arrowFields): + guard fieldType.name == "struct", let children = expectedField.children + else { + return false + } + for (arrowField, child) in zip(arrowFields, children) { + let matches = arrowField.type.matches(expectedField: child) + if !matches { + return false + } + } + return true + case .map: // return fieldType.name == self.jsonTypeName fatalError("Not implemented.") diff --git a/Tests/ArrowTests/Array/ListArrayTests.swift b/Tests/ArrowTests/Array/ListArrayTests.swift index 0d0b5db..ce575d8 100644 --- a/Tests/ArrowTests/Array/ListArrayTests.swift +++ b/Tests/ArrowTests/Array/ListArrayTests.swift @@ -38,8 +38,8 @@ struct ListArrayTests { let listArray = builder.finish() - let list0 = listArray[0] as? ArrowArrayOfInt32 - let list1 = listArray[2] as? ArrowArrayOfInt32 + let list0 = listArray[0] as? ArrowArrayNumeric + let list1 = listArray[2] as? ArrowArrayNumeric #expect(list0?.length == 2) #expect(list0?[0] == 1) #expect(list0?[1] == 2) From 21922d7a2ec04a58c7b7a860708c36e90b3d2098 Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Thu, 27 Nov 2025 15:59:54 +0800 Subject: [PATCH 05/10] Primtive test works with JSON approach. --- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 3 +- .../ArrowIPCTests/Gold/ArrowJSONEncoder.swift | 151 +++++++++++++----- .../ArrowIPCTests/Gold/ArrowTestingGold.swift | 20 +-- .../ArrowIPCTests/Gold/ArrowTestingJSON.swift | 21 +-- 4 files changed, 130 insertions(+), 65 deletions(-) diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index b2c5f29..199f7a7 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -103,7 +103,6 @@ extension [KeyValue] { enum DataValue: Codable, Equatable { case string(String) case int(Int) - case double(Double) case bool(Bool) case null @@ -115,7 +114,7 @@ enum DataValue: Codable, Equatable { } else if let intValue = try? container.decode(Int.self) { self = .int(intValue) } else if let doubleValue = try? container.decode(Double.self) { - self = .double(doubleValue) + self = .string(String(doubleValue)) } else if let stringValue = try? container.decode(String.self) { self = .string(stringValue) } else if let boolValue = try? container.decode(Bool.self) { diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift index 1f6e9ec..f80bdfc 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -36,45 +36,60 @@ func encodeColumn( var data: [DataValue]? = nil var children: [ArrowGold.Column]? = nil - switch field.type { - // Test the actual array interface - case .list(let listField): - guard let listArray = array as? ArrowArrayOfList else { - throw ArrowError.invalid("Expected list array") - } - // Build offsets by using the array interface - var computedOffsets: [Int] = [0] - var currentOffset = 0 + if array.length > 0 { - for i in 0..( guard let typedArray = array as? ArrowArrayNumeric else { throw ArrowError.invalid("Expected \(T.self) array, got \(type(of: array))") } + return try (0..( + from array: AnyArrowArrayProtocol, + expectedType: T.Type +) throws -> [DataValue] { + guard let typedArray = array as? ArrowArrayNumeric else { + throw ArrowError.invalid("Expected \(T.self) array, got \(type(of: array))") + } + + let encoder = JSONEncoder() + let decoder = JSONDecoder() return try (0.. [DataValue] { + guard let typedArray = array as? ArrowArrayBoolean else { + throw ArrowError.invalid("Expected boolean array, got \(type(of: array))") + } + return (0.. [DataValue] { +// guard let binaryArray = array as? ArrowArrayBinary else { +// throw ArrowError.invalid("Expected binary array") +// } +// +// return (0.. Date: Fri, 28 Nov 2025 10:31:25 +0800 Subject: [PATCH 06/10] Variabe length has configurable offset type. --- Sources/Arrow/Array/Array.swift | 52 +++++--- Sources/Arrow/Array/Builder.swift | 37 +++--- Sources/Arrow/Buffer/FixedWidthBuffer.swift | 3 +- Sources/ArrowIPC/Array+IPC.swift | 124 +++++++++--------- Sources/ArrowIPC/ArrowReader.swift | 24 ++-- Tests/ArrowIPCTests/ArrowReaderTests.swift | 5 +- Tests/ArrowTests/Array/BasicArrayTests.swift | 2 +- Tests/ArrowTests/Array/FuzzedArrayTests.swift | 10 +- Tests/ArrowTests/Array/ReadmeExamples.swift | 2 +- Tests/ArrowTests/Array/StructArrayTests.swift | 2 +- 10 files changed, 138 insertions(+), 123 deletions(-) diff --git a/Sources/Arrow/Array/Array.swift b/Sources/Arrow/Array/Array.swift index 62b7070..06cd331 100644 --- a/Sources/Arrow/Array/Array.swift +++ b/Sources/Arrow/Array/Array.swift @@ -194,34 +194,36 @@ where } /// An Arrow array of variable-length types. -public struct ArrowArrayVariable: - ArrowArrayProtocol -where - OffsetsBuffer: FixedWidthBufferProtocol, - ValueBuffer: VariableLengthBufferProtocol, - ValueBuffer.ElementType: VariableLength -{ - public typealias ItemType = ValueBuffer.ElementType +public struct ArrowArrayVariable< + ItemType: VariableLength, + OffsetType: FixedWidthInteger & SignedInteger +>: ArrowArrayProtocol { public let offset: Int public let length: Int + private let nullBuffer: NullBuffer + private let offsetsBuffer: any FixedWidthBufferProtocol + private let valueBuffer: any VariableLengthBufferProtocol + public var bufferSizes: [Int] { [nullBuffer.length, offsetsBuffer.length, valueBuffer.length] } + public var buffers: [ArrowBufferProtocol] { [nullBuffer, offsetsBuffer, valueBuffer] } + public var nullCount: Int { nullBuffer.nullCount } - let nullBuffer: NullBuffer - let offsetsBuffer: OffsetsBuffer - let valueBuffer: ValueBuffer - public init( + public init< + Offsets: FixedWidthBufferProtocol, + Values: VariableLengthBufferProtocol + >( offset: Int = 0, length: Int, nullBuffer: NullBuffer, - offsetsBuffer: OffsetsBuffer, - valueBuffer: ValueBuffer - ) { + offsetsBuffer: Offsets, + valueBuffer: Values + ) where Values.ElementType == ItemType { self.offset = offset self.length = length self.nullBuffer = nullBuffer @@ -229,16 +231,19 @@ where self.valueBuffer = valueBuffer } - public subscript(index: Int) -> ValueBuffer.ElementType? { + public subscript(index: Int) -> ItemType? { let offsetIndex = self.offset + index - if !self.nullBuffer.isSet(offsetIndex) { + guard self.nullBuffer.isSet(offsetIndex) else { return nil } - let startIndex = offsetsBuffer[offsetIndex] - let endIndex = offsetsBuffer[offsetIndex + 1] + + // Use runtime dispatch through the existential + let startOffset = offsetsBuffer[offsetIndex] + let endOffset = offsetsBuffer[offsetIndex + 1] + return valueBuffer.loadVariable( - at: Int(startIndex), - arrayLength: Int(endIndex - startIndex) + at: Int(startOffset), + arrayLength: Int(endOffset - startOffset) ) } @@ -253,6 +258,11 @@ where } } +public typealias ArrowArrayVariableInt32 = + ArrowArrayVariable +public typealias ArrowArrayVariableInt64 = + ArrowArrayVariable + /// An Arrow array of `Date`s with a resolution of 1 day. public struct ArrowArrayDate32: ArrowArrayProtocol { public typealias ItemType = Date diff --git a/Sources/Arrow/Array/Builder.swift b/Sources/Arrow/Array/Builder.swift index 19b2f48..9cabf4c 100644 --- a/Sources/Arrow/Array/Builder.swift +++ b/Sources/Arrow/Array/Builder.swift @@ -173,24 +173,23 @@ public class ArrayBuilderFixedSizedBinary: } /// A builder for Arrow arrays holding variable length types. -public class ArrayBuilderVariableLength: - AnyArrayBuilder -{ - public typealias ArrayType = ArrowArrayVariable< - FixedWidthBuffer, VariableLengthTypeBuffer - > +public class ArrayBuilderVariableLength< + Element: VariableLength, OffsetType: FixedWidthInteger & SignedInteger +>: AnyArrayBuilder { + + public typealias ArrayType = ArrowArrayVariable var length: Int let nullBuilder: NullBufferBuilder - let offsetsBuilder: FixedWidthBufferBuilder + let offsetsBuilder: FixedWidthBufferBuilder let valueBuilder: VariableLengthTypeBufferBuilder public init() { self.length = 0 self.nullBuilder = NullBufferBuilder() - self.offsetsBuilder = FixedWidthBufferBuilder() + self.offsetsBuilder = FixedWidthBufferBuilder() self.valueBuilder = VariableLengthTypeBufferBuilder() - self.offsetsBuilder.append(Int32.zero) + self.offsetsBuilder.append(OffsetType.zero) } public func append(_ value: Element) { @@ -206,36 +205,32 @@ public class ArrayBuilderVariableLength: valueBuilder.increaseCapacity(to: newCapacity) } valueBuilder.append(data) - let newOffset = Int32(valueBuilder.length) + let newOffset = OffsetType(valueBuilder.length) offsetsBuilder.append(newOffset) } public func appendNull() { length += 1 nullBuilder.appendValid(false) - let newOffset = Int32(valueBuilder.length) + let newOffset = OffsetType(valueBuilder.length) offsetsBuilder.append(newOffset) } public func finish() -> ArrayType { - let nullBuffer = nullBuilder.finish() - let offsetsBuffer = offsetsBuilder.finish() - let valueBuffer = valueBuilder.finish() - return .init( - offset: 0, + ArrayType( length: length, - nullBuffer: nullBuffer, - offsetsBuffer: offsetsBuffer, - valueBuffer: valueBuffer + nullBuffer: nullBuilder.finish(), + offsetsBuffer: offsetsBuilder.finish(), + valueBuffer: valueBuilder.finish() ) } } /// A builder for Arrow arrays holding `String` values. -typealias ArrayBuilderString = ArrayBuilderVariableLength +typealias ArrayBuilderString = ArrayBuilderVariableLength /// A builder for Arrow arrays holding `Data` values. -typealias ArrayBuilderBinary = ArrayBuilderVariableLength +typealias ArrayBuilderBinary = ArrayBuilderVariableLength /// A builder for Arrow arrays holding `Date`s with a resolution of one day. public struct ArrayBuilderDate32: AnyArrayBuilder { diff --git a/Sources/Arrow/Buffer/FixedWidthBuffer.swift b/Sources/Arrow/Buffer/FixedWidthBuffer.swift index a90379e..a0023f1 100644 --- a/Sources/Arrow/Buffer/FixedWidthBuffer.swift +++ b/Sources/Arrow/Buffer/FixedWidthBuffer.swift @@ -19,9 +19,8 @@ public protocol FixedWidthBufferProtocol: ArrowBufferProtocol { } /// A buffer used in Arrow arrays that hold fixed-width types. -public final class FixedWidthBuffer: FixedWidthBufferProtocol +final class FixedWidthBuffer: FixedWidthBufferProtocol where T: Numeric { - public typealias ElementType = T public var length: Int var capacity: Int diff --git a/Sources/ArrowIPC/Array+IPC.swift b/Sources/ArrowIPC/Array+IPC.swift index 79c5f04..83bfb71 100644 --- a/Sources/ArrowIPC/Array+IPC.swift +++ b/Sources/ArrowIPC/Array+IPC.swift @@ -15,65 +15,65 @@ import Arrow import Foundation -/// A `Data` backed Arrow utf8 array. -typealias ArrowArrayUtf8 = ArrowArrayVariable< - FixedWidthBufferIPC, - VariableLengthBufferIPC -> - -extension ArrowArrayUtf8 { - - /// Build a `Data` backed Arrow utf8 array. - /// - Parameters: - /// - length: The array length. - /// - nullBuffer: The null buffer. - /// - offsetsBuffer: A view over file-backed data. - /// - valueBuffer: A view over file-backed data. - /// - Returns: A file-backed Arrow utf8 array. - static func utf8( - length: Int, - nullBuffer: NullBuffer, - offsetsBuffer: FileDataBuffer, - valueBuffer: FileDataBuffer - ) -> Self { - let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) - let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) - return Self( - length: length, - nullBuffer: nullBuffer, - offsetsBuffer: offsetsBufferTyped, - valueBuffer: valueBufferTyped - ) - } -} - -typealias ArrowArrayBinary = ArrowArrayVariable< - FixedWidthBufferIPC, - VariableLengthBufferIPC -> - -extension ArrowArrayBinary { - - /// Build a `Data` backed Arrow binary array. - /// - Parameters: - /// - length: The array length. - /// - nullBuffer: The null buffer. - /// - offsetsBuffer: A view over file-backed data. - /// - valueBuffer: A view over file-backed data. - /// - Returns: A file-backed Arrow utf8 array. - static func binary( - length: Int, - nullBuffer: NullBuffer, - offsetsBuffer: FileDataBuffer, - valueBuffer: FileDataBuffer - ) -> Self { - let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) - let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) - return Self( - length: length, - nullBuffer: nullBuffer, - offsetsBuffer: offsetsBufferTyped, - valueBuffer: valueBufferTyped - ) - } -} +///// A `Data` backed Arrow utf8 array. +//typealias ArrowArrayUtf8 = ArrowArrayVariable< +// FixedWidthBufferIPC, +// VariableLengthBufferIPC +//> +// +//extension ArrowArrayUtf8 { +// +// /// Build a `Data` backed Arrow utf8 array. +// /// - Parameters: +// /// - length: The array length. +// /// - nullBuffer: The null buffer. +// /// - offsetsBuffer: A view over file-backed data. +// /// - valueBuffer: A view over file-backed data. +// /// - Returns: A file-backed Arrow utf8 array. +// static func utf8( +// length: Int, +// nullBuffer: NullBuffer, +// offsetsBuffer: FileDataBuffer, +// valueBuffer: FileDataBuffer +// ) -> Self { +// let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) +// let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) +// return Self( +// length: length, +// nullBuffer: nullBuffer, +// offsetsBuffer: offsetsBufferTyped, +// valueBuffer: valueBufferTyped +// ) +// } +//} +// +//typealias ArrowArrayBinary = ArrowArrayVariable< +// FixedWidthBufferIPC, +// VariableLengthBufferIPC +//> +// +//extension ArrowArrayBinary { +// +// /// Build a `Data` backed Arrow binary array. +// /// - Parameters: +// /// - length: The array length. +// /// - nullBuffer: The null buffer. +// /// - offsetsBuffer: A view over file-backed data. +// /// - valueBuffer: A view over file-backed data. +// /// - Returns: A file-backed Arrow utf8 array. +// static func binary( +// length: Int, +// nullBuffer: NullBuffer, +// offsetsBuffer: FileDataBuffer, +// valueBuffer: FileDataBuffer +// ) -> Self { +// let offsetsBufferTyped = FixedWidthBufferIPC(buffer: offsetsBuffer) +// let valueBufferTyped = VariableLengthBufferIPC(buffer: valueBuffer) +// return Self( +// length: length, +// nullBuffer: nullBuffer, +// offsetsBuffer: offsetsBufferTyped, +// valueBuffer: valueBufferTyped +// ) +// } +//} diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 68cfad1..1aac278 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -85,7 +85,9 @@ where } /// A `Data` backed buffer for variable-length types. -struct VariableLengthBufferIPC: +struct VariableLengthBufferIPC< + Element: VariableLength, OffsetType: FixedWidthInteger +>: VariableLengthBufferProtocol, ArrowBufferIPC { typealias ElementType = Element @@ -318,19 +320,25 @@ public struct ArrowReader { let buffer2 = try nextBuffer( message: rbMessage, index: &bufferIndex, offset: offset, data: data) + let offsetsBufferTyped = FixedWidthBufferIPC(buffer: buffer1) + if arrowType == .utf8 { - return ArrowArrayVariable.utf8( + let valueBufferTyped = VariableLengthBufferIPC( + buffer: buffer2) + return ArrowArrayVariable( length: length, nullBuffer: nullBuffer, - offsetsBuffer: buffer1, - valueBuffer: buffer2 + offsetsBuffer: offsetsBufferTyped, + valueBuffer: valueBufferTyped ) } else if arrowType == .binary { - return ArrowArrayVariable.binary( + let valueBufferTyped = VariableLengthBufferIPC( + buffer: buffer2) + return ArrowArrayVariable( length: length, nullBuffer: nullBuffer, - offsetsBuffer: buffer1, - valueBuffer: buffer2 + offsetsBuffer: offsetsBufferTyped, + valueBuffer: valueBufferTyped ) } else { throw ArrowError.notImplemented @@ -418,7 +426,7 @@ public struct ArrowReader { if case .fixedSizeBinary(let byteWidth) = arrowType { let valueBuffer = try nextBuffer( message: rbMessage, index: &bufferIndex, offset: offset, data: data) - let valueBufferTyped = VariableLengthBufferIPC( + let valueBufferTyped = VariableLengthBufferIPC( buffer: valueBuffer) return ArrowArrayFixedSizeBinary( length: length, diff --git a/Tests/ArrowIPCTests/ArrowReaderTests.swift b/Tests/ArrowIPCTests/ArrowReaderTests.swift index a4ff7a5..be869d4 100644 --- a/Tests/ArrowIPCTests/ArrowReaderTests.swift +++ b/Tests/ArrowIPCTests/ArrowReaderTests.swift @@ -53,7 +53,10 @@ struct ArrowReaderTests { #expect(doubleColumn[4] == 5.5) // Test the String column (index 1) - guard let stringColumn = recordBatch.arrays[1] as? ArrowArrayUtf8 else { + guard + let stringColumn = recordBatch.arrays[1] + as? ArrowArrayVariable + else { Issue.record("Failed to cast column 1 to ArrowArrayString") return } diff --git a/Tests/ArrowTests/Array/BasicArrayTests.swift b/Tests/ArrowTests/Array/BasicArrayTests.swift index 7fe77dd..76b9c81 100644 --- a/Tests/ArrowTests/Array/BasicArrayTests.swift +++ b/Tests/ArrowTests/Array/BasicArrayTests.swift @@ -76,7 +76,7 @@ struct BasicArrayTests { } @Test func stringArray() throws { - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() builder.appendNull() builder.append("abc") builder.append("def") diff --git a/Tests/ArrowTests/Array/FuzzedArrayTests.swift b/Tests/ArrowTests/Array/FuzzedArrayTests.swift index e7fb937..ebada6b 100644 --- a/Tests/ArrowTests/Array/FuzzedArrayTests.swift +++ b/Tests/ArrowTests/Array/FuzzedArrayTests.swift @@ -56,7 +56,7 @@ struct FuzzedArrayTests { testArray[i] = nil } } - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() for value in testArray { if let value { builder.append(value) @@ -84,7 +84,7 @@ struct FuzzedArrayTests { } @Test func binaryStringArray() throws { - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() var byteCount: Int = 0 let count: Int = 100 var nullCount: Int = 0 @@ -143,7 +143,7 @@ struct FuzzedArrayTests { nullCount += 1 } } - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() for value in expected { if let value { builder.append(value) @@ -223,7 +223,7 @@ struct FuzzedArrayTests { expected[i] = randomString(length: length, using: &rng) } } - let arrayBuilder: ArrayBuilderVariableLength = .init() + let arrayBuilder: ArrayBuilderVariableLength = .init() for value in expected { if let value { arrayBuilder.append(value) @@ -281,7 +281,7 @@ struct FuzzedArrayTests { byteCount += value.utf8.count } } - let builder: ArrayBuilderVariableLength = .init() + let builder: ArrayBuilderVariableLength = .init() for value in expected { if let value { builder.append(value) diff --git a/Tests/ArrowTests/Array/ReadmeExamples.swift b/Tests/ArrowTests/Array/ReadmeExamples.swift index a6d1515..41ed43e 100644 --- a/Tests/ArrowTests/Array/ReadmeExamples.swift +++ b/Tests/ArrowTests/Array/ReadmeExamples.swift @@ -43,7 +43,7 @@ struct ReadmeExamples { @Test func stringArray() throws { let swiftArray: [String?] = ["ab", nil, "c", "", "."] - let arrayBuilder: ArrayBuilderVariableLength = .init() + let arrayBuilder: ArrayBuilderVariableLength = .init() for value in swiftArray { if let value { arrayBuilder.append(value) diff --git a/Tests/ArrowTests/Array/StructArrayTests.swift b/Tests/ArrowTests/Array/StructArrayTests.swift index eefbcbf..97c4be3 100644 --- a/Tests/ArrowTests/Array/StructArrayTests.swift +++ b/Tests/ArrowTests/Array/StructArrayTests.swift @@ -22,7 +22,7 @@ struct StructArrayTests { @Test func testStructArray() { // Create builders for struct fields let idBuilder = ArrayBuilderFixedWidth() - let nameBuilder = ArrayBuilderVariableLength() + let nameBuilder = ArrayBuilderVariableLength() // Create struct builder let structBuilder = ArrayBuilderStruct(fields: [ From 7829c6ec75b854f86baa7be9dce5fa3740d0366f Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Fri, 28 Nov 2025 13:02:27 +0800 Subject: [PATCH 07/10] Protocols over Binary and Utf8 added and used in json integraton testing approach. --- Sources/Arrow/Array/Array.swift | 38 +++++----- Sources/Arrow/Array/Builder.swift | 4 +- .../ArrowIPCTests/Gold/ArrowJSONEncoder.swift | 75 +++++++++++++++++-- .../ArrowIPCTests/Gold/ArrowTestingGold.swift | 6 +- .../ArrowIPCTests/Gold/ArrowTestingJSON.swift | 30 +++++++- 5 files changed, 118 insertions(+), 35 deletions(-) diff --git a/Sources/Arrow/Array/Array.swift b/Sources/Arrow/Array/Array.swift index 06cd331..fded369 100644 --- a/Sources/Arrow/Array/Array.swift +++ b/Sources/Arrow/Array/Array.swift @@ -44,12 +44,12 @@ public protocol ArrowArrayOfString { } extension ArrowArrayVariable: ArrowArrayOfString where ItemType == String {} -public protocol ArrowArrayOfData { - var length: Int { get } - subscript(index: Int) -> Data? { get } -} -extension ArrowArrayFixedSizeBinary: ArrowArrayOfData where ItemType == Data {} -extension ArrowArrayVariable: ArrowArrayOfData where ItemType == Data {} +//public protocol ArrowArrayOfData { +// var length: Int { get } +// subscript(index: Int) -> Data? { get } +//} +//extension ArrowArrayFixedSizeBinary: ArrowArrayOfData where ItemType == Data {} +//extension ArrowArrayVariable: ArrowArrayOfData where ItemType == Data {} public protocol ArrowArrayOfList { var length: Int { get } @@ -145,10 +145,7 @@ public struct ArrowArrayNumeric: } } -public struct ArrowArrayFixedSizeBinary: ArrowArrayProtocol -where - ValueBuffer: VariableLengthBufferProtocol -{ +public struct ArrowArrayFixedSizeBinary: ArrowArrayProtocol { public typealias ItemType = Data public let offset: Int public let length: Int @@ -160,14 +157,14 @@ where public var nullCount: Int { nullBuffer.nullCount } let nullBuffer: NullBuffer - let valueBuffer: ValueBuffer + let valueBuffer: any VariableLengthBufferProtocol public init( offset: Int = 0, length: Int, byteWidth: Int, nullBuffer: NullBuffer, - valueBuffer: ValueBuffer + valueBuffer: any VariableLengthBufferProtocol ) { self.offset = offset self.length = length @@ -176,7 +173,7 @@ where self.valueBuffer = valueBuffer } - public subscript(index: Int) -> ValueBuffer.ElementType? { + public subscript(index: Int) -> ItemType? { guard nullBuffer.isSet(index) else { return nil } let startIndex = index * byteWidth return valueBuffer.loadVariable(at: startIndex, arrayLength: byteWidth) @@ -193,6 +190,16 @@ where } } +protocol BinaryArrayProtocol: ArrowArrayProtocol where ItemType == Data {} +extension ArrowArrayFixedSizeBinary: BinaryArrayProtocol {} +extension ArrowArrayVariable: BinaryArrayProtocol +where ItemType == Data, OffsetType: FixedWidthInteger & SignedInteger {} + +protocol Utf8ArrayProtocol: ArrowArrayProtocol where ItemType == String {} +//extension ArrowArrayFixedSize: BinaryArrayProtocol { } +extension ArrowArrayVariable: Utf8ArrayProtocol +where ItemType == String, OffsetType: FixedWidthInteger & SignedInteger {} + /// An Arrow array of variable-length types. public struct ArrowArrayVariable< ItemType: VariableLength, @@ -258,11 +265,6 @@ public struct ArrowArrayVariable< } } -public typealias ArrowArrayVariableInt32 = - ArrowArrayVariable -public typealias ArrowArrayVariableInt64 = - ArrowArrayVariable - /// An Arrow array of `Date`s with a resolution of 1 day. public struct ArrowArrayDate32: ArrowArrayProtocol { public typealias ItemType = Date diff --git a/Sources/Arrow/Array/Builder.swift b/Sources/Arrow/Array/Builder.swift index 9cabf4c..1913f40 100644 --- a/Sources/Arrow/Array/Builder.swift +++ b/Sources/Arrow/Array/Builder.swift @@ -120,9 +120,7 @@ public class ArrayBuilderFixedWidth: public class ArrayBuilderFixedSizedBinary: AnyArrayBuilder { - public typealias ArrayType = ArrowArrayFixedSizeBinary< - VariableLengthTypeBuffer - > + public typealias ArrayType = ArrowArrayFixedSizeBinary var length: Int let byteWidth: Int diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift index f80bdfc..8c038c9 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -86,8 +86,15 @@ func encodeColumn( data = try extractFloatData(from: array, expectedType: Float32.self) case .float64: data = try extractFloatData(from: array, expectedType: Float64.self) + case .binary: + try extractBinaryData(from: array, into: &data, offsets: &offsets) + case .fixedSizeBinary(_): + try extractBinaryData(from: array, into: &data, offsets: &offsets) + offsets = nil // Fixed-size offsets are implicit. + case .utf8: + try extractUtf8Data(from: array, into: &data, offsets: &offsets) default: - print("Unhandled type: \(field.type)") + throw ArrowError.invalid("Unhandled field type: \(field.type)") } } return .init( @@ -163,17 +170,69 @@ func extractBoolData(from array: AnyArrowArrayProtocol) throws -> [DataValue] { } } -//func extractBinaryData( -// from array: AnyArrowArrayProtocol -//) throws -> [DataValue] { -// guard let binaryArray = array as? ArrowArrayBinary else { +//func extractBinaryDataX( +// from array: AnyArrowArrayProtocol, +// into dataValues: inout [DataValue]?, +// offsets: inout [Int]? +//) throws { +// guard let binaryArray = array as? any BinaryArrayProtocol else { // throw ArrowError.invalid("Expected binary array") // } // -// return (0.. Date: Fri, 28 Nov 2025 14:51:20 +0800 Subject: [PATCH 08/10] Add generated metadata test. --- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 3 ++- Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift | 13 +++++-------- Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift | 13 +++++++------ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index 199f7a7..69ba2a4 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -146,7 +146,8 @@ extension ArrowGold.Column { count: count, validity: validity, offset: offset, - data: filteredData.isEmpty ? nil : filteredData, + data: filteredData, +// data: filteredData.isEmpty ? nil : filteredData, children: children?.map { $0.withoutJunkData() } ) } diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift index 8c038c9..947373d 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -24,16 +24,12 @@ func encodeColumn( guard let array = array as? (any ArrowArrayProtocol) else { throw ArrowError.invalid("Expected ArrowArray, got \(type(of: array))") } - - var validity: [Int] = [] - - for i in 0.. 0 { @@ -61,6 +57,7 @@ func encodeColumn( let childColumn = try encodeColumn( array: listArray.values, field: listField) children = [childColumn] + data = nil // List arrays point to child arrays, not their data. case .boolean: data = try extractBoolData(from: array) diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift index 16d2378..842ec6c 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift @@ -48,10 +48,9 @@ struct ArrowTestingJSON { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_binary", -// "generated_binary_zerolength", - // "generated_binary_no_batches", - // "generated_custom_metadata", - // "generated_nested", +// "generated_binary_zerolength", + "generated_custom_metadata", // TODO: replicate the gold metadata tests +// "generated_nested", ] // @Test(.serialized, arguments: testCases) @@ -91,7 +90,7 @@ struct ArrowTestingJSON { // This is just useful for pin-pointing differences. if actual != expected { - print(expectedColumn.name) + print("==== \(expectedColumn.name) ====") #expect(actual.validity == expected.validity) #expect(actual.offset == expected.offset) @@ -99,7 +98,9 @@ struct ArrowTestingJSON { guard let actualData = actual.data, let expectedData = expected.data, let validity = actual.validity else { - fatalError() +// fatalError() + #expect(false) + return } for (i, isValid) in validity.enumerated() { From 08ddb6917bee1d4ac906ad8286286d5be0b6ecfe Mon Sep 17 00:00:00 2001 From: Will Temperley Date: Fri, 28 Nov 2025 15:55:52 +0800 Subject: [PATCH 09/10] Directly test offset buffer contents. --- Sources/Arrow/ArrowType.swift | 3 +- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 2 +- .../ArrowIPCTests/Gold/ArrowJSONEncoder.swift | 190 +++++++++--------- .../ArrowIPCTests/Gold/ArrowTestingJSON.swift | 8 +- 4 files changed, 96 insertions(+), 107 deletions(-) diff --git a/Sources/Arrow/ArrowType.swift b/Sources/Arrow/ArrowType.swift index ddfc539..829c9de 100644 --- a/Sources/Arrow/ArrowType.swift +++ b/Sources/Arrow/ArrowType.swift @@ -518,7 +518,7 @@ extension ArrowType { /// Returns true if the type is primitive: (numeric, temporal). @inlinable - public func isPrimitive() -> Bool { + public var isPrimitive: Bool { self.isNumeric || self.isTemporal } @@ -1023,7 +1023,6 @@ extension ArrowType { } else if from == "u" { return .utf8 } - throw .notImplemented } } diff --git a/Tests/ArrowIPCTests/Gold/ArrowGold.swift b/Tests/ArrowIPCTests/Gold/ArrowGold.swift index 69ba2a4..e685093 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowGold.swift @@ -147,7 +147,7 @@ extension ArrowGold.Column { validity: validity, offset: offset, data: filteredData, -// data: filteredData.isEmpty ? nil : filteredData, + // data: filteredData.isEmpty ? nil : filteredData, children: children?.map { $0.withoutJunkData() } ) } diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift index 947373d..018e4ea 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -16,49 +16,68 @@ import Foundation @testable import Arrow +/// Encode an array to the gold testing JSON format. +/// - Parameters: +/// - array: The array to encode. +/// - field: The field associated with the array. +/// - Throws: An `ArrowError` if encoding fails. +/// - Returns: The column exactly as the test format expects it. +/// Note the junk values present in the test data are not replicated here therefore these need to be +/// removed from test data before comparison happens. func encodeColumn( array: AnyArrowArrayProtocol, field: ArrowField -) throws -> ArrowGold.Column { +) throws(ArrowError) -> ArrowGold.Column { guard let array = array as? (any ArrowArrayProtocol) else { - throw ArrowError.invalid("Expected ArrowArray, got \(type(of: array))") + throw .invalid("Expected ArrowArray, got \(type(of: array))") } // Validity is always present in the gold files. let validity: [Int] = (0.. 0 { - switch field.type { - // Test the actual array interface case .list(let listField): guard let listArray = array as? ArrowArrayOfList else { throw ArrowError.invalid("Expected list array") } // Build offsets by using the array interface - var computedOffsets: [Int] = [0] - var currentOffset = 0 - - for i in 0..( from array: AnyArrowArrayProtocol, expectedType: T.Type -) throws -> [DataValue] { +) throws(ArrowError) -> [DataValue] { guard let typedArray = array as? ArrowArrayNumeric else { - throw ArrowError.invalid("Expected \(T.self) array, got \(type(of: array))") + throw .invalid("Expected \(T.self) array, got \(type(of: array))") } - return try (0..( from array: AnyArrowArrayProtocol, expectedType: T.Type -) throws -> [DataValue] { +) throws(ArrowError) -> [DataValue] { guard let typedArray = array as? ArrowArrayNumeric else { throw ArrowError.invalid("Expected \(T.self) array, got \(type(of: array))") } - let encoder = JSONEncoder() let decoder = JSONDecoder() - - return try (0.. [DataValue] { +func extractBoolData(from array: AnyArrowArrayProtocol) throws(ArrowError) + -> [DataValue] +{ guard let typedArray = array as? ArrowArrayBoolean else { - throw ArrowError.invalid("Expected boolean array, got \(type(of: array))") + throw .invalid("Expected boolean array, got \(type(of: array))") } return (0.. [DataValue] { } } -//func extractBinaryDataX( -// from array: AnyArrowArrayProtocol, -// into dataValues: inout [DataValue]?, -// offsets: inout [Int]? -//) throws { -// guard let binaryArray = array as? any BinaryArrayProtocol else { -// throw ArrowError.invalid("Expected binary array") -// } -// -// -// dataValues = (0.. Date: Sat, 29 Nov 2025 12:50:01 +0800 Subject: [PATCH 10/10] Full-schema comparison tests green. --- Sources/Arrow/Array/Array.swift | 47 ++---- Sources/Arrow/ArrowSchema.swift | 1 - Sources/ArrowIPC/ArrowReader.swift | 40 +++-- Sources/ArrowIPC/ArrowType+IPC.swift | 35 +--- Tests/ArrowIPCTests/Gold/ArrowGold.swift | 105 ++++++++++-- .../ArrowIPCTests/Gold/ArrowJSONEncoder.swift | 47 +++--- .../ArrowIPCTests/Gold/ArrowTestingGold.swift | 11 +- .../ArrowIPCTests/Gold/ArrowTestingJSON.swift | 66 ++++---- .../Gold/ArrowType+validation.swift | 149 ++++++++++++++++++ Tests/ArrowIPCTests/TestSupport.swift | 2 +- Tests/ArrowTests/Array/ListArrayTests.swift | 4 +- 11 files changed, 359 insertions(+), 148 deletions(-) diff --git a/Sources/Arrow/Array/Array.swift b/Sources/Arrow/Array/Array.swift index fded369..c4d427c 100644 --- a/Sources/Arrow/Array/Array.swift +++ b/Sources/Arrow/Array/Array.swift @@ -38,25 +38,30 @@ extension ArrowArrayProtocol { // MARK: Capability protocols. -public protocol ArrowArrayOfString { +public protocol StringArrayProtocol { var length: Int { get } subscript(index: Int) -> String? { get } } -extension ArrowArrayVariable: ArrowArrayOfString where ItemType == String {} +extension ArrowArrayVariable: StringArrayProtocol where ItemType == String {} -//public protocol ArrowArrayOfData { -// var length: Int { get } -// subscript(index: Int) -> Data? { get } -//} -//extension ArrowArrayFixedSizeBinary: ArrowArrayOfData where ItemType == Data {} -//extension ArrowArrayVariable: ArrowArrayOfData where ItemType == Data {} +protocol BinaryArrayProtocol: ArrowArrayProtocol where ItemType == Data {} +extension ArrowArrayFixedSizeBinary: BinaryArrayProtocol {} +extension ArrowArrayVariable: BinaryArrayProtocol +where ItemType == Data, OffsetType: FixedWidthInteger & SignedInteger {} -public protocol ArrowArrayOfList { +protocol Utf8ArrayProtocol: ArrowArrayProtocol where ItemType == String {} +extension ArrowArrayVariable: Utf8ArrayProtocol +where ItemType == String, OffsetType: FixedWidthInteger & SignedInteger {} + +public protocol ListArrayProtocol { var length: Int { get } var values: AnyArrowArrayProtocol { get } subscript(index: Int) -> AnyArrowArrayProtocol? { get } } -extension ArrowListArray: ArrowArrayOfList {} +extension ArrowListArray: ListArrayProtocol {} +extension ArrowFixedSizeListArray: ListArrayProtocol {} + +// MARK: Array implementations. /// An Arrow array of booleans using the three-valued logical model (true / false / null). public struct ArrowArrayBoolean: ArrowArrayProtocol { @@ -190,16 +195,6 @@ public struct ArrowArrayFixedSizeBinary: ArrowArrayProtocol { } } -protocol BinaryArrayProtocol: ArrowArrayProtocol where ItemType == Data {} -extension ArrowArrayFixedSizeBinary: BinaryArrayProtocol {} -extension ArrowArrayVariable: BinaryArrayProtocol -where ItemType == Data, OffsetType: FixedWidthInteger & SignedInteger {} - -protocol Utf8ArrayProtocol: ArrowArrayProtocol where ItemType == String {} -//extension ArrowArrayFixedSize: BinaryArrayProtocol { } -extension ArrowArrayVariable: Utf8ArrayProtocol -where ItemType == String, OffsetType: FixedWidthInteger & SignedInteger {} - /// An Arrow array of variable-length types. public struct ArrowArrayVariable< ItemType: VariableLength, @@ -376,17 +371,7 @@ where } } -protocol ListArrayProtocol: ArrowArrayProtocol { - var length: Int { get } - var values: AnyArrowArrayProtocol { get } -} - -extension ArrowListArray: ListArrayProtocol { - - // No implementation needed - offsetsBuffer and values already exist - // Swift automatically satisfies the protocol requirements -} - +/// An Arrow list array with fixed size elements. public struct ArrowFixedSizeListArray: ArrowArrayProtocol { public let offset: Int public let length: Int diff --git a/Sources/Arrow/ArrowSchema.swift b/Sources/Arrow/ArrowSchema.swift index af93450..735f39d 100644 --- a/Sources/Arrow/ArrowSchema.swift +++ b/Sources/Arrow/ArrowSchema.swift @@ -25,7 +25,6 @@ public final class ArrowSchema: Sendable { for (index, field) in fields.enumerated() { fieldLookup[field.name] = index } - self.fields = fields self.fieldLookup = fieldLookup self.metadata = metadata diff --git a/Sources/ArrowIPC/ArrowReader.swift b/Sources/ArrowIPC/ArrowReader.swift index 1aac278..cb21629 100644 --- a/Sources/ArrowIPC/ArrowReader.swift +++ b/Sources/ArrowIPC/ArrowReader.swift @@ -499,23 +499,7 @@ public struct ArrowReader { guard let field = schema.fields(at: index) else { throw .invalid("Field not found at index: \(index)") } - let fieldType: ArrowType = try .type(for: field) - guard let fieldName = field.name else { - throw .invalid("Field name not found") - } - let fieldMetadata = (0.. Self { + let fieldType: ArrowType = try .type(for: field) + guard let fieldName = field.name else { + throw .invalid("Field name not found") + } + let fieldMetadata = (0.. Self { - guard let data = self.data, let validity = self.validity else { - return self + guard let validity = self.validity else { + fatalError() } - let filteredData = data.enumerated().map { index, value in + let filteredData = data?.enumerated().map { index, value in validity[index] == 1 ? value : .null } return Self( @@ -147,8 +212,22 @@ extension ArrowGold.Column { validity: validity, offset: offset, data: filteredData, - // data: filteredData.isEmpty ? nil : filteredData, children: children?.map { $0.withoutJunkData() } ) } } + +/// Decode a list of `KeyValue` to a dictionary. +/// - Parameter keyValues: The key values to convert. +/// - Throws: If decoding fails. +/// - Returns: A metadata dictionary. +private func buildDictionary( + from keyValues: inout any UnkeyedDecodingContainer +) throws -> [String: String]? { + var dict: [String: String] = [:] + while !keyValues.isAtEnd { + let pair = try keyValues.decode(KeyValue.self) + dict[pair.key] = pair.value + } + return dict.isEmpty ? nil : dict +} diff --git a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift index 018e4ea..ed923e5 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowJSONEncoder.swift @@ -28,7 +28,6 @@ func encodeColumn( array: AnyArrowArrayProtocol, field: ArrowField ) throws(ArrowError) -> ArrowGold.Column { - guard let array = array as? (any ArrowArrayProtocol) else { throw .invalid("Expected ArrowArray, got \(type(of: array))") } @@ -58,26 +57,38 @@ func encodeColumn( if array.length > 0 { switch field.type { case .list(let listField): - guard let listArray = array as? ArrowArrayOfList else { + guard let listArray = array as? ListArrayProtocol else { throw ArrowError.invalid("Expected list array") } - // Build offsets by using the array interface - // var computedOffsets: [Int] = [0] - // var currentOffset = 0 - - // for i in 0..( } } -func extractBoolData(from array: AnyArrowArrayProtocol) throws(ArrowError) - -> [DataValue] -{ +func extractBoolData( + from array: AnyArrowArrayProtocol +) throws(ArrowError) -> [DataValue] { guard let typedArray = array as? ArrowArrayBoolean else { throw .invalid("Expected boolean array, got \(type(of: array))") } diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift index 904096c..ab15cbf 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingGold.swift @@ -96,7 +96,7 @@ struct ArrowTestingIPC { #expect(testCase.batches.count == recordBatches.count) - let expectedMetadata = testCase.schema.metadata?.asDictionary ?? [:] + let expectedMetadata = testCase.schema.metadata ?? [:] #expect(expectedMetadata == arrowSchema.metadata) for (testBatch, recordBatch) in zip(testCase.batches, recordBatches) { @@ -113,8 +113,7 @@ struct ArrowTestingIPC { #expect(arrowField.type.matches(expectedField: expectedField)) #expect(arrowArray.length == expectedColumn.count) #expect(arrowField.name == expectedColumn.name) - let expectedMetadata = expectedField.metadata?.asDictionary ?? [:] - #expect(arrowField.metadata == expectedMetadata) + // #expect(arrowField.metadata == expectedMetadata) switch arrowField.type { case .fixedSizeBinary(let byteWidth): @@ -179,10 +178,8 @@ struct ArrowTestingIPC { listSize: listSize ) break - // case .strct(let fields): - default: - // throw ArrowError.invalid( + // throw ArrowError.invalid( print( "TODO: Implement test for arrow field type: \(arrowField.type)") } @@ -454,7 +451,7 @@ struct ArrowTestingIPC { } } case .utf8: - guard let binaryArray = actual as? ArrowArrayOfString else { + guard let binaryArray = actual as? StringArrayProtocol else { Issue.record("Binary array expected.") return } diff --git a/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift index 046c4db..ac22c98 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowTestingJSON.swift @@ -1,17 +1,3 @@ -// ArrowTestingIPC.swift -// Arrow -// -// Created by Will Temperley on 26/11/2025. All rights reserved. -// Copyright 2025 Will Temperley. -// -// Copying or reproduction of this file via any medium requires prior express -// written permission from the copyright holder. -// ----------------------------------------------------------------------------- -/// -/// Implementation notes, links and internal documentation go here. -/// -// ----------------------------------------------------------------------------- - // Copyright 2025 The Columnar Swift Contributors // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -48,15 +34,14 @@ struct ArrowTestingJSON { "generated_primitive_no_batches", "generated_primitive_zerolength", "generated_binary", - // "generated_binary_zerolength", - "generated_custom_metadata", // TODO: replicate the gold metadata tests - // "generated_nested", + "generated_binary_zerolength", + "generated_custom_metadata", + "generated_nested", + "generated_recursive_nested", ] - // @Test(.serialized, arguments: testCases) @Test(arguments: testCases) func json(name: String) throws { - let resourceURL = try loadTestResource( name: name, withExtension: "json.lz4", @@ -75,34 +60,45 @@ struct ArrowTestingJSON { #expect(testCase.batches.count == recordBatches.count) - for (testBatch, recordBatch) in zip(testCase.batches, recordBatches) { + // Strip placeholder values. + let expectedBatches = testCase.batches.map { batch in + ArrowGold.Batch( + count: batch.count, + columns: batch.columns.map { $0.withoutJunkData() } + ) + } + let expectedSchema = testCase.schema + let expectedDictionaries = testCase.dictionaries + let _ = ArrowGold( + schema: expectedSchema, + batches: expectedBatches, + dictionaries: expectedDictionaries + ) + let actualSchema = encodeSchema(schema: arrowSchema) + #expect(actualSchema == expectedSchema) + for (testBatch, recordBatch) in zip(expectedBatches, recordBatches) { for ( (arrowField, arrowArray), - (expectedField, expectedColumn) + (_, expected) ) in zip( zip(arrowSchema.fields, recordBatch.arrays), zip(testCase.schema.fields, testBatch.columns) ) { let actual = try encodeColumn(array: arrowArray, field: arrowField) - let expected = expectedColumn.withoutJunkData() #expect(actual == expected) // This is just useful for pin-pointing differences. if actual != expected { - print("==== \(expectedColumn.name) ====") + print("==== \(expected.name) ====") #expect(actual.validity == expected.validity) #expect(actual.offset == expected.offset) - if actual.data != expected.data { guard let actualData = actual.data, let expectedData = expected.data, let validity = actual.validity else { - // fatalError() - #expect(false) - return + throw ArrowError.invalid("Expected and actual data both nil") } - for (i, isValid) in validity.enumerated() { if isValid == 1 { let aV = actualData[i] @@ -115,4 +111,18 @@ struct ArrowTestingJSON { } } } + +} + +private func encodeSchema(schema: ArrowSchema) -> ArrowGold.Schema { + let fields = schema.fields.map { arrowField in + arrowField.toGoldField() + } + let encodedMetadata: [String: String]? = + switch schema.metadata { + case .none: nil + case .some(let metadata): metadata.isEmpty ? nil : metadata + } + + return .init(fields: fields, metadata: encodedMetadata) } diff --git a/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift index 363ccf0..c802ee5 100644 --- a/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift +++ b/Tests/ArrowIPCTests/Gold/ArrowType+validation.swift @@ -155,3 +155,152 @@ extension TimeUnit { } } } + +extension ArrowField { + func toGoldField() -> ArrowGold.Field { + ArrowGold.Field( + name: name, + type: type.toGoldFieldType(), + nullable: isNullable, + children: type.goldChildren(), + dictionary: nil, // TODO: handle dictionary encoding if needed + metadata: self.metadata.isEmpty ? nil : self.metadata + ) + } +} + +extension ArrowType { + func toGoldFieldType() -> ArrowGold.FieldType { + let name: String + var byteWidth: Int? + var bitWidth: Int? + var isSigned: Bool? = nil + var precision: String? = nil + var scale: Int? = nil + var unit: String? = nil + var timezone: String? = nil + var listSize: Int? = nil + + switch self { + case .int8: + name = "int" + bitWidth = 8 + isSigned = true + case .int16: + name = "int" + bitWidth = 16 + isSigned = true + case .int32: + name = "int" + bitWidth = 32 + isSigned = true + case .int64: + name = "int" + bitWidth = 64 + isSigned = true + case .uint8: + name = "int" + bitWidth = 8 + isSigned = false + case .uint16: + name = "int" + bitWidth = 16 + isSigned = false + case .uint32: + name = "int" + bitWidth = 32 + isSigned = false + case .uint64: + name = "int" + bitWidth = 64 + isSigned = false + case .float16: + name = "floatingpoint" + precision = "HALF" + case .float32: + name = "floatingpoint" + precision = "SINGLE" + case .float64: + name = "floatingpoint" + precision = "DOUBLE" + case .boolean: + name = "bool" + case .utf8: + name = "utf8" + case .binary: + name = "binary" + case .fixedSizeBinary(let byteWidth_): + byteWidth = Int(byteWidth_) + name = "fixedsizebinary" + case .date32: + name = "date" + unit = "DAY" + case .date64: + name = "date" + unit = "MILLISECOND" + case .timestamp(let unit_, let timezone_): + name = "timestamp" + unit = unit_.jsonName + timezone = timezone_ + case .time32(let unit_): + name = "time" + bitWidth = 32 + unit = unit_.jsonName + case .time64(let unit_): + name = "time" + bitWidth = 64 + unit = unit_.jsonName + case .duration(let unit_): + name = "duration" + bitWidth = nil + unit = unit_.jsonName + case .decimal128(let precision_, let scale_): + name = "decimal" + bitWidth = 128 + precision = String(precision_) + scale = Int(scale_) + case .decimal256(let precision_, let scale_): + name = "decimal" + bitWidth = 256 + precision = String(precision_) + scale = Int(scale_) + case .list(let field): + name = "list" + case .largeList(let field): + name = "largelist" + case .fixedSizeList(let field, let listSize_): + name = "fixedsizelist" + listSize = Int(listSize_) + case .strct(let fields): + name = "struct" + case .map: + name = "struct" + default: + fatalError("Unhandled type: \(self)") + } + return ArrowGold.FieldType( + name: name, + byteWidth: byteWidth, + bitWidth: bitWidth, + isSigned: isSigned, + precision: precision, + scale: scale, + unit: unit, + timezone: timezone, + listSize: listSize + ) + } + + func goldChildren() -> [ArrowGold.Field]? { + switch self { + case .list(let field), .largeList(let field), .fixedSizeList(let field, _): + return [field.toGoldField()] + case .strct(let fields): + return fields.map { $0.toGoldField() } + default: + // May need to implement different nested types. + if isNested { fatalError("Not implemented for nested ArrowType") } + return [] + } + } +} diff --git a/Tests/ArrowIPCTests/TestSupport.swift b/Tests/ArrowIPCTests/TestSupport.swift index 0910884..f19c28a 100644 --- a/Tests/ArrowIPCTests/TestSupport.swift +++ b/Tests/ArrowIPCTests/TestSupport.swift @@ -64,7 +64,7 @@ func checkBoolRecordBatch(recordBatch: RecordBatch) { #expect(one[4] == true) guard - let utf8Column = recordBatch.arrays[1] as? ArrowArrayOfString + let utf8Column = recordBatch.arrays[1] as? StringArrayProtocol else { Issue.record("Failed to cast column to ArrowUtf8Array") return diff --git a/Tests/ArrowTests/Array/ListArrayTests.swift b/Tests/ArrowTests/Array/ListArrayTests.swift index ce575d8..a79e3ac 100644 --- a/Tests/ArrowTests/Array/ListArrayTests.swift +++ b/Tests/ArrowTests/Array/ListArrayTests.swift @@ -71,8 +71,8 @@ struct ListArrayTests { let listArray = builder.finish() - let list0 = listArray[0] as? ArrowArrayOfString - let list1 = listArray[2] as? ArrowArrayOfString + let list0 = listArray[0] as? StringArrayProtocol + let list1 = listArray[2] as? StringArrayProtocol #expect(list0?.length == 2) #expect(list0?[0] == "a") #expect(list0?[1] == "b")