Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
46bb6b8
Full testing of IPC reader for primitive, binary, nested and custom m…
willtemperley Nov 30, 2025
4eadd0e
Map support tests green against generated_map gold files.
willtemperley Nov 30, 2025
7600a37
Move type parsing to ArrowField+IPC.
willtemperley Nov 30, 2025
e597459
Public array protocols no longer require an associated type.
willtemperley Nov 30, 2025
0289669
Provide ArrowIPC library
willtemperley Dec 3, 2025
573ddab
Make ArrowReader read public.
willtemperley Dec 3, 2025
77e3690
Make IPC fields public.
willtemperley Dec 3, 2025
14839c1
Make FixedWidthBuffer public and add ability to copy from array.
willtemperley Dec 3, 2025
d2f2378
Make NullBufferBuilder public
willtemperley Dec 3, 2025
436fd3e
Make appendValid public
willtemperley Dec 3, 2025
1366b6c
Made AnyArrayBuilder public.
willtemperley Dec 5, 2025
b4b8e57
Gold test working for generated_datetime.
willtemperley Dec 10, 2025
6cf3405
ArrowError is now a struct.
willtemperley Dec 10, 2025
a18c70e
Gold test generated_duration passes. ArrowError is now a struct.
willtemperley Dec 10, 2025
ec2d364
Make RecordBatch Sendable using unchecked Sendable on buffers.
willtemperley Dec 23, 2025
7242126
Remove old arrow reader / writer. Breaks arrow flight.
willtemperley Dec 26, 2025
0b58d13
Delete old Arrow array implementation.
willtemperley Dec 26, 2025
6e7721c
Reinstate some table tests.
willtemperley Dec 26, 2025
1ce717f
Reinstate some table tests.
willtemperley Dec 26, 2025
5aa3979
Merge branch 'main' of https://github.com/willtemperley/swift-arrow
willtemperley Dec 26, 2025
3f783c4
List support in arrow writer. Using createVector for native structs i…
willtemperley Dec 27, 2025
737bf31
Added Arrow Gold write test support.
willtemperley Dec 28, 2025
e770ee9
Simplify List array type signature.
willtemperley Dec 29, 2025
ec77afb
Fix: Schema issue: nested lists.
willtemperley Dec 29, 2025
6bdccfc
Fix: Write list buffers. See writeBufferInfo.
willtemperley Dec 29, 2025
a03684e
Fix: Add missing bytewidth to fixed size binary in ArrowWriter.
willtemperley Dec 29, 2025
4ad445b
ArrowWriter now writes custom metadata.
willtemperley Dec 30, 2025
c3168da
Writing custom metadata passes gold test. Adding support for writing …
willtemperley Dec 30, 2025
97565a1
Multiple fixes in ArrowWriter: buffer order, buffer sizes. Writing te…
willtemperley Dec 30, 2025
adb2287
Map serialization via List array now supported.
willtemperley Dec 31, 2025
4e7d46f
Merge.
willtemperley Dec 31, 2025
77a5e2d
Fix Arrow Table. Add type-erased chunked array.
willtemperley Jan 2, 2026
0928c5e
Add chunked array tests.
willtemperley Jan 2, 2026
9ca2b9b
Add swift-subprocess to IPC testing to integrate with PyArrow.
willtemperley Jan 3, 2026
8685565
Add Binary View array and builder.
willtemperley Jan 4, 2026
3d28270
Add binary views to IPC.
willtemperley Jan 5, 2026
dbb04e8
Add binaryview to IPC writer.
willtemperley Jan 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@ import PackageDescription
let package = Package(
name: "Arrow",
platforms: [
.macOS(.v15)
.macOS(.v26)
],
products: [
.library(
name: "Arrow",
targets: ["Arrow"]
)
.library(name: "Arrow", targets: ["Arrow"]),
.library(name: "ArrowIPC", targets: ["ArrowIPC"]),
],
dependencies: [
.package(
Expand All @@ -46,9 +44,13 @@ let package = Package(
from: "1.29.0"
),
.package(
url: "https://github.com/apple/swift-binary-parsing",
url: "https://github.com/apple/swift-binary-parsing.git",
from: "0.0.1"
),
.package(
url: "https://github.com/swiftlang/swift-subprocess.git",
branch: "main"
),
],
targets: [
.target(
Expand Down Expand Up @@ -81,16 +83,11 @@ let package = Package(
// build: .unsafeFlags(["-warnings-as-errors"])
]
),
// .target(
// name: "ArrowC",
// swiftSettings: [
// // build: .unsafeFlags(["-warnings-as-errors"])
// ]
// ),
.target(
name: "ArrowFlight",
dependencies: [
"Arrow",
"ArrowIPC",
.product(name: "GRPC", package: "grpc-swift"),
.product(name: "SwiftProtobuf", package: "swift-protobuf"),
],
Expand All @@ -111,7 +108,11 @@ let package = Package(
),
.testTarget(
name: "ArrowIPCTests",
dependencies: ["Arrow", "ArrowIPC"],
dependencies: [
"Arrow",
"ArrowIPC",
.product(name: "Subprocess", package: "swift-subprocess"),
],
resources: [
.copy("Resources/")
],
Expand Down
57 changes: 42 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,33 +1,59 @@
# Swift Arrow

![Swift 6.2](https://img.shields.io/badge/Swift-6.2-orange?style=for-the-badge&logo=swift&logoColor=white)

A Swift implementation of Apache Arrow, the universal columnar format for fast data interchange and in-memory analytics.

This is a **work in progress**. Do not use in production. Progress is fast however, expect a beta in December.
The in-memory contiguous buffers allow constant-time random access to large, structured and strongly-typed datasets.

## Project status:

IPC serialization / deserialization has been tested against the Arrow integration testing JSON files, using the following strategy:

1. Read the [Arrow cpp21 generated files](https://github.com/apache/arrow-testing/tree/master/data/arrow-ipc-stream/integration/cpp-21.0.0) into memory.
2. Encode the results to Codable & Equatable structs that can read and write the [test data format.](https://arrow.apache.org/docs/format/Integration.html#json-test-data-format).
3. Read the test JSON into the same Codable & Equatable struct and compare with the deserialized results,using Swift equality. This

IPC serialization uses the same methodology, except a serialization-deserialization round-trip to/from Arrow IPC is performed prior to step 2, i.e. the results under test have been deserialized from IPC, re-serialized to IPC and deserialized again before being compared to the JSON.

The following types are fully supported:

* Primitive types: boolean, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64.
* Temporal types: timestamp, date32, date64, time32, time64, duration.
* Variable length types: binary and string, plus their fixed-width equivalents
* Nested and recursively nested types: lists and structs, structs of lists etc.
* Maps: Represented as list of key-values, which is spec compliant, however the public API will change.
* Binary views: binaryView and utf8View.

All binary arrays (variable, fixed and view) can be accessed via BinaryArrayProtocol. The same applies to StringArrayProtocol.

## Array interface

Arrow arrays are backed by a standard memory layout:
https://arrow.apache.org/docs/format/Columnar.html

In Swift-Arrow, every array conforms to:
In Swift-Arrow, every array has the following type-erased capabilities:

```swift
public protocol ArrowArrayProtocol {
associatedtype ItemType
subscript(_ index: Int) -> ItemType? { get }
public protocol AnyArrowArrayProtocol: Sendable {
var offset: Int { get }
var length: Int { get }
var nullCount: Int { get }
func slice(offset: Int, length: Int) -> Self
func any(at index: Int) -> Any?
var bufferSizes: [Int] { get }
var buffers: [ArrowBufferProtocol] { get }
}
```

The in-memory contiguous buffers allow constant-time random access.
Every array also supports typed access:

Every Arrow array supports nullable elements. This is encoded as an optional bit-packed validity buffer aka null array aka bitfield.
In psuedocode, bitfield[index] == 0 means null or invalid, and bitfield[index] == 1 means not null or valid.
```swift
public protocol ArrowArrayProtocol<ItemType>: AnyArrowArrayProtocol {
associatedtype ItemType
subscript(_ index: Int) -> ItemType? { get }
}
```

Every Arrow array supports nullable elements. This is encoded as an optional bit-packed validity buffer.
Fixed-width types are encoded back-to-back, with placeholder values for nulls. For example the array:

```swift
Expand Down Expand Up @@ -101,19 +127,20 @@ typealias ArrowArrayUtf8 = ArrowArrayVariable<
>
``


## Relationship to Arrow-Swift

This project is based on Arrow-Swift, the official Swift implementation of Apache Arrow. The decision was made to at least temporarily operate independently of the Apache Software Foundation (ASF). Currently there are no active ASF maintaners with knowledge of Swift, and the only [Apache approved CI for Swift](https://github.com/apache/infrastructure-actions/blob/main/approved_patterns.yml) is [setup-swift which is unmaintained](https://github.com/swift-actions/setup-swift/issues), leading to intermittent CI failures. This has led to delays in much-needed fixes being implemented.
This project is based on Arrow-Swift, the official Swift implementation of Apache Arrow. The decision was made to at least temporarily operate independently of the Apache Software Foundation (ASF) to improve development velocity.

The intention is to continue contributing to the official Apache-Swift repository, however changes can be iterated on more quickly here.

Original source: https://github.com/apache/arrow-swift

Changes made since forking Arrow-Swift:
* `ArrowType` has been moved from a class hierarchy to an enum to improve usability and concurrency support.
* IPC is now fully zero-copy, whereas previously file data were copied to pointer-backed arrays.
* Gold-standard IPC tests have been added.
* CI uses the swiftlang workflows: https://github.com/swiftlang/github-workflows
* `ArrowType` has been moved from a class hierarchy to an enum to improve concurrency support.
* Tests have been migrated to Swift Testing.
* A migration from reference to value types, where appropriate, has begun.
* A DockerFile for compiling ArrowFlight protocol buffers and grpc classes is provided.
* C export has been made Swift 6 compatible through MainActor annotations. This is a workaround.
* C import/export has been removed.

20 changes: 20 additions & 0 deletions Scripts/readArrowIPC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Useful for debugging IPC writing issues.
import pyarrow as pa
import sys

print(f"PyArrow version: {pa.__version__}")

try:
with open(sys.argv[1], 'rb') as f:
reader = pa.ipc.open_file(f)
print(f"Schema: {reader.schema}")
print(f"Num batches: {reader.num_record_batches}")

for i in range(reader.num_record_batches):
batch = reader.get_batch(i)
print(f"Batch {i}: {batch.num_rows} rows, {batch.num_columns} columns")

print("✓ File read successfully")
except Exception as e:
print(f"✗ Error: {e}")
sys.exit(1)
44 changes: 26 additions & 18 deletions Sources/Arrow/Array/Array.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import Foundation

public protocol AnyArrowArrayProtocol {
/// The type-independent Arrow array capabilities.
public protocol AnyArrowArrayProtocol: Sendable {
var offset: Int { get }
var length: Int { get }
var nullCount: Int { get }
Expand All @@ -24,7 +25,10 @@ public protocol AnyArrowArrayProtocol {
var buffers: [ArrowBufferProtocol] { get }
}

internal protocol ArrowArrayProtocol: AnyArrowArrayProtocol {
/// Typed array conformance.
///
/// Public access to typed arays is provided via concrete types or individual protocols as appropriate.
public protocol ArrowArrayProtocol<ItemType>: AnyArrowArrayProtocol {
associatedtype ItemType
subscript(_ index: Int) -> ItemType? { get }
}
Expand All @@ -38,28 +42,33 @@ extension ArrowArrayProtocol {

// MARK: Capability protocols.

public protocol StringArrayProtocol {
var length: Int { get }
/// A type which provides access to arrays of utf8 encoded `String`, with opaque offset types.
///
/// The underlying array may be `String` or `LargeString`.
public protocol StringArrayProtocol: AnyArrowArrayProtocol {
subscript(index: Int) -> String? { get }
}
extension ArrowArrayVariable: StringArrayProtocol where ItemType == String {}
extension ArrowArrayBinaryView: StringArrayProtocol where ItemType == String {}

protocol BinaryArrayProtocol: ArrowArrayProtocol where ItemType == Data {}
/// A type which provides access to arrays of `Data`, with opaque offset types.
///
/// The underlying array may have fixed or variable-length items.
protocol BinaryArrayProtocol: AnyArrowArrayProtocol {
subscript(index: Int) -> Data? { get }
}
extension ArrowArrayFixedSizeBinary: BinaryArrayProtocol {}
extension ArrowArrayVariable: BinaryArrayProtocol
where ItemType == Data, OffsetType: FixedWidthInteger & SignedInteger {}
extension ArrowArrayBinaryView: BinaryArrayProtocol where ItemType == Data {}

protocol Utf8ArrayProtocol: ArrowArrayProtocol where ItemType == String {}
extension ArrowArrayVariable: Utf8ArrayProtocol
where ItemType == String, OffsetType: FixedWidthInteger & SignedInteger {}

public protocol ListArrayProtocol {
var length: Int { get }
public protocol ListArrayProtocol: AnyArrowArrayProtocol {
var values: AnyArrowArrayProtocol { get }
subscript(index: Int) -> AnyArrowArrayProtocol? { get }
}
extension ArrowListArray: ListArrayProtocol {}
extension ArrowFixedSizeListArray: ListArrayProtocol {}
// TODO: Add large lists.

// MARK: Array implementations.

Expand Down Expand Up @@ -243,6 +252,7 @@ public struct ArrowArrayVariable<
let startOffset = offsetsBuffer[offsetIndex]
let endOffset = offsetsBuffer[offsetIndex + 1]

precondition(endOffset >= startOffset, "Corrupted Arrow data")
return valueBuffer.loadVariable(
at: Int(startOffset),
arrayLength: Int(endOffset - startOffset)
Expand Down Expand Up @@ -315,11 +325,9 @@ public struct ArrowArrayDate64: ArrowArrayProtocol {
}

///// An Arrow list array which may be nested arbitrarily.
public struct ArrowListArray<OffsetsBuffer>: ArrowArrayProtocol
where
OffsetsBuffer: FixedWidthBufferProtocol,
OffsetsBuffer.ElementType: FixedWidthInteger & SignedInteger
{
public struct ArrowListArray<
OffsetType: FixedWidthInteger & SignedInteger
>: ArrowArrayProtocol {
public let offset: Int
public let length: Int
public var bufferSizes: [Int] {
Expand All @@ -331,14 +339,14 @@ where
public var nullCount: Int { nullBuffer.nullCount }

let nullBuffer: NullBuffer
let offsetsBuffer: OffsetsBuffer
let offsetsBuffer: any FixedWidthBufferProtocol<OffsetType>
public let values: AnyArrowArrayProtocol

public init(
offset: Int = 0,
length: Int,
nullBuffer: NullBuffer,
offsetsBuffer: OffsetsBuffer,
offsetsBuffer: any FixedWidthBufferProtocol<OffsetType>,
values: AnyArrowArrayProtocol
) {
self.offset = offset
Expand Down
102 changes: 102 additions & 0 deletions Sources/Arrow/Array/ArrowArrayBinaryView.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright 2025 The Columnar Swift Contributors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

public struct ArrowArrayBinaryView<ItemType: VariableLength>: ArrowArrayProtocol
{
public let offset: Int
public let length: Int
private let nullBuffer: NullBuffer
private let viewsBuffer: any FixedWidthBufferProtocol<BinaryView>
let dataBuffers: [any VariableLengthBufferProtocol<ItemType>]

public var bufferSizes: [Int] {
[nullBuffer.length, viewsBuffer.length] + dataBuffers.map { $0.length }
}

public var buffers: [ArrowBufferProtocol] {
[nullBuffer, viewsBuffer] + dataBuffers.map { $0 as ArrowBufferProtocol }
}

public var nullCount: Int { nullBuffer.nullCount }

public init<Views: FixedWidthBufferProtocol<BinaryView>>(
offset: Int = 0,
length: Int,
nullBuffer: NullBuffer,
viewsBuffer: Views,
dataBuffers: [any VariableLengthBufferProtocol<ItemType>]
) {
self.offset = offset
self.length = length
self.nullBuffer = nullBuffer
self.viewsBuffer = viewsBuffer
self.dataBuffers = dataBuffers
}

public subscript(index: Int) -> ItemType? {
let offsetIndex = self.offset + index
guard self.nullBuffer.isSet(offsetIndex) else {
return nil
}

let view = viewsBuffer[offsetIndex]

if view.isInline {
// Fast path: data is inline
return view.withInlineData { dataSpan in
dataSpan.withUnsafeBufferPointer { buffer in
ItemType(buffer)
}
}
} else {
// Referenced data
let bufferIndex = Int(view.bufferIndex)
let offset = Int(view.offset)

precondition(
bufferIndex >= 0 && bufferIndex < dataBuffers.count,
"Invalid buffer index")

let dataBuffer = dataBuffers[bufferIndex]
return dataBuffer.loadVariable(
at: offset,
arrayLength: Int(view.length)
)
}
}

public func slice(offset: Int, length: Int) -> Self {
// True zero-copy: just adjust offset/length, share all buffers
.init(
offset: offset,
length: length,
nullBuffer: nullBuffer,
viewsBuffer: viewsBuffer,
dataBuffers: dataBuffers
)
}

/// Compact the array by copying referenced data into fewer buffers.
public func compact() -> Self {
// TODO: Implement compaction strategy
// For now, just return self
self
}

/// Get buffer utilization statistics.
public func bufferStats() -> [(bufferIndex: Int, utilization: Double)] {
// TODO: Track which views reference which buffers
[]
}
}
Loading
Loading