Skip to content

Commit 66c17ef

Browse files
guangtaoguangtao
authored andcommitted
Add experimental Arrow Flight support
1 parent 10a8308 commit 66c17ef

File tree

109 files changed

+8260
-74
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+8260
-74
lines changed

.github/workflows/ci.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,50 @@ jobs:
168168
continue-on-error: false
169169
run: >
170170
julia --color=yes --project=monorepo -e 'using Pkg; Pkg.test("Arrow")'
171+
flight_interop:
172+
name: Arrow Flight interop - Julia 1 - ubuntu-latest
173+
runs-on: ubuntu-latest
174+
timeout-minutes: 30
175+
steps:
176+
- uses: actions/checkout@v6
177+
- uses: actions/setup-python@v6
178+
with:
179+
python-version: '3.11'
180+
- name: Install Flight Python dependencies
181+
run: |
182+
python -m pip install --upgrade pip
183+
python -m pip install pyarrow grpcio grpcio-tools
184+
- uses: julia-actions/setup-julia@v2
185+
with:
186+
version: '1'
187+
- uses: actions/cache@v5
188+
env:
189+
cache-name: cache-artifacts
190+
with:
191+
path: ~/.julia/artifacts
192+
key: ${{ runner.os }}-flight-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
193+
restore-keys: |
194+
${{ runner.os }}-flight-${{ env.cache-name }}-
195+
${{ runner.os }}-flight-
196+
${{ runner.os }}-
197+
- uses: julia-actions/julia-buildpkg@v1.6
198+
with:
199+
project: .
200+
- name: Dev local ArrowTypes for Arrow.jl tests
201+
shell: julia --project=. {0}
202+
run: |
203+
using Pkg
204+
Pkg.develop(PackageSpec(path="src/ArrowTypes"))
205+
- name: Run Arrow Flight interop tests
206+
env:
207+
ARROW_FLIGHT_PYTHON: ${{ env.pythonLocation }}/bin/python
208+
run: >
209+
julia --color=yes --project=test -e 'using Pkg;
210+
Pkg.develop(PackageSpec(path="."));
211+
Pkg.develop(PackageSpec(path="src/ArrowTypes"));
212+
Pkg.instantiate();
213+
using Test, Arrow;
214+
include("test/flight.jl")'
171215
docs:
172216
name: Documentation
173217
runs-on: ubuntu-latest

.github/workflows/ci_nightly.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,48 @@ jobs:
106106
continue-on-error: false
107107
run: >
108108
julia --color=yes --project=monorepo -e 'using Pkg; Pkg.test("Arrow")'
109+
flight_interop:
110+
name: Arrow Flight interop - Julia nightly - ubuntu-latest
111+
runs-on: ubuntu-latest
112+
timeout-minutes: 30
113+
steps:
114+
- uses: actions/checkout@v6
115+
- uses: actions/setup-python@v6
116+
with:
117+
python-version: '3.11'
118+
- name: Install Flight Python dependencies
119+
run: |
120+
python -m pip install --upgrade pip
121+
python -m pip install pyarrow grpcio grpcio-tools
122+
- uses: julia-actions/setup-julia@v2
123+
with:
124+
version: 'nightly'
125+
arch: x64
126+
- uses: actions/cache@v5
127+
env:
128+
cache-name: cache-artifacts
129+
with:
130+
path: ~/.julia/artifacts
131+
key: ${{ runner.os }}-flight-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
132+
restore-keys: |
133+
${{ runner.os }}-flight-${{ env.cache-name }}-
134+
${{ runner.os }}-flight-
135+
${{ runner.os }}-
136+
- uses: julia-actions/julia-buildpkg@v1.6
137+
with:
138+
project: .
139+
- name: Dev local ArrowTypes for Arrow.jl tests
140+
shell: julia --project=. {0}
141+
run: |
142+
using Pkg
143+
Pkg.develop(PackageSpec(path="src/ArrowTypes"))
144+
- name: Run Arrow Flight interop tests
145+
env:
146+
ARROW_FLIGHT_PYTHON: ${{ env.pythonLocation }}/bin/python
147+
run: >
148+
julia --color=yes --project=test -e 'using Pkg;
149+
Pkg.develop(PackageSpec(path="."));
150+
Pkg.develop(PackageSpec(path="src/ArrowTypes"));
151+
Pkg.instantiate();
152+
using Test, Arrow;
153+
include("test/flight.jl")'

Project.toml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,16 @@ version = "2.8.1"
2121

2222
[deps]
2323
ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
24+
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
2425
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
2526
CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
2627
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
2728
ConcurrentUtilities = "f0e56b4a-5159-44fe-b623-3e5288b988bb"
2829
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
2930
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
3031
EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
32+
ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
33+
gRPCClient = "aaca4a50-36af-4a1d-b878-4c443f2061ad"
3134
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
3235
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
3336
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
@@ -37,6 +40,15 @@ TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
3740
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
3841
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
3942

43+
[weakdeps]
44+
gRPCServer = "608c6337-0d7d-447f-bb69-0f5674ee3959"
45+
46+
[extensions]
47+
ArrowgRPCServerExt = "gRPCServer"
48+
49+
[sources]
50+
ArrowTypes = { path = "src/ArrowTypes" }
51+
4052
[compat]
4153
ArrowTypes = "1.1,2"
4254
BitIntegers = "0.2, 0.3"
@@ -45,10 +57,13 @@ CodecZstd = "0.7, 0.8"
4557
ConcurrentUtilities = "2"
4658
DataAPI = "1"
4759
EnumX = "1"
60+
ProtoBuf = "~1.2.1"
61+
gRPCClient = "1"
62+
gRPCServer = "0.1"
4863
PooledArrays = "0.5, 1.0"
4964
SentinelArrays = "1"
5065
StringViews = "1"
5166
Tables = "1.1"
5267
TimeZones = "1"
5368
TranscodingStreams = "0.9.12, 0.10, 0.11"
54-
julia = "1.9"
69+
julia = "1.12"

README.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ The package can be installed by typing in the following in a Julia REPL:
4040
julia> using Pkg; Pkg.add("Arrow")
4141
```
4242

43+
Arrow.jl currently requires Julia `1.12+`.
44+
4345
## Local Development
4446

4547
When developing on Arrow.jl it is recommended that you run the following to ensure that any
@@ -49,6 +51,12 @@ changes to ArrowTypes.jl are immediately available to Arrow.jl without requiring
4951
julia --project -e 'using Pkg; Pkg.develop(path="src/ArrowTypes")'
5052
```
5153

54+
Current write-path notes:
55+
* `Arrow.tobuffer` includes a direct single-partition fast path for eligible inputs
56+
* `Arrow.tobuffer(Tables.partitioner(...))` also includes a targeted direct multi-record-batch path for single-column top-level strings and single-column non-missing binary/code-units columns
57+
* `Arrow.write(io, Tables.partitioner(...))` now reuses that same targeted direct multi-record-batch path instead of always going through the legacy `Writer` orchestration
58+
* multi-column partitions, dictionary-encoded top-level columns, map-heavy inputs, and missing-binary partitions retain the existing writer path
59+
5260
## Format Support
5361

5462
This implementation supports the 1.0 version of the specification, including support for:
@@ -60,9 +68,27 @@ This implementation supports the 1.0 version of the specification, including sup
6068

6169
It currently doesn't include support for:
6270
* Tensors or sparse tensors
63-
* Flight RPC
6471
* C data interface
6572

73+
Flight RPC status:
74+
* Experimental `Arrow.Flight` support is available in-tree
75+
* Requires Julia `1.12+`
76+
* Includes generated protocol bindings and complete client constructors for the `FlightService` RPC surface
77+
* Keeps the top-level Flight module shell thin, with exports and generated-protocol setup split out of `src/flight/Flight.jl`
78+
* Includes high-level `FlightData <-> Arrow IPC` helpers for `Arrow.Table`, `Arrow.Stream`, and DoPut payload generation
79+
* Keeps the Flight IPC conversion layer modular under `src/flight/convert/`, with `src/flight/convert.jl` retained as a thin entrypoint
80+
* Includes client helpers for request headers, binary metadata, handshake token reuse, and TLS configuration via `withheaders`, `withtoken`, and `authenticate`
81+
* Keeps the Flight client implementation modular under `src/flight/client/`, with thin entrypoints at `src/flight/client.jl` and `src/flight/client/rpc_methods.jl`
82+
* Includes a transport-agnostic server core (`Service`, `ServerCallContext`, `ServiceDescriptor`, `MethodDescriptor`) for local Flight method dispatch, path lookup, and handler testing
83+
* Keeps the transport-agnostic server core modular under `src/flight/server/`, with `src/flight/server.jl` retained as a thin entrypoint
84+
* Includes an optional `gRPCServer.jl` package extension that maps `Arrow.Flight.Service` into `gRPCServer.ServiceDescriptor` and registers Flight proto types with the external server package when it is present
85+
* Keeps the optional `gRPCServer.jl` bridge modular under `ext/arrowgrpcserverext/`, with `ext/ArrowgRPCServerExt.jl` retained as a thin entrypoint
86+
* Includes optional live interoperability coverage for `Handshake`, authenticated token propagation, `PollFlightInfo`, and TLS via dedicated Python reference servers
87+
* Includes optional live `pyarrow.flight` interoperability coverage for `ListFlights`, `GetFlightInfo`, `GetSchema`, `DoGet`, `DoPut`, `DoExchange`, `ListActions`, and `DoAction`
88+
* Keeps targeted Flight verification modular under `test/flight/`, with `test/flight.jl` retained as a thin entrypoint for local and CI invocation stability, the client-constructor/protocol-wrapper checks decomposed under `test/flight/client_surface/`, the optional `gRPCServer` extension scenarios decomposed under `test/flight/grpcserver_extension/`, the `pyarrow.flight` interop scenarios decomposed under `test/flight/pyarrow_interop/`, and the transport-agnostic server-core checks decomposed under `test/flight/server_core/`
89+
* Includes `test/flight_grpcserver.jl` as a temporary-environment runner for optional native `gRPCServer` coverage without mutating `test/Project.toml`
90+
* Dedicated CI jobs now exercise the Flight interop suite on stable and nightly Linux; native Julia server transport remains optional/experimental and is not part of the default Flight suite
91+
6692
Third-party data formats:
6793
* CSV, parquet and avro support via the existing [CSV.jl](https://github.com/JuliaData/CSV.jl), [Parquet.jl](https://github.com/JuliaIO/Parquet.jl) and [Avro.jl](https://github.com/JuliaData/Avro.jl) packages
6894
* Other Tables.jl-compatible packages automatically supported ([DataFrames.jl](https://github.com/JuliaData/DataFrames.jl), [JSONTables.jl](https://github.com/JuliaData/JSONTables.jl), [JuliaDB.jl](https://github.com/JuliaData/JuliaDB.jl), [SQLite.jl](https://github.com/JuliaDatabases/SQLite.jl), [MySQL.jl](https://github.com/JuliaDatabases/MySQL.jl), [JDBC.jl](https://github.com/JuliaDatabases/JDBC.jl), [ODBC.jl](https://github.com/JuliaDatabases/ODBC.jl), [XLSX.jl](https://github.com/felipenoris/XLSX.jl), etc.)

dev/release/rat_exclude_files.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# under the License.
1717

1818
Manifest.toml
19+
*/Manifest.toml
1920
dev/release/apache-rat-*.jar
2021
dev/release/filtered_rat.txt
2122
dev/release/rat.xml

docs/src/manual.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,20 @@ One note on performance: when writing `TimeZones.ZonedDateTime` columns to the a
9797
as the column has `ZonedDateTime` elements that all share a common timezone. This ensures the writing process can know "upfront" which timezone will be encoded and is thus much more
9898
efficient and performant.
9999

100+
Similarly, `ArrowTypes.ToArrow` avoids repeated type-promotion work for
101+
homogeneous custom columns even when `ArrowTypes.ArrowType(T)` is abstract, so
102+
write-time conversion does not pay unnecessary overhead once the serialized
103+
element type is stable.
104+
100105
#### Custom types
101106

102107
To support writing your custom Julia struct, Arrow.jl utilizes the format's mechanism for "extension types" by allowing the storing of Julia type name and metadata in the field metadata. To "hook in" to this machinery, custom types can utilize the interface methods defined in the `Arrow.ArrowTypes` submodule. For example:
103108

109+
Arrow.jl already uses this mechanism for several Base logical types, including
110+
`nothing`, `Tuple`, `VersionNumber`, and `Complex`, so those values roundtrip as
111+
their original Julia types instead of falling back to plain struct-shaped
112+
`NamedTuple`s.
113+
104114
```julia
105115
using Arrow
106116

ext/ArrowgRPCServerExt.jl

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
module ArrowgRPCServerExt
19+
20+
using Arrow
21+
using gRPCServer
22+
23+
include("arrowgrpcserverext/constants.jl")
24+
include("arrowgrpcserverext/context.jl")
25+
include("arrowgrpcserverext/streams.jl")
26+
include("arrowgrpcserverext/handlers.jl")
27+
include("arrowgrpcserverext/descriptor.jl")
28+
29+
end # module ArrowgRPCServerExt
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
const Flight = Arrow.Flight
19+
const STREAM_BUFFER_SIZE = 16
20+
const GENERATED_TYPE_PREFIX = "Arrow.Flight.Generated."

ext/arrowgrpcserverext/context.jl

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
function _method_type(method::Flight.MethodDescriptor)
19+
if method.request_streaming
20+
return method.response_streaming ? gRPCServer.MethodType.BIDI_STREAMING :
21+
gRPCServer.MethodType.CLIENT_STREAMING
22+
end
23+
return method.response_streaming ? gRPCServer.MethodType.SERVER_STREAMING :
24+
gRPCServer.MethodType.UNARY
25+
end
26+
27+
function _call_context(context::gRPCServer.ServerContext)
28+
headers = Flight.HeaderPair[
29+
String(name) => (value isa String ? value : Vector{UInt8}(value)) for
30+
(name, value) in pairs(context.metadata)
31+
]
32+
peer = string(context.peer.address, ":", context.peer.port)
33+
return Flight.ServerCallContext(
34+
headers=headers,
35+
peer=peer,
36+
secure=(context.peer.certificate !== nothing),
37+
)
38+
end
39+
40+
function _proto_type_name(T::Type)
41+
type_name = string(T)
42+
if startswith(type_name, GENERATED_TYPE_PREFIX)
43+
return type_name[(ncodeunits(GENERATED_TYPE_PREFIX) + 1):end]
44+
end
45+
return type_name
46+
end
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
function _register_proto_types!(method::Flight.MethodDescriptor)
19+
registry = gRPCServer.get_type_registry()
20+
registry[_proto_type_name(method.request_type)] = method.request_type
21+
registry[_proto_type_name(method.response_type)] = method.response_type
22+
return nothing
23+
end
24+
25+
function gRPCServer.service_descriptor(service::Flight.Service)
26+
descriptor = Flight.servicedescriptor(service)
27+
methods = Dict{String,gRPCServer.MethodDescriptor}()
28+
for method in descriptor.methods
29+
_register_proto_types!(method)
30+
methods[method.name] = gRPCServer.MethodDescriptor(
31+
method.name,
32+
_method_type(method),
33+
_proto_type_name(method.request_type),
34+
_proto_type_name(method.response_type),
35+
_handler(service, method),
36+
)
37+
end
38+
return gRPCServer.ServiceDescriptor(descriptor.name, methods, nothing)
39+
end

0 commit comments

Comments
 (0)