Skip to content

Commit 64cc61b

Browse files
guangtaoguangtao
authored andcommitted
Fix #590 view-backed variadic buffer inference
Centralize the view inline/external layout boundary, infer missing external buffers from valid view elements, and ignore null slots during buffer-count recovery for Utf8View/BinaryView readers.
1 parent 66c17ef commit 64cc61b

File tree

4 files changed

+54
-11
lines changed

4 files changed

+54
-11
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ This implementation supports the 1.0 version of the specification, including sup
6464
* All nested data types
6565
* Dictionary encodings and messages
6666
* Extension types
67+
* View-backed Utf8/Binary columns, including recovery from under-reported variadic buffer counts by inferring the required external buffers from valid view elements
6768
* Streaming, file, record batch, and replacement and isdelta dictionary messages
6869

6970
It currently doesn't include support for:

src/arraytypes/views.jl

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,17 @@ struct ViewElement
2121
offset::Int32
2222
end
2323

24+
const VIEW_ELEMENT_BYTES = sizeof(ViewElement)
25+
const VIEW_LENGTH_BYTES = sizeof(Int32)
26+
const VIEW_INLINE_BYTES = VIEW_ELEMENT_BYTES - VIEW_LENGTH_BYTES
27+
28+
@inline _viewisinline(length::Integer) = length <= VIEW_INLINE_BYTES
29+
@inline _viewinlinestart(i::Integer) =
30+
((i - 1) * VIEW_ELEMENT_BYTES) + VIEW_LENGTH_BYTES + 1
31+
@inline _viewinlineend(i::Integer, length::Integer) = _viewinlinestart(i) + length - 1
32+
@inline _viewinlineslice(inline::Vector{UInt8}, i::Integer, length::Integer) =
33+
@view inline[_viewinlinestart(i):_viewinlineend(i, length)]
34+
2435
"""
2536
Arrow.View
2637
@@ -45,12 +56,8 @@ Base.size(l::View) = (l.ℓ,)
4556
if S <: Base.CodeUnits
4657
# BinaryView
4758
return !l.validity[i] ? missing :
48-
v.length < 13 ?
49-
Base.CodeUnits(
50-
StringView(
51-
@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)]
52-
),
53-
) :
59+
_viewisinline(v.length) ?
60+
Base.CodeUnits(StringView(_viewinlineslice(l.inline, i, v.length))) :
5461
Base.CodeUnits(
5562
StringView(
5663
@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]
@@ -59,12 +66,10 @@ Base.size(l::View) = (l.ℓ,)
5966
else
6067
# Utf8View
6168
return !l.validity[i] ? missing :
62-
v.length < 13 ?
69+
_viewisinline(v.length) ?
6370
ArrowTypes.fromarrow(
6471
T,
65-
StringView(
66-
@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)]
67-
),
72+
StringView(_viewinlineslice(l.inline, i, v.length)),
6873
) :
6974
ArrowTypes.fromarrow(
7075
T,

src/table.jl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,18 @@ const ListTypes =
732732
const LargeLists = Union{Meta.LargeUtf8,Meta.LargeBinary,Meta.LargeList,Meta.LargeListView}
733733
const ViewTypes = Union{Meta.Utf8View,Meta.BinaryView,Meta.ListView,Meta.LargeListView}
734734

735+
@inline function _viewbuffercount(validity, views, declared::Integer)
736+
count = Int(declared)
737+
for i in eachindex(views)
738+
validity[i] || continue
739+
v = @inbounds views[i]
740+
if !_viewisinline(v.length)
741+
count = max(count, Int(v.bufindex) + 1)
742+
end
743+
end
744+
return count
745+
end
746+
735747
function build(field::Meta.Field, batch, rb, de, nodeidx, bufferidx, varbufferidx, convert)
736748
d = field.dictionary
737749
if d !== nothing
@@ -910,7 +922,8 @@ function build(
910922
inline = reinterpret(UInt8, views) # reuse the (possibly realigned) memory backing `views`
911923
bufferidx += 1
912924
buffers = Vector{UInt8}[]
913-
for i = 1:rb.variadicBufferCounts[varbufferidx]
925+
nvariadic = _viewbuffercount(validity, views, rb.variadicBufferCounts[varbufferidx])
926+
for i = 1:nvariadic
914927
buffer = rb.buffers[bufferidx]
915928
_, A = reinterp(UInt8, batch, buffer, rb.compression)
916929
push!(buffers, A)

test/runtests.jl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,30 @@ end
264264
@test all(isequal.(values(t), values(tt)))
265265
end
266266

267+
@testset "View buffer count inference" begin
268+
inline_len = Int32(Arrow.VIEW_INLINE_BYTES)
269+
views = Arrow.ViewElement[
270+
Arrow.ViewElement(inline_len, Int32(0), Int32(0), Int32(0)),
271+
Arrow.ViewElement(inline_len + Int32(148), Int32(0), Int32(0), Int32(0)),
272+
Arrow.ViewElement(inline_len + Int32(207), Int32(0), Int32(1), Int32(160)),
273+
]
274+
validity = Arrow.ValidityBitmap(UInt8[], 1, length(views), 0)
275+
@test Arrow._viewisinline(inline_len)
276+
@test !Arrow._viewisinline(inline_len + Int32(1))
277+
@test Arrow._viewbuffercount(validity, views, Int32(0)) == 2
278+
@test Arrow._viewbuffercount(validity, views, Int32(1)) == 2
279+
@test Arrow._viewbuffercount(validity, views, Int32(3)) == 3
280+
281+
sparse_validity = Arrow.ValidityBitmap(UInt8[0x05], 1, 3, 1)
282+
sparse_views = Arrow.ViewElement[
283+
Arrow.ViewElement(inline_len + Int32(64), Int32(0), Int32(0), Int32(0)),
284+
Arrow.ViewElement(inline_len + Int32(64), Int32(0), Int32(99), Int32(0)),
285+
Arrow.ViewElement(inline_len, Int32(0), Int32(0), Int32(0)),
286+
]
287+
@test !sparse_validity[2]
288+
@test Arrow._viewbuffercount(sparse_validity, sparse_views, Int32(0)) == 1
289+
end
290+
267291
@testset "single-partition tobuffer byte equivalence" begin
268292
t = (col=OffsetArray(["a", "bc", "def"], 0:2),)
269293
io = IOBuffer()

0 commit comments

Comments
 (0)