Skip to content

Commit a45b3f9

Browse files
committed
Fix OffsetArray support and clean CategoricalArray support
Invalid indices were used with `OffsetArray`s as 1-based indexing was assumed. Fix this, and always wrap them in a `ToArrow` objet so that they are consistently turned into 1-based arrays. This allows dropping special code for `CategoricalArray` in favor of using the standard `DataAPI.refpool` API combined with the Arrow extension point added to CategoricalArrays.
1 parent 78e4f4b commit a45b3f9

File tree

5 files changed

+19
-27
lines changed

5 files changed

+19
-27
lines changed

src/ArrowTypes/src/ArrowTypes.jl

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -404,17 +404,15 @@ concrete_or_concreteunion(T) =
404404
function ToArrow(x::A) where {A}
405405
S = eltype(A)
406406
T = ArrowType(S)
407-
if S === T && concrete_or_concreteunion(S)
407+
fi = firstindex(x)
408+
if S === T && concrete_or_concreteunion(S) && fi == 1
408409
return x
409410
elseif !concrete_or_concreteunion(T)
410411
# arrow needs concrete types, so try to find a concrete common type, preferring unions
411412
if isempty(x)
412413
return Missing[]
413414
end
414-
T = typeof(toarrow(x[1]))
415-
for i = 2:length(x)
416-
@inbounds T = promoteunion(T, typeof(toarrow(x[i])))
417-
end
415+
T = mapreduce(typeof toarrow, promoteunion, x)
418416
if T === Missing && concrete_or_concreteunion(S)
419417
T = promoteunion(T, typeof(toarrow(default(S))))
420418
end
@@ -442,6 +440,7 @@ function _convert(::Type{T}, x) where {T}
442440
return convert(T, x)
443441
end
444442
end
445-
Base.getindex(x::ToArrow{T}, i::Int) where {T} = _convert(T, toarrow(getindex(x.data, i)))
443+
Base.getindex(x::ToArrow{T}, i::Int) where {T} =
444+
_convert(T, toarrow(getindex(x.data, i + firstindex(x.data) - 1)))
446445

447446
end # module ArrowTypes

src/ArrowTypes/test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# under the License.
1717

1818
[deps]
19+
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
1920
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
2021
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2122
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

src/ArrowTypes/test/runtests.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
using Test, ArrowTypes, UUIDs, Sockets
18+
using Test, ArrowTypes, UUIDs, Sockets, OffsetArrays
1919

2020
include("tests.jl")

src/ArrowTypes/test/tests.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ end
192192
@test eltype(x) == Union{Float64,String}
193193
@test x == [1.0, 3.14, "hey"]
194194

195+
x = ArrowTypes.ToArrow(OffsetArray([1, 2, 3], -3:-1))
196+
@test x isa ArrowTypes.ToArrow{Int,OffsetVector{Int,Vector{Int}}}
197+
@test eltype(x) == Int
198+
@test x == [1, 2, 3]
199+
200+
x = ArrowTypes.ToArrow(OffsetArray(Any[1, 3.14], -3:-2))
201+
@test x isa ArrowTypes.ToArrow{Float64,OffsetVector{Any,Vector{Any}}}
202+
@test eltype(x) == Float64
203+
@test x == [1, 3.14]
204+
195205
@testset "respect non-missing concrete type" begin
196206
struct DateTimeTZ
197207
instant::Int64

src/arraytypes/dictencoding.jl

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -227,22 +227,8 @@ function arrowvector(
227227
refa = DataAPI.refarray(x)
228228
inds = copyto!(similar(Vector{signedtype(length(pool))}, length(refa)), refa)
229229
end
230-
# horrible hack? yes. better than taking CategoricalArrays dependency? also yes.
231-
if typeof(pool).name.name == :CategoricalRefPool
232-
if eltype(x) >: Missing
233-
pool = vcat(missing, DataAPI.levels(x))
234-
else
235-
pool = DataAPI.levels(x)
236-
for i = 1:length(inds)
237-
@inbounds inds[i] -= 1
238-
end
239-
end
240-
else
241-
# adjust to "offset" instead of index
242-
for i = 1:length(inds)
243-
@inbounds inds[i] -= 1
244-
end
245-
end
230+
# adjust to "offset" instead of index
231+
inds .-= firstindex(refa)
246232
data = arrowvector(
247233
pool,
248234
i,
@@ -278,11 +264,7 @@ function arrowvector(
278264
)
279265
deltas = eltype(x)[]
280266
inds = Vector{ET}(undef, len)
281-
categorical = typeof(x).name.name == :CategoricalArray
282267
for (j, val) in enumerate(x)
283-
if categorical
284-
val = get(val)
285-
end
286268
@inbounds inds[j] = get!(pool, val) do
287269
push!(deltas, val)
288270
return length(pool)

0 commit comments

Comments
 (0)