diff --git a/src/ArrowTypes/src/ArrowTypes.jl b/src/ArrowTypes/src/ArrowTypes.jl index 86183b54..09e70338 100644 --- a/src/ArrowTypes/src/ArrowTypes.jl +++ b/src/ArrowTypes/src/ArrowTypes.jl @@ -50,7 +50,7 @@ For a give type `T`, define it's "arrow type kind", or the general category of a * [`ArrowTypes.MapKind`](@ref): any `AbstractDict` * [`ArrowTypes.StructKind`](@ref): any `NamedTuple` or plain struct (mutable or otherwise) * [`ArrowTypes.UnionKind`](@ref): any `Union` - * [`ArrowTypes.DictEncodedKind`](@ref): array types that implement the `DataAPI.refarray` interface + * [`ArrowTypes.DictEncodedKind`](@ref): array types that implement the `DataAPI.refpool` interface The list of `ArrowKind`s listed above translate to different ways to physically store data as supported by the arrow data format. See the docs for each for an idea of whether they might be an appropriate fit for a custom type. @@ -404,17 +404,15 @@ concrete_or_concreteunion(T) = function ToArrow(x::A) where {A} S = eltype(A) T = ArrowType(S) - if S === T && concrete_or_concreteunion(S) + fi = firstindex(x) + if S === T && concrete_or_concreteunion(S) && fi == 1 return x elseif !concrete_or_concreteunion(T) # arrow needs concrete types, so try to find a concrete common type, preferring unions if isempty(x) return Missing[] end - T = typeof(toarrow(x[1])) - for i = 2:length(x) - @inbounds T = promoteunion(T, typeof(toarrow(x[i]))) - end + T = mapreduce(typeof ∘ toarrow, promoteunion, x) if T === Missing && concrete_or_concreteunion(S) T = promoteunion(T, typeof(toarrow(default(S)))) end @@ -442,6 +440,7 @@ function _convert(::Type{T}, x) where {T} return convert(T, x) end end -Base.getindex(x::ToArrow{T}, i::Int) where {T} = _convert(T, toarrow(getindex(x.data, i))) +Base.getindex(x::ToArrow{T}, i::Int) where {T} = + _convert(T, toarrow(getindex(x.data, i + firstindex(x.data) - 1))) end # module ArrowTypes diff --git a/src/ArrowTypes/test/Project.toml b/src/ArrowTypes/test/Project.toml index 24d0f273..d46fd5db 100644 --- a/src/ArrowTypes/test/Project.toml +++ b/src/ArrowTypes/test/Project.toml @@ -16,6 +16,7 @@ # under the License. [deps] +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" diff --git a/src/ArrowTypes/test/runtests.jl b/src/ArrowTypes/test/runtests.jl index adeaf01f..71fa1513 100644 --- a/src/ArrowTypes/test/runtests.jl +++ b/src/ArrowTypes/test/runtests.jl @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -using Test, ArrowTypes, UUIDs, Sockets +using Test, ArrowTypes, UUIDs, Sockets, OffsetArrays include("tests.jl") diff --git a/src/ArrowTypes/test/tests.jl b/src/ArrowTypes/test/tests.jl index 79dd6046..22d8dd0e 100644 --- a/src/ArrowTypes/test/tests.jl +++ b/src/ArrowTypes/test/tests.jl @@ -192,6 +192,16 @@ end @test eltype(x) == Union{Float64,String} @test x == [1.0, 3.14, "hey"] + x = ArrowTypes.ToArrow(OffsetArray([1, 2, 3], -3:-1)) + @test x isa ArrowTypes.ToArrow{Int,OffsetVector{Int,Vector{Int}}} + @test eltype(x) == Int + @test x == [1, 2, 3] + + x = ArrowTypes.ToArrow(OffsetArray(Any[1, 3.14], -3:-2)) + @test x isa ArrowTypes.ToArrow{Float64,OffsetVector{Any,Vector{Any}}} + @test eltype(x) == Float64 + @test x == [1, 3.14] + @testset "respect non-missing concrete type" begin struct DateTimeTZ instant::Int64 diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index fe483040..e79dc6c7 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -220,29 +220,15 @@ function arrowvector( if DataAPI.refarray(x) === x || DataAPI.refpool(x) === nothing # need to encode ourselves x = PooledArray(x; signed=true, compress=true) - inds = DataAPI.refarray(x) + inds = refa = DataAPI.refarray(x) pool = DataAPI.refpool(x) else pool = DataAPI.refpool(x) refa = DataAPI.refarray(x) inds = copyto!(similar(Vector{signedtype(length(pool))}, length(refa)), refa) end - # horrible hack? yes. better than taking CategoricalArrays dependency? also yes. - if typeof(pool).name.name == :CategoricalRefPool - if eltype(x) >: Missing - pool = vcat(missing, DataAPI.levels(x)) - else - pool = DataAPI.levels(x) - for i = 1:length(inds) - @inbounds inds[i] -= 1 - end - end - else - # adjust to "offset" instead of index - for i = 1:length(inds) - @inbounds inds[i] -= 1 - end - end + # adjust to "offset" instead of index + inds .-= firstindex(refa) data = arrowvector( pool, i, @@ -278,11 +264,7 @@ function arrowvector( ) deltas = eltype(x)[] inds = Vector{ET}(undef, len) - categorical = typeof(x).name.name == :CategoricalArray for (j, val) in enumerate(x) - if categorical - val = get(val) - end @inbounds inds[j] = get!(pool, val) do push!(deltas, val) return length(pool) diff --git a/test/Project.toml b/test/Project.toml index 93977a9a..15891502 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -23,6 +23,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" Mmap = "a63ad114-7e13-5084-954f-fe012c677804" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" @@ -40,6 +41,7 @@ DataAPI = "1" DataFrames = "1" FilePathsBase = "0.9" JSON3 = "1" +OffsetArrays = "1" PooledArrays = "1" StructTypes = "1" SentinelArrays = "1" diff --git a/test/runtests.jl b/test/runtests.jl index 1b3418e5..b4c5441d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,6 +27,7 @@ using CategoricalArrays using DataAPI using FilePathsBase using DataFrames +using OffsetArrays import Random: randstring include(joinpath(dirname(pathof(ArrowTypes)), "../test/tests.jl"))