Skip to content

Commit 6ef6662

Browse files
committed
Merge branch 'nl/quantilecut' into nl/cutlabels
2 parents e1acb38 + e5d84c7 commit 6ef6662

13 files changed

+220
-85
lines changed

.github/workflows/CompatHelper.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
run: which julia
1616
continue-on-error: true
1717
- name: Install Julia, but only if it is not already available in the PATH
18-
uses: julia-actions/setup-julia@v1
18+
uses: julia-actions/setup-julia@v2
1919
with:
2020
version: '1'
2121
arch: ${{ runner.arch }}

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
fail-fast: false
1313
matrix:
1414
version:
15-
- '1.0'
15+
- '1.6'
1616
- '1' # automatically expands to the latest stable 1.x release of Julia
1717
- 'nightly'
1818
os:
@@ -31,7 +31,7 @@ jobs:
3131
with:
3232
version: ${{ matrix.version }}
3333
arch: ${{ matrix.arch }}
34-
- uses: actions/cache@v2
34+
- uses: actions/cache@v4
3535
env:
3636
cache-name: cache-artifacts
3737
with:

Project.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,21 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1313
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
1414

1515
[weakdeps]
16+
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
1617
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
1718
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
1819
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
1920
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
2021

2122
[extensions]
23+
CategoricalArraysArrowExt = "Arrow"
2224
CategoricalArraysJSONExt = "JSON"
2325
CategoricalArraysRecipesBaseExt = "RecipesBase"
2426
CategoricalArraysSentinelArraysExt = "SentinelArrays"
2527
CategoricalArraysStructTypesExt = "StructTypes"
2628

2729
[compat]
30+
Arrow = "2"
2831
Compat = "3.47, 4.10"
2932
DataAPI = "1.6"
3033
JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
@@ -35,9 +38,10 @@ Requires = "1"
3538
SentinelArrays = "1"
3639
Statistics = "1"
3740
StructTypes = "1"
38-
julia = "1"
41+
julia = "1.6"
3942

4043
[extras]
44+
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
4145
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
4246
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
4347
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
@@ -49,4 +53,4 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
4953
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
5054

5155
[targets]
52-
test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"]
56+
test = ["Arrow", "Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"]

ext/CategoricalArraysArrowExt.jl

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
module CategoricalArraysArrowExt
2+
3+
using CategoricalArrays
4+
import Arrow
5+
import Arrow: ArrowTypes
6+
7+
const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray")
8+
ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME
9+
ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R)
10+
11+
ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME
12+
ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} =
13+
string(R)
14+
15+
const REFTYPES = Dict(string(T) => T for T in (Int128, Int16, Int32, Int64, Int8, UInt128,
16+
UInt16, UInt32, UInt64, UInt8))
17+
function ArrowTypes.JuliaType(::Val{CATARRAY_ARROWNAME},
18+
::Type{S}, meta::String) where S
19+
R = REFTYPES[meta]
20+
return CategoricalValue{S, R}
21+
end
22+
23+
for (MV, MT) in ((:V, :T), (:(Union{V,Missing}), :(Union{T,Missing})))
24+
@eval begin
25+
function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.List{U, O, B},
26+
isOrdered, metadata) where
27+
{T, R, V<:CategoricalValue{T,R}, S, O, A, B, U}
28+
newdata = Arrow.List{$MT,O,B}(data.arrow, data.validity, data.offsets,
29+
data.data, data.ℓ, data.metadata)
30+
levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata
31+
catdata = CategoricalVector{$MT,R}(newdata, levels=levels)
32+
return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata,
33+
isOrdered, metadata)
34+
end
35+
36+
function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.Primitive{U, B},
37+
isOrdered, metadata) where
38+
{T, R, V<:CategoricalValue{T,R}, S, A, B, U}
39+
newdata = Arrow.Primitive{$MT,B}(data.arrow, data.validity, data.data,
40+
data.ℓ, data.metadata)
41+
levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata
42+
catdata = CategoricalVector{$MT,R}(newdata, levels=levels)
43+
return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata,
44+
isOrdered, metadata)
45+
end
46+
end
47+
end
48+
49+
function Base.copy(x::Arrow.DictEncoded{V}) where {T, R, V<:CategoricalValue{T, R}}
50+
pool = CategoricalPool{T,R}(x.encoding.data)
51+
inds = x.indices
52+
refs = similar(inds, R)
53+
refs .= inds .+ one(R)
54+
return CategoricalVector{T}(refs, pool)
55+
end
56+
57+
function Base.copy(x::Arrow.DictEncoded{Union{Missing,V}}) where
58+
{T, R, V<:CategoricalValue{T, R}}
59+
ismissing(x.encoding.data[1]) ||
60+
throw(ErrorException("`missing` must be the first value in a " *
61+
"`CategoricalArray` pool"))
62+
levels = collect(skipmissing(x.encoding.data))
63+
pool = CategoricalPool{T,R}(levels)
64+
inds = x.indices
65+
refs = similar(inds, R)
66+
refs .= inds
67+
return CategoricalVector{Union{T,Missing}}(refs, pool)
68+
end
69+
70+
end

src/CategoricalArrays.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ module CategoricalArrays
4343

4444
@static if !isdefined(Base, :get_extension)
4545
function __init__()
46+
@require Arrow="69666777-d1a9-59fb-9406-91d4454c9d45" include("../ext/CategoricalArraysArrowExt.jl")
4647
@require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" include("../ext/CategoricalArraysJSONExt.jl")
4748
@require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" include("../ext/CategoricalArraysRecipesBaseExt.jl")
4849
@require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" include("../ext/CategoricalArraysSentinelArraysExt.jl")

src/array.jl

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
## Code for CategoricalArray
22

33
import Base: Array, convert, collect, copy, getindex, setindex!, similar, size,
4-
unique, vcat, in, summary, float, complex, copyto!
4+
unique, unique!, vcat, in, summary, float, complex, copyto!
55

66
# Used for keyword argument default value
77
_isordered(x::AbstractCategoricalArray) = isordered(x)
@@ -160,9 +160,8 @@ function CategoricalArray{T, N, R}(::UndefInitializer, dims::NTuple{N,Int};
160160
U = leveltype(nonmissingtype(T))
161161
S = T >: Missing ? Union{U, Missing} : U
162162
check_supported_eltype(S, T)
163-
V = CategoricalValue{U, R}
164163
levs = levels === nothing ? U[] : collect(U, levels)
165-
CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R, V}(levs, ordered))
164+
CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R}(levs, ordered))
166165
end
167166

168167
CategoricalArray{T, N}(::UndefInitializer, dims::NTuple{N,Int};
@@ -868,31 +867,36 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector;
868867
return A
869868
end
870869

871-
function _unique(::Type{S},
872-
refs::AbstractArray{T},
873-
pool::CategoricalPool) where {S, T<:Integer}
874-
nlevels = length(levels(pool)) + 1
875-
order = fill(0, nlevels) # 0 indicates not seen
876-
# If we don't track missings, short-circuit even if none has been seen
877-
count = S >: Missing ? 0 : 1
878-
@inbounds for i in refs
879-
if order[i + 1] == 0
880-
count += 1
881-
order[i + 1] = count
882-
count == nlevels && break
870+
# return unique refs (each value is unique) in the order of appearance in `refs`
871+
# equivalent to fallback Base.unique() implementation,
872+
# but short-circuits once references to all levels are encountered
873+
function _uniquerefs(A::CatArrOrSub{T}) where T
874+
arefs = refs(A)
875+
res = similar(arefs, 0)
876+
nlevels = length(levels(A))
877+
maxunique = nlevels + (T >: Missing ? 1 : 0)
878+
seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref)
879+
@inbounds for ref in arefs
880+
if !seen[ref + 1]
881+
push!(res, ref)
882+
seen[ref + 1] = true
883+
(length(res) == maxunique) && break
883884
end
884885
end
885-
S[i == 1 ? missing : levels(pool)[i - 1] for i in sortperm(order) if order[i] != 0]
886+
return res
886887
end
887888

888-
"""
889-
unique(A::CategoricalArray)
889+
unique(A::CatArrOrSub{T}) where T =
890+
CategoricalVector{T}(_uniquerefs(A), copy(pool(A)))
890891

891-
Return levels which appear in `A` in their order of appearance.
892-
This function is significantly slower than [`levels`](@ref DataAPI.levels)
893-
since it needs to check whether levels are used or not.
894-
"""
895-
unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool)
892+
function unique!(A::CategoricalVector)
893+
urefs = _uniquerefs(A)
894+
if length(urefs) != length(A)
895+
resize!(A.refs, length(urefs))
896+
copyto!(A.refs, urefs)
897+
end
898+
return A
899+
end
896900

897901
"""
898902
droplevels!(A::CategoricalArray)

src/pool.jl

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,18 @@ const catpool_seed = UInt === UInt32 ? 0xe3cf1386 : 0x356f2c715023f1a5
22

33
hashlevels(levs::AbstractVector) = foldl((h, x) -> hash(x, h), levs, init=catpool_seed)
44

5-
CategoricalPool{T, R, V}(ordered::Bool=false) where {T, R, V} =
6-
CategoricalPool{T, R, V}(T[], ordered)
75
CategoricalPool{T, R}(ordered::Bool=false) where {T, R} =
86
CategoricalPool{T, R}(T[], ordered)
97
CategoricalPool{T}(ordered::Bool=false) where {T} =
108
CategoricalPool{T, DefaultRefType}(T[], ordered)
119

1210
CategoricalPool{T, R}(levels::AbstractVector, ordered::Bool=false) where {T, R} =
13-
CategoricalPool{T, R, CategoricalValue{T, R}}(convert(Vector{T}, levels), ordered)
11+
CategoricalPool{T, R}(convert(Vector{T}, levels), ordered)
1412
CategoricalPool(levels::AbstractVector{T}, ordered::Bool=false) where {T} =
1513
CategoricalPool{T, DefaultRefType}(convert(Vector{T}, levels), ordered)
1614

1715
CategoricalPool(invindex::Dict{T, R}, ordered::Bool=false) where {T, R <: Integer} =
18-
CategoricalPool{T, R, CategoricalValue{T, R}}(invindex, ordered)
16+
CategoricalPool{T, R}(invindex, ordered)
1917

2018
Base.convert(::Type{T}, pool::T) where {T <: CategoricalPool} = pool
2119

@@ -29,12 +27,12 @@ function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) wher
2927

3028
levelsT = convert(Vector{T}, pool.levels)
3129
invindexT = convert(Dict{T, R}, pool.invindex)
32-
return CategoricalPool{T, R, CategoricalValue{T, R}}(levelsT, invindexT, pool.ordered)
30+
return CategoricalPool{T, R}(levelsT, invindexT, pool.ordered)
3331
end
3432

35-
Base.copy(pool::CategoricalPool{T, R, V}) where {T, R, V} =
36-
CategoricalPool{T, R, V}(copy(pool.levels), copy(pool.invindex),
37-
pool.ordered, pool.hash)
33+
Base.copy(pool::CategoricalPool{T, R}) where {T, R} =
34+
CategoricalPool{T, R}(copy(pool.levels), copy(pool.invindex),
35+
pool.ordered, pool.hash)
3836

3937
function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R}
4038
@static if VERSION >= v"1.6.0"

src/subarray.jl

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,6 @@ isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(paren
55
levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} =
66
levels!(parent(sa), newlevels)
77

8-
function unique(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray}
9-
A = parent(sa)
10-
refs = view(A.refs, sa.indices...)
11-
S = eltype(P) >: Missing ? Union{eltype(levels(A.pool)), Missing} : eltype(levels(A.pool))
12-
_unique(S, refs, A.pool)
13-
end
14-
158
refs(A::SubArray{<:Any, <:Any, <:CategoricalArray}) =
169
view(parent(A).refs, parentindices(A)...)
1710

src/typedefs.jl

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,27 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number}
66
# Type params:
77
# * `T` type of categorized values
88
# * `R` integer type for referencing category levels
9-
# * `V` categorical value type
10-
mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
9+
mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer}
1110
levels::Vector{T} # category levels ordered by their reference codes
1211
invindex::Dict{T, R} # map from category levels to their reference codes
1312
ordered::Bool # whether levels can be compared using <
1413
hash::Union{UInt, Nothing} # hash of levels
1514
subsetof::Ptr{Nothing} # last seen strict superset pool
1615
equalto::Ptr{Nothing} # last seen equal pool
1716

18-
function CategoricalPool{T, R, V}(levels::Vector{T},
19-
ordered::Bool) where {T, R, V}
17+
function CategoricalPool{T, R}(levels::Vector{T},
18+
ordered::Bool) where {T, R}
2019
if length(levels) > typemax(R)
2120
throw(LevelsException{T, R}(levels[Int(typemax(R))+1:end]))
2221
end
2322
invindex = Dict{T, R}(v => i for (i, v) in enumerate(levels))
2423
if length(invindex) != length(levels)
2524
throw(ArgumentError("Duplicate entries are not allowed in levels"))
2625
end
27-
CategoricalPool{T, R, V}(levels, invindex, ordered)
26+
CategoricalPool{T, R}(levels, invindex, ordered)
2827
end
29-
function CategoricalPool{T, R, V}(invindex::Dict{T, R},
30-
ordered::Bool) where {T, R, V}
28+
function CategoricalPool{T, R}(invindex::Dict{T, R},
29+
ordered::Bool) where {T, R}
3130
levels = Vector{T}(undef, length(invindex))
3231
# If invindex contains non consecutive values, a BoundsError will be thrown
3332
try
@@ -40,18 +39,12 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
4039
if length(invindex) > typemax(R)
4140
throw(LevelsException{T, R}(levels[typemax(R)+1:end]))
4241
end
43-
CategoricalPool{T, R, V}(levels, invindex, ordered)
42+
CategoricalPool{T, R}(levels, invindex, ordered)
4443
end
45-
function CategoricalPool{T, R, V}(levels::Vector{T},
46-
invindex::Dict{T, R},
47-
ordered::Bool,
48-
hash::Union{UInt, Nothing}=nothing) where {T, R, V}
49-
if !(V <: CategoricalValue)
50-
throw(ArgumentError("Type $V is not a categorical value type"))
51-
end
52-
if V !== CategoricalValue{T, R}
53-
throw(ArgumentError("V must be CategoricalValue{T, R}"))
54-
end
44+
function CategoricalPool{T, R}(levels::Vector{T},
45+
invindex::Dict{T, R},
46+
ordered::Bool,
47+
hash::Union{UInt, Nothing}=nothing) where {T, R}
5548
pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL)
5649
return pool
5750
end
@@ -77,7 +70,7 @@ the order of the pool's [`levels`](@ref DataAPI.levels) is used rather than the
7770
ordering of values of type `T`.
7871
"""
7972
struct CategoricalValue{T <: SupportedTypes, R <: Integer}
80-
pool::CategoricalPool{T, R, CategoricalValue{T, R}}
73+
pool::CategoricalPool{T, R}
8174
ref::R
8275
end
8376

@@ -98,14 +91,14 @@ const AbstractCategoricalMatrix{T, R, V, C, U} = AbstractCategoricalArray{T, 2,
9891

9992
mutable struct CategoricalArray{T, N, R <: Integer, V, C, U} <: AbstractCategoricalArray{T, N, R, V, C, U}
10093
refs::Array{R, N}
101-
pool::CategoricalPool{V, R, C}
94+
pool::CategoricalPool{V, R}
10295

10396
function CategoricalArray{T, N}(refs::Array{R, N},
104-
pool::CategoricalPool{V, R, C}) where
105-
{T, N, R <: Integer, V, C}
97+
pool::CategoricalPool{V, R}) where
98+
{T, N, R <: Integer, V}
10699
T === V || T == Union{V, Missing} || throw(ArgumentError("T ($T) must be equal to $V or Union{$V, Missing}"))
107100
U = T >: Missing ? Missing : Union{}
108-
new{T, N, R, V, C, U}(refs, pool)
101+
new{T, N, R, V, CategoricalValue{V, R}, U}(refs, pool)
109102
end
110103
end
111104
const CategoricalVector{T, R <: Integer, V, C, U} = CategoricalArray{T, 1, R, V, C, U}

0 commit comments

Comments
 (0)