Skip to content

Commit 062efb7

Browse files
committed
Simplify default cut labels
1) The quantile number isn't needed in most cases in the label, and anyway it's shown when printing an ordered `CategoricalValue`. Only use it by default when `allowempty=true` to avoid data-dependent errors if there are duplicate levels. 2) Round breaks by default to a number of significant digits chosen by `sigdigits`. This number is increased if necessary for breaks to remain unique. This generates labels which are not completely correct as rounding may make the left break greater than a value which is included in the interval, but this is generally minor and expected. Taking the floor rather than rounding would be more correct, but it can generate unexpected labels due to floating point trickiness (e.g. `floor(0.0003, sigdigits=4)` gives 0.0002999). This is what R does. Add a deprecation to avoid breaking custom `labels` functions which did not accept `sigdigits`.
1 parent daaa0cc commit 062efb7

File tree

4 files changed

+261
-94
lines changed

4 files changed

+261
-94
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ CategoricalArraysSentinelArraysExt = "SentinelArrays"
2525
CategoricalArraysStructTypesExt = "StructTypes"
2626

2727
[compat]
28-
Compat = "3.37, 4"
28+
Compat = "3.47, 4.10"
2929
DataAPI = "1.6"
3030
JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
3131
JSON3 = "1.1.2"

src/CategoricalArrays.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,12 @@ module CategoricalArrays
1111
import DataAPI: unwrap
1212
export unwrap
1313

14+
using Compat
15+
@compat public default_formatter, numbered_formatter
16+
1417
using DataAPI
1518
using Missings
1619
using Printf
17-
import Compat
1820

1921
# JuliaLang/julia#36810
2022
if VERSION < v"1.5.2"

src/extras.jl

Lines changed: 161 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,56 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
2727
end
2828
end
2929

30+
const CUT_FMT = Printf.Format("%.*g")
31+
32+
"""
33+
CategoricalArrays.default_formatter(from, to, i::Integer;
34+
leftclosed::Bool, rightclosed::Bool,
35+
sigdigits::Integer)
36+
37+
Provide the default label format for the `cut(x, breaks)` method,
38+
which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise.
39+
40+
If they are floating points values, breaks are turned into to strings using
41+
`@sprintf("%.*g", sigdigits, break)`
42+
(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
3043
"""
31-
default_formatter(from, to, i; leftclosed, rightclosed)
44+
function default_formatter(from, to, i::Integer;
45+
leftclosed::Bool, rightclosed::Bool,
46+
sigdigits::Integer)
47+
from_str = from isa AbstractFloat ?
48+
Printf.format(CUT_FMT, sigdigits, from) :
49+
string(from)
50+
to_str = to isa AbstractFloat ?
51+
Printf.format(CUT_FMT, sigdigits, to) :
52+
string(to)
53+
string(leftclosed ? "[" : "(", from_str, ", ", to_str, rightclosed ? "]" : ")")
54+
end
3255

33-
Provide the default label format for the `cut(x, breaks)` method.
3456
"""
35-
default_formatter(from, to, i; leftclosed, rightclosed) =
36-
string(leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
57+
CategoricalArrays.numbered_formatter(from, to, i::Integer;
58+
leftclosed::Bool, rightclosed::Bool,
59+
sigdigits::Integer)
60+
61+
Provide the default label format for the `cut(x, ngroups)` method
62+
when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed`
63+
is `true` and `"i: [from, to)"` otherwise.
64+
65+
If they are floating points values, breaks are turned into to strings using
66+
`@sprintf("%.*g", sigdigits, breaks)`
67+
(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
68+
"""
69+
numbered_formatter(from, to, i::Integer;
70+
leftclosed::Bool, rightclosed::Bool,
71+
sigdigits::Integer) =
72+
string(i, ": ",
73+
default_formatter(from, to, i, leftclosed=leftclosed, rightclosed=rightclosed,
74+
sigdigits=sigdigits))
3775

3876
@doc raw"""
3977
cut(x::AbstractArray, breaks::AbstractVector;
4078
labels::Union{AbstractVector,Function},
79+
sigdigits::Integer=3,
4180
extend::Union{Bool,Missing}=false, allowempty::Bool=false)
4281
4382
Cut a numeric array into intervals at values `breaks`
@@ -54,10 +93,15 @@ also accept them.
5493
in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
5594
all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
5695
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
57-
or numbers giving the names to use for
58-
the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
96+
or numbers giving the names to use for the intervals; or a function
97+
`f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates
5998
the labels from the left and right interval boundaries and the group index. Defaults to
60-
`"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`).
99+
[`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
100+
for the rightmost interval if `extend == true`).
101+
* `sigdigits::Integer=3`: the minimum number of significant digits to use in labels.
102+
This value is increased automatically if necessary so that rounded breaks are unique.
103+
Only used for floating point types and when `labels` is a function, in which case it
104+
is passed to it as a keyword argument.
61105
* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than
62106
the last one appear multiple times, generating empty intervals; when `true`,
63107
duplicate breaks are allowed and the intervals they generate are kept as
@@ -69,19 +113,19 @@ julia> using CategoricalArrays
69113
70114
julia> cut(-1:0.5:1, [0, 1], extend=true)
71115
5-element CategoricalArray{String,1,UInt32}:
72-
"[-1.0, 0.0)"
73-
"[-1.0, 0.0)"
74-
"[0.0, 1.0]"
75-
"[0.0, 1.0]"
76-
"[0.0, 1.0]"
116+
"[-1, 0)"
117+
"[-1, 0)"
118+
"[0, 1]"
119+
"[0, 1]"
120+
"[0, 1]"
77121
78122
julia> cut(-1:0.5:1, 2)
79123
5-element CategoricalArray{String,1,UInt32}:
80-
"Q1: [-1.0, 0.0)"
81-
"Q1: [-1.0, 0.0)"
82-
"Q2: [0.0, 1.0]"
83-
"Q2: [0.0, 1.0]"
84-
"Q2: [0.0, 1.0]"
124+
"[-1, 0)"
125+
"[-1, 0)"
126+
"[0, 1]"
127+
"[0, 1]"
128+
"[0, 1]"
85129
86130
julia> cut(-1:0.5:1, 2, labels=["A", "B"])
87131
5-element CategoricalArray{String,1,UInt32}:
@@ -114,6 +158,7 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
114158
@inline function cut(x::AbstractArray, breaks::AbstractVector;
115159
extend::Union{Bool, Missing}=false,
116160
labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter,
161+
sigdigits::Integer=3,
117162
allowmissing::Union{Bool, Nothing}=nothing,
118163
allow_missing::Union{Bool, Nothing}=nothing,
119164
allowempty::Bool=false)
@@ -127,14 +172,15 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
127172
:cut)
128173
extend = missing
129174
end
130-
return _cut(x, breaks, extend, labels, allowempty)
175+
return _cut(x, breaks, extend, labels, sigdigits, allowempty)
131176
end
132177

133178
# Separate function for inferability (thanks to inlining of cut)
134179
function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
135180
extend::Union{Bool, Missing},
136181
labels::Union{AbstractVector{<:SupportedTypes},Function},
137-
allowempty::Bool=false) where {T, N}
182+
sigdigits::Integer,
183+
allowempty::Bool) where {T, N}
138184
if !issorted(breaks)
139185
breaks = sort(breaks)
140186
end
@@ -191,21 +237,55 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
191237
end
192238
end
193239

240+
# Find minimal number of digits so that distinct breaks remain so
241+
if eltype(breaks) <: AbstractFloat
242+
while true
243+
local i
244+
for outer i in 2:lastindex(breaks)
245+
b1 = breaks[i-1]
246+
b2 = breaks[i]
247+
isequal(b1, b2) && continue
248+
249+
b1_str = Printf.format(CUT_FMT, sigdigits, b1)
250+
b2_str = Printf.format(CUT_FMT, sigdigits, b2)
251+
if b1_str == b2_str
252+
sigdigits += 1
253+
break
254+
end
255+
end
256+
i == lastindex(breaks) && break
257+
end
258+
end
194259
n = length(breaks)
195260
n >= 2 || throw(ArgumentError("at least two breaks must be provided when extend is not true"))
196261
if labels isa Function
197262
from = breaks[1:n-1]
198263
to = breaks[2:n]
199-
firstlevel = labels(from[1], to[1], 1,
200-
leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
264+
local firstlevel
265+
try
266+
firstlevel = labels(from[1], to[1], 1,
267+
leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false,
268+
sigdigits=sigdigits)
269+
catch
270+
# Support functions defined before v1.0, where sigdigits did not exist
271+
Base.depwarn("`labels` function is now required to accept a `sigdigits` keyword argument",
272+
:cut)
273+
labels_orig = labels
274+
labels = (from, to, i; leftclosed, rightclosed, sigdigits) ->
275+
labels_orig(from, to, i; leftclosed, rightclosed)
276+
firstlevel = labels_orig(from[1], to[1], 1,
277+
leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
278+
end
201279
levs = Vector{typeof(firstlevel)}(undef, n-1)
202280
levs[1] = firstlevel
203281
for i in 2:n-2
204282
levs[i] = labels(from[i], to[i], i,
205-
leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false)
283+
leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false,
284+
sigdigits=sigdigits)
206285
end
207286
levs[end] = labels(from[end], to[end], n-1,
208-
leftclosed=true, rightclosed=true)
287+
leftclosed=true, rightclosed=true,
288+
sigdigits=sigdigits)
209289
else
210290
length(labels) == n-1 ||
211291
throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))"))
@@ -225,40 +305,37 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
225305
CategoricalArray{S, N}(refs, pool)
226306
end
227307

228-
"""
229-
quantile_formatter(from, to, i; leftclosed, rightclosed)
230-
231-
Provide the default label format for the `cut(x, ngroups)` method.
232-
"""
233-
quantile_formatter(from, to, i; leftclosed, rightclosed) =
234-
string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
235-
236308
"""
237309
Find first value in (sorted) `v` which is greater than or equal to each quantile
238310
in (sorted) `qs`.
239311
"""
240312
function find_breaks(v::AbstractVector, qs::AbstractVector)
241313
n = length(qs)
242314
breaks = similar(v, n)
243-
n == 0 && return breaks
315+
breaks_prev = similar(v, n)
316+
n == 0 && return (breaks, breaks_prev)
244317

245318
i = 1
246319
q = qs[1]
247-
@inbounds for x in v
320+
@inbounds for j in eachindex(v)
321+
x = v[j]
248322
# Use isless and isequal to differentiate -0.0 from 0.0
249323
if isless(q, x) || isequal(q, x)
250324
breaks[i] = x
325+
# FIXME : handle duplicated breaks
326+
breaks_prev[i] = v[clamp(j-1, firstindex(v), lastindex(v))]
251327
i += 1
252328
i > n && break
253329
q = qs[i]
254330
end
255331
end
256-
return breaks
332+
return (breaks, breaks_prev)
257333
end
258334

259335
"""
260336
cut(x::AbstractArray, ngroups::Integer;
261337
labels::Union{AbstractVector{<:AbstractString},Function},
338+
sigdigits::Integer=3,
262339
allowempty::Bool=false)
263340
264341
Cut a numeric array into `ngroups` quantiles.
@@ -271,17 +348,25 @@ quantiles.
271348
272349
# Keyword arguments
273350
* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
274-
or numbers giving the names to use for
275-
the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
351+
or numbers giving the names to use for the intervals; or a function
352+
`f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates
276353
the labels from the left and right interval boundaries and the group index. Defaults to
277-
`"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval).
354+
[`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
355+
for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to
356+
[`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile
357+
number to ensure uniqueness.
358+
* `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding
359+
breaks for inclusion in generated labels. This value is increased automatically if necessary
360+
so that rounded breaks are unique. Only used for floating point types and when `labels` is a
361+
function, in which case it is passed to it as a keyword argument.
278362
* `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints
279363
other than the last one are equal, generating empty intervals;
280364
when `true`, duplicate breaks are allowed and the intervals they generate are kept as
281365
unused levels (but duplicate labels are not allowed).
282366
"""
283367
function cut(x::AbstractArray, ngroups::Integer;
284-
labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter,
368+
labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing,
369+
sigdigits::Integer=3,
285370
allowempty::Bool=false)
286371
ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)"))
287372
sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x)
@@ -291,12 +376,48 @@ function cut(x::AbstractArray, ngroups::Integer;
291376
throw(ArgumentError("NaN values are not allowed in input vector"))
292377
end
293378
qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true)
294-
breaks = [min_x; find_breaks(sorted_x, qs); max_x]
379+
breaks, breaks_prev = find_breaks(sorted_x, qs)
380+
breaks = [min_x; breaks; max_x]
295381
if !allowempty && !allunique(@view breaks[1:end-1])
296382
throw(ArgumentError("cannot compute $ngroups quantiles due to " *
297383
"too many duplicated values in `x`. " *
298384
"Pass `allowempty=true` to allow empty quantiles or " *
299385
"choose a lower value for `ngroups`."))
300386
end
301-
cut(x, breaks; labels=labels, allowempty=allowempty)
387+
if labels === nothing
388+
labels = allowempty ? numbered_formatter : default_formatter
389+
390+
if eltype(breaks) <: AbstractFloat
391+
while true
392+
local i
393+
for outer i in 2:lastindex(breaks)
394+
b1 = breaks[i-1]
395+
b2 = breaks[i]
396+
isequal(b1, b2) && continue
397+
398+
# Find minimal number of digits so that `floor` does not
399+
# return a value that is lower than value immediately below break
400+
# We skip the first break, which is the minimum and has no equivalent
401+
# in `breaks_prev`
402+
b1_rounded = round(b1, sigdigits=sigdigits)
403+
b2_rounded = round(b2, sigdigits=sigdigits)
404+
if i < lastindex(breaks) &&
405+
(isequal(b2_rounded, breaks_prev[i-1]) || isless(b2_rounded, breaks_prev[i-1]))
406+
sigdigits += 1
407+
break
408+
end
409+
410+
# Find minimal number of digits so that breaks are unique
411+
b1_str = Printf.format(CUT_FMT, sigdigits, b1)
412+
b2_str = Printf.format(CUT_FMT, sigdigits, b2)
413+
if b1_str == b2_str
414+
sigdigits += 1
415+
break
416+
end
417+
end
418+
i == lastindex(breaks) && break
419+
end
420+
end
421+
end
422+
return cut(x, breaks; labels=labels, sigdigits=sigdigits, allowempty=allowempty)
302423
end

0 commit comments

Comments
 (0)