Skip to content

Commit e0fe39c

Browse files
committed
Choose different quantile cutpoints in cut(x, n)
`Statistics.quantile` returns values which are not the most appropriate to generate labels. It is more intuitive to choose values from the actual data, which are likely to have fewer decimals and make more sense for users. Unfortunately, since we use intervals closed on the left, we cannot use any of the seven standard definitions of quantiles. Type 1 is the closest, but we have to take the value next to it as a cutpoint to prevent it from being included into the next quantile group. This gives essentially consistent group attributions to R's `Hmisc::cut2` or `cut(x, quantile(x, (0:n)/n, type=1), include.lowest=T))`, though with different cutpoints in labels.
1 parent 3e0d056 commit e0fe39c

File tree

2 files changed

+34
-20
lines changed

2 files changed

+34
-20
lines changed

src/extras.jl

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ default_formatter(from, to, i; leftclosed, rightclosed) =
4242
4343
Cut a numeric array into intervals at values `breaks`
4444
and return an ordered `CategoricalArray` indicating
45-
the interval into which each entry falls. Intervals are of the form `[lower, upper)`,
46-
i.e. the lower bound is included and the upper bound is excluded, except
45+
the interval into which each entry falls. Intervals are of the form `[lower, upper)`
46+
(closed on the left), i.e. the lower bound is included and the upper bound is excluded, except
4747
the last interval, which is closed on both ends, i.e. `[lower, upper]`.
4848
4949
If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will
@@ -233,12 +233,27 @@ Provide the default label format for the `cut(x, ngroups)` method.
233233
quantile_formatter(from, to, i; leftclosed, rightclosed) =
234234
string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
235235

236+
function _quantile!(v::AbstractVector, p::AbstractVector)
237+
n = length(v)
238+
n > 0 || throw(ArgumentError("cannot compute quantiles of empty data vector"))
239+
sort!(v)
240+
return map(p) do i
241+
v[clamp(ceil(Int, n*i), 0, n-1) + firstindex(v)]
242+
end
243+
end
244+
_quantile(x::AbstractArray, p::AbstractVector) =
245+
_quantile!(Base.copymutable(vec(x)), p)
246+
_quantile(x, p::AbstractVector) = _quantile!(collect(x), p)
247+
236248
"""
237249
cut(x::AbstractArray, ngroups::Integer;
238250
labels::Union{AbstractVector{<:AbstractString},Function},
239251
allowempty::Bool=false)
240252
241-
Cut a numeric array into `ngroups` quantiles, determined using `quantile`.
253+
Cut a numeric array into `ngroups` quantiles.
254+
255+
Cutpoints differ from those returned by `Statistics.quantile` as they are suited
256+
for intervals closed on the left and taken from actual values in `x`.
242257
243258
If `x` contains `missing` values, they are automatically skipped when computing
244259
quantiles.
@@ -265,8 +280,7 @@ function cut(x::AbstractArray, ngroups::Integer;
265280
(max_x isa Number && isnan(max_x))
266281
throw(ArgumentError("NaN values are not allowed in input vector"))
267282
end
268-
breaks = quantile(xnm, (1:ngroups-1)/ngroups)
269-
breaks = [min_x; breaks; max_x]
283+
breaks = _quantile(xnm, (0:ngroups)/ngroups)
270284
if !allowempty && !allunique(@view breaks[1:end-1])
271285
throw(ArgumentError("cannot compute $ngroups quantiles due to " *
272286
"too many duplicated values in `x`. " *

test/15_extras.jl

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -127,18 +127,18 @@ end
127127

128128
@testset "cut([5, 4, 3, 2], 2)" begin
129129
x = @inferred cut([5, 4, 3, 2], 2)
130-
@test x == ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", "Q1: [2.0, 3.5)"]
130+
@test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"]
131131
@test isa(x, CategoricalArray)
132132
@test isordered(x)
133-
@test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"]
133+
@test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"]
134134
end
135135

136136
@testset "cut(x, n) with missing values" begin
137137
x = @inferred cut([5, 4, 3, missing, 2], 2)
138-
@test x ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", missing, "Q1: [2.0, 3.5)"]
138+
@test x ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"]
139139
@test isa(x, CategoricalArray)
140140
@test isordered(x)
141-
@test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"]
141+
@test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"]
142142
end
143143

144144
@testset "cut(x, n) with invalid n" begin
@@ -257,18 +257,18 @@ end
257257
@test_throws ArgumentError cut([fill(1, 10); 4], 2)
258258
@test_throws ArgumentError cut([fill(1, 10); 4], 3)
259259
x = cut([fill(1, 10); 4], 2, allowempty=true)
260-
@test unique(x) == ["Q2: [1.0, 4.0]"]
260+
@test unique(x) == ["Q2: [1, 4]"]
261261
x = cut([fill(1, 10); 4], 3, allowempty=true)
262-
@test unique(x) == ["Q3: [1.0, 4.0]"]
263-
@test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"]
262+
@test unique(x) == ["Q3: [1, 4]"]
263+
@test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"]
264264

265265
x = cut([fill(1, 5); fill(4, 5)], 2)
266-
@test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)]
267-
@test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"]
266+
@test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)]
267+
@test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"]
268268
@test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3)
269269
x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true)
270-
@test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)]
271-
@test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"]
270+
@test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)]
271+
@test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"]
272272
end
273273

274274
@testset "cut with -0.0" begin
@@ -353,12 +353,12 @@ end
353353
@test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"]
354354

355355
x = cut([1:5; Inf], 2)
356-
@test x [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)]
357-
@test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"]
356+
@test x [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)]
357+
@test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"]
358358

359359
x = cut([1:5; -Inf], 2)
360-
@test x [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"]
361-
@test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"]
360+
@test x [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"]
361+
@test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"]
362362
end
363363

364364
end

0 commit comments

Comments
 (0)