From b932029713256cb94f5aef3c3f18d060559476a1 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 22 Feb 2025 12:10:12 +0100 Subject: [PATCH 1/7] Choose different quantile cutpoints in `cut(x, n)` `Statistics.quantile` returns values which are not the most appropriate to generate labels. It is more intuitive to choose values from the actual data, which are likely to have fewer decimals and make more sense for users. Unfortunately, since we use intervals closed on the left, we cannot use any of the seven standard definitions of quantiles. Type 1 is the closest, but we have to take the value next to it as a cutpoint to prevent it from being included into the next quantile group. This gives essentially consistent group attributions to R's `Hmisc::cut2` or `cut(x, quantile(x, (0:n)/n, type=1), include.lowest=T))`, though with different cutpoints in labels. --- src/extras.jl | 24 +++++++++++++++++++----- test/15_extras.jl | 30 +++++++++++++++--------------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index 2afcef38..7b7d5d0c 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -42,8 +42,8 @@ default_formatter(from, to, i; leftclosed, rightclosed) = Cut a numeric array into intervals at values `breaks` and return an ordered `CategoricalArray` indicating -the interval into which each entry falls. Intervals are of the form `[lower, upper)`, -i.e. the lower bound is included and the upper bound is excluded, except +the interval into which each entry falls. Intervals are of the form `[lower, upper)` +(closed on the left), i.e. the lower bound is included and the upper bound is excluded, except the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will @@ -233,12 +233,27 @@ Provide the default label format for the `cut(x, ngroups)` method. quantile_formatter(from, to, i; leftclosed, rightclosed) = string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +function _quantile!(v::AbstractVector, p::AbstractVector) + n = length(v) + n > 0 || throw(ArgumentError("cannot compute quantiles of empty data vector")) + sort!(v) + return map(p) do i + v[clamp(ceil(Int, n*i), 0, n-1) + firstindex(v)] + end +end +_quantile(x::AbstractArray, p::AbstractVector) = + _quantile!(Base.copymutable(vec(x)), p) +_quantile(x, p::AbstractVector) = _quantile!(collect(x), p) + """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, allowempty::Bool=false) -Cut a numeric array into `ngroups` quantiles, determined using `quantile`. +Cut a numeric array into `ngroups` quantiles. + +Cutpoints differ from those returned by `Statistics.quantile` as they are suited +for intervals closed on the left and taken from actual values in `x`. If `x` contains `missing` values, they are automatically skipped when computing quantiles. @@ -265,8 +280,7 @@ function cut(x::AbstractArray, ngroups::Integer; (max_x isa Number && isnan(max_x)) throw(ArgumentError("NaN values are not allowed in input vector")) end - breaks = quantile(xnm, (1:ngroups-1)/ngroups) - breaks = [min_x; breaks; max_x] + breaks = _quantile(xnm, (0:ngroups)/ngroups) if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * "too many duplicated values in `x`. " * diff --git a/test/15_extras.jl b/test/15_extras.jl index 1aaf8dc7..8440141c 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -127,18 +127,18 @@ end @testset "cut([5, 4, 3, 2], 2)" begin x = @inferred cut([5, 4, 3, 2], 2) - @test x == ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", "Q1: [2.0, 3.5)"] + @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] end @testset "cut(x, n) with missing values" begin x = @inferred cut([5, 4, 3, missing, 2], 2) - @test x ≅ ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", missing, "Q1: [2.0, 3.5)"] + @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] end @testset "cut(x, n) with invalid n" begin @@ -257,18 +257,18 @@ end @test_throws ArgumentError cut([fill(1, 10); 4], 2) @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 2, allowempty=true) - @test unique(x) == ["Q2: [1.0, 4.0]"] + @test unique(x) == ["Q2: [1, 4]"] x = cut([fill(1, 10); 4], 3, allowempty=true) - @test unique(x) == ["Q3: [1.0, 4.0]"] - @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"] + @test unique(x) == ["Q3: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) - @test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)] - @test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"] + @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)] + @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)] - @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"] + @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] end @testset "cut with -0.0" begin @@ -353,12 +353,12 @@ end @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"] x = cut([1:5; Inf], 2) - @test x ≅ [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)] - @test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"] + @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)] + @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"] x = cut([1:5; -Inf], 2) - @test x ≅ [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"] - @test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"] + @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"] + @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] end end \ No newline at end of file From 9787d61fb7b3328cb230a3367c695ca2bf819c63 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 18 Apr 2025 12:04:43 +0200 Subject: [PATCH 2/7] Improve choice of cutpoints --- src/extras.jl | 54 ++++++++++++++++++++++++++--------------------- test/15_extras.jl | 14 ++++++------ 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index 7b7d5d0c..3b0b7b66 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -77,17 +77,17 @@ julia> cut(-1:0.5:1, [0, 1], extend=true) julia> cut(-1:0.5:1, 2) 5-element CategoricalArray{String,1,UInt32}: - "Q1: [-1.0, 0.0)" - "Q1: [-1.0, 0.0)" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" + "Q1: [-1.0, 0.5)" + "Q1: [-1.0, 0.5)" + "Q1: [-1.0, 0.5)" + "Q2: [0.5, 1.0]" + "Q2: [0.5, 1.0]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: "A" "A" - "B" + "A" "B" "B" @@ -95,7 +95,7 @@ julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5]) 5-element CategoricalArray{Float64,1,UInt32}: -0.5 -0.5 - 0.5 + -0.5 0.5 0.5 @@ -104,11 +104,11 @@ fmt (generic function with 1 method) julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: - "grp 1 (-1.0//-0.3333333333333335)" - "grp 1 (-1.0//-0.3333333333333335)" - "grp 2 (-0.3333333333333335//0.33333333333333326)" - "grp 3 (0.33333333333333326//1.0)" - "grp 3 (0.33333333333333326//1.0)" + "grp 1 (-1.0//0.0)" + "grp 1 (-1.0//0.0)" + "grp 2 (0.0//1.0)" + "grp 2 (0.0//1.0)" + "grp 3 (1.0//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; @@ -233,17 +233,21 @@ Provide the default label format for the `cut(x, ngroups)` method. quantile_formatter(from, to, i; leftclosed, rightclosed) = string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") -function _quantile!(v::AbstractVector, p::AbstractVector) +function _quantile!(v::AbstractVector, ps::AbstractVector) n = length(v) n > 0 || throw(ArgumentError("cannot compute quantiles of empty data vector")) - sort!(v) - return map(p) do i - v[clamp(ceil(Int, n*i), 0, n-1) + firstindex(v)] + return map(ps) do p + i = clamp(ceil(Int, n*p), 1, n) + firstindex(v) - 1 + q = v[i] + # Take next distinct value even if quantile falls in a series of duplicated values + @inbounds for j in (i+1):lastindex(v) + q_prev = q + q = v[j] + q_prev != q && break + end + return q end end -_quantile(x::AbstractArray, p::AbstractVector) = - _quantile!(Base.copymutable(vec(x)), p) -_quantile(x, p::AbstractVector) = _quantile!(collect(x), p) """ cut(x::AbstractArray, ngroups::Integer; @@ -253,7 +257,9 @@ _quantile(x, p::AbstractVector) = _quantile!(collect(x), p) Cut a numeric array into `ngroups` quantiles. Cutpoints differ from those returned by `Statistics.quantile` as they are suited -for intervals closed on the left and taken from actual values in `x`. +for intervals closed on the left and taken from actual values in `x`. However, +group assignments are identical to those which would be obtained with type 1 +quantiles if intervals were closed on the right. If `x` contains `missing` values, they are automatically skipped when computing quantiles. @@ -273,14 +279,14 @@ function cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, allowempty::Bool=false) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) - xnm = eltype(x) >: Missing ? skipmissing(x) : x - # Computing extrema is faster than taking 0 and 1 quantiles - min_x, max_x = extrema(xnm) + xnm = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) + min_x, max_x = first(xnm), last(xnm) if (min_x isa Number && isnan(min_x)) || (max_x isa Number && isnan(max_x)) throw(ArgumentError("NaN values are not allowed in input vector")) end - breaks = _quantile(xnm, (0:ngroups)/ngroups) + qs = _quantile!(xnm, (1:(ngroups-1))/ngroups) + breaks = [min_x; qs; max_x] if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * "too many duplicated values in `x`. " * diff --git a/test/15_extras.jl b/test/15_extras.jl index 8440141c..0a686464 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -254,21 +254,21 @@ end fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0) @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) - @test_throws ArgumentError cut([fill(1, 10); 4], 2) + x = cut([fill(1, 10); 4], 2) + @test x == [fill("Q1: [1, 4)", 10); "Q2: [4, 4]"] + @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] @test_throws ArgumentError cut([fill(1, 10); 4], 3) - x = cut([fill(1, 10); 4], 2, allowempty=true) - @test unique(x) == ["Q2: [1, 4]"] x = cut([fill(1, 10); 4], 3, allowempty=true) - @test unique(x) == ["Q3: [1, 4]"] - @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] + @test x == [fill("Q1: [1, 4)", 10); "Q3: [4, 4]"] + @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)] @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] - @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] + @test x == [fill("Q1: [1, 4)", 5); fill("Q3: [4, 4]", 5)] + @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] end @testset "cut with -0.0" begin From 2efe69510ab7fbaf035dab2b5faeafff6b411ff8 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 27 Apr 2025 21:46:29 +0200 Subject: [PATCH 3/7] WIP --- src/extras.jl | 42 ++++++++++++++++++++++++++---------------- test/15_extras.jl | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index 3b0b7b66..f3bc17ca 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -233,20 +233,28 @@ Provide the default label format for the `cut(x, ngroups)` method. quantile_formatter(from, to, i; leftclosed, rightclosed) = string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") -function _quantile!(v::AbstractVector, ps::AbstractVector) - n = length(v) - n > 0 || throw(ArgumentError("cannot compute quantiles of empty data vector")) - return map(ps) do p - i = clamp(ceil(Int, n*p), 1, n) + firstindex(v) - 1 - q = v[i] - # Take next distinct value even if quantile falls in a series of duplicated values - @inbounds for j in (i+1):lastindex(v) - q_prev = q - q = v[j] - q_prev != q && break +"""Find first value in data which is greater than each quantile in ``qs``.""" +function find_breaks(v::AbstractVector, qs::AbstractVector) + n = length(qs) + breaks = similar(v, n) + n == 0 && return breaks + + i = 1 + q = qs[1] + @inbounds for x in v + if x > q + breaks[i] = x + i += 1 + i > n && break + q = qs[i] end - return q end + # if last values in x are equal to q, breaks were not initialized + for i in i:n + breaks[i] = q + end + @show breaks + return breaks end """ @@ -279,14 +287,16 @@ function cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, allowempty::Bool=false) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) - xnm = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) - min_x, max_x = first(xnm), last(xnm) + sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) + min_x, max_x = first(sorted_x), last(sorted_x) if (min_x isa Number && isnan(min_x)) || (max_x isa Number && isnan(max_x)) throw(ArgumentError("NaN values are not allowed in input vector")) end - qs = _quantile!(xnm, (1:(ngroups-1))/ngroups) - breaks = [min_x; qs; max_x] + qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) + @show qs, min_x, max_x + breaks = [min_x; find_breaks(sorted_x, qs); max_x] + @show breaks if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * "too many duplicated values in `x`. " * diff --git a/test/15_extras.jl b/test/15_extras.jl index 0a686464..859cd349 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -361,4 +361,51 @@ end @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] end +@testset "cut in corner cases" begin + # In this case, cut(x, quantile(x, (0:36)/36)) in R generates + # an empty "(143,172]" level + # and qcut(x, 36) in Polars misses that level. + # Our approach uses different breaks at 143 and 182 + x = [23, 23, 60, 76, 84, 95, 101, 108, 111, 133, 137, 143, 143, 143, 182, + 206, 214, 241, 258, 262, 280, 289, 303, 312, 321, 323, 352, 353, 354, + 368, 369, 373, 374, 384, 385, 386, 387, 392, 405, 406, 410, 421, 430, + 430, 431, 442, 464, 474, 478, 479, 496, 516, 530, 534, 549, 554, 568, + 575, 589, 590, 591, 592, 595, 596, 603, 625, 632, 632, 638, 640, 640, + 645, 648, 690, 704, 748, 758, 771, 772, 803, 835, 839, 853, 869, 873, + 874, 887, 911, 920, 923, 928, 933, 943, 945, 945, 947, 951, 965, 978, 980] + + @test cut(x, 36) == + ["Q17: [442, 474)", "Q8: [280, 312)", "Q2: [76, 101)", "Q9: [312, 323)", + "Q14: [387, 406)", "Q30: [835, 869)", "Q17: [442, 474)", "Q35: [947, 965)", + "Q2: [76, 101)", "Q11: [354, 373)", "Q32: [887, 923)", "Q12: [373, 385)", + "Q24: [603, 638)", "Q29: [772, 835)", "Q24: [603, 638)", "Q15: [406, 430)", + "Q11: [354, 373)", "Q23: [592, 603)", "Q3: [101, 133)", "Q16: [430, 442)", + "Q34: [933, 947)", "Q27: [648, 748)", "Q28: [748, 772)", "Q28: [748, 772)", + "Q21: [568, 589)", "Q18: [474, 496)", "Q32: [887, 923)", "Q11: [354, 373)", + "Q7: [241, 280)", "Q3: [101, 133)", "Q19: [496, 534)", "Q13: [385, 387)", + "Q36: [965, 980]", "Q33: [923, 933)", "Q16: [430, 442)", "Q36: [965, 980]", + "Q27: [648, 748)", "Q24: [603, 638)", "Q32: [887, 923)", "Q4: [133, 182)", + "Q22: [589, 592)", "Q1: [23, 76)", "Q5: [182, 206)", "Q28: [748, 772)", + "Q30: [835, 869)", "Q31: [869, 887)", "Q22: [589, 592)", "Q15: [406, 430)", + "Q31: [869, 887)", "Q19: [496, 534)", "Q26: [640, 648)", "Q8: [280, 312)", + "Q18: [474, 496)", "Q14: [387, 406)", "Q7: [241, 280)", "Q30: [835, 869)", + "Q3: [101, 133)", "Q9: [312, 323)", "Q4: [133, 182)", "Q24: [603, 638)", + "Q20: [534, 568)", "Q25: [638, 640)", "Q20: [534, 568)", "Q23: [592, 603)", + "Q12: [373, 385)", "Q27: [648, 748)", "Q29: [772, 835)", "Q6: [206, 241)", + "Q34: [933, 947)", "Q16: [430, 442)", "Q26: [640, 648)", "Q15: [406, 430)", + "Q12: [373, 385)", "Q26: [640, 648)", "Q19: [496, 534)", "Q4: [133, 182)", + "Q34: [933, 947)", "Q31: [869, 887)", "Q10: [323, 354)", "Q21: [568, 589)", + "Q4: [133, 182)", "Q7: [241, 280)", "Q35: [947, 965)", "Q14: [387, 406)", + "Q18: [474, 496)", "Q34: [933, 947)", "Q20: [534, 568)", "Q22: [589, 592)", + "Q33: [923, 933)", "Q10: [323, 354)", "Q13: [385, 387)", "Q1: [23, 76)", + "Q36: [965, 980]", "Q2: [76, 101)", "Q23: [592, 603)", "Q6: [206, 241)", + "Q1: [23, 76)", "Q10: [323, 354)", "Q4: [133, 182)", "Q8: [280, 312)"] + + @test cut([0, 1, 1, 1, 1], 2) == + ["Q1: [0, 1)", "Q2: [1, 1]", "Q2: [1, 1]", "Q2: [1, 1]", "Q2: [1, 1]"] + + @test cut([1, 1, 1, 1, 2], 2) == + ["Q1: [1, 2)", "Q1: [1, 2)", "Q1: [1, 2)", "Q1: [1, 2)", "Q2: [2, 2]"] +end + end \ No newline at end of file From 6a913c7be061600a538586fdb7f3d8d15ba5d1e1 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 27 Apr 2025 23:06:39 +0200 Subject: [PATCH 4/7] Yet another approach --- src/extras.jl | 15 +++++----- test/15_extras.jl | 73 ++++++++++++++--------------------------------- 2 files changed, 30 insertions(+), 58 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index f3bc17ca..1c5f3596 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -233,7 +233,10 @@ Provide the default label format for the `cut(x, ngroups)` method. quantile_formatter(from, to, i; leftclosed, rightclosed) = string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") -"""Find first value in data which is greater than each quantile in ``qs``.""" +""" +Find first value in (sorted) `v` which is greater than or equal to each quantile +in (sorted) `qs`. +""" function find_breaks(v::AbstractVector, qs::AbstractVector) n = length(qs) breaks = similar(v, n) @@ -242,7 +245,8 @@ function find_breaks(v::AbstractVector, qs::AbstractVector) i = 1 q = qs[1] @inbounds for x in v - if x > q + # Use isless and isequal to differentiate -0.0 from 0.0 + if isless(q, x) || isequal(q, x) breaks[i] = x i += 1 i > n && break @@ -253,7 +257,6 @@ function find_breaks(v::AbstractVector, qs::AbstractVector) for i in i:n breaks[i] = q end - @show breaks return breaks end @@ -264,10 +267,8 @@ end Cut a numeric array into `ngroups` quantiles. -Cutpoints differ from those returned by `Statistics.quantile` as they are suited -for intervals closed on the left and taken from actual values in `x`. However, -group assignments are identical to those which would be obtained with type 1 -quantiles if intervals were closed on the right. +This is equivalent to `cut(x, quantile(x, (0:ngroups)/ngroups))`, +but breaks are taken from actual data values instead of estimated quantiles. If `x` contains `missing` values, they are automatically skipped when computing quantiles. diff --git a/test/15_extras.jl b/test/15_extras.jl index 859cd349..b995e3bf 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -254,12 +254,21 @@ end fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0) @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) - x = cut([fill(1, 10); 4], 2) - @test x == [fill("Q1: [1, 4)", 10); "Q2: [4, 4]"] - @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] + @test_throws ArgumentError cut([fill(1, 10); 4], 2) + x = cut([fill(1, 10); 4], 2, allowempty=true) + @test unique(x) == ["Q2: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"] @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 3, allowempty=true) - @test x == [fill("Q1: [1, 4)", 10); "Q3: [4, 4]"] + @test unique(x) == ["Q3: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] + + x = cut([fill(4, 10); 1], 2) + @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"] + @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"] + @test_throws ArgumentError cut([fill(4, 10); 1], 3) + x = cut([fill(4, 10); 1], 3, allowempty=true) + @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"] @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) @@ -267,8 +276,8 @@ end @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q1: [1, 4)", 5); fill("Q3: [4, 4]", 5)] - @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] + @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] end @testset "cut with -0.0" begin @@ -361,51 +370,13 @@ end @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] end -@testset "cut in corner cases" begin - # In this case, cut(x, quantile(x, (0:36)/36)) in R generates - # an empty "(143,172]" level - # and qcut(x, 36) in Polars misses that level. - # Our approach uses different breaks at 143 and 182 - x = [23, 23, 60, 76, 84, 95, 101, 108, 111, 133, 137, 143, 143, 143, 182, - 206, 214, 241, 258, 262, 280, 289, 303, 312, 321, 323, 352, 353, 354, - 368, 369, 373, 374, 384, 385, 386, 387, 392, 405, 406, 410, 421, 430, - 430, 431, 442, 464, 474, 478, 479, 496, 516, 530, 534, 549, 554, 568, - 575, 589, 590, 591, 592, 595, 596, 603, 625, 632, 632, 638, 640, 640, - 645, 648, 690, 704, 748, 758, 771, 772, 803, 835, 839, 853, 869, 873, - 874, 887, 911, 920, 923, 928, 933, 943, 945, 945, 947, 951, 965, 978, 980] - - @test cut(x, 36) == - ["Q17: [442, 474)", "Q8: [280, 312)", "Q2: [76, 101)", "Q9: [312, 323)", - "Q14: [387, 406)", "Q30: [835, 869)", "Q17: [442, 474)", "Q35: [947, 965)", - "Q2: [76, 101)", "Q11: [354, 373)", "Q32: [887, 923)", "Q12: [373, 385)", - "Q24: [603, 638)", "Q29: [772, 835)", "Q24: [603, 638)", "Q15: [406, 430)", - "Q11: [354, 373)", "Q23: [592, 603)", "Q3: [101, 133)", "Q16: [430, 442)", - "Q34: [933, 947)", "Q27: [648, 748)", "Q28: [748, 772)", "Q28: [748, 772)", - "Q21: [568, 589)", "Q18: [474, 496)", "Q32: [887, 923)", "Q11: [354, 373)", - "Q7: [241, 280)", "Q3: [101, 133)", "Q19: [496, 534)", "Q13: [385, 387)", - "Q36: [965, 980]", "Q33: [923, 933)", "Q16: [430, 442)", "Q36: [965, 980]", - "Q27: [648, 748)", "Q24: [603, 638)", "Q32: [887, 923)", "Q4: [133, 182)", - "Q22: [589, 592)", "Q1: [23, 76)", "Q5: [182, 206)", "Q28: [748, 772)", - "Q30: [835, 869)", "Q31: [869, 887)", "Q22: [589, 592)", "Q15: [406, 430)", - "Q31: [869, 887)", "Q19: [496, 534)", "Q26: [640, 648)", "Q8: [280, 312)", - "Q18: [474, 496)", "Q14: [387, 406)", "Q7: [241, 280)", "Q30: [835, 869)", - "Q3: [101, 133)", "Q9: [312, 323)", "Q4: [133, 182)", "Q24: [603, 638)", - "Q20: [534, 568)", "Q25: [638, 640)", "Q20: [534, 568)", "Q23: [592, 603)", - "Q12: [373, 385)", "Q27: [648, 748)", "Q29: [772, 835)", "Q6: [206, 241)", - "Q34: [933, 947)", "Q16: [430, 442)", "Q26: [640, 648)", "Q15: [406, 430)", - "Q12: [373, 385)", "Q26: [640, 648)", "Q19: [496, 534)", "Q4: [133, 182)", - "Q34: [933, 947)", "Q31: [869, 887)", "Q10: [323, 354)", "Q21: [568, 589)", - "Q4: [133, 182)", "Q7: [241, 280)", "Q35: [947, 965)", "Q14: [387, 406)", - "Q18: [474, 496)", "Q34: [933, 947)", "Q20: [534, 568)", "Q22: [589, 592)", - "Q33: [923, 933)", "Q10: [323, 354)", "Q13: [385, 387)", "Q1: [23, 76)", - "Q36: [965, 980]", "Q2: [76, 101)", "Q23: [592, 603)", "Q6: [206, 241)", - "Q1: [23, 76)", "Q10: [323, 354)", "Q4: [133, 182)", "Q8: [280, 312)"] - - @test cut([0, 1, 1, 1, 1], 2) == - ["Q1: [0, 1)", "Q2: [1, 1]", "Q2: [1, 1]", "Q2: [1, 1]", "Q2: [1, 1]"] - - @test cut([1, 1, 1, 1, 2], 2) == - ["Q1: [1, 2)", "Q1: [1, 2)", "Q1: [1, 2)", "Q1: [1, 2)", "Q2: [2, 2]"] +@testset "cut when quantile falls exactly on a data value" begin + x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) + @test x == + ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", + "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", + "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] + @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"] end end \ No newline at end of file From 844ff423dbdfd2b94cbcc811563a7e733871378e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 27 Apr 2025 23:12:40 +0200 Subject: [PATCH 5/7] Small cleanup --- src/extras.jl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index 1c5f3596..3f901d63 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -253,10 +253,6 @@ function find_breaks(v::AbstractVector, qs::AbstractVector) q = qs[i] end end - # if last values in x are equal to q, breaks were not initialized - for i in i:n - breaks[i] = q - end return breaks end @@ -295,9 +291,7 @@ function cut(x::AbstractArray, ngroups::Integer; throw(ArgumentError("NaN values are not allowed in input vector")) end qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) - @show qs, min_x, max_x breaks = [min_x; find_breaks(sorted_x, qs); max_x] - @show breaks if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * "too many duplicated values in `x`. " * From b54c9fb7d1652c17d8e10d98b4438f3872ea5d0f Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 27 Apr 2025 23:35:42 +0200 Subject: [PATCH 6/7] Fix doctests --- src/extras.jl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index 3f901d63..11207326 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -77,25 +77,25 @@ julia> cut(-1:0.5:1, [0, 1], extend=true) julia> cut(-1:0.5:1, 2) 5-element CategoricalArray{String,1,UInt32}: - "Q1: [-1.0, 0.5)" - "Q1: [-1.0, 0.5)" - "Q1: [-1.0, 0.5)" - "Q2: [0.5, 1.0]" - "Q2: [0.5, 1.0]" + "Q1: [-1.0, 0.0)" + "Q1: [-1.0, 0.0)" + "Q2: [0.0, 1.0]" + "Q2: [0.0, 1.0]" + "Q2: [0.0, 1.0]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: "A" "A" - "A" "B" - "B" + "B" + "B" julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5]) 5-element CategoricalArray{Float64,1,UInt32}: -0.5 -0.5 - -0.5 + 0.5 0.5 0.5 @@ -106,9 +106,9 @@ julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: "grp 1 (-1.0//0.0)" "grp 1 (-1.0//0.0)" - "grp 2 (0.0//1.0)" - "grp 2 (0.0//1.0)" - "grp 3 (1.0//1.0)" + "grp 2 (0.0//0.5)" + "grp 3 (0.5//1.0)" + "grp 3 (0.5//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; From a33a85357ad5c92f7c6d6edf99c56a6ac9e1e9da Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 27 Apr 2025 23:37:49 +0200 Subject: [PATCH 7/7] Indentation --- test/15_extras.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/15_extras.jl b/test/15_extras.jl index b995e3bf..af4f79f5 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -373,9 +373,9 @@ end @testset "cut when quantile falls exactly on a data value" begin x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) @test x == - ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", - "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", - "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] + ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", + "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", + "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"] end