diff --git a/src/extras.jl b/src/extras.jl index 2afcef38..11207326 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -42,8 +42,8 @@ default_formatter(from, to, i; leftclosed, rightclosed) = Cut a numeric array into intervals at values `breaks` and return an ordered `CategoricalArray` indicating -the interval into which each entry falls. Intervals are of the form `[lower, upper)`, -i.e. the lower bound is included and the upper bound is excluded, except +the interval into which each entry falls. Intervals are of the form `[lower, upper)` +(closed on the left), i.e. the lower bound is included and the upper bound is excluded, except the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will @@ -81,7 +81,7 @@ julia> cut(-1:0.5:1, 2) "Q1: [-1.0, 0.0)" "Q2: [0.0, 1.0]" "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" + "Q2: [0.0, 1.0]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: @@ -89,7 +89,7 @@ julia> cut(-1:0.5:1, 2, labels=["A", "B"]) "A" "B" "B" - "B" + "B" julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5]) 5-element CategoricalArray{Float64,1,UInt32}: @@ -104,11 +104,11 @@ fmt (generic function with 1 method) julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: - "grp 1 (-1.0//-0.3333333333333335)" - "grp 1 (-1.0//-0.3333333333333335)" - "grp 2 (-0.3333333333333335//0.33333333333333326)" - "grp 3 (0.33333333333333326//1.0)" - "grp 3 (0.33333333333333326//1.0)" + "grp 1 (-1.0//0.0)" + "grp 1 (-1.0//0.0)" + "grp 2 (0.0//0.5)" + "grp 3 (0.5//1.0)" + "grp 3 (0.5//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; @@ -233,12 +233,38 @@ Provide the default label format for the `cut(x, ngroups)` method. quantile_formatter(from, to, i; leftclosed, rightclosed) = string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +""" +Find first value in (sorted) `v` which is greater than or equal to each quantile +in (sorted) `qs`. +""" +function find_breaks(v::AbstractVector, qs::AbstractVector) + n = length(qs) + breaks = similar(v, n) + n == 0 && return breaks + + i = 1 + q = qs[1] + @inbounds for x in v + # Use isless and isequal to differentiate -0.0 from 0.0 + if isless(q, x) || isequal(q, x) + breaks[i] = x + i += 1 + i > n && break + q = qs[i] + end + end + return breaks +end + """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, allowempty::Bool=false) -Cut a numeric array into `ngroups` quantiles, determined using `quantile`. +Cut a numeric array into `ngroups` quantiles. + +This is equivalent to `cut(x, quantile(x, (0:ngroups)/ngroups))`, +but breaks are taken from actual data values instead of estimated quantiles. If `x` contains `missing` values, they are automatically skipped when computing quantiles. @@ -258,15 +284,14 @@ function cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, allowempty::Bool=false) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) - xnm = eltype(x) >: Missing ? skipmissing(x) : x - # Computing extrema is faster than taking 0 and 1 quantiles - min_x, max_x = extrema(xnm) + sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) + min_x, max_x = first(sorted_x), last(sorted_x) if (min_x isa Number && isnan(min_x)) || (max_x isa Number && isnan(max_x)) throw(ArgumentError("NaN values are not allowed in input vector")) end - breaks = quantile(xnm, (1:ngroups-1)/ngroups) - breaks = [min_x; breaks; max_x] + qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) + breaks = [min_x; find_breaks(sorted_x, qs); max_x] if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * "too many duplicated values in `x`. " * diff --git a/test/15_extras.jl b/test/15_extras.jl index 1aaf8dc7..af4f79f5 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -127,18 +127,18 @@ end @testset "cut([5, 4, 3, 2], 2)" begin x = @inferred cut([5, 4, 3, 2], 2) - @test x == ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", "Q1: [2.0, 3.5)"] + @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] end @testset "cut(x, n) with missing values" begin x = @inferred cut([5, 4, 3, missing, 2], 2) - @test x ≅ ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", missing, "Q1: [2.0, 3.5)"] + @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] end @testset "cut(x, n) with invalid n" begin @@ -255,20 +255,29 @@ end @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) @test_throws ArgumentError cut([fill(1, 10); 4], 2) - @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 2, allowempty=true) - @test unique(x) == ["Q2: [1.0, 4.0]"] + @test unique(x) == ["Q2: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"] + @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 3, allowempty=true) - @test unique(x) == ["Q3: [1.0, 4.0]"] - @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"] + @test unique(x) == ["Q3: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] + + x = cut([fill(4, 10); 1], 2) + @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"] + @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"] + @test_throws ArgumentError cut([fill(4, 10); 1], 3) + x = cut([fill(4, 10); 1], 3, allowempty=true) + @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"] + @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) - @test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)] - @test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"] + @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)] + @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)] - @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"] + @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] end @testset "cut with -0.0" begin @@ -353,12 +362,21 @@ end @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"] x = cut([1:5; Inf], 2) - @test x ≅ [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)] - @test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"] + @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)] + @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"] x = cut([1:5; -Inf], 2) - @test x ≅ [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"] - @test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"] + @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"] + @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] +end + +@testset "cut when quantile falls exactly on a data value" begin + x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) + @test x == + ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", + "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", + "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] + @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"] end end \ No newline at end of file