diff --git a/Project.toml b/Project.toml index 4de04411..2c345ff7 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ CategoricalArraysStructTypesExt = "StructTypes" [compat] Arrow = "2" -Compat = "3.37, 4" +Compat = "3.47, 4.10" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index e597c344..f3383645 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -11,10 +11,12 @@ module CategoricalArrays import DataAPI: unwrap export unwrap + using Compat + @compat public default_formatter, numbered_formatter + using DataAPI using Missings using Printf - import Compat # JuliaLang/julia#36810 if VERSION < v"1.5.2" diff --git a/src/extras.jl b/src/extras.jl index 3f27aba6..60f32a64 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -27,17 +27,67 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray, end end +if VERSION >= v"1.10" + const CUT_FMT = Printf.Format("%.*g") +end + """ - default_formatter(from, to, i; leftclosed, rightclosed) + CategoricalArrays.default_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) -Provide the default label format for the `cut(x, breaks)` method. +Provide the default label format for the `cut(x, breaks)` method, +which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise. + +If they are floating points values, breaks are turned into to strings using +`@sprintf("%.*g", sigdigits, break)` +(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break). """ -default_formatter(from, to, i; leftclosed, rightclosed) = - string(leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +function default_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + @static if VERSION >= v"1.10" + from_str = from isa AbstractFloat ? + Printf.format(CUT_FMT, sigdigits, from) : + string(from) + to_str = to isa AbstractFloat ? + Printf.format(CUT_FMT, sigdigits, to) : + string(to) + else + from_str = from isa AbstractFloat ? + Printf.format(Printf.Format("%.$(sigdigits)g"), from) : + string(from) + to_str = to isa AbstractFloat ? + Printf.format(Printf.Format("%.$(sigdigits)g"), to) : + string(to) + end + string(leftclosed ? "[" : "(", from_str, ", ", to_str, rightclosed ? "]" : ")") +end + +""" + CategoricalArrays.numbered_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + +Provide the default label format for the `cut(x, ngroups)` method +when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed` +is `true` and `"i: [from, to)"` otherwise. + +If they are floating points values, breaks are turned into to strings using +`@sprintf("%.*g", sigdigits, breaks)` +(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break). +""" +numbered_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) = + string(i, ": ", + default_formatter(from, to, i, leftclosed=leftclosed, rightclosed=rightclosed, + sigdigits=sigdigits)) @doc raw""" cut(x::AbstractArray, breaks::AbstractVector; labels::Union{AbstractVector,Function}, + sigdigits::Integer=3, extend::Union{Bool,Missing}=false, allowempty::Bool=false) Cut a numeric array into intervals at values `breaks` @@ -49,15 +99,25 @@ the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will also accept them. +!!! note + For floating point data, breaks may be rounded to `sigdigits` significant digits + when generating interval labels, meaning that they may not reflect exactly the cutpoints + used. + # Keyword arguments * `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values in `x` fall outside of the breaks; when `true`, breaks are automatically added to include all values in `x`; when `missing`, values outside of the breaks generate `missing` entries. * `labels::Union{AbstractVector, Function}`: a vector of strings, characters - or numbers giving the names to use for - the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates + or numbers giving the names to use for the intervals; or a function + `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`). + [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"` + for the rightmost interval if `extend == true`). +* `sigdigits::Integer=3`: the minimum number of significant digits to use in labels. + This value is increased automatically if necessary so that rounded breaks are unique. + Only used for floating point types and when `labels` is a function, in which case it + is passed to it as a keyword argument. * `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than the last one appear multiple times, generating empty intervals; when `true`, duplicate breaks are allowed and the intervals they generate are kept as @@ -69,19 +129,19 @@ julia> using CategoricalArrays julia> cut(-1:0.5:1, [0, 1], extend=true) 5-element CategoricalArray{String,1,UInt32}: - "[-1.0, 0.0)" - "[-1.0, 0.0)" - "[0.0, 1.0]" - "[0.0, 1.0]" - "[0.0, 1.0]" + "[-1, 0)" + "[-1, 0)" + "[0, 1]" + "[0, 1]" + "[0, 1]" julia> cut(-1:0.5:1, 2) 5-element CategoricalArray{String,1,UInt32}: - "Q1: [-1.0, 0.0)" - "Q1: [-1.0, 0.0)" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" + "[-1, 0)" + "[-1, 0)" + "[0, 1]" + "[0, 1]" + "[0, 1]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: @@ -114,15 +174,17 @@ julia> cut(-1:0.5:1, 3, labels=fmt) @inline function cut(x::AbstractArray, breaks::AbstractVector; extend::Union{Bool, Missing}=false, labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter, + sigdigits::Integer=3, allowempty::Bool=false) - return _cut(x, breaks, extend, labels, allowempty) + return _cut(x, breaks, extend, labels, sigdigits, allowempty) end # Separate function for inferability (thanks to inlining of cut) function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, extend::Union{Bool, Missing}, labels::Union{AbstractVector{<:SupportedTypes},Function}, - allowempty::Bool=false) where {T, N} + sigdigits::Integer, + allowempty::Bool) where {T, N} if !issorted(breaks) breaks = sort(breaks) end @@ -179,21 +241,60 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, end end + # Find minimal number of digits so that distinct breaks remain so + if eltype(breaks) <: AbstractFloat + while true + local i + for outer i in 2:lastindex(breaks) + b1 = breaks[i-1] + b2 = breaks[i] + isequal(b1, b2) && continue + + @static if VERSION >= v"1.9" + b1_str = Printf.format(CUT_FMT, sigdigits, b1) + b2_str = Printf.format(CUT_FMT, sigdigits, b2) + else + b1_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b1) + b2_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b2) + end + if b1_str == b2_str + sigdigits += 1 + break + end + end + i == lastindex(breaks) && break + end + end n = length(breaks) n >= 2 || throw(ArgumentError("at least two breaks must be provided when extend is not true")) if labels isa Function from = breaks[1:n-1] to = breaks[2:n] - firstlevel = labels(from[1], to[1], 1, - leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false) + local firstlevel + try + firstlevel = labels(from[1], to[1], 1, + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false, + sigdigits=sigdigits) + catch + # Support functions defined before v1.0, where sigdigits did not exist + Base.depwarn("`labels` function is now required to accept a `sigdigits` keyword argument", + :cut) + labels_orig = labels + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> + labels_orig(from, to, i; leftclosed, rightclosed) + firstlevel = labels_orig(from[1], to[1], 1, + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false) + end levs = Vector{typeof(firstlevel)}(undef, n-1) levs[1] = firstlevel for i in 2:n-2 levs[i] = labels(from[i], to[i], i, - leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false) + leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false, + sigdigits=sigdigits) end levs[end] = labels(from[end], to[end], n-1, - leftclosed=true, rightclosed=true) + leftclosed=true, rightclosed=true, + sigdigits=sigdigits) else length(labels) == n-1 || throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) @@ -213,14 +314,6 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, CategoricalArray{S, N}(refs, pool) end -""" - quantile_formatter(from, to, i; leftclosed, rightclosed) - -Provide the default label format for the `cut(x, ngroups)` method. -""" -quantile_formatter(from, to, i; leftclosed, rightclosed) = - string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") - """ Find first value in (sorted) `v` which is greater than or equal to each quantile in (sorted) `qs`. @@ -247,6 +340,7 @@ end """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, + sigdigits::Integer=3, allowempty::Bool=false) Cut a numeric array into `ngroups` quantiles. @@ -257,19 +351,32 @@ but breaks are taken from actual data values instead of estimated quantiles. If `x` contains `missing` values, they are automatically skipped when computing quantiles. +!!! note + For floating point data, breaks may be rounded to `sigdigits` significant digits + when generating interval labels, meaning that they may not reflect exactly the cutpoints + used. + # Keyword arguments * `labels::Union{AbstractVector, Function}`: a vector of strings, characters - or numbers giving the names to use for - the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates + or numbers giving the names to use for the intervals; or a function + `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval). + [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"` + for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to + [`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile + number to ensure uniqueness. +* `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding + breaks for inclusion in generated labels. This value is increased automatically if necessary + so that rounded breaks are unique. Only used for floating point types and when `labels` is a + function, in which case it is passed to it as a keyword argument. * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints other than the last one are equal, generating empty intervals; when `true`, duplicate breaks are allowed and the intervals they generate are kept as unused levels (but duplicate labels are not allowed). """ function cut(x::AbstractArray, ngroups::Integer; - labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, + labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing, + sigdigits::Integer=3, allowempty::Bool=false) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) @@ -286,5 +393,8 @@ function cut(x::AbstractArray, ngroups::Integer; "Pass `allowempty=true` to allow empty quantiles or " * "choose a lower value for `ngroups`.")) end - cut(x, breaks; labels=labels, allowempty=allowempty) + if labels === nothing + labels = allowempty ? numbered_formatter : default_formatter + end + return cut(x, breaks; labels=labels, sigdigits=sigdigits, allowempty=allowempty) end diff --git a/test/15_extras.jl b/test/15_extras.jl index af4f79f5..5df7860b 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -93,10 +93,10 @@ const ≅ = isequal @test levels(x) == ["b", "a"] x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5]) - @test x == ["[-2.134, 3.0)" "[3.0, 12.5]"; "[-2.134, 3.0)" "[3.0, 12.5]"] + @test x == ["[-2.13, 3)" "[3, 12.5]"; "[-2.13, 3)" "[3, 12.5]"] @test isa(x, CategoricalMatrix{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5]"] + @test levels(x) == ["[-2.13, 3)", "[3, 12.5]"] labels = 0:2:8 x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @@ -127,18 +127,18 @@ end @testset "cut([5, 4, 3, 2], 2)" begin x = @inferred cut([5, 4, 3, 2], 2) - @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"] + @test x == ["[4, 5]", "[4, 5]", "[2, 4)", "[2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] + @test levels(x) == ["[2, 4)", "[4, 5]"] end @testset "cut(x, n) with missing values" begin x = @inferred cut([5, 4, 3, missing, 2], 2) - @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"] + @test x ≅ ["[4, 5]", "[4, 5]", "[2, 4)", missing, "[2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] + @test levels(x) == ["[2, 4)", "[4, 5]"] end @testset "cut(x, n) with invalid n" begin @@ -147,7 +147,7 @@ end end @testset "cut with formatter function" begin - my_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" + my_formatter(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $from -- $to" x = 0.15:0.20:0.95 p = [0, 0.4, 0.8, 1.0] @@ -155,20 +155,24 @@ end a = @inferred cut(x, p, labels=my_formatter) @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"] + my_old_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" + a = @test_deprecated r"`labels`.*" cut(x, p, labels=my_old_formatter) + @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"] + # GH 274 - my_formatter_2(from, to, i; leftclosed, rightclosed) = "$i: $(from+1) -- $(to+1)" + my_formatter_2(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $(from+1) -- $(to+1)" a = @inferred cut(x, p, labels=my_formatter_2) @test a == ["1: 1.0 -- 1.4", "1: 1.0 -- 1.4", "2: 1.4 -- 1.8", "2: 1.4 -- 1.8", "3: 1.8 -- 2.0"] for T in (Union{}, Missing) - labels = (from, to, i; leftclosed, rightclosed) -> (to+from)/2 + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> (to+from)/2 a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @test a == [1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0] @test isa(a, CategoricalVector{Union{Float64, T}}) @test isordered(a) @test levels(a) == [1.0, 3.0, 5.0, 7.0, 9.0] - labels = (from, to, i; leftclosed, rightclosed) -> "$((to+from)/2)" + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> "$((to+from)/2)" a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @test a == string.([1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0]) @test isa(a, CategoricalVector{Union{String, T}}) @@ -188,8 +192,8 @@ end @test_throws ArgumentError cut(x, [0, 0.1, 0.1, 10]) @test_throws ArgumentError cut(x, 10) y = cut(x, [0, 0.1, 10, 10]) - @test y == [fill("[0.0, 0.1)", 10); fill("[0.1, 10.0)", 10)] - @test levels(y) == ["[0.0, 0.1)", "[0.1, 10.0)", "[10.0, 10.0]"] + @test y == [fill("[0, 0.1)", 10); fill("[0.1, 10)", 10)] + @test levels(y) == ["[0, 0.1)", "[0.1, 10)", "[10, 10]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 11]) y = cut(1:10, [1, 5, 5, 11], allowempty=true) @@ -251,55 +255,55 @@ end @test_throws ArgumentError cut(1:8, 0:2:10, labels=[0, 1, 1, 2, 3]) @test_throws ArgumentError cut(1:8, [0, 2, 2, 6, 8, 10], labels=[0, 1, 1, 2, 3], allowempty=true) - fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0) + fmt = (from, to, i; leftclosed, rightclosed, sigdigits) -> (i % 2 == 0 ? to : 0.0) @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) @test_throws ArgumentError cut([fill(1, 10); 4], 2) x = cut([fill(1, 10); 4], 2, allowempty=true) - @test unique(x) == ["Q2: [1, 4]"] - @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"] + @test unique(x) == ["2: [1, 4]"] + @test levels(x) == ["1: (1, 1)", "2: [1, 4]"] @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 3, allowempty=true) - @test unique(x) == ["Q3: [1, 4]"] - @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] + @test unique(x) == ["3: [1, 4]"] + @test levels(x) == ["1: (1, 1)", "2: (1, 1)", "3: [1, 4]"] x = cut([fill(4, 10); 1], 2) - @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"] - @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"] + @test x == [fill("[4, 4]", 10); "[1, 4)"] + @test levels(x) == ["[1, 4)"; "[4, 4]"] @test_throws ArgumentError cut([fill(4, 10); 1], 3) x = cut([fill(4, 10); 1], 3, allowempty=true) - @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"] - @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] + @test x == [fill("3: [4, 4]", 10); "1: [1, 4)"] + @test levels(x) == ["1: [1, 4)", "2: (4, 4)", "3: [4, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) - @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)] - @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] + @test x == [fill("[1, 4)", 5); fill("[4, 4]", 5)] + @test levels(x) == ["[1, 4)", "[4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] - @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] + @test x == [fill("2: [1, 4)", 5); fill("3: [4, 4]", 5)] + @test levels(x) == ["1: (1, 1)", "2: [1, 4)", "3: [4, 4]"] end @testset "cut with -0.0" begin x = cut([-0.0, 0.0, 0.0, -0.0], 2) - @test x == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]", "Q2: [0.0, 0.0]", "Q1: [-0.0, 0.0)"] - @test levels(x) == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]"] + @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"] + @test levels(x) == ["[-0, 0)", "[0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0, 0.0]) - @test x == ["[-0.0, 0.0)", "[0.0, 0.0]", "[0.0, 0.0]", "[-0.0, 0.0)"] - @test levels(x) == ["[-0.0, 0.0)", "[0.0, 0.0]"] + @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"] + @test levels(x) == ["[-0, 0)", "[0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0]) - @test x == fill("[-0.0, 0.0]", 4) - @test levels(x) == ["[-0.0, 0.0]"] + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [0.0], extend=true) - @test x == fill("[-0.0, 0.0]", 4) - @test levels(x) == ["[-0.0, 0.0]"] + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0], extend=true) - @test x == fill("[-0.0, 0.0]", 4) - @test levels(x) == ["[-0.0, 0.0]"] + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], 2, labels=[-0.0, 0.0]) @test x == [-0.0, 0.0, 0.0, -0.0] @@ -336,7 +340,7 @@ end @test levels(x) == [-0.0, 0.0] x = @inferred cut(-1:0.5:1, [0, 1], extend=true) - @test x == ["[-1.0, 0.0)", "[-1.0, 0.0)", "[0.0, 1.0]", "[0.0, 1.0]", "[0.0, 1.0]"] + @test x == ["[-1, 0)", "[-1, 0)", "[0, 1]", "[0, 1]", "[0, 1]"] end @testset "cut with NaN and Inf" begin @@ -346,37 +350,77 @@ end @test_throws ArgumentError("NaN values are not allowed in breaks") cut([1, 2], [1, NaN]) x = cut([1, Inf], [1], extend=true) - @test x ≅ ["[1.0, Inf]", "[1.0, Inf]"] - @test levels(x) == ["[1.0, Inf]"] + @test x ≅ ["[1, Inf]", "[1, Inf]"] + @test levels(x) == ["[1, Inf]"] x = cut([1, -Inf], [1], extend=true) - @test x ≅ ["[-Inf, 1.0]", "[-Inf, 1.0]"] - @test levels(x) == ["[-Inf, 1.0]"] + @test x ≅ ["[-Inf, 1]", "[-Inf, 1]"] + @test levels(x) == ["[-Inf, 1]"] x = cut([1:5; Inf], [1, 2, Inf]) - @test x ≅ ["[1.0, 2.0)"; fill("[2.0, Inf]", 5)] - @test levels(x) == ["[1.0, 2.0)", "[2.0, Inf]"] + @test x ≅ ["[1, 2)"; fill("[2, Inf]", 5)] + @test levels(x) == ["[1, 2)", "[2, Inf]"] x = cut([1:5; -Inf], [-Inf, 2, 5]) - @test x ≅ ["[-Inf, 2.0)"; fill("[2.0, 5.0]", 4); "[-Inf, 2.0)"] - @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"] + @test x ≅ ["[-Inf, 2)"; fill("[2, 5]", 4); "[-Inf, 2)"] + @test levels(x) == ["[-Inf, 2)", "[2, 5]"] x = cut([1:5; Inf], 2) - @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)] - @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"] + @test x ≅ [fill("[1, 4)", 3); fill("[4, Inf]", 3)] + @test levels(x) == ["[1, 4)", "[4, Inf]"] x = cut([1:5; -Inf], 2) - @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"] - @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] + @test x ≅ [fill("[-Inf, 3)", 2); fill("[3, 5]", 3); "[-Inf, 3)"] + @test levels(x) == ["[-Inf, 3)", "[3, 5]"] end @testset "cut when quantile falls exactly on a data value" begin x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) @test x == - ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", - "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", - "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] - @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"] + ["[11, 54)", "[11, 54)", "[11, 54)", + "[54, 73)", "[54, 73)", "[54, 73)", + "[73, 84]", "[73, 84]", "[73, 84]", "[73, 84]"] + @test levels(x) == ["[11, 54)", "[54, 73)", "[73, 84]"] +end + +@testset "cut computation of sigdigits" begin + x = cut([1.2, 1.3, 2], 2) + @test levels(x) == ["[1.2, 1.3)", "[1.3, 2]"] + + x = cut([1.0, 2.0, 3.0], 2) + @test levels(x) == ["[1, 2)", "[2, 3]"] + + x = cut([1.00002, 1.00003, 2], 2) + @test levels(x) == ["[1.00002, 1.00003)", "[1.00003, 2]"] + + x = cut([1.00002, 1.00003, 1.00005, 2], 2) + @test levels(x) == ["[1, 1.0001)", "[1.0001, 2]"] + + x = cut([1.00001, 1.00002, 1.00002, 2], 2) + @test levels(x) == ["[1.00001, 1.00002)", "[1.00002, 2]"] + + x = cut([1.00001, 1.00003, 1.1, 2], 2) + @test levels(x) == ["[1, 1.1)", "[1.1, 2]"] + + # @sprintf with %g uses scientific notation even in some cases + # where classic notation would be shorter + x = cut([1.0, 10.0, 100.0, 1000.0], [1.0, 10.0, 100.0, 1000.0]) + @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1e+03]"] + # But integers are rendered using plain `string` + x = cut([1, 10, 100], [1, 10, 100, 1000]) + @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1000]"] + + # Extreme case + x = cut([8.85718832925723e-7, 8.572446994052413e-7, 1.40217695121027e-7, 8.966449714804087e-7, + 3.070384341319470e-7, 3.070384341319471e-7, 1.8520709563325888e-7, 5.630461710066611e-7, + 6.781422109070843e-7, 4.776113711396994e-7, 0.2538909094146984, 0.5249665525921473, + 0.8321957380046366, 0.9648282851978118, 0.36084175275805797, 0.7851054639425253, + 0.6875195857202754, 0.614940093507575, 0.6224944997292978, 0.6055683461790675, + 5.349085340927365e11, 1.3471583229449602e11, 6.538893396835975e11, 4.826316844547661e11, + 8.803607035550856e11, 1.8174694671397316e10, 1.6709745443719125e11, 3.2050577954311835e11, + 1.6134999167460663e11, 7.396308745225059e11], 3) + @test levels(x) == ["[1.4e-07, 0.254)", "[0.254, 1.82e+10)", "[1.82e+10, 8.8e+11]"] + end end \ No newline at end of file