Skip to content

Commit cac54b2

Browse files
committed
Yet another approach
1 parent 9dd935b commit cac54b2

File tree

2 files changed

+30
-58
lines changed

2 files changed

+30
-58
lines changed

src/extras.jl

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,10 @@ Provide the default label format for the `cut(x, ngroups)` method.
233233
quantile_formatter(from, to, i; leftclosed, rightclosed) =
234234
string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
235235

236-
"""Find first value in data which is greater than each quantile in ``qs``."""
236+
"""
237+
Find first value in (sorted) `v` which is greater than or equal to each quantile
238+
in (sorted) `qs`.
239+
"""
237240
function find_breaks(v::AbstractVector, qs::AbstractVector)
238241
n = length(qs)
239242
breaks = similar(v, n)
@@ -242,7 +245,8 @@ function find_breaks(v::AbstractVector, qs::AbstractVector)
242245
i = 1
243246
q = qs[1]
244247
@inbounds for x in v
245-
if x > q
248+
# Use isless and isequal to differentiate -0.0 from 0.0
249+
if isless(q, x) || isequal(q, x)
246250
breaks[i] = x
247251
i += 1
248252
i > n && break
@@ -253,7 +257,6 @@ function find_breaks(v::AbstractVector, qs::AbstractVector)
253257
for i in i:n
254258
breaks[i] = q
255259
end
256-
@show breaks
257260
return breaks
258261
end
259262

@@ -264,10 +267,8 @@ end
264267
265268
Cut a numeric array into `ngroups` quantiles.
266269
267-
Cutpoints differ from those returned by `Statistics.quantile` as they are suited
268-
for intervals closed on the left and taken from actual values in `x`. However,
269-
group assignments are identical to those which would be obtained with type 1
270-
quantiles if intervals were closed on the right.
270+
This is equivalent to `cut(x, quantile(x, (0:ngroups)/ngroups))`,
271+
but breaks are taken from actual data values instead of estimated quantiles.
271272
272273
If `x` contains `missing` values, they are automatically skipped when computing
273274
quantiles.

test/15_extras.jl

Lines changed: 22 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -254,21 +254,30 @@ end
254254
fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0)
255255
@test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt)
256256

257-
x = cut([fill(1, 10); 4], 2)
258-
@test x == [fill("Q1: [1, 4)", 10); "Q2: [4, 4]"]
259-
@test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"]
257+
@test_throws ArgumentError cut([fill(1, 10); 4], 2)
258+
x = cut([fill(1, 10); 4], 2, allowempty=true)
259+
@test unique(x) == ["Q2: [1, 4]"]
260+
@test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"]
260261
@test_throws ArgumentError cut([fill(1, 10); 4], 3)
261262
x = cut([fill(1, 10); 4], 3, allowempty=true)
262-
@test x == [fill("Q1: [1, 4)", 10); "Q3: [4, 4]"]
263+
@test unique(x) == ["Q3: [1, 4]"]
264+
@test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"]
265+
266+
x = cut([fill(4, 10); 1], 2)
267+
@test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"]
268+
@test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"]
269+
@test_throws ArgumentError cut([fill(4, 10); 1], 3)
270+
x = cut([fill(4, 10); 1], 3, allowempty=true)
271+
@test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"]
263272
@test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"]
264273

265274
x = cut([fill(1, 5); fill(4, 5)], 2)
266275
@test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)]
267276
@test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"]
268277
@test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3)
269278
x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true)
270-
@test x == [fill("Q1: [1, 4)", 5); fill("Q3: [4, 4]", 5)]
271-
@test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"]
279+
@test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)]
280+
@test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"]
272281
end
273282

274283
@testset "cut with -0.0" begin
@@ -361,51 +370,13 @@ end
361370
@test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"]
362371
end
363372

364-
@testset "cut in corner cases" begin
365-
# In this case, cut(x, quantile(x, (0:36)/36)) in R generates
366-
# an empty "(143,172]" level
367-
# and qcut(x, 36) in Polars misses that level.
368-
# Our approach uses different breaks at 143 and 182
369-
x = [23, 23, 60, 76, 84, 95, 101, 108, 111, 133, 137, 143, 143, 143, 182,
370-
206, 214, 241, 258, 262, 280, 289, 303, 312, 321, 323, 352, 353, 354,
371-
368, 369, 373, 374, 384, 385, 386, 387, 392, 405, 406, 410, 421, 430,
372-
430, 431, 442, 464, 474, 478, 479, 496, 516, 530, 534, 549, 554, 568,
373-
575, 589, 590, 591, 592, 595, 596, 603, 625, 632, 632, 638, 640, 640,
374-
645, 648, 690, 704, 748, 758, 771, 772, 803, 835, 839, 853, 869, 873,
375-
874, 887, 911, 920, 923, 928, 933, 943, 945, 945, 947, 951, 965, 978, 980]
376-
377-
@test cut(x, 36) ==
378-
["Q17: [442, 474)", "Q8: [280, 312)", "Q2: [76, 101)", "Q9: [312, 323)",
379-
"Q14: [387, 406)", "Q30: [835, 869)", "Q17: [442, 474)", "Q35: [947, 965)",
380-
"Q2: [76, 101)", "Q11: [354, 373)", "Q32: [887, 923)", "Q12: [373, 385)",
381-
"Q24: [603, 638)", "Q29: [772, 835)", "Q24: [603, 638)", "Q15: [406, 430)",
382-
"Q11: [354, 373)", "Q23: [592, 603)", "Q3: [101, 133)", "Q16: [430, 442)",
383-
"Q34: [933, 947)", "Q27: [648, 748)", "Q28: [748, 772)", "Q28: [748, 772)",
384-
"Q21: [568, 589)", "Q18: [474, 496)", "Q32: [887, 923)", "Q11: [354, 373)",
385-
"Q7: [241, 280)", "Q3: [101, 133)", "Q19: [496, 534)", "Q13: [385, 387)",
386-
"Q36: [965, 980]", "Q33: [923, 933)", "Q16: [430, 442)", "Q36: [965, 980]",
387-
"Q27: [648, 748)", "Q24: [603, 638)", "Q32: [887, 923)", "Q4: [133, 182)",
388-
"Q22: [589, 592)", "Q1: [23, 76)", "Q5: [182, 206)", "Q28: [748, 772)",
389-
"Q30: [835, 869)", "Q31: [869, 887)", "Q22: [589, 592)", "Q15: [406, 430)",
390-
"Q31: [869, 887)", "Q19: [496, 534)", "Q26: [640, 648)", "Q8: [280, 312)",
391-
"Q18: [474, 496)", "Q14: [387, 406)", "Q7: [241, 280)", "Q30: [835, 869)",
392-
"Q3: [101, 133)", "Q9: [312, 323)", "Q4: [133, 182)", "Q24: [603, 638)",
393-
"Q20: [534, 568)", "Q25: [638, 640)", "Q20: [534, 568)", "Q23: [592, 603)",
394-
"Q12: [373, 385)", "Q27: [648, 748)", "Q29: [772, 835)", "Q6: [206, 241)",
395-
"Q34: [933, 947)", "Q16: [430, 442)", "Q26: [640, 648)", "Q15: [406, 430)",
396-
"Q12: [373, 385)", "Q26: [640, 648)", "Q19: [496, 534)", "Q4: [133, 182)",
397-
"Q34: [933, 947)", "Q31: [869, 887)", "Q10: [323, 354)", "Q21: [568, 589)",
398-
"Q4: [133, 182)", "Q7: [241, 280)", "Q35: [947, 965)", "Q14: [387, 406)",
399-
"Q18: [474, 496)", "Q34: [933, 947)", "Q20: [534, 568)", "Q22: [589, 592)",
400-
"Q33: [923, 933)", "Q10: [323, 354)", "Q13: [385, 387)", "Q1: [23, 76)",
401-
"Q36: [965, 980]", "Q2: [76, 101)", "Q23: [592, 603)", "Q6: [206, 241)",
402-
"Q1: [23, 76)", "Q10: [323, 354)", "Q4: [133, 182)", "Q8: [280, 312)"]
403-
404-
@test cut([0, 1, 1, 1, 1], 2) ==
405-
["Q1: [0, 1)", "Q2: [1, 1]", "Q2: [1, 1]", "Q2: [1, 1]", "Q2: [1, 1]"]
406-
407-
@test cut([1, 1, 1, 1, 2], 2) ==
408-
["Q1: [1, 2)", "Q1: [1, 2)", "Q1: [1, 2)", "Q1: [1, 2)", "Q2: [2, 2]"]
373+
@testset "cut when quantile falls exactly on a data value" begin
374+
x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3)
375+
@test x ==
376+
["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)",
377+
"Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)",
378+
"Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"]
379+
@test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"]
409380
end
410381

411382
end

0 commit comments

Comments
 (0)