Skip to content

Commit 7fca6e8

Browse files
authored
Add quantilerank and percentilerank functions (#741)
1 parent 4f02efa commit 7fca6e8

File tree

4 files changed

+224
-3
lines changed

4 files changed

+224
-3
lines changed

docs/src/scalarstats.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ iqr
6969
nquantile
7070
quantile
7171
Statistics.median(v::StatsBase.RealVector, w::AbstractWeights{<:Real})
72+
quantilerank
73+
percentilerank
7274
```
7375

7476
## Mode and Modes

src/StatsBase.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ export
8181
zscore, # compute Z-scores
8282
zscore!, # compute Z-scores inplace or to a pre-allocated array
8383

84-
percentile, # quantile using percentage (instead of fraction) as argument
85-
nquantile, # quantiles at [0:n]/n
84+
percentile, # quantile using percentage (instead of fraction) as argument
85+
nquantile, # quantiles at [0:n]/n
86+
quantilerank, # quantile-position (0-1) of a value relative to a collection
87+
percentilerank, # percentile-position (0-100) of a value relative to a collection
8688

8789
span, # The range minimum(x):maximum(x)
8890
variation, # ratio of standard deviation to mean

src/scalarstats.jl

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,166 @@ returns a vector of quantiles, respectively at `[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`.
226226
"""
227227
nquantile(x, n::Integer) = quantile(x, (0:n)/n)
228228

229+
"""
230+
quantilerank(itr, value; method=:inc)
231+
232+
Compute the quantile position in the [0, 1] interval of `value` relative to collection `itr`.
233+
234+
Different definitions can be chosen via the `method` keyword argument.
235+
Let `count_less` be the number of elements of `itr` that are less than `value`,
236+
`count_equal` the number of elements of `itr` that are equal to `value`, `n` the length of `itr`,
237+
`greatest_smaller` the highest value below `value` and `smallest_greater` the lowest value above `value`.
238+
Then `method` supports the following definitions:
239+
240+
- `:inc` (default): Return a value in the range 0 to 1 inclusive.
241+
Return `count_less / (n - 1)` if `value ∈ itr`, otherwise apply interpolation based on
242+
definition 7 of quantile in Hyndman and Fan (1996)
243+
(equivalent to Excel `PERCENTRANK` and `PERCENTRANK.INC`).
244+
This definition corresponds to the lower semi-continuous inverse of
245+
[`quantile`](@ref) with its default parameters.
246+
247+
- `:exc`: Return a value in the range 0 to 1 exclusive.
248+
Return `(count_less + 1) / (n + 1)` if `value ∈ itr` otherwise apply interpolation
249+
based on definition 6 of quantile in Hyndman and Fan (1996)
250+
(equivalent to Excel `PERCENTRANK.EXC`).
251+
252+
- `:compete`: Return `count_less / (n - 1)` if `value ∈ itr`, otherwise
253+
return `(count_less - 1) / (n - 1)`, without interpolation
254+
(equivalent to MariaDB `PERCENT_RANK`, dplyr `percent_rank`).
255+
256+
- `:tied`: Return `(count_less + count_equal/2) / n`, without interpolation.
257+
Based on the definition in Roscoe, J. T. (1975)
258+
(equivalent to `"mean"` kind of SciPy `percentileofscore`).
259+
260+
- `:strict`: Return `count_less / n`, without interpolation
261+
(equivalent to `"strict"` kind of SciPy `percentileofscore`).
262+
263+
- `:weak`: Return `(count_less + count_equal) / n`, without interpolation
264+
(equivalent to `"weak"` kind of SciPy `percentileofscore`).
265+
266+
!!! note
267+
An `ArgumentError` is thrown if `itr` contains `NaN` or `missing` values
268+
or if `itr` contains fewer than two elements.
269+
270+
# References
271+
Roscoe, J. T. (1975). [Fundamental Research Statistics for the Behavioral Sciences]
272+
(http://www.bryanburnham.net/wp-content/uploads/2014/07/Fundamental-Statistics-for-the-Behavioral-Sciences-v2.0.pdf#page=57)",
273+
2nd ed., New York : Holt, Rinehart and Winston.
274+
275+
Hyndman, R.J and Fan, Y. (1996) "[Sample Quantiles in Statistical Packages]
276+
(https://www.amherst.edu/media/view/129116/original/Sample+Quantiles.pdf)",
277+
*The American Statistician*, Vol. 50, No. 4, pp. 361-365.
278+
279+
# Examples
280+
```julia
281+
julia> using StatsBase
282+
283+
julia> v1 = [1, 1, 1, 2, 3, 4, 8, 11, 12, 13];
284+
285+
julia> v2 = [1, 2, 3, 5, 6, missing, 8];
286+
287+
julia> v3 = [1, 2, 3, 4, 4, 5, 6, 7, 8, 9];
288+
289+
julia> quantilerank(v1, 2)
290+
0.3333333333333333
291+
292+
julia> quantilerank(v1, 2, method=:exc), quantilerank(v1, 2, method=:tied)
293+
(0.36363636363636365, 0.35)
294+
295+
# use `skipmissing` for vectors with missing entries.
296+
julia> quantilerank(skipmissing(v2), 4)
297+
0.5
298+
299+
# use broadcasting with `Ref` to compute quantile rank for multiple values
300+
julia> quantilerank.(Ref(v3), [4, 8])
301+
2-element Vector{Float64}:
302+
0.3333333333333333
303+
0.8888888888888888
304+
```
305+
"""
306+
function quantilerank(itr, value; method::Symbol=:inc)
307+
((value isa Number && isnan(value)) || ismissing(value)) &&
308+
throw(ArgumentError("`value` cannot be NaN or missing"))
309+
any(x -> ismissing(x) || (x isa Number && isnan(x)), itr) &&
310+
throw(ArgumentError("`itr` cannot contain missing or NaN entries"))
311+
312+
count_less = count_equal = n = 0
313+
greatest_smaller = smallest_greater = value
314+
for x in itr
315+
if x == value
316+
count_equal += 1
317+
elseif x < value
318+
count_less += 1
319+
if greatest_smaller == value || greatest_smaller < x
320+
greatest_smaller = x
321+
end
322+
else
323+
if smallest_greater == value || smallest_greater > x
324+
smallest_greater = x
325+
end
326+
end
327+
n += 1
328+
end
329+
330+
n == 0 && throw(ArgumentError("`itr` is empty. Pass a collection with at least two elements"))
331+
n == 1 && throw(ArgumentError("`itr` has only 1 value. Pass a collection with at least two elements"))
332+
333+
if method == :inc
334+
if greatest_smaller == value
335+
return 0.0
336+
elseif count_equal > 0
337+
return count_less / (n - 1)
338+
elseif smallest_greater == value
339+
return 1.0
340+
else
341+
lower = (count_less - 1) / (n - 1)
342+
upper = count_less / (n - 1)
343+
ratio = (value - greatest_smaller) / (smallest_greater - greatest_smaller)
344+
return lower + ratio * (upper - lower)
345+
end
346+
elseif method == :exc
347+
if count_less == 0 && count_equal == 0
348+
return 0.0
349+
elseif count_less == 0
350+
return 1.0 / (n + 1)
351+
elseif count_equal > 0
352+
return (count_less + 1) / (n + 1)
353+
elseif smallest_greater == value
354+
return 1.0
355+
else
356+
lower = count_less / (n + 1)
357+
upper = (count_less + 1) / (n + 1)
358+
ratio = (value - greatest_smaller) / (smallest_greater - greatest_smaller)
359+
return lower + ratio * (upper - lower)
360+
end
361+
elseif method == :compete
362+
if value > maximum(itr)
363+
return 1.0
364+
elseif value minimum(itr)
365+
return 0.0
366+
else
367+
value itr && (count_less += 1)
368+
return (count_less - 1) / (n - 1)
369+
end
370+
elseif method == :tied
371+
return (count_less + count_equal/2) / n
372+
elseif method == :strict
373+
return count_less / n
374+
elseif method == :weak
375+
return (count_less + count_equal) / n
376+
else
377+
throw(ArgumentError("method=:$method is not valid. Pass :inc, :exc, :compete, :tied, :strict or :weak."))
378+
end
379+
end
380+
381+
"""
382+
percentilerank(itr, value; method=:inc)
383+
384+
Return the `q`th percentile of `value` in collection `itr`, i.e. [`quantilerank(itr, value)`](@ref) * 100.
385+
386+
See the [`quantilerank`](@ref) docstring for more details.
387+
"""
388+
percentilerank(itr, value; method::Symbol=:inc) = quantilerank(itr, value, method=method) * 100
229389

230390
#############################
231391
#

test/scalarstats.jl

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,64 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.]
9898
@test percentile(skipmissing([missing, 2, 5, missing]), 25) 2.75
9999
@test percentile(skipmissing([missing, 2, 5, missing]), [25, 50, 75]) [2.75, 3.5, 4.25]
100100

101-
101+
@testset "quantilerank and percentilerank" begin
102+
@testset "value as number and array" begin
103+
@testset ":inc and :exc" begin
104+
v1 = [1, 1, 1, 2, 3, 4, 8, 11, 12, 13]
105+
v2 = [1, 2, 3, 6, 6, 6, 7, 8, 9]
106+
v3 = [1, 2, 4, 3, 4]
107+
v4 = [1, 2, 1, 3, 4]
108+
@test quantilerank(v1, 2, method=:inc) == 1/3
109+
@test quantilerank(v1, 4, method=:inc) == 5/9
110+
@test quantilerank(v1, 8, method=:inc) == 2/3
111+
@test quantilerank(v1, 5, method=:inc) == 7/12
112+
@test quantilerank(v2, 7, method=:exc) == 0.7
113+
@test quantilerank(v2, 5.43, method=:exc) == 0.381
114+
@test quantilerank(v3, 4, method=:exc) == 6/9
115+
@test quantilerank(v3, 4, method=:inc) == 3/4
116+
@test quantilerank(v4, 1, method=:exc) == 1/6
117+
@test quantilerank(v4, -100, method=:inc) == 0.0
118+
@test quantilerank(v4, 100, method=:inc) == 1.0
119+
@test quantilerank(v4, -100, method=:exc) == 0.0
120+
@test quantilerank(v4, 100, method=:exc) == 1.0
121+
@test percentilerank(v1, 2) == 100 * quantilerank(v1, 2)
122+
@test percentilerank(v2, 7, method=:exc) == 100 * quantilerank(v2, 7, method=:exc)
123+
end
124+
@testset ":compete" begin
125+
v = [0, 0, 1, 1, 2, 2, 2, 2, 4, 4]
126+
@test quantilerank(v, 1, method=:compete) == 2/9
127+
@test quantilerank(v, 2, method=:compete) == 4/9
128+
@test quantilerank(v, 4, method=:compete) == 8/9
129+
@test quantilerank(v, -100, method=:compete) == 0.0
130+
@test quantilerank(v, 100, method=:compete) == 1.0
131+
end
132+
@testset ":strict, :weak and :tied" begin
133+
v = [7, 8, 2, 1, 3, 4, 5, 4, 6, 9]
134+
for (method, res1, res2) in [(:tied, .4, [.4, .85]),
135+
(:strict, .3, [.3, .8]),
136+
(:weak, .5, [.5, .9])]
137+
@test quantilerank(v, 4, method=method) == res1
138+
end
139+
end
140+
end
141+
@testset "errors" begin
142+
v1 = [1, 2, 3, 5, 6, missing, 8]
143+
v2 = [missing, missing]
144+
v3 = [1, 2, 3, 5, 6, NaN, 8]
145+
v4 = [1, 2, 3, 3, 4]
146+
for method in (:tied, :strict, :weak)
147+
@test_throws ArgumentError quantilerank(v1, 4, method=method)
148+
@test_throws ArgumentError quantilerank(v2, 4, method=method)
149+
@test_throws ArgumentError quantilerank(v3, 4, method=method)
150+
end
151+
@test_throws ArgumentError quantilerank(v4, 3, method=:wrongargument)
152+
@test_throws ArgumentError quantilerank(v4, NaN)
153+
@test_throws ArgumentError quantilerank(v4, missing)
154+
@test_throws ArgumentError quantilerank([], 3)
155+
@test_throws ArgumentError quantilerank([1], 3)
156+
end
157+
end
158+
102159
##### Dispersion
103160

104161
@test span([3, 4, 5, 6, 2]) == (2:6)

0 commit comments

Comments
 (0)