Add quantilerank and percentilerank functions (#741)

AugustoCL · web-flow · commit 7fca6e803f5a · 2022-02-04T17:05:06.000+01:00
diff --git a/docs/src/scalarstats.md b/docs/src/scalarstats.md
@@ -69,6 +69,8 @@ iqr
 nquantile
 quantile
 Statistics.median(v::StatsBase.RealVector, w::AbstractWeights{<:Real})
+quantilerank
+percentilerank
 ```
 
 ## Mode and Modes
diff --git a/src/StatsBase.jl b/src/StatsBase.jl
@@ -81,8 +81,10 @@ export
     zscore,      # compute Z-scores
     zscore!,     # compute Z-scores inplace or to a pre-allocated array
 
-    percentile,  # quantile using percentage (instead of fraction) as argument
-    nquantile,   # quantiles at [0:n]/n
+    percentile,     # quantile using percentage (instead of fraction) as argument
+    nquantile,      # quantiles at [0:n]/n
+    quantilerank,   # quantile-position (0-1) of a value relative to a collection
+    percentilerank, # percentile-position (0-100) of a value relative to a collection
 
     span,        # The range minimum(x):maximum(x)
     variation,   # ratio of standard deviation to mean
diff --git a/src/scalarstats.jl b/src/scalarstats.jl
@@ -226,6 +226,166 @@ returns a vector of quantiles, respectively at `[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`.
 """
 nquantile(x, n::Integer) = quantile(x, (0:n)/n)
 
+"""
+    quantilerank(itr, value; method=:inc)
+
+Compute the quantile position in the [0, 1] interval of `value` relative to collection `itr`.
+
+Different definitions can be chosen via the `method` keyword argument.
+Let `count_less` be the number of elements of `itr` that are less than `value`, 
+`count_equal` the number of elements of `itr` that are equal to `value`, `n` the length of `itr`, 
+`greatest_smaller` the highest value below `value` and `smallest_greater` the lowest value above `value`. 
+Then `method` supports the following definitions:
+
+- `:inc` (default): Return a value in the range 0 to 1 inclusive. 
+Return `count_less / (n - 1)` if `value ∈ itr`, otherwise apply interpolation based on 
+definition 7 of quantile in Hyndman and Fan (1996)
+(equivalent to Excel `PERCENTRANK` and `PERCENTRANK.INC`).
+This definition corresponds to the lower semi-continuous inverse of
+[`quantile`](@ref) with its default parameters.
+
+- `:exc`: Return a value in the range 0 to 1 exclusive.
+Return `(count_less + 1) / (n + 1)` if `value ∈ itr` otherwise apply interpolation 
+based on definition 6 of quantile in Hyndman and Fan (1996)
+(equivalent to Excel `PERCENTRANK.EXC`).
+
+- `:compete`: Return `count_less / (n - 1)` if `value ∈ itr`, otherwise 
+return `(count_less - 1) / (n - 1)`, without interpolation
+(equivalent to MariaDB `PERCENT_RANK`, dplyr `percent_rank`).
+
+- `:tied`: Return `(count_less + count_equal/2) / n`, without interpolation.
+Based on the definition in Roscoe, J. T. (1975)
+(equivalent to `"mean"` kind of SciPy `percentileofscore`).
+
+- `:strict`: Return `count_less / n`, without interpolation
+(equivalent to `"strict"` kind of SciPy `percentileofscore`).
+
+- `:weak`: Return `(count_less + count_equal) / n`, without interpolation
+(equivalent to `"weak"` kind of SciPy `percentileofscore`).
+
+!!! note
+    An `ArgumentError` is thrown if `itr` contains `NaN` or `missing` values
+    or if `itr` contains fewer than two elements.
+
+# References
+Roscoe, J. T. (1975). [Fundamental Research Statistics for the Behavioral Sciences]
+(http://www.bryanburnham.net/wp-content/uploads/2014/07/Fundamental-Statistics-for-the-Behavioral-Sciences-v2.0.pdf#page=57)",
+2nd ed., New York : Holt, Rinehart and Winston.
+
+Hyndman, R.J and Fan, Y. (1996) "[Sample Quantiles in Statistical Packages]
+(https://www.amherst.edu/media/view/129116/original/Sample+Quantiles.pdf)",
+*The American Statistician*, Vol. 50, No. 4, pp. 361-365.
+
+# Examples
+```julia
+julia> using StatsBase
+
+julia> v1 = [1, 1, 1, 2, 3, 4, 8, 11, 12, 13];
+
+julia> v2 = [1, 2, 3, 5, 6, missing, 8];
+
+julia> v3 = [1, 2, 3, 4, 4, 5, 6, 7, 8, 9];
+
+julia> quantilerank(v1, 2)
+0.3333333333333333
+
+julia> quantilerank(v1, 2, method=:exc), quantilerank(v1, 2, method=:tied)
+(0.36363636363636365, 0.35)
+
+# use `skipmissing` for vectors with missing entries.
+julia> quantilerank(skipmissing(v2), 4)
+0.5
+
+# use broadcasting with `Ref` to compute quantile rank for multiple values
+julia> quantilerank.(Ref(v3), [4, 8])
+2-element Vector{Float64}:
+ 0.3333333333333333
+ 0.8888888888888888
+```
+"""
+function quantilerank(itr, value; method::Symbol=:inc)
+    ((value isa Number && isnan(value)) || ismissing(value)) &&
+        throw(ArgumentError("`value` cannot be NaN or missing"))
+    any(x -> ismissing(x) || (x isa Number && isnan(x)), itr) &&
+        throw(ArgumentError("`itr` cannot contain missing or NaN entries"))
+
+    count_less = count_equal = n = 0
+    greatest_smaller = smallest_greater = value
+    for x in itr
+        if x == value
+            count_equal += 1
+        elseif x < value
+            count_less += 1
+            if greatest_smaller == value || greatest_smaller < x
+                greatest_smaller = x
+            end
+        else
+            if smallest_greater == value || smallest_greater > x
+                smallest_greater = x
+            end
+        end
+        n += 1
+    end
+
+    n == 0 && throw(ArgumentError("`itr` is empty. Pass a collection with at least two elements"))
+    n == 1 && throw(ArgumentError("`itr` has only 1 value. Pass a collection with at least two elements"))
+
+    if method == :inc
+        if greatest_smaller == value
+            return 0.0
+        elseif count_equal > 0
+            return count_less / (n - 1)
+        elseif smallest_greater == value
+            return 1.0
+        else
+            lower = (count_less - 1) / (n - 1)
+            upper = count_less / (n - 1)
+            ratio = (value - greatest_smaller) / (smallest_greater - greatest_smaller)
+            return lower + ratio * (upper - lower)
+        end
+    elseif method == :exc
+        if count_less == 0 && count_equal == 0
+            return 0.0
+        elseif count_less == 0
+            return 1.0 / (n + 1)
+        elseif count_equal > 0
+            return (count_less + 1) / (n + 1)
+        elseif smallest_greater == value
+            return 1.0
+        else
+            lower = count_less / (n + 1)
+            upper = (count_less + 1) / (n + 1)
+            ratio = (value - greatest_smaller) / (smallest_greater - greatest_smaller)
+            return lower + ratio * (upper - lower)
+        end
+    elseif method == :compete
+        if value > maximum(itr)
+            return 1.0
+        elseif value ≤ minimum(itr) 
+            return 0.0
+        else
+            value ∈ itr && (count_less += 1)
+            return (count_less - 1) / (n - 1)
+        end 
+    elseif method == :tied
+        return (count_less + count_equal/2) / n
+    elseif method == :strict
+        return count_less / n
+    elseif method == :weak
+        return (count_less + count_equal) / n
+    else
+        throw(ArgumentError("method=:$method is not valid. Pass :inc, :exc, :compete, :tied, :strict or :weak."))
+    end
+end
+
+"""
+    percentilerank(itr, value; method=:inc)
+
+Return the `q`th percentile of `value` in collection `itr`, i.e. [`quantilerank(itr, value)`](@ref) * 100.
+
+See the [`quantilerank`](@ref) docstring for more details.
+"""
+percentilerank(itr, value; method::Symbol=:inc) = quantilerank(itr, value, method=method) * 100
 
 #############################
 #
diff --git a/test/scalarstats.jl b/test/scalarstats.jl
@@ -98,7 +98,64 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.]
 @test percentile(skipmissing([missing, 2, 5, missing]), 25) ≈ 2.75
 @test percentile(skipmissing([missing, 2, 5, missing]), [25, 50, 75]) ≈ [2.75, 3.5, 4.25]
 
-
+@testset "quantilerank and percentilerank" begin
+     @testset "value as number and array" begin
+         @testset ":inc and :exc" begin
+             v1 = [1, 1, 1, 2, 3, 4, 8, 11, 12, 13]
+             v2 = [1, 2, 3, 6, 6, 6, 7, 8, 9]
+             v3 = [1, 2, 4, 3, 4]
+             v4 = [1, 2, 1, 3, 4]
+             @test quantilerank(v1, 2, method=:inc)    == 1/3
+             @test quantilerank(v1, 4, method=:inc)    == 5/9
+             @test quantilerank(v1, 8, method=:inc)    == 2/3
+             @test quantilerank(v1, 5, method=:inc)    == 7/12        
+             @test quantilerank(v2, 7, method=:exc)    == 0.7
+             @test quantilerank(v2, 5.43, method=:exc) == 0.381
+             @test quantilerank(v3, 4, method=:exc)    == 6/9
+             @test quantilerank(v3, 4, method=:inc)    == 3/4
+             @test quantilerank(v4, 1, method=:exc)    == 1/6
+             @test quantilerank(v4, -100, method=:inc) == 0.0
+             @test quantilerank(v4,  100, method=:inc) == 1.0
+             @test quantilerank(v4, -100, method=:exc) == 0.0
+             @test quantilerank(v4,  100, method=:exc) == 1.0
+             @test percentilerank(v1, 2)               == 100 * quantilerank(v1, 2)
+             @test percentilerank(v2, 7, method=:exc)  == 100 * quantilerank(v2, 7, method=:exc)
+         end
+         @testset ":compete" begin
+             v = [0, 0, 1, 1, 2, 2, 2, 2, 4, 4]
+             @test quantilerank(v, 1, method=:compete)    == 2/9
+             @test quantilerank(v, 2, method=:compete)    == 4/9
+             @test quantilerank(v, 4, method=:compete)    == 8/9
+             @test quantilerank(v, -100, method=:compete) == 0.0
+             @test quantilerank(v,  100, method=:compete) == 1.0
+         end
+         @testset ":strict, :weak and :tied" begin
+             v = [7, 8, 2, 1, 3, 4, 5, 4, 6, 9]
+             for (method, res1, res2) in [(:tied, .4, [.4, .85]),
+                                          (:strict, .3, [.3, .8]),
+                                          (:weak, .5, [.5, .9])]
+                 @test quantilerank(v, 4, method=method) == res1
+             end
+         end
+     end
+     @testset "errors" begin
+         v1 = [1, 2, 3, 5, 6, missing, 8]
+         v2 = [missing, missing]
+         v3 = [1, 2, 3, 5, 6, NaN, 8]
+         v4 = [1, 2, 3, 3, 4]
+         for method in (:tied, :strict, :weak)
+             @test_throws ArgumentError quantilerank(v1, 4, method=method)
+             @test_throws ArgumentError quantilerank(v2, 4, method=method)
+             @test_throws ArgumentError quantilerank(v3, 4, method=method)
+         end
+         @test_throws ArgumentError quantilerank(v4, 3, method=:wrongargument)
+         @test_throws ArgumentError quantilerank(v4, NaN)
+         @test_throws ArgumentError quantilerank(v4, missing)
+         @test_throws ArgumentError quantilerank([], 3)
+         @test_throws ArgumentError quantilerank([1], 3)
+     end
+ end
+ 
 ##### Dispersion
 
 @test span([3, 4, 5, 6, 2]) == (2:6)