Skip to content

Commit cdf1490

Browse files
committed
Further deep dive into outliers, quantiles
* Remove the limiting of bounds with min/max * consult and get some more samples * include R samples as some authorative examples
1 parent ad7174b commit cdf1490

File tree

3 files changed

+91
-35
lines changed

3 files changed

+91
-35
lines changed

lib/statistex.ex

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ defmodule Statistex do
130130
sample_size: 9,
131131
total: 4500,
132132
outliers: [],
133-
outlier_bounds: {200, 900.0}
133+
outlier_bounds: {100.0, 900.0}
134134
}
135135
136136
iex> Statistex.statistics([])
@@ -622,13 +622,13 @@ defmodule Statistex do
622622
## Examples
623623
624624
iex> Statistex.outlier_bounds([3, 4, 5])
625-
{3, 5}
625+
{0.0, 8.0}
626626
627627
iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])
628-
{22.5, 50}
628+
{22.5, 66.5}
629629
630630
iex> Statistex.outlier_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99])
631-
{50, 80.625}
631+
{31.625, 80.625}
632632
"""
633633
@spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number}
634634
def outlier_bounds(samples, options \\ [])
@@ -641,14 +641,11 @@ defmodule Statistex do
641641
Percentile.percentiles(samples, [@first_quartile, @third_quartile])
642642
end)
643643

644-
minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end)
645-
maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end)
646-
647644
q1 = get_percentile(samples, @first_quartile, percentiles)
648645
q3 = get_percentile(samples, @third_quartile, percentiles)
649646
iqr = q3 - q1
650647

651-
{max(q1 - iqr * @iqr_factor, minimum), min(q3 + iqr * @iqr_factor, maximum)}
648+
{q1 - iqr * @iqr_factor, q3 + iqr * @iqr_factor}
652649
end
653650

654651
@doc """

lib/statistex/percentile.ex

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,20 @@ defmodule Statistex.Percentile do
6262
# particular sample). Of the 9 main strategies, (types 1-9), types 6, 7, and 8
6363
# are generally acceptable and give similar results.
6464
#
65+
# R uses type 7, but you can change the strategies used in R with arguments.
66+
#
67+
# > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 6)
68+
# 25% 50% 75%
69+
# 9.25 10.00 11.75
70+
# > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 7)
71+
# 25% 50% 75%
72+
# 9.75 10.00 11.25
73+
#
6574
# For more information on interpolation strategies, see:
6675
# - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
6776
# - http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
6877
defp interpolation_value(lower_bound, upper_bound, rank) do
69-
# in our source rank is k, and interpolation_weitgh is d
78+
# in our source rank is k, and interpolation_weight is d
7079
interpolation_weight = rank - trunc(rank)
7180
interpolation_weight * (upper_bound - lower_bound)
7281
end

test/statistex_test.exs

Lines changed: 76 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@ defmodule Statistex.StatistexTest do
1313
end
1414

1515
describe ".outlier_bounds/2" do
16+
# examples doubled up, maybe get rid of them?
1617
test "returns outlier bounds for samples without outliers" do
1718
assert Statistex.outlier_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) ==
18-
{200, 900.0}
19+
{100.0, 900.0}
1920
end
2021

2122
test "returns outlier bounds for samples with outliers" do
@@ -30,7 +31,7 @@ defmodule Statistex.StatistexTest do
3031
%Statistex{
3132
total: 4500,
3233
average: 500.0,
33-
variance: 40000.0,
34+
variance: 40_000.0,
3435
standard_deviation: 200.0,
3536
standard_deviation_ratio: 0.4,
3637
median: 500.0,
@@ -39,7 +40,7 @@ defmodule Statistex.StatistexTest do
3940
mode: [500, 400],
4041
minimum: 200,
4142
maximum: 900,
42-
outlier_bounds: {200, 900.0},
43+
outlier_bounds: {100.0, 900.0},
4344
outliers: [],
4445
sample_size: 9
4546
}
@@ -50,7 +51,7 @@ defmodule Statistex.StatistexTest do
5051
%Statistex{
5152
total: 4450,
5253
average: 445.0,
53-
variance: 61361.11111111111,
54+
variance: 61_361.11111111111,
5455
standard_deviation: 247.71175004652304,
5556
standard_deviation_ratio: 0.5566556180820742,
5657
median: 475.0,
@@ -65,28 +66,77 @@ defmodule Statistex.StatistexTest do
6566
}
6667
end
6768

68-
test "returns Statistex struct with excluded outliers once" do
69-
assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900],
70-
exclude_outliers: true
71-
) ==
72-
%Statistex{
73-
total: 3450,
74-
average: 492.85714285714283,
75-
variance: 2857.142857142857,
76-
standard_deviation: 53.452248382484875,
77-
standard_deviation_ratio: 0.1084538372977954,
78-
median: 500.0,
79-
percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0},
80-
frequency_distribution: %{450 => 3, 500 => 3, 600 => 1},
81-
mode: [500, 450],
82-
minimum: 450,
83-
maximum: 600,
84-
# check with other sources what is right and what isn't, I fear we may have calculated outliers twice before
85-
outlier_bounds: {450, 575.0},
86-
# Either sort them or make the test ignorant of order
87-
outliers: [600, 50, 50, 900],
88-
sample_size: 7
89-
}
69+
# https://www.youtube.com/watch?v=rZJbj2I-_Ek
70+
test "gets outliers from the sample right" do
71+
# One could argue that this is controversial, R comes up with these results (by default):
72+
# > summary(c(9, 9, 10, 10, 10, 11, 12, 36))
73+
# Min. 1st Qu. Median Mean 3rd Qu. Max.
74+
# 9.00 9.75 10.00 13.38 11.25 36.00
75+
#
76+
# R by default uses type 7 interpolation, we implemented type 6 interpolation though. Which
77+
# R can also use:
78+
# > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 6)
79+
# 25% 50% 75%
80+
# 9.25 10.00 11.75
81+
# Which is our result.
82+
83+
assert %Statistex{
84+
median: 10.0,
85+
percentiles: %{25 => 9.25, 50 => 10.0, 75 => 11.75},
86+
minimum: 9,
87+
maximum: 36,
88+
outlier_bounds: {5.5, 15.5},
89+
outliers: [36]
90+
} = Statistex.statistics([9, 9, 10, 10, 10, 11, 12, 36], exclude_outliers: false)
91+
end
92+
93+
# https://en.wikipedia.org/wiki/Box_plot#Example_with_outliers
94+
test "another example with outliers" do
95+
data = [
96+
52,
97+
57,
98+
57,
99+
58,
100+
63,
101+
66,
102+
66,
103+
67,
104+
67,
105+
68,
106+
69,
107+
70,
108+
70,
109+
70,
110+
70,
111+
72,
112+
73,
113+
75,
114+
75,
115+
76,
116+
76,
117+
78,
118+
79,
119+
89
120+
]
121+
122+
assert %Statistex{
123+
median: 70.0,
124+
percentiles: %{25 => 66.0, 50 => 70.0, 75 => 75.0},
125+
# report interquantile range?
126+
outlier_bounds: {52.5, 88.5},
127+
outliers: [52, 89]
128+
} = Statistex.statistics(data, exclude_outliers: false)
129+
end
130+
131+
# https://en.wikipedia.org/wiki/Interquartile_range#Data_set_in_a_table
132+
test "quartile example" do
133+
assert %Statistex{
134+
median: 87.0,
135+
percentiles: %{25 => 31.0, 50 => 87.0, 75 => 119.0}
136+
} =
137+
Statistex.statistics([7, 7, 31, 31, 47, 75, 87, 115, 116, 119, 119, 155, 177],
138+
exclude_outliers: false
139+
)
90140
end
91141
end
92142

0 commit comments

Comments
 (0)