Skip to content

Commit ad7174b

Browse files
committed
WIP: (known failure) try to remove repeatedly identifying outliers
From: #5 (comment) Need to think through it again/and or check some more samples and test it against that. Getting different bounds/outliers right now although I think they're right.
1 parent 5b5c329 commit ad7174b

File tree

2 files changed

+62
-74
lines changed

2 files changed

+62
-74
lines changed

lib/statistex.ex

Lines changed: 59 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -96,18 +96,16 @@ defmodule Statistex do
9696
9797
The statistics themselves are described in the individual samples that can be used to calculate individual values.
9898
99-
`Argumenterror` is raised if the given list is empty.
99+
`ArgumentError` is raised if the given list is empty.
100100
101101
## Options
102102
103-
In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can
103+
With a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can
104104
be given. The percentiles 25th, 50th (median) and 75th are always calculated.
105105
106-
The option `exclude_outliers` can be set to `:once`, `:repeatedly` or `nil`,
107-
`nil` is the default. If this option set to `:once` the outliers are excluded
108-
and the statistics are calculated with the rest of the samples. The value
109-
`:repeatedly` repeats the outlier exclusion until the samples no longer
110-
contains outliers.
106+
The option `exclude_outliers` can be set to `true`, `false`. Defaults to `false`.
107+
If this option is set to `true` the outliers are excluded
108+
and the statistics are calculated with the rest of the samples.
111109
112110
## Examples
113111
@@ -167,62 +165,72 @@ defmodule Statistex do
167165
def statistics(samples, configuration) do
168166
samples = Enum.sort(samples)
169167

170-
minimum = hd(samples)
171-
maximum = List.last(samples)
172-
173-
percentiles = calculate_percentiles(samples, configuration)
168+
# these statistics are required to do the outlier calculations
169+
%{minimum: minimum, maximum: maximum, percentiles: percentiles} =
170+
base_statistics(samples, configuration)
174171

175172
outlier_bounds =
176173
do_outlier_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum)
177174

175+
# make sure rest remains sorted and so can be used again to ok results
178176
{outliers, rest} = do_outliers(samples, outlier_bounds: outlier_bounds)
179177

180178
if exclude_outliers?(configuration) and Enum.any?(outliers) do
181-
configuration =
182-
configuration
183-
|> Keyword.update!(:exclude_outliers, fn
184-
:once -> :stop
185-
:repeatedly -> :repeatedly
186-
end)
187-
|> Keyword.update(:acc_outliers, outliers, fn list -> list ++ outliers end)
188-
189-
statistics(rest, configuration)
179+
# figure out to avoid double sorting
180+
rest = Enum.sort(rest)
181+
# need to recalculate with the outliers removed
182+
%{minimum: minimum, maximum: maximum, percentiles: percentiles} =
183+
base_statistics(rest, configuration)
184+
185+
create_full_statistics(rest, minimum, maximum, percentiles, outliers, outlier_bounds)
190186
else
191-
outliers = outliers ++ Keyword.get(configuration, :acc_outliers, [])
192-
193-
total = total(samples)
194-
sample_size = length(samples)
195-
average = average(samples, total: total, sample_size: sample_size)
196-
variance = variance(samples, average: average, sample_size: sample_size)
197-
198-
frequency_distribution = frequency_distribution(samples)
199-
200-
standard_deviation = standard_deviation(samples, variance: variance)
201-
202-
standard_deviation_ratio =
203-
standard_deviation_ratio(samples, standard_deviation: standard_deviation)
204-
205-
%__MODULE__{
206-
total: total,
207-
average: average,
208-
variance: variance,
209-
standard_deviation: standard_deviation,
210-
standard_deviation_ratio: standard_deviation_ratio,
211-
median: median(samples, percentiles: percentiles),
212-
percentiles: percentiles,
213-
frequency_distribution: frequency_distribution,
214-
mode: mode(samples, frequency_distribution: frequency_distribution),
215-
minimum: minimum,
216-
maximum: maximum,
217-
outlier_bounds: outlier_bounds,
218-
outliers: outliers,
219-
sample_size: sample_size
220-
}
187+
create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds)
221188
end
222189
end
223190

191+
defp base_statistics(samples, configuration) do
192+
minimum = hd(samples)
193+
maximum = List.last(samples)
194+
195+
percentiles = calculate_percentiles(samples, configuration)
196+
197+
%{minimum: minimum, maximum: maximum, percentiles: percentiles}
198+
end
199+
224200
defp exclude_outliers?(configuration) do
225-
Keyword.get(configuration, :exclude_outliers) in [:once, :repeatedly]
201+
Access.get(configuration, :exclude_outliers) == true
202+
end
203+
204+
# maybe make argument a map
205+
defp create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds) do
206+
total = total(samples)
207+
sample_size = length(samples)
208+
average = average(samples, total: total, sample_size: sample_size)
209+
variance = variance(samples, average: average, sample_size: sample_size)
210+
211+
frequency_distribution = frequency_distribution(samples)
212+
213+
standard_deviation = standard_deviation(samples, variance: variance)
214+
215+
standard_deviation_ratio =
216+
standard_deviation_ratio(samples, standard_deviation: standard_deviation)
217+
218+
%__MODULE__{
219+
total: total,
220+
average: average,
221+
variance: variance,
222+
standard_deviation: standard_deviation,
223+
standard_deviation_ratio: standard_deviation_ratio,
224+
median: median(samples, percentiles: percentiles),
225+
percentiles: percentiles,
226+
frequency_distribution: frequency_distribution,
227+
mode: mode(samples, frequency_distribution: frequency_distribution),
228+
minimum: minimum,
229+
maximum: maximum,
230+
outlier_bounds: outlier_bounds,
231+
outliers: outliers,
232+
sample_size: sample_size
233+
}
226234
end
227235

228236
@doc """

test/statistex_test.exs

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ defmodule Statistex.StatistexTest do
6767

6868
test "returns Statistex struct with excluded outliers once" do
6969
assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900],
70-
exclude_outliers: :once
70+
exclude_outliers: true
7171
) ==
7272
%Statistex{
7373
total: 3450,
@@ -81,33 +81,13 @@ defmodule Statistex.StatistexTest do
8181
mode: [500, 450],
8282
minimum: 450,
8383
maximum: 600,
84+
# check with other sources what is right and what isn't, I fear we may have calculated outliers twice before
8485
outlier_bounds: {450, 575.0},
86+
# Either sort them or make the test ignorant of order
8587
outliers: [600, 50, 50, 900],
8688
sample_size: 7
8789
}
8890
end
89-
90-
test "returns Statistex struct with excluded outliers repeatedly" do
91-
assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900],
92-
exclude_outliers: :repeatedly
93-
) ==
94-
%Statistex{
95-
total: 2850,
96-
average: 475.0,
97-
variance: 750.0,
98-
standard_deviation: 27.386127875258307,
99-
standard_deviation_ratio: 0.05765500605317538,
100-
median: 475.0,
101-
percentiles: %{25 => 450.0, 50 => 475.0, 75 => 500.0},
102-
frequency_distribution: %{450 => 3, 500 => 3},
103-
mode: [500, 450],
104-
minimum: 450,
105-
maximum: 500,
106-
outlier_bounds: {450, 500},
107-
outliers: [50, 50, 900, 600],
108-
sample_size: 6
109-
}
110-
end
11191
end
11292

11393
describe "property testing as we might get loads of data" do

0 commit comments

Comments
 (0)