Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions src/samplers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,20 @@ Samples from the sampler and returns the result.
"""
rand(::AbstractRNG, ::Sampleable)

for fname in ["aliastable.jl",
"binomial.jl",
"poissonbinomial.jl",
"poisson.jl",
"exponential.jl",
"gamma.jl",
"multinomial.jl",
"vonmises.jl",
"vonmisesfisher.jl",
"discretenonparametric.jl",
"categorical.jl",
"productnamedtuple.jl",
]

for fname in [
"aliastable.jl"
"binomial.jl"
"poissonbinomial.jl"
"poisson.jl"
"exponential.jl"
"gamma.jl"
"multinomial.jl"
"vonmises.jl"
"vonmisesfisher.jl"
"discretenonparametric.jl"
"categorical.jl"
"productnamedtuple.jl"
"hypergeometric.jl"
]
include(joinpath("samplers", fname))
end
195 changes: 195 additions & 0 deletions src/samplers/hypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""
### Hypergeometric Sampler

***Sampling implementation reference:*** V. Kachitvichyanukul & B. Schmeiser
"Computer generation of hypergeometric random variates"
Journal of Statistical Computation and Simulation, 22(2):127-145
doi:10.1080/00949658508810839
"""
struct HypergeometricSampler{T} <: Sampleable{Univariate,Discrete}

dist::Hypergeometric
ns_opt::Int
nf_opt::Int
n_opt::Int
N::Int
cache::T

# some addapted names between this implementation and the original paper (see docstring)
# | paper | here |
# |-------|--------|
# | n1* | ns | successes in population ─┬─ stored in the distribution struct
# | n2* | nf | failures in population ──┤
# | k* | n | sample size ─────────────╯
# | n1 | ns_opt | ─┬─ optimized parameters for the algorithms
# | n2 | nf_opt | ─┤
# | k | n_opt | ─╯
# | n | N | total population size
# | x | y | random variate being generated (specifically on HIN)

function HypergeometricSampler(dist::Hypergeometric)

# Step 0: Set-up constants
N = dist.ns + dist.nf
ns_opt, nf_opt = if dist.ns > dist.nf
dist.nf, dist.ns
else
dist.ns, dist.nf
end
n_opt = if dist.n > N / 2
N - dist.n
else
dist.n
end
M = floor(Int, (n_opt + 1) * (ns_opt + 1) / (N + 2))
cache = if M - max(0, n_opt - nf_opt) < 10
# for the HIN algorithm
if n_opt < nf_opt
p = exp(
logabsgamma(nf_opt + 1)[1] +
logabsgamma(N - n_opt + 1)[1] -
logabsgamma(N + 1)[1] -
logabsgamma(nf_opt - n_opt + 1)[1]
)
y = 0
HINCache(p, y)
else
p = exp(
logabsgamma(ns_opt + 1)[1] +
logabsgamma(n_opt + 1)[1] -
logabsgamma(n_opt - nf_opt + 1)[1] -
logabsgamma(N + 1)[1]
)
y = n_opt - nf_opt
HINCache(p, y)
end
else
# for the H2PE algorithm
A = logabsgamma(M + 1)[1] +
logabsgamma(ns_opt - M + 1)[1] +
logabsgamma(n_opt - M + 1)[1] +
logabsgamma(nf_opt - n_opt + M + 1)[1]
D = 1.5 * sqrt((N - n_opt) * n_opt * ns_opt * nf_opt / ((N - 1) * N * N)) + 0.5
xL = M - D + 0.5
xR = M + D + 0.5
kL = exp(
A -
logabsgamma(xL + 1)[1] -
logabsgamma(ns_opt - xL + 1)[1] -
logabsgamma(n_opt - xL + 1)[1] -
logabsgamma(nf_opt - n_opt + xL + 1)[1]
)
kR = exp(
A -
logabsgamma(xR)[1] -
logabsgamma(ns_opt - xR + 2)[1] -
logabsgamma(n_opt - xR + 2)[1] -
logabsgamma(nf_opt - n_opt + xR)[1]
)
λL = -log(xL * (nf_opt - n_opt + xL) / ((ns_opt - xL + 1) * (n_opt - xL + 1)))
λR = -log((ns_opt - xR + 1) * (n_opt - xR + 1) / (xR * (nf_opt - n_opt + xR)))
p1 = 2D
p2 = p1 + kL / λL
p3 = p2 + kR / λR
H2PECache(A, xL, xR, λL, λR, p1, p2, p3)
end
new{typeof(cache)}(dist, ns_opt, nf_opt, n_opt, N, cache)
end

end

struct H2PECache
A::Float64
xL::Float64
xR::Float64
λL::Float64
λR::Float64
p1::Float64
p2::Float64
p3::Float64
end

struct HINCache
p::Float64
y::Int
end

function Random.rand(rng::AbstractRNG, spl::HypergeometricSampler{H2PECache})
# Steps 1, 2, 3, 4: base variate generation
cache = spl.cache
y::Int = 0
while true
u = rand(rng) * cache.p3
v = rand(rng)
if u <= cache.p1
# Region 1: central region
y = floor(Int, cache.xL + u)
elseif u <= cache.p2
# Region 2: left exponential tail
y = floor(Int, cache.xL + log(v) / cache.λL)
if y < max(0, spl.n_opt - spl.nf_opt)
continue
end
v = v * (u - cache.p1) * cache.λL
else
# Region 3: right exponential tail
y = floor(Int, cache.xR - log(v) / cache.λR)
if y > min(spl.ns_opt, spl.n_opt)
continue
end
v = v * (u - cache.p2) * cache.λR
end
# Step 4: Acceptance/Regection Comparison
# note: seems like optimizations found in the paper don't improve the
# performance compared to this direct implementation with the lgamma function,
# so the check is done directly for simplicity.
if log(v) > cache.A -
logabsgamma(y + 1)[1] -
logabsgamma(spl.ns_opt - y + 1)[1] -
logabsgamma(spl.n_opt - y + 1)[1] -
logabsgamma(spl.nf_opt - spl.n_opt + y + 1)[1]
continue
else
break
end
end

# Step 5: return appropriate random variate
return _hg_correct_variate(spl, y)

end

function Random.rand(rng::AbstractRNG, spl::HypergeometricSampler{HINCache})
cache = spl.cache
u = rand(rng)
p = cache.p
y = cache.y
while u > p
u -= p
p *= (spl.ns_opt - y) *
(spl.n_opt - y) /
(y + 1) /
(spl.nf_opt - spl.n_opt + 1 + y)
y += 1
end
return _hg_correct_variate(spl, y)
end

"***Internal:*** Addapts the sampled variable from the optimized distribution for output"
function _hg_correct_variate(spl::HypergeometricSampler, y::Integer)
dist = spl.dist
if dist.n < spl.N / 2
if dist.ns <= dist.nf
y
else
dist.n - y
end
else
if dist.ns <= dist.nf
dist.ns - y
else
dist.n - dist.nf + y
end
end
end

34 changes: 16 additions & 18 deletions src/univariate/discrete/hypergeometric.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""
Hypergeometric(s, f, n)

A *Hypergeometric distribution* describes the number of successes in `n` draws without replacement from a finite population containing `s` successes and `f` failures.
A *Hypergeometric distribution* describes the number of successes in `n` draws without
replacement from a finite population containing `s` successes and `f` failures.

```math
P(X = k) = {{{s \\choose k} {f \\choose {n-k}}}\\over {s+f \\choose n}}, \\quad \\text{for } k = \\max(0, n - f), \\ldots, \\min(n, s).
Expand All @@ -17,12 +18,11 @@ params(d) # Get the parameters, i.e. (s, f, n)
External links

* [Hypergeometric distribution on Wikipedia](http://en.wikipedia.org/wiki/Hypergeometric_distribution)

"""
struct Hypergeometric <: DiscreteUnivariateDistribution
ns::Int # number of successes in population
nf::Int # number of failures in population
n::Int # sample size
ns::Int # number of successes in population
nf::Int # number of failures in population
n::Int # sample size

function Hypergeometric(ns::Real, nf::Real, n::Real; check_args::Bool=true)
@check_args(
Expand Down Expand Up @@ -58,22 +58,27 @@ mode(d::Hypergeometric) = floor(Int, (d.n + 1) * (d.ns + 1) / (d.ns + d.nf + 2))

function modes(d::Hypergeometric)
if (d.ns == d.nf) && mod(d.n, 2) == 1
[(d.n-1)/2, (d.n+1)/2]
[(d.n - 1) / 2, (d.n + 1) / 2]
else
[mode(d)]
end
end

skewness(d::Hypergeometric) = (d.nf-d.ns)*sqrt(d.ns+d.nf-1)*(d.ns+d.nf-2*d.n)/sqrt(d.n*d.ns*d.nf*(d.ns+d.nf-d.n))/(d.ns+d.nf-2)
skewness(d::Hypergeometric) = (d.nf - d.ns) *
sqrt(d.ns + d.nf - 1) *
(d.ns + d.nf - 2 * d.n) /
sqrt(d.n * d.ns * d.nf * (d.ns + d.nf - d.n)) /
(d.ns + d.nf - 2)

function kurtosis(d::Hypergeometric)
ns = float(d.ns)
nf = float(d.nf)
n = float(d.n)
N = ns + nf
a = (N-1) * N^2 * (N * (N+1) - 6*ns * (N-ns) - 6*n*(N-n)) + 6*n*ns*(nf)*(N-n)*(5*N-6)
b = (n*ns*(N-ns) * (N-n)*(N-2)*(N-3))
a/b
a = (N - 1) * N^2 * (N * (N + 1) - 6 * ns * (N - ns) - 6 * n * (N - n)) +
6 * n * ns * (nf) * (N - n) * (5 * N - 6)
b = (n * ns * (N - ns) * (N - n) * (N - 2) * (N - 3))
a / b
end

entropy(d::Hypergeometric) = entropy(map(Base.Fix1(pdf, d), support(d)))
Expand All @@ -84,11 +89,4 @@ entropy(d::Hypergeometric) = entropy(map(Base.Fix1(pdf, d), support(d)))

## sampling

# TODO: remove RFunctions dependency. Implement:
# V. Kachitvichyanukul & B. Schmeiser
# "Computer generation of hypergeometric random variates"
# Journal of Statistical Computation and Simulation, 22(2):127-145
# doi:10.1080/00949658508810839
@rand_rdist(Hypergeometric)
rand(d::Hypergeometric) =
convert(Int, StatsFuns.RFunctions.hyperrand(d.ns, d.nf, d.n))
sampler(dist::Hypergeometric) = HypergeometricSampler(dist)
1 change: 1 addition & 0 deletions test/ref/discrete_test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Hypergeometric(3, 2, 0)
Hypergeometric(3, 2, 5)
Hypergeometric(4, 5, 6)
Hypergeometric(60, 80, 100)
Hypergeometric(200, 200, 10)

NegativeBinomial()
NegativeBinomial(6)
Expand Down
Loading
Loading