Skip to content

Commit f57d2e7

Browse files
authored
ENH: stats.dpareto_lognorm: add double Pareto lognormal distribution (scipy#21731)
* ENH: stats.dpareto_lognorm: add double Pareto lognormal distribution * Apply suggestions from code review * Update doc/source/tutorial/stats/continuous_dpareto_lognorm.rst [docs only] * MAINT: stats.dpareto_lognorm: adjustments per review * TST: stats.dpareto_lognorm: adjust fit test skips * TST: stats.dpareto_lognorm: adjust test skips * TST: stats.dpareto_lognorm: mark fit tests xslow
1 parent 82b7a32 commit f57d2e7

File tree

9 files changed

+222
-16
lines changed

9 files changed

+222
-16
lines changed

doc/source/tutorial/stats/continuous.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ Continuous Distributions in `scipy.stats`
223223
continuous_chi2
224224
continuous_cosine
225225
continuous_dgamma
226+
continuous_dpareto_lognorm
226227
continuous_dweibull
227228
continuous_erlang
228229
continuous_expon
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
2+
.. _continuous-dpareto_lognorm:
3+
4+
Double Pareto Lognormal Distribution
5+
====================================
6+
7+
For real numbers :math:`x` and :math:`\mu`, :math:`\sigma > 0`,
8+
:math:`\alpha > 0`, and :math:`\beta > 0`, the PDF of a double
9+
Pareto lognormal distribution is:
10+
11+
.. math::
12+
:nowrap:
13+
14+
\begin{eqnarray*}
15+
f(x, \mu, \sigma, \alpha, \beta) =
16+
\frac{\alpha \beta}{(\alpha + \beta) x}
17+
\phi\left( \frac{\log x - \mu}{\sigma} \right)
18+
\left( R(y_1) + R(y_2) \right)
19+
\end{eqnarray*}
20+
21+
where :math:`R(t) = \frac{1 - \Phi(t)}{\phi(t)}` is a Mills' ratio,
22+
:math:`y_1 = \alpha \sigma - \frac{\log x - \mu}{\sigma}`,
23+
and :math:`y_2 = \beta \sigma + \frac{\log x - \mu}{\sigma}`.
24+
The CDF is:
25+
26+
.. math::
27+
:nowrap:
28+
29+
\begin{eqnarray*}
30+
F(x, \mu, \sigma, \alpha, \beta) =
31+
\Phi \left(\frac{\log x - \mu}{\sigma} \right) -
32+
\phi \left(\frac{\log x - \mu}{\sigma} \right)
33+
\left(\frac{\beta R(x_1) - \alpha R(x_2)}{\alpha + \beta} \right)
34+
\end{eqnarray*}
35+
36+
Raw moment :math:`k > \alpha` is given by:
37+
38+
.. math::
39+
:nowrap:
40+
41+
\begin{eqnarray*}
42+
\mu_k' = \frac{\alpha \beta}{(\alpha - k)(\beta + k)}
43+
\exp \left(k \mu + \frac{k^2 \sigma^2}{2} \right)
44+
\end{eqnarray*}
45+
46+
Implementation: `scipy.stats.dpareto_lognorm`

doc/source/tutorial/stats/probability_distributions.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ introspection:
7878
>>> dist_discrete = [d for d in dir(stats) if
7979
... isinstance(getattr(stats, d), stats.rv_discrete)]
8080
>>> print('number of continuous distributions: %d' % len(dist_continu))
81-
number of continuous distributions: 108
81+
number of continuous distributions: 109
8282
>>> print('number of discrete distributions: %d' % len(dist_discrete))
8383
number of discrete distributions: 21
8484

scipy/stats/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
cosine -- Cosine
6464
crystalball -- Crystalball
6565
dgamma -- Double Gamma
66+
dpareto_lognorm -- Double Pareto Lognormal
6667
dweibull -- Double Weibull
6768
erlang -- Erlang
6869
expon -- Exponential

scipy/stats/_continuous_distns.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from ._distn_infrastructure import (_vectorize_rvs_over_shapes,
2828
get_distribution_names, _kurtosis, _isintegral,
2929
rv_continuous, _skew, _get_fixed_fit_value, _check_shape, _ShapeInfo)
30+
from scipy.stats._distribution_infrastructure import _log1mexp
3031
from ._ksstats import kolmogn, kolmognp, kolmogni
3132
from ._constants import (_XMIN, _LOGXMIN, _EULER, _ZETA3, _SQRT_PI,
3233
_SQRT_2_OVER_PI, _LOG_PI, _LOG_SQRT_2_OVER_PI)
@@ -1828,6 +1829,140 @@ def _stats(self, a):
18281829
dgamma = dgamma_gen(name='dgamma')
18291830

18301831

1832+
class dpareto_lognorm_gen(rv_continuous):
1833+
r"""A double Pareto lognormal continuous random variable.
1834+
1835+
%(before_notes)s
1836+
1837+
Notes
1838+
-----
1839+
The probability density function for `dpareto_lognorm` is:
1840+
1841+
.. math::
1842+
1843+
f(x, \mu, \sigma, \alpha, \beta) =
1844+
\frac{\alpha \beta}{(\alpha + \beta) x}
1845+
\phi\left( \frac{\log x - \mu}{\sigma} \right)
1846+
\left( R(y_1) + R(y_2) \right)
1847+
1848+
where :math:`R(t) = \frac{1 - \Phi(t)}{\phi(t)}`,
1849+
:math:`phi` and :math:`Phi` are the normal PDF and CDF, respectively,
1850+
:math:`y_1 = \alpha \sigma - \frac{\log x - \mu}{\sigma}`,
1851+
and :math:`y_2 = \beta \sigma + \frac{\log x - \mu}{\sigma}`
1852+
for real numbers :math:`x` and :math:`\mu`, :math:`\sigma > 0`,
1853+
:math:`\alpha > 0`, and :math:`\beta > 0` [1]_.
1854+
1855+
`dpareto_lognorm` takes
1856+
``u`` as a shape parameter for :math:`\mu`,
1857+
``s`` as a shape parameter for :math:`\sigma`,
1858+
``a`` as a shape parameter for :math:`\alpha`, and
1859+
``b`` as a shape parameter for :math:`\beta`.
1860+
1861+
A random variable :math:`X` distributed according to the PDF above
1862+
can be represented as :math:`X = U \frac{V_1}{V_2}` where :math:`U`,
1863+
:math:`V_1`, and :math:`V_2` are independent, :math:`U` is lognormally
1864+
distributed such that :math:`\log U \sim N(\mu, \sigma^2)`, and
1865+
:math:`V_1` and :math:`V_2` follow Pareto distributions with parameters
1866+
:math:`\alpha` and :math:`\beta`, respectively [2]_.
1867+
1868+
%(after_notes)s
1869+
1870+
References
1871+
----------
1872+
.. [1] Hajargasht, Gholamreza, and William E. Griffiths. "Pareto-lognormal
1873+
distributions: Inequality, poverty, and estimation from grouped income
1874+
data." Economic Modelling 33 (2013): 593-604.
1875+
.. [2] Reed, William J., and Murray Jorgensen. "The double Pareto-lognormal
1876+
distribution - a new parametric model for size distributions."
1877+
Communications in Statistics - Theory and Methods 33.8 (2004): 1733-1753.
1878+
1879+
%(example)s
1880+
1881+
"""
1882+
_logphi = norm._logpdf
1883+
_logPhi = norm._logcdf
1884+
_logPhic = norm._logsf
1885+
_phi = norm._pdf
1886+
_Phi = norm._cdf
1887+
_Phic = norm._sf
1888+
1889+
def _R(self, z):
1890+
return self._Phic(z) / self._phi(z)
1891+
1892+
def _logR(self, z):
1893+
return self._logPhic(z) - self._logphi(z)
1894+
1895+
def _shape_info(self):
1896+
return [_ShapeInfo("u", False, (-np.inf, np.inf), (False, False)),
1897+
_ShapeInfo("s", False, (0, np.inf), (False, False)),
1898+
_ShapeInfo("a", False, (0, np.inf), (False, False)),
1899+
_ShapeInfo("b", False, (0, np.inf), (False, False))]
1900+
1901+
def _argcheck(self, u, s, a, b):
1902+
return (s > 0) & (a > 0) & (b > 0)
1903+
1904+
def _rvs(self, u, s, a, b, size=None, random_state=None):
1905+
# From [1] after Equation (12): "To generate pseudo-random
1906+
# deviates from the dPlN distribution, one can exponentiate
1907+
# pseudo-random deviates from NL generated using (6)."
1908+
Z = random_state.normal(u, s, size=size)
1909+
E1 = random_state.standard_exponential(size=size)
1910+
E2 = random_state.standard_exponential(size=size)
1911+
return np.exp(Z + E1 / a - E2 / b)
1912+
1913+
def _logpdf(self, x, u, s, a, b):
1914+
with np.errstate(invalid='ignore', divide='ignore'):
1915+
log_y, m = np.log(x), u # compare against [1] Eq. 1
1916+
z = (log_y - m) / s
1917+
x1 = a * s - z
1918+
x2 = b * s + z
1919+
out = np.asarray(np.log(a) + np.log(b) - np.log(a + b) - log_y)
1920+
out += self._logphi(z)
1921+
out += np.logaddexp(self._logR(x1), self._logR(x2))
1922+
out[(x == 0) | np.isinf(x)] = -np.inf
1923+
return out[()]
1924+
1925+
def _logcdf(self, x, u, s, a, b):
1926+
log_y, m = np.log(x), u # compare against [1] Eq. 2
1927+
z = (log_y - m) / s
1928+
x1 = a * s - z
1929+
x2 = b * s + z
1930+
t1 = self._logPhi(z)
1931+
t2 = self._logphi(z)
1932+
t3 = (np.log(b) + self._logR(x1))
1933+
t4 = (np.log(a) + self._logR(x2))
1934+
t1, t2, t3, t4, one = np.broadcast_arrays(t1, t2, t3, t4, 1)
1935+
# t3 can be smaller than t4, so we have to consider log of negative number
1936+
# This would be much simpler, but `return_sign` is available, so use it?
1937+
# t5 = sc.logsumexp([t3, t4 + np.pi*1j])
1938+
t5, sign = sc.logsumexp([t3, t4], b=[one, -one], axis=0, return_sign=True)
1939+
return sc.logsumexp([t1, t2 + t5 - np.log(a + b)], b=[one, -one*sign], axis=0)
1940+
1941+
def _logsf(self, x, u, s, a, b):
1942+
return _log1mexp(self._logcdf(x, u, s, a, b))
1943+
1944+
# Infrastructure doesn't seem to do this, so...
1945+
1946+
def _pdf(self, x, u, s, a, b):
1947+
return np.exp(self._logpdf(x, u, s, a, b))
1948+
1949+
def _cdf(self, x, u, s, a, b):
1950+
return np.exp(self._logcdf(x, u, s, a, b))
1951+
1952+
def _sf(self, x, u, s, a, b):
1953+
return np.exp(self._logsf(x, u, s, a, b))
1954+
1955+
def _munp(self, n, u, s, a, b):
1956+
m, k = u, float(n) # compare against [1] Eq. 6
1957+
out = (a * b) / ((a - k) * (b + k)) * np.exp(k * m + k ** 2 * s ** 2 / 2)
1958+
out = np.asarray(out)
1959+
out[a <= k] = np.nan
1960+
return out
1961+
1962+
1963+
dpareto_lognorm = dpareto_lognorm_gen(a=0, name='dpareto_lognorm')
1964+
1965+
18311966
class dweibull_gen(rv_continuous):
18321967
r"""A double Weibull continuous random variable.
18331968

scipy/stats/_distr_params.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
['cosine', ()],
2020
['crystalball', (2.0, 3.0)],
2121
['dgamma', (1.1023326088288166,)],
22+
['dpareto_lognorm', (3, 1.2, 1.5, 2)],
2223
['dweibull', (2.0685080649914673,)],
2324
['erlang', (10,)],
2425
['expon', ()],
@@ -121,7 +122,8 @@
121122
['wald', ()],
122123
['weibull_max', (2.8687961709100187,)],
123124
['weibull_min', (1.7866166930421596,)],
124-
['wrapcauchy', (0.031071279018614728,)]]
125+
['wrapcauchy', (0.031071279018614728,)]
126+
]
125127

126128

127129
distdiscrete = [
@@ -196,6 +198,7 @@
196198
['cosine', ()],
197199
['crystalball', (-1, 2)],
198200
['dgamma', (-1, )],
201+
['dpareto_lognorm', (3, -1.2, 1.5, 2)],
199202
['dweibull', (-1, )],
200203
['erlang', (-1, )],
201204
['expon', ()],

scipy/stats/tests/test_continuous_basic.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,13 @@
6262
'johnsonsb', 'kstwobign', 'ncx2', 'norminvgauss', 'truncnorm',
6363
'truncweibull_min', 'wrapcauchy'}
6464
xfail_fit_mm = {'alpha', 'betaprime', 'bradford', 'burr', 'burr12', 'cauchy',
65-
'crystalball', 'exponweib', 'f', 'fisk', 'foldcauchy', 'genextreme',
66-
'genpareto', 'halfcauchy', 'invgamma', 'irwinhall', 'jf_skew_t',
67-
'johnsonsu', 'kappa3', 'kappa4', 'landau', 'levy', 'levy_l',
68-
'loglaplace', 'lomax', 'mielke', 'ncf', 'nct', 'pareto', 'powerlognorm',
69-
'powernorm', 'rel_breitwigner', 'skewcauchy', 't', 'trapezoid',
70-
'truncexpon', 'truncpareto', 'tukeylambda', 'vonmises', 'vonmises_line'}
65+
'crystalball', 'dpareto_lognorm', 'exponweib', 'f', 'fisk',
66+
'foldcauchy', 'genextreme', 'genpareto', 'halfcauchy', 'invgamma',
67+
'irwinhall', 'jf_skew_t', 'johnsonsu', 'kappa3', 'kappa4', 'landau',
68+
'levy', 'levy_l', 'loglaplace', 'lomax', 'mielke', 'ncf', 'nct',
69+
'pareto', 'powerlognorm', 'powernorm', 'rel_breitwigner',
70+
'skewcauchy', 't', 'trapezoid', 'truncexpon', 'truncpareto',
71+
'tukeylambda', 'vonmises', 'vonmises_line'}
7172
skip_fit_mm = {'genexpon', 'genhyperbolic', 'ksone', 'kstwo', 'levy_stable',
7273
'recipinvgauss', 'studentized_range'} # far too slow (>10min)
7374

@@ -76,8 +77,8 @@
7677
# on the implementation details of corresponding special functions.
7778
# cf https://github.com/scipy/scipy/pull/4979 for a discussion.
7879
fails_cmplx = {'argus', 'beta', 'betaprime', 'cauchy', 'chi', 'chi2', 'cosine',
79-
'dgamma', 'dweibull', 'erlang', 'f', 'foldcauchy', 'gamma',
80-
'gausshyper', 'gengamma', 'genhyperbolic',
80+
'dgamma', 'dpareto_lognorm', 'dweibull', 'erlang', 'f', 'foldcauchy',
81+
'gamma', 'gausshyper', 'gengamma', 'genhyperbolic',
8182
'geninvgauss', 'gennorm', 'genpareto',
8283
'halfcauchy', 'halfgennorm', 'invgamma', 'irwinhall', 'jf_skew_t',
8384
'ksone', 'kstwo', 'kstwobign', 'landau', 'levy_l', 'loggamma',
@@ -385,7 +386,7 @@ def test_rvs_broadcast(dist, shape_args):
385386
# implementation detail of the distribution, not a requirement. If
386387
# the implementation the rvs() method of a distribution changes, this
387388
# test might also have to be changed.
388-
shape_only = dist in ['argus', 'betaprime', 'dgamma', 'dweibull',
389+
shape_only = dist in ['argus', 'betaprime', 'dgamma', 'dpareto_lognorm', 'dweibull',
389390
'exponnorm', 'genhyperbolic', 'geninvgauss', 'landau',
390391
'levy_stable', 'nct', 'norminvgauss', 'rice',
391392
'skewnorm', 'semicircular', 'gennorm', 'loggamma']

scipy/stats/tests/test_distributions.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10305,6 +10305,24 @@ def test_sf_ih10_exact(self):
1030510305
assert_array_max_ulp(self.ih10.sf(1/10), ref, maxulp=10)
1030610306

1030710307

10308+
class TestDParetoLognorm:
10309+
def test_against_R(self):
10310+
# Test against R implementation in `distributionsrd`
10311+
# library(distributionsrd)
10312+
# options(digits=16)
10313+
# x = 1.1
10314+
# b = 2
10315+
# a = 1.5
10316+
# m = 3
10317+
# s = 1.2
10318+
# ddoubleparetolognormal(x, b, a, m, s)
10319+
# pdoubleparetolognormal(x, b, a, m, s)
10320+
x, m, s, a, b = 1.1, 3, 1.2, 1.5, 2
10321+
dist = stats.dpareto_lognorm(m, s, a, b)
10322+
np.testing.assert_allclose(dist.pdf(x), 0.02490187219085912)
10323+
np.testing.assert_allclose(dist.cdf(x), 0.01664024173822796)
10324+
10325+
1030810326
# Cases are (distribution name, log10 of smallest probability mass to test,
1030910327
# log10 of the complement of the largest probability mass to test, atol,
1031010328
# rtol). None uses default values.

scipy/stats/tests/test_fit.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
thresh_min = 0.75 # minimum difference estimate - true to fail test
2222

2323
mle_failing_fits = [
24+
'dpareto_lognorm',
2425
'gausshyper',
2526
'genexpon',
2627
'gengamma',
@@ -61,8 +62,8 @@
6162
]
6263

6364
mm_failing_fits = ['alpha', 'betaprime', 'burr', 'burr12', 'cauchy', 'chi',
64-
'chi2', 'crystalball', 'dgamma', 'dweibull', 'f',
65-
'fatiguelife', 'fisk', 'foldcauchy', 'genextreme',
65+
'chi2', 'crystalball', 'dgamma', 'dpareto_lognorm', 'dweibull',
66+
'f', 'fatiguelife', 'fisk', 'foldcauchy', 'genextreme',
6667
'gengamma', 'genhyperbolic', 'gennorm', 'genpareto',
6768
'halfcauchy', 'invgamma', 'invweibull', 'irwinhall', 'jf_skew_t',
6869
'johnsonsu', 'kappa3', 'ksone', 'kstwo', 'landau', 'levy', 'levy_l',
@@ -249,8 +250,8 @@ def cases_test_fit_mle():
249250
't', 'uniform', 'weibull_max', 'weibull_min', 'wrapcauchy'}
250251

251252
# Please keep this list in alphabetical order...
252-
xslow_basic_fit = {'betabinom', 'betanbinom', 'burr', 'exponweib',
253-
'gausshyper', 'gengamma', 'genhalflogistic',
253+
xslow_basic_fit = {'betabinom', 'betanbinom', 'burr', 'dpareto_lognorm',
254+
'exponweib', 'gausshyper', 'gengamma', 'genhalflogistic',
254255
'genhyperbolic', 'geninvgauss',
255256
'hypergeom', 'kappa4', 'loguniform',
256257
'ncf', 'nchypergeom_fisher', 'nchypergeom_wallenius',
@@ -307,7 +308,7 @@ def cases_test_fit_mse():
307308

308309
# Please keep this list in alphabetical order...
309310
xslow_basic_fit = {'argus', 'beta', 'betaprime', 'burr', 'burr12',
310-
'dgamma', 'f', 'gengamma', 'gennorm',
311+
'dgamma', 'dpareto_lognorm', 'f', 'gengamma', 'gennorm',
311312
'halfgennorm', 'invgamma', 'invgauss', 'jf_skew_t',
312313
'johnsonsb', 'kappa4', 'loguniform', 'mielke',
313314
'nakagami', 'ncf', 'nchypergeom_fisher',

0 commit comments

Comments
 (0)