ENH: stats.dpareto_lognorm: add double Pareto lognormal distribution (scipy#21731)

mdhaber · web-flow · commit f57d2e7b6f19 · 2024-11-16T08:45:24.000+01:00
* ENH: stats.dpareto_lognorm: add double Pareto lognormal distribution

* Apply suggestions from code review

* Update doc/source/tutorial/stats/continuous_dpareto_lognorm.rst

[docs only]

* MAINT: stats.dpareto_lognorm: adjustments per review

* TST: stats.dpareto_lognorm: adjust fit test skips

* TST: stats.dpareto_lognorm: adjust test skips

* TST: stats.dpareto_lognorm: mark fit tests xslow
diff --git a/doc/source/tutorial/stats/continuous.rst b/doc/source/tutorial/stats/continuous.rst
@@ -223,6 +223,7 @@ Continuous Distributions in `scipy.stats`
    continuous_chi2
    continuous_cosine
    continuous_dgamma
+   continuous_dpareto_lognorm
    continuous_dweibull
    continuous_erlang
    continuous_expon
diff --git a/doc/source/tutorial/stats/continuous_dpareto_lognorm.rst b/doc/source/tutorial/stats/continuous_dpareto_lognorm.rst
@@ -0,0 +1,46 @@
+
+.. _continuous-dpareto_lognorm:
+
+Double Pareto Lognormal Distribution
+====================================
+
+For real numbers :math:`x` and :math:`\mu`, :math:`\sigma > 0`,
+:math:`\alpha > 0`, and :math:`\beta > 0`, the PDF of a double
+Pareto lognormal distribution is:
+
+.. math::
+   :nowrap:
+
+    \begin{eqnarray*}
+        f(x, \mu, \sigma, \alpha, \beta) =
+        \frac{\alpha \beta}{(\alpha + \beta) x}
+        \phi\left( \frac{\log x - \mu}{\sigma} \right)
+        \left( R(y_1) + R(y_2) \right)
+    \end{eqnarray*}
+
+where :math:`R(t) = \frac{1 - \Phi(t)}{\phi(t)}` is a Mills' ratio,
+:math:`y_1 = \alpha \sigma - \frac{\log x - \mu}{\sigma}`,
+and :math:`y_2 = \beta \sigma + \frac{\log x - \mu}{\sigma}`.
+The CDF is:
+
+.. math::
+   :nowrap:
+
+    \begin{eqnarray*}
+        F(x, \mu, \sigma, \alpha, \beta) =
+        \Phi \left(\frac{\log x - \mu}{\sigma} \right) -
+        \phi \left(\frac{\log x - \mu}{\sigma} \right)
+        \left(\frac{\beta R(x_1) - \alpha R(x_2)}{\alpha + \beta} \right)
+    \end{eqnarray*}
+
+Raw moment :math:`k > \alpha` is given by:
+
+.. math::
+   :nowrap:
+
+    \begin{eqnarray*}
+        \mu_k' = \frac{\alpha \beta}{(\alpha - k)(\beta + k)} 
+                 \exp \left(k \mu + \frac{k^2 \sigma^2}{2} \right)
+    \end{eqnarray*}
+
+Implementation: `scipy.stats.dpareto_lognorm`
diff --git a/doc/source/tutorial/stats/probability_distributions.rst b/doc/source/tutorial/stats/probability_distributions.rst
@@ -78,7 +78,7 @@ introspection:
     >>> dist_discrete = [d for d in dir(stats) if
     ...                  isinstance(getattr(stats, d), stats.rv_discrete)]
     >>> print('number of continuous distributions: %d' % len(dist_continu))
-    number of continuous distributions: 108
+    number of continuous distributions: 109
     >>> print('number of discrete distributions:   %d' % len(dist_discrete))
     number of discrete distributions:   21
 
diff --git a/scipy/stats/__init__.py b/scipy/stats/__init__.py
@@ -63,6 +63,7 @@
    cosine            -- Cosine
    crystalball       -- Crystalball
    dgamma            -- Double Gamma
+   dpareto_lognorm   -- Double Pareto Lognormal
    dweibull          -- Double Weibull
    erlang            -- Erlang
    expon             -- Exponential
diff --git a/scipy/stats/_continuous_distns.py b/scipy/stats/_continuous_distns.py
@@ -27,6 +27,7 @@
 from ._distn_infrastructure import (_vectorize_rvs_over_shapes,
     get_distribution_names, _kurtosis, _isintegral,
     rv_continuous, _skew, _get_fixed_fit_value, _check_shape, _ShapeInfo)
+from scipy.stats._distribution_infrastructure import _log1mexp
 from ._ksstats import kolmogn, kolmognp, kolmogni
 from ._constants import (_XMIN, _LOGXMIN, _EULER, _ZETA3, _SQRT_PI,
                          _SQRT_2_OVER_PI, _LOG_PI, _LOG_SQRT_2_OVER_PI)
@@ -1828,6 +1829,140 @@ def _stats(self, a):
 dgamma = dgamma_gen(name='dgamma')
 
 
+class dpareto_lognorm_gen(rv_continuous):
+    r"""A double Pareto lognormal continuous random variable.
+
+    %(before_notes)s
+
+    Notes
+    -----
+    The probability density function for `dpareto_lognorm` is:
+
+    .. math::
+
+        f(x, \mu, \sigma, \alpha, \beta) =
+        \frac{\alpha \beta}{(\alpha + \beta) x}
+        \phi\left( \frac{\log x - \mu}{\sigma} \right)
+        \left( R(y_1) + R(y_2) \right)
+
+    where :math:`R(t) = \frac{1 - \Phi(t)}{\phi(t)}`,
+    :math:`phi` and :math:`Phi` are the normal PDF and CDF, respectively,
+    :math:`y_1 = \alpha \sigma - \frac{\log x - \mu}{\sigma}`,
+    and :math:`y_2 = \beta \sigma + \frac{\log x - \mu}{\sigma}`
+    for real numbers :math:`x` and :math:`\mu`, :math:`\sigma > 0`,
+    :math:`\alpha > 0`, and :math:`\beta > 0` [1]_.
+
+    `dpareto_lognorm` takes
+    ``u`` as a shape parameter for :math:`\mu`,
+    ``s`` as a shape parameter for :math:`\sigma`,
+    ``a`` as a shape parameter for :math:`\alpha`, and
+    ``b`` as a shape parameter for :math:`\beta`.
+
+    A random variable :math:`X` distributed according to the PDF above
+    can be represented as :math:`X = U \frac{V_1}{V_2}` where :math:`U`,
+    :math:`V_1`, and :math:`V_2` are independent, :math:`U` is lognormally
+    distributed such that :math:`\log U \sim N(\mu, \sigma^2)`, and
+    :math:`V_1` and :math:`V_2` follow Pareto distributions with parameters
+    :math:`\alpha` and :math:`\beta`, respectively [2]_.
+
+    %(after_notes)s
+
+    References
+    ----------
+    .. [1] Hajargasht, Gholamreza, and William E. Griffiths. "Pareto-lognormal
+           distributions: Inequality, poverty, and estimation from grouped income
+           data." Economic Modelling 33 (2013): 593-604.
+    .. [2] Reed, William J., and Murray Jorgensen. "The double Pareto-lognormal
+           distribution - a new parametric model for size distributions."
+           Communications in Statistics - Theory and Methods 33.8 (2004): 1733-1753.
+
+    %(example)s
+
+    """
+    _logphi = norm._logpdf
+    _logPhi = norm._logcdf
+    _logPhic = norm._logsf
+    _phi = norm._pdf
+    _Phi = norm._cdf
+    _Phic = norm._sf
+
+    def _R(self, z):
+        return self._Phic(z) / self._phi(z)
+
+    def _logR(self, z):
+        return self._logPhic(z) - self._logphi(z)
+
+    def _shape_info(self):
+        return [_ShapeInfo("u", False, (-np.inf, np.inf), (False, False)),
+                _ShapeInfo("s", False, (0, np.inf), (False, False)),
+                _ShapeInfo("a", False, (0, np.inf), (False, False)),
+                _ShapeInfo("b", False, (0, np.inf), (False, False))]
+
+    def _argcheck(self, u, s, a, b):
+        return (s > 0) & (a > 0) & (b > 0)
+
+    def _rvs(self, u, s, a, b, size=None, random_state=None):
+        # From [1] after Equation (12): "To generate pseudo-random
+        # deviates from the dPlN distribution, one can exponentiate
+        # pseudo-random deviates from NL generated using (6)."
+        Z = random_state.normal(u, s, size=size)
+        E1 = random_state.standard_exponential(size=size)
+        E2 = random_state.standard_exponential(size=size)
+        return np.exp(Z + E1 / a - E2 / b)
+
+    def _logpdf(self, x, u, s, a, b):
+        with np.errstate(invalid='ignore', divide='ignore'):
+            log_y, m = np.log(x), u  # compare against [1] Eq. 1
+            z = (log_y - m) / s
+            x1 = a * s - z
+            x2 = b * s + z
+            out = np.asarray(np.log(a) + np.log(b) - np.log(a + b) - log_y)
+            out += self._logphi(z)
+            out += np.logaddexp(self._logR(x1), self._logR(x2))
+        out[(x == 0) | np.isinf(x)] = -np.inf
+        return out[()]
+
+    def _logcdf(self, x, u, s, a, b):
+        log_y, m = np.log(x), u  # compare against [1] Eq. 2
+        z = (log_y - m) / s
+        x1 = a * s - z
+        x2 = b * s + z
+        t1 = self._logPhi(z)
+        t2 = self._logphi(z)
+        t3 = (np.log(b) + self._logR(x1))
+        t4 = (np.log(a) + self._logR(x2))
+        t1, t2, t3, t4, one = np.broadcast_arrays(t1, t2, t3, t4, 1)
+        # t3 can be smaller than t4, so we have to consider log of negative number
+        # This would be much simpler, but `return_sign` is available, so use it?
+        # t5 =  sc.logsumexp([t3, t4 + np.pi*1j])
+        t5, sign =  sc.logsumexp([t3, t4], b=[one, -one], axis=0, return_sign=True)
+        return sc.logsumexp([t1, t2 + t5 - np.log(a + b)], b=[one, -one*sign], axis=0)
+
+    def _logsf(self, x, u, s, a, b):
+        return _log1mexp(self._logcdf(x, u, s, a, b))
+
+    # Infrastructure doesn't seem to do this, so...
+
+    def _pdf(self, x, u, s, a, b):
+        return np.exp(self._logpdf(x, u, s, a, b))
+
+    def _cdf(self, x, u, s, a, b):
+        return np.exp(self._logcdf(x, u, s, a, b))
+
+    def _sf(self, x, u, s, a, b):
+        return np.exp(self._logsf(x, u, s, a, b))
+
+    def _munp(self, n, u, s, a, b):
+        m, k = u, float(n)  # compare against [1] Eq. 6
+        out = (a * b) / ((a - k) * (b + k)) * np.exp(k * m + k ** 2 * s ** 2 / 2)
+        out = np.asarray(out)
+        out[a <= k] = np.nan
+        return out
+
+
+dpareto_lognorm = dpareto_lognorm_gen(a=0, name='dpareto_lognorm')
+
+
 class dweibull_gen(rv_continuous):
     r"""A double Weibull continuous random variable.
 
diff --git a/scipy/stats/_distr_params.py b/scipy/stats/_distr_params.py
@@ -19,6 +19,7 @@
     ['cosine', ()],
     ['crystalball', (2.0, 3.0)],
     ['dgamma', (1.1023326088288166,)],
+    ['dpareto_lognorm', (3, 1.2, 1.5, 2)],
     ['dweibull', (2.0685080649914673,)],
     ['erlang', (10,)],
     ['expon', ()],
@@ -121,7 +122,8 @@
     ['wald', ()],
     ['weibull_max', (2.8687961709100187,)],
     ['weibull_min', (1.7866166930421596,)],
-    ['wrapcauchy', (0.031071279018614728,)]]
+    ['wrapcauchy', (0.031071279018614728,)]
+]
 
 
 distdiscrete = [
@@ -196,6 +198,7 @@
     ['cosine', ()],
     ['crystalball', (-1, 2)],
     ['dgamma', (-1, )],
+    ['dpareto_lognorm', (3, -1.2, 1.5, 2)],
     ['dweibull', (-1, )],
     ['erlang', (-1, )],
     ['expon', ()],
diff --git a/scipy/stats/tests/test_continuous_basic.py b/scipy/stats/tests/test_continuous_basic.py
@@ -62,12 +62,13 @@
                 'johnsonsb', 'kstwobign', 'ncx2', 'norminvgauss', 'truncnorm',
                 'truncweibull_min', 'wrapcauchy'}
 xfail_fit_mm = {'alpha', 'betaprime', 'bradford', 'burr', 'burr12', 'cauchy',
-                'crystalball', 'exponweib', 'f', 'fisk', 'foldcauchy', 'genextreme',
-                'genpareto', 'halfcauchy', 'invgamma', 'irwinhall', 'jf_skew_t',
-                'johnsonsu', 'kappa3', 'kappa4', 'landau', 'levy', 'levy_l',
-                'loglaplace', 'lomax', 'mielke', 'ncf', 'nct', 'pareto', 'powerlognorm',
-                'powernorm', 'rel_breitwigner',  'skewcauchy', 't', 'trapezoid',
-                'truncexpon', 'truncpareto', 'tukeylambda', 'vonmises', 'vonmises_line'}
+                'crystalball', 'dpareto_lognorm', 'exponweib', 'f', 'fisk',
+                'foldcauchy', 'genextreme', 'genpareto', 'halfcauchy', 'invgamma',
+                'irwinhall', 'jf_skew_t', 'johnsonsu', 'kappa3', 'kappa4', 'landau',
+                'levy', 'levy_l', 'loglaplace', 'lomax', 'mielke', 'ncf', 'nct',
+                'pareto', 'powerlognorm', 'powernorm', 'rel_breitwigner',
+                'skewcauchy', 't', 'trapezoid', 'truncexpon', 'truncpareto',
+                'tukeylambda', 'vonmises', 'vonmises_line'}
 skip_fit_mm = {'genexpon', 'genhyperbolic', 'ksone', 'kstwo', 'levy_stable',
                'recipinvgauss', 'studentized_range'}  # far too slow (>10min)
 
@@ -76,8 +77,8 @@
 # on the implementation details of corresponding special functions.
 # cf https://github.com/scipy/scipy/pull/4979 for a discussion.
 fails_cmplx = {'argus', 'beta', 'betaprime', 'cauchy', 'chi', 'chi2', 'cosine',
-               'dgamma', 'dweibull', 'erlang', 'f', 'foldcauchy', 'gamma',
-               'gausshyper', 'gengamma', 'genhyperbolic',
+               'dgamma', 'dpareto_lognorm', 'dweibull', 'erlang', 'f', 'foldcauchy',
+               'gamma', 'gausshyper', 'gengamma', 'genhyperbolic',
                'geninvgauss', 'gennorm', 'genpareto',
                'halfcauchy', 'halfgennorm', 'invgamma', 'irwinhall', 'jf_skew_t',
                'ksone', 'kstwo', 'kstwobign', 'landau', 'levy_l', 'loggamma',
@@ -385,7 +386,7 @@ def test_rvs_broadcast(dist, shape_args):
     # implementation detail of the distribution, not a requirement.  If
     # the implementation the rvs() method of a distribution changes, this
     # test might also have to be changed.
-    shape_only = dist in ['argus', 'betaprime', 'dgamma', 'dweibull',
+    shape_only = dist in ['argus', 'betaprime', 'dgamma', 'dpareto_lognorm', 'dweibull',
                           'exponnorm', 'genhyperbolic', 'geninvgauss', 'landau',
                           'levy_stable', 'nct', 'norminvgauss', 'rice',
                           'skewnorm', 'semicircular', 'gennorm', 'loggamma']
diff --git a/scipy/stats/tests/test_distributions.py b/scipy/stats/tests/test_distributions.py
@@ -10305,6 +10305,24 @@ def test_sf_ih10_exact(self):
         assert_array_max_ulp(self.ih10.sf(1/10), ref, maxulp=10)
 
 
+class TestDParetoLognorm:
+    def test_against_R(self):
+        # Test against R implementation in `distributionsrd`
+        # library(distributionsrd)
+        # options(digits=16)
+        # x = 1.1
+        # b = 2
+        # a = 1.5
+        # m = 3
+        # s = 1.2
+        # ddoubleparetolognormal(x, b, a, m, s)
+        # pdoubleparetolognormal(x, b, a, m, s)
+        x, m, s, a, b = 1.1, 3, 1.2, 1.5, 2
+        dist = stats.dpareto_lognorm(m, s, a, b)
+        np.testing.assert_allclose(dist.pdf(x), 0.02490187219085912)
+        np.testing.assert_allclose(dist.cdf(x), 0.01664024173822796)
+
+
 # Cases are (distribution name, log10 of smallest probability mass to test,
 # log10 of the complement of the largest probability mass to test, atol,
 # rtol). None uses default values.
diff --git a/scipy/stats/tests/test_fit.py b/scipy/stats/tests/test_fit.py
@@ -21,6 +21,7 @@
 thresh_min = 0.75  # minimum difference estimate - true to fail test
 
 mle_failing_fits = [
+        'dpareto_lognorm',
         'gausshyper',
         'genexpon',
         'gengamma',
@@ -61,8 +62,8 @@
 ]
 
 mm_failing_fits = ['alpha', 'betaprime', 'burr', 'burr12', 'cauchy', 'chi',
-                   'chi2', 'crystalball', 'dgamma', 'dweibull', 'f',
-                   'fatiguelife', 'fisk', 'foldcauchy', 'genextreme',
+                   'chi2', 'crystalball', 'dgamma', 'dpareto_lognorm', 'dweibull',
+                   'f', 'fatiguelife', 'fisk', 'foldcauchy', 'genextreme',
                    'gengamma', 'genhyperbolic', 'gennorm', 'genpareto',
                    'halfcauchy', 'invgamma', 'invweibull', 'irwinhall', 'jf_skew_t',
                    'johnsonsu', 'kappa3', 'ksone', 'kstwo', 'landau', 'levy', 'levy_l',
@@ -249,8 +250,8 @@ def cases_test_fit_mle():
                       't', 'uniform', 'weibull_max', 'weibull_min', 'wrapcauchy'}
 
     # Please keep this list in alphabetical order...
-    xslow_basic_fit = {'betabinom', 'betanbinom', 'burr', 'exponweib',
-                       'gausshyper', 'gengamma', 'genhalflogistic',
+    xslow_basic_fit = {'betabinom', 'betanbinom', 'burr', 'dpareto_lognorm',
+                       'exponweib', 'gausshyper', 'gengamma', 'genhalflogistic',
                        'genhyperbolic', 'geninvgauss',
                        'hypergeom', 'kappa4', 'loguniform',
                        'ncf', 'nchypergeom_fisher', 'nchypergeom_wallenius',
@@ -307,7 +308,7 @@ def cases_test_fit_mse():
 
     # Please keep this list in alphabetical order...
     xslow_basic_fit = {'argus', 'beta', 'betaprime', 'burr', 'burr12',
-                       'dgamma', 'f', 'gengamma', 'gennorm',
+                       'dgamma', 'dpareto_lognorm', 'f', 'gengamma', 'gennorm',
                        'halfgennorm', 'invgamma', 'invgauss', 'jf_skew_t',
                        'johnsonsb', 'kappa4', 'loguniform', 'mielke',
                        'nakagami', 'ncf', 'nchypergeom_fisher',