MAINT: stats: Add custom reprs for transformed distributions (scipy#22037)

steppi · web-flow · commit 050e65d7452e · 2024-12-12T00:05:42.000-08:00
* MAINT: sort dict of parameters instead of sorting later

* MAINT: improve reprs of transformed distributions to make them executable

* MAINT: raise if no __repr__ override transformed dist subclass

* MAINT: remove __str__ from monotonic transform

* TST: Update __repr__ tests for continuous dists

* TST: Add tests the reprs evaluate to correct dist

* BUG: Set array priority to 1 to get reflected operators working

* MAINT: summarize arrays to prevent long output
diff --git a/scipy/stats/_distribution_infrastructure.py b/scipy/stats/_distribution_infrastructure.py
@@ -27,7 +27,8 @@
 def _isnull(x):
     return type(x) is object or x is None
 
-__all__ = ['ContinuousDistribution']
+__all__ = ['make_distribution', 'Mixture', 'order_statistic',
+           'truncate', 'abs', 'exp', 'log']
 
 # Could add other policies for broadcasting and edge/out-of-bounds case handling
 # For instance, when edge case handling is known not to be needed, it's much
@@ -1482,6 +1483,7 @@ class ContinuousDistribution(_ProbabilityDistribution):
     text.
 
     """
+    __array_priority__ = 1
     _parameterizations = []  # type: ignore[var-annotated]
 
     ### Initialization
@@ -1501,7 +1503,8 @@ def __init__(self, *, tol=_null, validation_policy=None, cache_policy=None,
         # IDEs can suggest parameter names. If there are multiple parameterizations,
         # we'll need the default values of parameters to be None; this will
         # filter out the parameters that were not actually specified by the user.
-        parameters = {key: val for key, val in parameters.items() if val is not None}
+        parameters = {key: val for key, val in
+                      sorted(parameters.items()) if val is not None}
         self._update_parameters(**parameters)
 
     def _update_parameters(self, *, validation_policy=None, **params):
@@ -1701,9 +1704,7 @@ def _process_parameters(self, **params):
 
     def _get_parameter_str(self, parameters):
         # Get a string representation of the parameters like "{a, b, c}".
-        parameter_names_list = list(parameters.keys())
-        parameter_names_list.sort()
-        return f"{{{', '.join(parameter_names_list)}}}"
+        return f"{{{', '.join(parameters.keys())}}}"
 
     def _copy_parameterization(self):
         self._parameterizations = self._parameterizations.copy()
@@ -1786,25 +1787,17 @@ def __repr__(self):
         r""" Returns a string representation of the distribution.
 
         Includes the name of the distribution family, the names of the
-        parameters, and the broadcasted shape and result dtype of the
-        parameters.
+        parameters and the `repr` of each of their values.
+
 
         """
         class_name = self.__class__.__name__
         parameters = list(self._original_parameters.items())
         info = []
-        if parameters:
-            parameters.sort()
-            if self._size <= 3:
-                str_parameters = [f"{symbol}={value}" for symbol, value in parameters]
-                str_parameters = f"{', '.join(str_parameters)}"
-            else:
-                str_parameters = f"{', '.join([symbol for symbol, _ in parameters])}"
-            info.append(str_parameters)
-        if self._shape:
-            info.append(f"shape={self._shape}")
-        if self._dtype != np.float64:
-            info.append(f"dtype={self._dtype}")
+        with np.printoptions(threshold=10):
+            str_parameters = [f"{symbol}={repr(value)}" for symbol, value in parameters]
+        str_parameters = f"{', '.join(str_parameters)}"
+        info.append(str_parameters)
         return f"{class_name}({', '.join(info)})"
 
     def __add__(self, loc):
@@ -1825,10 +1818,13 @@ def __pow__(self, other):
                        "implemented when the argument is a positive integer.")
             raise NotImplementedError(message)
 
-        X = abs(self) if (other % 2 == 0) else self
+        # Fill in repr_pattern with the repr of self before taking abs.
+        # Avoids having unnecessary abs in the repr.
+        with np.printoptions(threshold=10):
+            repr_pattern = f"({repr(self)})**{repr(other)}"
+        X = abs(self) if other % 2 == 0 else self
 
-        # This notation for g_name is nonstandard
-        funcs = dict(g=lambda u: u**other, g_name=f'pow_{other}',
+        funcs = dict(g=lambda u: u**other, repr_pattern=repr_pattern,
                      h=lambda u: np.sign(u) * np.abs(u)**(1 / other),
                      dh=lambda u: 1/other * np.abs(u)**(1/other - 1))
 
@@ -1846,8 +1842,10 @@ def __rmul__(self, other):
 
     def __rtruediv__(self, other):
         a, b = self.support()
-        funcs = dict(g=lambda u: 1 / u, g_name='inv',
-                     h=lambda u: 1 / u, dh=lambda u: 1 / u ** 2)
+        with np.printoptions(threshold=10):
+            funcs = dict(g=lambda u: 1 / u,
+                         repr_pattern=f"{repr(other)}/({repr(self)})",
+                         h=lambda u: 1 / u, dh=lambda u: 1 / u ** 2)
         if np.all(a >= 0) or np.all(b <= 0):
             out = MonotonicTransformedDistribution(self, **funcs, increasing=False)
         else:
@@ -1860,9 +1858,11 @@ def __rtruediv__(self, other):
             return out * other
 
     def __rpow__(self, other):
-        funcs = dict(g=lambda u: other**u, g_name=f'{other}**',
-                     h=lambda u: np.log(u) / np.log(other),
-                     dh=lambda u: 1 / np.abs(u * np.log(other)))
+        with np.printoptions(threshold=10):
+            funcs = dict(g=lambda u: other**u,
+                         h=lambda u: np.log(u) / np.log(other),
+                         dh=lambda u: 1 / np.abs(u * np.log(other)),
+                         repr_pattern=f"{repr(other)}**({repr(self)})")
 
         if not np.isscalar(other) or other <= 0 or other == 1:
             message = ("Raising an argument to the power of a random variable is only "
@@ -3846,9 +3846,7 @@ def _process_parameters(self, **params):
         return self._dist._process_parameters(**params)
 
     def __repr__(self):
-        s = super().__repr__()
-        return s.replace("Distribution",
-                         self._dist.__class__.__name__)
+        raise NotImplementedError()
 
 
 class TruncatedDistribution(TransformedDistribution):
@@ -3926,6 +3924,11 @@ def _iccdf_dispatch(self, p, *args, lb, ub, _a, _b, logmass, **params):
         p_adjusted = cFb + p*np.exp(logmass)
         return self._dist._iccdf_dispatch(p_adjusted, *args, **params)
 
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return (f"truncate({repr(self._dist)}, "
+                    f"lb={repr(self.lb)}, ub={repr(self.ub)})")
+
 
 def truncate(X, lb=-np.inf, ub=np.inf):
     """Truncate the support of a random variable.
@@ -4026,6 +4029,18 @@ def _support(self, loc, scale, sign, **params):
         a, b = self._itransform(a, loc, scale), self._itransform(b, loc, scale)
         return np.where(sign, a, b)[()], np.where(sign, b, a)[()]
 
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            result =  f"{repr(self.scale)}*{repr(self._dist)}"
+            if not self.loc.ndim and self.loc < 0:
+                result += f" - {repr(-self.loc)}"
+            elif (np.any(self.loc != 0)
+                  or not np.can_cast(self.loc.dtype, self.scale.dtype)):
+                # We don't want to hide a zero array loc if it can cause
+                # a type promotion.
+                result += f" + {repr(self.loc)}"
+        return result
+
     # Here, we override all the `_dispatch` methods rather than the public
     # methods or _function methods. Why not the public methods?
     # If we were to override the public methods, then other
@@ -4298,6 +4313,11 @@ def _iccdf_formula(self, p, r, n, **kwargs):
         p_ = special.betainccinv(r, n-r+1, p)
         return self._dist._icdf_dispatch(p_, **kwargs)
 
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return (f"order_statistic({repr(self._dist)}, r={repr(self.r)}, "
+                    f"n={repr(self.n)})")
+
 
 def order_statistic(X, /, *, r, n):
     r"""Probability distribution of an order statistic
@@ -4678,6 +4698,17 @@ def sample(self, shape=(), *, rng=None, method=None):
         x = np.reshape(rng.permuted(np.concatenate(x)), shape)
         return x[()]
 
+    def __repr__(self):
+        result = "Mixture(\n"
+        result += "    [\n"
+        with np.printoptions(threshold=10):
+            for component in self.components:
+                result += f"        {repr(component)},\n"
+            result += "    ],\n"
+            result += f"    weights={repr(self.weights)},\n"
+        result += ")"
+        return result
+
 
 class MonotonicTransformedDistribution(TransformedDistribution):
     r"""Distribution underlying a strictly monotonic function of a random variable
@@ -4701,14 +4732,18 @@ class MonotonicTransformedDistribution(TransformedDistribution):
     increasing : bool, optional
         Whether the function is strictly increasing (True, default)
         or strictly decreasing (False).
-    g_name : str, optional
-        The name of the mathematical function represented by `g`,
-        used in `__repr__` and `__str__`. The default is ``g.__name__``.
+    repr_pattern : str, optional
+        A string pattern for determining the __repr__. The __repr__
+        for X will be substituted into the position where `***` appears.
+        For example:
+            ``"exp(***)"`` for the repr of an exponentially transformed
+            distribution
+        The default is ``f"{g.__name__}(***)"``.
 
     """
 
     def __init__(self, X, /, *args, g, h, dh, logdh=None,
-                 increasing=True, g_name=None, **kwargs):
+                 increasing=True, repr_pattern=None, **kwargs):
         super().__init__(X, *args, **kwargs)
         self._g = g
         self._h = h
@@ -4734,13 +4769,11 @@ def __init__(self, X, /, *args, g, h, dh, logdh=None,
             self._ilogxdf = self._dist._ilogccdf_dispatch
             self._ilogcxdf = self._dist._ilogcdf_dispatch
         self._increasing = increasing
-        self._g_name = g.__name__ if g_name is None else g_name
+        self._repr_pattern = repr_pattern or f"{g.__name__}(***)"
 
     def __repr__(self):
-        return f"{self._g_name}({repr(self._dist)})"
-
-    def __str__(self):
-        return f"{self._g_name}({str(self._dist)})"
+        with np.printoptions(threshold=10):
+            return self._repr_pattern.replace("***", repr(self._dist))
 
     def _overrides(self, method_name):
         # Do not use the generic overrides of TransformedDistribution
@@ -4892,6 +4925,10 @@ def _sample_dispatch(self, sample_shape, full_shape, *,
             sample_shape, full_shape, method=method, rng=rng, **params)
         return np.abs(rvs)
 
+    def __repr__(self):
+        with np.printoptions(threshold=10):
+            return f"abs({repr(self._dist)})"
+
 
 def abs(X, /):
     r"""Absolute value of a random variable
diff --git a/scipy/stats/tests/test_continuous.py b/scipy/stats/tests/test_continuous.py
@@ -1114,11 +1114,17 @@ def test_repr_str_docs(self):
             assert hasattr(stats, dist)
 
         dist = stats.make_distribution(stats.gamma)
-        assert str(dist(a=2)) == "Gamma(a=2.0)"
+        if np.__version__ < "2":
+            assert str(dist(a=2)) == "Gamma(a=2.0)"
+        else:
+            assert str(dist(a=2)) == "Gamma(a=np.float64(2.0))"
         assert 'Gamma' in dist.__doc__
 
         dist = stats.make_distribution(stats.halfgennorm)
-        assert str(dist(beta=2)) == "HalfGeneralizedNormal(beta=2.0)"
+        if np.__version__ < "2":
+            str(dist(beta=2)) == "HalfGeneralizedNormal(beta=2.0)"
+        else:
+            assert str(dist(beta=2)) == "HalfGeneralizedNormal(beta=np.float64(2.0))"
         assert 'HalfGeneralizedNormal' in dist.__doc__
 
 
@@ -1381,10 +1387,18 @@ def test_log(self):
     def test_monotonic_transforms(self):
         # Some tests of monotonic transforms that are better to be grouped or
         # don't fit well above
+
         X = Uniform(a=1, b=2)
-        assert repr(stats.log(X)) == str(stats.log(X)) == "log(Uniform(a=1.0, b=2.0))"
-        assert repr(1 / X) == str(1 / X) == "inv(Uniform(a=1.0, b=2.0))"
-        assert repr(stats.exp(X)) == str(stats.exp(X)) == "exp(Uniform(a=1.0, b=2.0))"
+        X_repr = (
+            "Uniform(a=1.0, b=2.0)" if np.__version__ < "2"
+            else "Uniform(a=np.float64(1.0), b=np.float64(2.0))"
+        )
+
+        assert repr(stats.log(X)) == str(stats.log(X)) == (
+            f"log({X_repr})"
+        )
+        assert repr(1 / X) == str(1 / X) == f"1/({X_repr})"
+        assert repr(stats.exp(X)) == str(stats.exp(X)) == f"exp({X_repr})"
 
         X = Uniform(a=-1, b=2)
         message = "Division by a random variable is only implemented when the..."
@@ -1634,17 +1648,106 @@ def test_generate_domain_support(self):
         msg = _generate_domain_support(_LogUniform)
         assert "accepts two parameterizations" in msg
 
-    def test_ContinuousDistribution__str__(self):
+    def test_ContinuousDistribution__repr__(self):
         X = Uniform(a=0, b=1)
-        assert str(X) == "Uniform(a=0.0, b=1.0)"
-
-        assert str(X*3 + 2) == "ShiftedScaledUniform(a=0.0, b=1.0, loc=2.0, scale=3.0)"
+        if np.__version__ < "2":
+            assert repr(X) == "Uniform(a=0.0, b=1.0)"
+        else:
+            assert repr(X) == "Uniform(a=np.float64(0.0), b=np.float64(1.0))"
+        if np.__version__ < "2":
+            assert repr(X*3 + 2) == "3.0*Uniform(a=0.0, b=1.0) + 2.0"
+        else:
+            assert repr(X*3 + 2) == (
+                "np.float64(3.0)*Uniform(a=np.float64(0.0), b=np.float64(1.0))"
+                " + np.float64(2.0)"
+            )
 
         X = Uniform(a=np.zeros(4), b=1)
-        assert str(X) == "Uniform(a, b, shape=(4,))"
+        assert repr(X) == "Uniform(a=array([0., 0., 0., 0.]), b=1)"
 
         X = Uniform(a=np.zeros(4, dtype=np.float32), b=np.ones(4, dtype=np.float32))
-        assert str(X) == "Uniform(a, b, shape=(4,), dtype=float32)"
+        assert repr(X) == (
+            "Uniform(a=array([0., 0., 0., 0.], dtype=float32),"
+            " b=array([1., 1., 1., 1.], dtype=float32))"
+        )
+
+
+class TestReprs:
+    U = Uniform(a=0, b=1)
+    V = Uniform(a=np.float32(0.0), b=np.float32(1.0))
+    X = Normal(mu=-1, sigma=1)
+    Y = Normal(mu=1, sigma=1)
+    Z = Normal(mu=np.zeros(1000), sigma=1)
+
+    @pytest.mark.parametrize(
+        "dist",
+        [
+            U,
+            U - np.array([1.0, 2.0]),
+            pytest.param(
+                V,
+                marks=pytest.mark.skipif(
+                    np.__version__ < "2",
+                    reason="numpy 1.x didn't have dtype in repr",
+                )
+            ),
+            pytest.param(
+                np.ones(2, dtype=np.float32)*V + np.zeros(2, dtype=np.float64),
+                marks=pytest.mark.skipif(
+                    np.__version__ < "2",
+                    reason="numpy 1.x didn't have dtype in repr",
+                )
+            ),
+            3*U + 2,
+            U**4,
+            (3*U + 2)**4,
+            (3*U + 2)**3,
+            2**U,
+            2**(3*U + 1),
+            1 / (1 + U),
+            stats.order_statistic(U, r=3, n=5),
+            stats.truncate(U, 0.2, 0.8),
+            stats.Mixture([X, Y], weights=[0.3, 0.7]),
+            abs(U),
+            stats.exp(U),
+            stats.log(1 + U),
+            np.array([1.0, 2.0])*U + np.array([2.0, 3.0]),
+        ]
+    )
+    def test_executable(self, dist):
+        # Test that reprs actually evaluate to proper distribution
+        # provided relevant imports are made.
+        from numpy import array  # noqa: F401
+        from numpy import float32  # noqa: F401
+        from scipy.stats import abs, exp, log, order_statistic, truncate # noqa: F401
+        from scipy.stats import Mixture, Normal # noqa: F401
+        from scipy.stats._new_distributions import Uniform # noqa: F401
+        new_dist = eval(repr(dist))
+        # A basic check that the distributions are the same
+        sample1 = dist.sample(shape=10, rng=1234)
+        sample2 = new_dist.sample(shape=10, rng=1234)
+        assert_equal(sample1, sample2)
+        assert sample1.dtype is sample2.dtype
+
+    @pytest.mark.parametrize(
+        "dist",
+        [
+            Z,
+            np.full(1000, 2.0) * X + 1.0,
+            2.0 * X + np.full(1000, 1.0),
+            np.full(1000, 2.0) * X + 1.0,
+            stats.truncate(Z, -1, 1),
+            stats.truncate(Z, -np.ones(1000), np.ones(1000)),
+            stats.order_statistic(X, r=np.arange(1, 1000), n=1000),
+            Z**2,
+            1.0 / (1 + stats.exp(Z)),
+            2**Z,
+        ]
+    )
+    def test_not_too_long(self, dist):
+        # Tests that array summarization is working to ensure reprs aren't too long.
+        # None of the reprs above will be executable.
+        assert len(repr(dist)) < 250
 
 
 class MixedDist(ContinuousDistribution):