diff --git a/pymc/backends/__init__.py b/pymc/backends/__init__.py
index 9b1af4bd4..2f58b7ed8 100644
--- a/pymc/backends/__init__.py
+++ b/pymc/backends/__init__.py
@@ -23,7 +23,7 @@
 Values can be accessed in a few ways. The easiest way is to index the
 backend object with a variable or variable name.
 
-    >>> trace['x']  # or trace.x or trace[x]
+    >>> trace["x"]  # or trace.x or trace[x]
 
 The call will return the sampling values of `x`, with the values for
 all chains concatenated. (For a single call to `sample`, the number of
@@ -32,18 +32,18 @@
 To discard the first N values of each chain, slicing syntax can be
 used.
 
-    >>> trace['x', 1000:]
+    >>> trace["x", 1000:]
 
 The `get_values` method offers more control over which values are
 returned. The call below will discard the first 1000 iterations
 from each chain and keep the values for each chain as separate arrays.
 
-    >>> trace.get_values('x', burn=1000, combine=False)
+    >>> trace.get_values("x", burn=1000, combine=False)
 
 The `chains` parameter of `get_values` can be used to limit the chains
 that are retrieved.
 
-    >>> trace.get_values('x', burn=1000, chains=[0, 2])
+    >>> trace.get_values("x", burn=1000, chains=[0, 2])
 
 MultiTrace objects also support slicing. For example, the following
 call would return a new trace object without the first 1000 sampling
diff --git a/pymc/data.py b/pymc/data.py
index 66a78ab28..a0d6893cb 100644
--- a/pymc/data.py
+++ b/pymc/data.py
@@ -390,16 +390,16 @@ def Data(
     >>> observed_data = [mu + np.random.randn(20) for mu in true_mu]
 
     >>> with pm.Model() as model:
-    ...     data = pm.Data('data', observed_data[0])
-    ...     mu = pm.Normal('mu', 0, 10)
-    ...     pm.Normal('y', mu=mu, sigma=1, observed=data)
+    ...     data = pm.Data("data", observed_data[0])
+    ...     mu = pm.Normal("mu", 0, 10)
+    ...     pm.Normal("y", mu=mu, sigma=1, observed=data)
 
     >>> # Generate one trace for each dataset
     >>> idatas = []
     >>> for data_vals in observed_data:
     ...     with model:
     ...         # Switch out the observed dataset
-    ...         model.set_data('data', data_vals)
+    ...         model.set_data("data", data_vals)
     ...         idatas.append(pm.sample())
     """
     if coords is None:
diff --git a/pymc/distributions/continuous.py b/pymc/distributions/continuous.py
index 0af7193b3..6e68d98bf 100644
--- a/pymc/distributions/continuous.py
+++ b/pymc/distributions/continuous.py
@@ -488,10 +488,10 @@ class Normal(Continuous):
     .. code-block:: python
 
         with pm.Model():
-            x = pm.Normal('x', mu=0, sigma=10)
+            x = pm.Normal("x", mu=0, sigma=10)
 
         with pm.Model():
-            x = pm.Normal('x', mu=0, tau=1/23)
+            x = pm.Normal("x", mu=0, tau=1 / 23)
     """
 
     rv_op = normal
@@ -636,13 +636,13 @@ class TruncatedNormal(BoundedContinuous):
     .. code-block:: python
 
         with pm.Model():
-            x = pm.TruncatedNormal('x', mu=0, sigma=10, lower=0)
+            x = pm.TruncatedNormal("x", mu=0, sigma=10, lower=0)
 
         with pm.Model():
-            x = pm.TruncatedNormal('x', mu=0, sigma=10, upper=1)
+            x = pm.TruncatedNormal("x", mu=0, sigma=10, upper=1)
 
         with pm.Model():
-            x = pm.TruncatedNormal('x', mu=0, sigma=10, lower=0, upper=1)
+            x = pm.TruncatedNormal("x", mu=0, sigma=10, lower=0, upper=1)
 
     """
 
@@ -817,10 +817,10 @@ class HalfNormal(PositiveContinuous):
     .. code-block:: python
 
         with pm.Model():
-            x = pm.HalfNormal('x', sigma=10)
+            x = pm.HalfNormal("x", sigma=10)
 
         with pm.Model():
-            x = pm.HalfNormal('x', tau=1/15)
+            x = pm.HalfNormal("x", tau=1 / 15)
     """
 
     rv_op = halfnormal
@@ -1711,10 +1711,10 @@ class LogNormal(PositiveContinuous):
 
         # Example to show that we pass in only ``sigma`` or ``tau`` but not both.
         with pm.Model():
-            x = pm.LogNormal('x', mu=2, sigma=30)
+            x = pm.LogNormal("x", mu=2, sigma=30)
 
         with pm.Model():
-            x = pm.LogNormal('x', mu=2, tau=1/100)
+            x = pm.LogNormal("x", mu=2, tau=1 / 100)
     """
 
     rv_op = lognormal
@@ -1828,10 +1828,10 @@ class StudentT(Continuous):
     .. code-block:: python
 
         with pm.Model():
-            x = pm.StudentT('x', nu=15, mu=0, sigma=10)
+            x = pm.StudentT("x", nu=15, mu=0, sigma=10)
 
         with pm.Model():
-            x = pm.StudentT('x', nu=15, mu=0, lam=1/23)
+            x = pm.StudentT("x", nu=15, mu=0, lam=1 / 23)
     """
 
     rv_op = t
@@ -2802,10 +2802,10 @@ class HalfStudentT(PositiveContinuous):
 
         # Only pass in one of lam or sigma, but not both.
         with pm.Model():
-            x = pm.HalfStudentT('x', sigma=10, nu=10)
+            x = pm.HalfStudentT("x", sigma=10, nu=10)
 
         with pm.Model():
-            x = pm.HalfStudentT('x', lam=4, nu=10)
+            x = pm.HalfStudentT("x", lam=4, nu=10)
     """
 
     rv_type = HalfStudentTRV
@@ -4104,9 +4104,9 @@ class PolyaGamma(PositiveContinuous):
 
         rng = np.random.default_rng()
         with pm.Model():
-            x = pm.PolyaGamma('x', h=1, z=5.5)
+            x = pm.PolyaGamma("x", h=1, z=5.5)
         with pm.Model():
-            x = pm.PolyaGamma('x', h=25, z=-2.3, rng=rng, size=(100, 5))
+            x = pm.PolyaGamma("x", h=25, z=-2.3, rng=rng, size=(100, 5))
 
     References
     ----------
diff --git a/pymc/distributions/custom.py b/pymc/distributions/custom.py
index 9b8098ab9..ec7e30e87 100644
--- a/pymc/distributions/custom.py
+++ b/pymc/distributions/custom.py
@@ -571,13 +571,15 @@ class CustomDist:
         import pymc as pm
         from pytensor.tensor import TensorVariable
 
+
         def logp(value: TensorVariable, mu: TensorVariable) -> TensorVariable:
-            return -(value - mu)**2
+            return -((value - mu) ** 2)
+
 
         with pm.Model():
-            mu = pm.Normal('mu',0,1)
+            mu = pm.Normal("mu", 0, 1)
             pm.CustomDist(
-                'custom_dist',
+                "custom_dist",
                 mu,
                 logp=logp,
                 observed=np.random.randn(100),
@@ -596,20 +598,23 @@ def logp(value: TensorVariable, mu: TensorVariable) -> TensorVariable:
         import pymc as pm
         from pytensor.tensor import TensorVariable
 
+
         def logp(value: TensorVariable, mu: TensorVariable) -> TensorVariable:
-            return -(value - mu)**2
+            return -((value - mu) ** 2)
+
 
         def random(
             mu: np.ndarray | float,
             rng: Optional[np.random.Generator] = None,
-            size : Optional[Tuple[int]]=None,
-        ) -> np.ndarray | float :
+            size: Optional[Tuple[int]] = None,
+        ) -> np.ndarray | float:
             return rng.normal(loc=mu, scale=1, size=size)
 
+
         with pm.Model():
-            mu = pm.Normal('mu', 0 , 1)
+            mu = pm.Normal("mu", 0, 1)
             pm.CustomDist(
-                'custom_dist',
+                "custom_dist",
                 mu,
                 logp=logp,
                 random=random,
@@ -629,6 +634,7 @@ def random(
         import pymc as pm
         from pytensor.tensor import TensorVariable
 
+
         def dist(
             lam: TensorVariable,
             shift: TensorVariable,
@@ -636,6 +642,7 @@ def dist(
         ) -> TensorVariable:
             return pm.Exponential.dist(lam, size=size) + shift
 
+
         with pm.Model() as m:
             lam = pm.HalfNormal("lam")
             shift = -1
diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
index 667ac5e69..36cd1c397 100644
--- a/pymc/distributions/mixture.py
+++ b/pymc/distributions/mixture.py
@@ -194,10 +194,10 @@ class Mixture(Distribution):
 
         # Mixture of 2 Poisson variables
         with pm.Model() as model:
-            w = pm.Dirichlet('w', a=np.array([1, 1]))  # 2 mixture weights
+            w = pm.Dirichlet("w", a=np.array([1, 1]))  # 2 mixture weights
 
-            lam1 = pm.Exponential('lam1', lam=1)
-            lam2 = pm.Exponential('lam2', lam=1)
+            lam1 = pm.Exponential("lam1", lam=1)
+            lam2 = pm.Exponential("lam2", lam=1)
 
             # As we just need the logp, rather than add a RV to the model, we need to call `.dist()`
             # These two forms are equivalent, but the second benefits from vectorization
@@ -208,14 +208,14 @@ class Mixture(Distribution):
             # `shape=(2,)` indicates 2 mixture components
             components = pm.Poisson.dist(mu=pm.math.stack([lam1, lam2]), shape=(2,))
 
-            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
+            like = pm.Mixture("like", w=w, comp_dists=components, observed=data)
 
 
     .. code-block:: python
 
         # Mixture of Normal and StudentT variables
         with pm.Model() as model:
-            w = pm.Dirichlet('w', a=np.array([1, 1]))  # 2 mixture weights
+            w = pm.Dirichlet("w", a=np.array([1, 1]))  # 2 mixture weights
 
             mu = pm.Normal("mu", 0, 1)
 
@@ -224,7 +224,7 @@ class Mixture(Distribution):
                 pm.StudentT.dist(nu=4, mu=mu, sigma=1),
             ]
 
-            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
+            like = pm.Mixture("like", w=w, comp_dists=components, observed=data)
 
 
     .. code-block:: python
@@ -233,10 +233,10 @@ class Mixture(Distribution):
         with pm.Model() as model:
             # w is a stack of 5 independent size 3 weight vectors
             # If shape was `(3,)`, the weights would be shared across the 5 replication dimensions
-            w = pm.Dirichlet('w', a=np.ones(3), shape=(5, 3))
+            w = pm.Dirichlet("w", a=np.ones(3), shape=(5, 3))
 
             # Each of the 3 mixture components has an independent mean
-            mu = pm.Normal('mu', mu=np.arange(3), sigma=1, shape=3)
+            mu = pm.Normal("mu", mu=np.arange(3), sigma=1, shape=3)
 
             # These two forms are equivalent, but the second benefits from vectorization
             components = [
@@ -249,14 +249,14 @@ class Mixture(Distribution):
             # The mixture is an array of 5 elements
             # Each element can be thought of as an independent scalar mixture of 3
             # components with different means
-            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
+            like = pm.Mixture("like", w=w, comp_dists=components, observed=data)
 
 
     .. code-block:: python
 
         # Mixture of 2 Dirichlet variables
         with pm.Model() as model:
-            w = pm.Dirichlet('w', a=np.ones(2))  # 2 mixture weights
+            w = pm.Dirichlet("w", a=np.ones(2))  # 2 mixture weights
 
             # These two forms are equivalent, but the second benefits from vectorization
             components = [
@@ -267,7 +267,7 @@ class Mixture(Distribution):
 
             # The mixture is an array of 3 elements
             # Each element comes from only one of the two core Dirichlet components
-            like = pm.Mixture('like', w=w, comp_dists=components, observed=data)
+            like = pm.Mixture("like", w=w, comp_dists=components, observed=data)
     """
 
     rv_type = MarginalMixtureRV
diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index b30944979..5c99bc667 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -230,9 +230,9 @@ class MvNormal(Continuous):
     Define a multivariate normal variable for a given covariance
     matrix::
 
-        cov = np.array([[1., 0.5], [0.5, 2]])
+        cov = np.array([[1.0, 0.5], [0.5, 2]])
         mu = np.zeros(2)
-        vals = pm.MvNormal('vals', mu=mu, cov=cov, shape=(5, 2))
+        vals = pm.MvNormal("vals", mu=mu, cov=cov, shape=(5, 2))
 
     Most of the time it is preferable to specify the cholesky
     factor of the covariance instead. For example, we could
@@ -240,24 +240,26 @@ class MvNormal(Continuous):
     of `LKJCholeskyCov` for more information about this)::
 
         mu = np.zeros(3)
-        true_cov = np.array([[1.0, 0.5, 0.1],
-                             [0.5, 2.0, 0.2],
-                             [0.1, 0.2, 1.0]])
+        true_cov = np.array(
+            [
+                [1.0, 0.5, 0.1],
+                [0.5, 2.0, 0.2],
+                [0.1, 0.2, 1.0],
+            ],
+        )
         data = np.random.multivariate_normal(mu, true_cov, 10)
 
         sd_dist = pm.Exponential.dist(1.0, shape=3)
-        chol, corr, stds = pm.LKJCholeskyCov('chol_cov', n=3, eta=2,
-            sd_dist=sd_dist, compute_corr=True)
-        vals = pm.MvNormal('vals', mu=mu, chol=chol, observed=data)
+        chol, corr, stds = pm.LKJCholeskyCov("chol_cov", n=3, eta=2, sd_dist=sd_dist, compute_corr=True)
+        vals = pm.MvNormal("vals", mu=mu, chol=chol, observed=data)
 
     For unobserved values it can be better to use a non-centered
     parametrization::
 
         sd_dist = pm.Exponential.dist(1.0, shape=3)
-        chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=3, eta=2,
-            sd_dist=sd_dist, compute_corr=True)
-        vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=(5, 3))
-        vals = pm.Deterministic('vals', pt.dot(chol, vals_raw.T).T)
+        chol, _, _ = pm.LKJCholeskyCov("chol_cov", n=3, eta=2, sd_dist=sd_dist, compute_corr=True)
+        vals_raw = pm.Normal("vals_raw", mu=0, sigma=1, shape=(5, 3))
+        vals = pm.Deterministic("vals", pt.dot(chol, vals_raw.T).T)
     """
 
     rv_op = multivariate_normal
@@ -1806,13 +1808,12 @@ class MatrixNormal(Continuous):
     Define a matrixvariate normal variable for given row and column covariance
     matrices::
 
-        colcov = np.array([[1., 0.5], [0.5, 2]])
+        colcov = np.array([[1.0, 0.5], [0.5, 2]])
         rowcov = np.array([[1, 0, 0], [0, 4, 0], [0, 0, 16]])
         m = rowcov.shape[0]
         n = colcov.shape[0]
         mu = np.zeros((m, n))
-        vals = pm.MatrixNormal('vals', mu=mu, colcov=colcov,
-                               rowcov=rowcov)
+        vals = pm.MatrixNormal("vals", mu=mu, colcov=colcov, rowcov=rowcov)
 
     Above, the ith row in vals has a variance that is scaled by 4^i.
     Alternatively, row or column cholesky matrices could be substituted for
@@ -2418,23 +2419,25 @@ class ICAR(Continuous):
         # 4x4 adjacency matrix
         # arranged in a square lattice
 
-        W = np.array([
-            [0,1,0,1],
-            [1,0,1,0],
-            [0,1,0,1],
-            [1,0,1,0]
-        ])
+        W = np.array(
+            [
+                [0, 1, 0, 1],
+                [1, 0, 1, 0],
+                [0, 1, 0, 1],
+                [1, 0, 1, 0],
+            ],
+        )
 
         # centered parameterization
         with pm.Model():
-            sigma = pm.Exponential('sigma', 1)
-            phi = pm.ICAR('phi', W=W, sigma=sigma)
+            sigma = pm.Exponential("sigma", 1)
+            phi = pm.ICAR("phi", W=W, sigma=sigma)
             mu = phi
 
         # non-centered parameterization
         with pm.Model():
-            sigma = pm.Exponential('sigma', 1)
-            phi = pm.ICAR('phi', W=W)
+            sigma = pm.Exponential("sigma", 1)
+            phi = pm.ICAR("phi", W=W)
             mu = sigma * phi
 
     References
diff --git a/pymc/distributions/simulator.py b/pymc/distributions/simulator.py
index 02c76e2c6..7bfe3d395 100644
--- a/pymc/distributions/simulator.py
+++ b/pymc/distributions/simulator.py
@@ -122,6 +122,7 @@ class Simulator(Distribution):
         def simulator_fn(rng, loc, scale, size):
             return rng.normal(loc, scale, size=size)
 
+
         with pm.Model() as m:
             loc = pm.Normal("loc", 0, 1)
             scale = pm.HalfNormal("scale", 1)
diff --git a/pymc/distributions/transforms.py b/pymc/distributions/transforms.py
index 0c2a43b1f..2c4e121b4 100644
--- a/pymc/distributions/transforms.py
+++ b/pymc/distributions/transforms.py
@@ -219,6 +219,7 @@ class Interval(IntervalTransform):
         def get_bounds(rng, size, mu, sigma):
             return 0, None
 
+
         with pm.Model():
             interval = pm.distributions.transforms.Interval(bounds_fn=get_bounds)
             x = pm.Normal("x", transform=interval)
@@ -230,6 +231,7 @@ def get_bounds(rng, size, mu, sigma):
         def get_bounds(rng, size, mu, sigma):
             return mu - 1, None
 
+
         interval = pm.distributions.transforms.Interval(bounds_fn=get_bounds)
 
         with pm.Model():
diff --git a/pymc/func_utils.py b/pymc/func_utils.py
index 0b3304c34..7aaf91e9c 100644
--- a/pymc/func_utils.py
+++ b/pymc/func_utils.py
@@ -98,7 +98,7 @@ def find_constrained_prior(
 
         # use these parameters in a model
         with pm.Model():
-            x = pm.Gamma('x', **opt_params)
+            x = pm.Gamma("x", **opt_params)
 
         # specify fixed values before optimization
         opt_params = pm.find_constrained_prior(
@@ -121,7 +121,7 @@ def find_constrained_prior(
         opt_params = pm.find_constrained_prior(
             pm.Exponential,
             lower=0,
-            upper=3.,
+            upper=3.0,
             mass=0.9,
             init_guess={"lam": 1},
             mass_below_lower=0,
diff --git a/pymc/logprob/basic.py b/pymc/logprob/basic.py
index e1a5d2911..d8188f264 100644
--- a/pymc/logprob/basic.py
+++ b/pymc/logprob/basic.py
@@ -190,9 +190,11 @@ def logp(rv: TensorVariable, value: TensorLike, warn_rvs=None, **kwargs) -> Tens
         import pymc as pm
         import pytensor.tensor as pt
 
+
         def normal_logp(value, mu, sigma):
             return pm.logp(pm.Normal.dist(mu, sigma), value)
 
+
         with pm.Model() as model:
             mu = pm.Normal("mu")
             sigma = pm.HalfNormal("sigma")
@@ -289,9 +291,11 @@ def logcdf(rv: TensorVariable, value: TensorLike, warn_rvs=None, **kwargs) -> Te
         import pymc as pm
         import pytensor.tensor as pt
 
+
         def normal_logcdf(value, mu, sigma):
             return pm.logcdf(pm.Normal.dist(mu, sigma), value)
 
+
         with pm.Model() as model:
             mu = pm.Normal("mu")
             sigma = pm.HalfNormal("sigma")
diff --git a/pymc/model/core.py b/pymc/model/core.py
index 3dbd0c306..3e59e7231 100644
--- a/pymc/model/core.py
+++ b/pymc/model/core.py
@@ -438,18 +438,24 @@ class Model(WithMemoization, metaclass=ContextMeta):
         import numpy as np
 
         coords = {
-            "feature", ["A", "B", "C"],
-            "trial", [1, 2, 3, 4, 5],
+            "feature",
+            ["A", "B", "C"],
+            "trial",
+            [1, 2, 3, 4, 5],
         }
 
         with pm.Model(coords=coords) as model:
-            intercept = pm.Normal("intercept", shape=(3,))  # Variable will have default dim label `intercept__dim_0`
-            beta = pm.Normal("beta", dims=("feature",))  # Variable will have shape (3,) and dim label `feature`
+            # Variable will have default dim label `intercept__dim_0`
+            intercept = pm.Normal("intercept", shape=(3,))
+            # Variable will have shape (3,) and dim label `feature`
+            beta = pm.Normal("beta", dims=("feature",))
 
             # Dims below are only used for labeling, they have no effect on shape
-            idx = pm.Data("idx", np.array([0, 1, 1, 2, 2]))  # Variable will have default dim label `idx__dim_0`
+            # Variable will have default dim label `idx__dim_0`
+            idx = pm.Data("idx", np.array([0, 1, 1, 2, 2]))
             x = pm.Data("x", np.random.normal(size=(5, 3)), dims=("trial", "feature"))
-            mu = pm.Deterministic("mu", intercept[idx] + beta @ x, dims="trial")  # single dim can be passed as string
+            # single dim can be passed as string
+            mu = pm.Deterministic("mu", intercept[idx] + beta @ x, dims="trial")
 
             # Dims controls the shape of the variable
             # If not specified, it would be inferred from the shape of the observations
@@ -465,12 +471,12 @@ class Model(WithMemoization, metaclass=ContextMeta):
         with pm.Model(name="root") as root:
             x = pm.Normal("x")  # Variable wil be named "root::x"
 
-            with pm.Model(name='first') as first:
+            with pm.Model(name="first") as first:
                 # Variable will belong to root and first
                 y = pm.Normal("y", mu=x)  # Variable wil be named "root::first::y"
 
             # Can pass parent model explicitly
-            with pm.Model(name='second', model=root) as second:
+            with pm.Model(name="second", model=root) as second:
                 # Variable will belong to root and second
                 z = pm.Normal("z", mu=y)  # Variable wil be named "root::second::z"
 
@@ -2013,7 +2019,6 @@ def to_graphviz(
             sigma = np.array([15, 10, 16, 11, 9, 11, 10, 18])
 
             with Model() as schools:
-
                 eta = Normal("eta", 0, 1, shape=J)
                 mu = Normal("mu", 0, sigma=1e6)
                 tau = HalfCauchy("tau", 25)
@@ -2086,10 +2091,10 @@ def set_data(new_data, model=None, *, coords=None):
         import pymc as pm
 
         with pm.Model() as model:
-            x = pm.Data('x', [1., 2., 3.])
-            y = pm.Data('y', [1., 2., 3.])
-            beta = pm.Normal('beta', 0, 1)
-            obs = pm.Normal('obs', x * beta, 1, observed=y, shape=x.shape)
+            x = pm.Data("x", [1.0, 2.0, 3.0])
+            y = pm.Data("y", [1.0, 2.0, 3.0])
+            beta = pm.Normal("beta", 0, 1)
+            obs = pm.Normal("obs", x * beta, 1, observed=y, shape=x.shape)
             idata = pm.sample()
 
     Then change the value of `x` to predict on new data.
@@ -2116,9 +2121,9 @@ def set_data(new_data, model=None, *, coords=None):
         data = rng.normal(loc=1.0, scale=2.0, size=100)
 
         with pm.Model() as model:
-            y = pm.Data('y', data)
-            theta = pm.Normal('theta', mu=0.0, sigma=10.0)
-            obs = pm.Normal('obs', theta, 2.0, observed=y, shape=y.shape)
+            y = pm.Data("y", data)
+            theta = pm.Normal("theta", mu=0.0, sigma=10.0)
+            obs = pm.Normal("obs", theta, 2.0, observed=y, shape=y.shape)
             idata = pm.sample()
 
     Now update the model with a new data set.
@@ -2126,7 +2131,7 @@ def set_data(new_data, model=None, *, coords=None):
     .. code-block:: python
 
         with model:
-            pm.set_data({'y': rng.normal(loc=1.0, scale=2.0, size=200)})
+            pm.set_data({"y": rng.normal(loc=1.0, scale=2.0, size=200)})
             idata = pm.sample()
     """
     model = modelcontext(model)
diff --git a/pymc/model/transform/conditioning.py b/pymc/model/transform/conditioning.py
index 6fc31a5d5..9bcdce7c4 100644
--- a/pymc/model/transform/conditioning.py
+++ b/pymc/model/transform/conditioning.py
@@ -256,7 +256,7 @@ def change_value_transforms(
             mean_q = pm.find_MAP()
 
         with change_value_transforms(transformed_p, {"p": None}) as untransformed_p:
-            new_p = untransformed_p['p']
+            new_p = untransformed_p["p"]
             std_q = ((1 / pm.find_hessian(mean_q, vars=[new_p])) ** 0.5)[0]
 
         print(f"  Mean, Standard deviation\\np {mean_q['p']:.2}, {std_q[0]:.2}")
diff --git a/pymc/model_graph.py b/pymc/model_graph.py
index 1c230fc5a..659647726 100644
--- a/pymc/model_graph.py
+++ b/pymc/model_graph.py
@@ -622,7 +622,6 @@ def model_to_networkx(
         sigma = np.array([15, 10, 16, 11, 9, 11, 10, 18])
 
         with Model() as schools:
-
             eta = Normal("eta", 0, 1, shape=J)
             mu = Normal("mu", 0, sigma=1e6)
             tau = HalfCauchy("tau", 25)
@@ -726,7 +725,6 @@ def model_to_graphviz(
         sigma = np.array([15, 10, 16, 11, 9, 11, 10, 18])
 
         with Model() as schools:
-
             eta = Normal("eta", 0, 1, shape=J)
             mu = Normal("mu", 0, sigma=1e6)
             tau = HalfCauchy("tau", 25)
diff --git a/pymc/ode/ode.py b/pymc/ode/ode.py
index c38f6cf8b..7ecb4deba 100644
--- a/pymc/ode/ode.py
+++ b/pymc/ode/ode.py
@@ -59,9 +59,10 @@ class DifferentialEquation(Op):
     .. code-block:: python
 
         def odefunc(y, t, p):
-            #Logistic differential equation
+            # Logistic differential equation
             return p[0] * y[0] * (1 - y[0])
 
+
         times = np.arange(0.5, 5, 0.5)
 
         ode_model = DifferentialEquation(func=odefunc, times=times, n_states=1, n_theta=1, t0=0)
diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
index 9d6ecd25a..f180f2794 100644
--- a/pymc/pytensorf.py
+++ b/pymc/pytensorf.py
@@ -520,20 +520,18 @@ def join_nonshared_inputs(
         y = pt.vector("y")
         # Original output
         out = x + y
-        print(out.eval({x: np.array(1), y: np.array([1, 2, 3])})) # [2, 3, 4]
+        print(out.eval({x: np.array(1), y: np.array([1, 2, 3])}))  # [2, 3, 4]
 
         # New output and inputs
         [new_out], joined_inputs = join_nonshared_inputs(
-            point={ # Only shapes matter
+            point={  # Only shapes matter
                 "x": np.zeros(()),
                 "y": np.zeros(3),
             },
             outputs=[out],
             inputs=[x, y],
         )
-        print(new_out.eval({
-            joined_inputs: np.array([1, 1, 2, 3]),
-        })) # [2, 3, 4]
+        print(new_out.eval({joined_inputs: np.array([1, 1, 2, 3])}))  # [2, 3, 4]
 
     Join the input value variables of a model logp.
 
@@ -544,15 +542,19 @@ def join_nonshared_inputs(
         with pm.Model() as model:
             mu_pop = pm.Normal("mu_pop")
             sigma_pop = pm.HalfNormal("sigma_pop")
-            mu = pm.Normal("mu", mu_pop, sigma_pop, shape=(3, ))
+            mu = pm.Normal("mu", mu_pop, sigma_pop, shape=(3,))
 
             y = pm.Normal("y", mu, 1.0, observed=[0, 1, 2])
 
-        print(model.compile_logp()({
-            "mu_pop": 0,
-            "sigma_pop_log__": 1,
-            "mu": [0, 1, 2],
-        })) # -12.691227342634292
+        print(
+            model.compile_logp()(
+                {
+                    "mu_pop": 0,
+                    "sigma_pop_log__": 1,
+                    "mu": [0, 1, 2],
+                }
+            )
+        )  # -12.691227342634292
 
         initial_point = model.initial_point()
         inputs = model.value_vars
@@ -563,9 +565,13 @@ def join_nonshared_inputs(
             inputs=inputs,
         )
 
-        print(logp.eval({
-            joined_inputs: [0, 1, 0, 1, 2],
-        })) # -12.691227342634292
+        print(
+            logp.eval(
+                {
+                    joined_inputs: [0, 1, 0, 1, 2],
+                }
+            )
+        )  # -12.691227342634292
 
     Same as above but with the `mu_pop` value variable being shared.
 
@@ -580,14 +586,16 @@ def join_nonshared_inputs(
             point=initial_point,
             outputs=[model.logp()],
             inputs=other_inputs,
-            shared_inputs={
-                mu_pop_input: shared_mu_pop_input
-            },
+            shared_inputs={mu_pop_input: shared_mu_pop_input},
         )
 
-        print(logp.eval({
-            other_joined_inputs: [1, 0, 1, 2],
-        })) # -12.691227342634292
+        print(
+            logp.eval(
+                {
+                    other_joined_inputs: [1, 0, 1, 2],
+                }
+            )
+        )  # -12.691227342634292
     """
     if not inputs:
         raise ValueError("Empty list of input variables.")
diff --git a/pymc/sampling/mcmc.py b/pymc/sampling/mcmc.py
index 32d2702ff..f7f181686 100644
--- a/pymc/sampling/mcmc.py
+++ b/pymc/sampling/mcmc.py
@@ -599,8 +599,10 @@ def sample(
        e.g. for a CompoundStep comprising NUTS and BinaryGibbsMetropolis,
        you could send ::
 
-        step=[pm.NUTS([freeRV1, freeRV2], target_accept=0.9),
-              pm.BinaryGibbsMetropolis([freeRV3], transit_p=.7)]
+        step = [
+            pm.NUTS([freeRV1, freeRV2], target_accept=0.9),
+            pm.BinaryGibbsMetropolis([freeRV3], transit_p=0.7),
+        ]
 
     You can find a full list of arguments in the docstring of the step methods.
 
diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py
index 3c911e1ba..36904db5e 100644
--- a/pymc/variational/callbacks.py
+++ b/pymc/variational/callbacks.py
@@ -59,11 +59,8 @@ class CheckParametersConvergence(Callback):
     --------
     >>> with model:
     ...     approx = pm.fit(
-    ...         n=10000, callbacks=[
-    ...             CheckParametersConvergence(
-    ...                 every=50, diff='absolute',
-    ...                 tolerance=1e-4)
-    ...         ]
+    ...         n=10000,
+    ...         callbacks=[CheckParametersConvergence(every=50, diff="absolute", tolerance=1e-4)],
     ...     )
     """
 
diff --git a/pymc/variational/opvi.py b/pymc/variational/opvi.py
index c59398710..96db3b269 100644
--- a/pymc/variational/opvi.py
+++ b/pymc/variational/opvi.py
@@ -604,7 +604,7 @@ class Group(WithMemoization):
 
     .. code:: python
 
-        >>> group = Group([latent1, latent2], vfam='mean_field')
+        >>> group = Group([latent1, latent2], vfam="mean_field")
 
     The other way to select approximation is to provide `params` dictionary that has some
     predefined well shaped parameters. Keys of the dict serve as an identifier for variational family and help
@@ -639,8 +639,8 @@ class Group(WithMemoization):
 
     .. code:: python
 
-        >>> group_1 = Group([latent1], vfam='fr')  # latent1 has full rank approximation
-        >>> group_other = Group(None, vfam='mf')  # other variables have mean field Q
+        >>> group_1 = Group([latent1], vfam="fr")  # latent1 has full rank approximation
+        >>> group_other = Group(None, vfam="mf")  # other variables have mean field Q
         >>> approx = Approximation([group_1, group_other])
 
     **Summing Up**
diff --git a/pymc/variational/updates.py b/pymc/variational/updates.py
index 224c2d249..d919a7e24 100644
--- a/pymc/variational/updates.py
+++ b/pymc/variational/updates.py
@@ -94,8 +94,8 @@
 >>> from lasagne.updates import sgd, apply_momentum
 >>> l_in = InputLayer((100, 20))
 >>> l1 = DenseLayer(l_in, num_units=3, nonlinearity=softmax)
->>> x = pt.matrix('x')  # shp: num_batch x num_features
->>> y = pt.ivector('y') # shp: num_batch
+>>> x = pt.matrix("x")  # shp: num_batch x num_features
+>>> y = pt.ivector("y")  # shp: num_batch
 >>> l_out = get_output(l1, x)
 >>> params = lasagne.layers.get_all_params(l1)
 >>> loss = pt.mean(pt.nnet.categorical_crossentropy(l_out, y))
@@ -212,12 +212,12 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3):
 
     Examples
     --------
-    >>> a = pytensor.shared(1.)
-    >>> b = a*2
-    >>> updates = sgd(b, [a], learning_rate=.01)
+    >>> a = pytensor.shared(1.0)
+    >>> b = a * 2
+    >>> updates = sgd(b, [a], learning_rate=0.01)
     >>> isinstance(updates, dict)
     True
-    >>> optimizer = sgd(learning_rate=.01)
+    >>> optimizer = sgd(learning_rate=0.01)
     >>> callable(optimizer)
     True
     >>> updates = optimizer(b, [a])
@@ -324,12 +324,12 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9):
 
     Examples
     --------
-    >>> a = pytensor.shared(1.)
-    >>> b = a*2
-    >>> updates = momentum(b, [a], learning_rate=.01)
+    >>> a = pytensor.shared(1.0)
+    >>> b = a * 2
+    >>> updates = momentum(b, [a], learning_rate=0.01)
     >>> isinstance(updates, dict)
     True
-    >>> optimizer = momentum(learning_rate=.01)
+    >>> optimizer = momentum(learning_rate=0.01)
     >>> callable(optimizer)
     True
     >>> updates = optimizer(b, [a])
@@ -442,12 +442,12 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen
 
     Examples
     --------
-    >>> a = pytensor.shared(1.)
-    >>> b = a*2
-    >>> updates = nesterov_momentum(b, [a], learning_rate=.01)
+    >>> a = pytensor.shared(1.0)
+    >>> b = a * 2
+    >>> updates = nesterov_momentum(b, [a], learning_rate=0.01)
     >>> isinstance(updates, dict)
     True
-    >>> optimizer = nesterov_momentum(learning_rate=.01)
+    >>> optimizer = nesterov_momentum(learning_rate=0.01)
     >>> callable(optimizer)
     True
     >>> updates = optimizer(b, [a])
@@ -510,12 +510,12 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
 
     Examples
     --------
-    >>> a = pytensor.shared(1.)
-    >>> b = a*2
-    >>> updates = adagrad(b, [a], learning_rate=.01)
+    >>> a = pytensor.shared(1.0)
+    >>> b = a * 2
+    >>> updates = adagrad(b, [a], learning_rate=0.01)
     >>> isinstance(updates, dict)
     True
-    >>> optimizer = adagrad(learning_rate=.01)
+    >>> optimizer = adagrad(learning_rate=0.01)
     >>> callable(optimizer)
     True
     >>> updates = optimizer(b, [a])
@@ -813,12 +813,12 @@ def adam(
 
     Examples
     --------
-    >>> a = pytensor.shared(1.)
-    >>> b = a*2
-    >>> updates = adam(b, [a], learning_rate=.01)
+    >>> a = pytensor.shared(1.0)
+    >>> b = a * 2
+    >>> updates = adam(b, [a], learning_rate=0.01)
     >>> isinstance(updates, dict)
     True
-    >>> optimizer = adam(learning_rate=.01)
+    >>> optimizer = adam(learning_rate=0.01)
     >>> callable(optimizer)
     True
     >>> updates = optimizer(b, [a])
@@ -897,12 +897,12 @@ def adamax(
 
     Examples
     --------
-    >>> a = pytensor.shared(1.)
-    >>> b = a*2
-    >>> updates = adamax(b, [a], learning_rate=.01)
+    >>> a = pytensor.shared(1.0)
+    >>> b = a * 2
+    >>> updates = adamax(b, [a], learning_rate=0.01)
     >>> isinstance(updates, dict)
     True
-    >>> optimizer = adamax(learning_rate=.01)
+    >>> optimizer = adamax(learning_rate=0.01)
     >>> callable(optimizer)
     True
     >>> updates = optimizer(b, [a])
@@ -975,8 +975,7 @@ def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
 
     Examples
     --------
-    >>> param = pytensor.shared(
-    ...     np.random.randn(100, 200).astype(pytensor.config.floatX))
+    >>> param = pytensor.shared(np.random.randn(100, 200).astype(pytensor.config.floatX))
     >>> update = param + 100
     >>> update = norm_constraint(update, 10)
     >>> func = pytensor.function([], [], updates=[(param, update)])
diff --git a/pyproject.toml b/pyproject.toml
index 459c1742e..f59054eb1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,9 @@ line-length = 100
 target-version = "py310"
 extend-exclude = ["_version.py"]
 
+[tool.ruff.format]
+docstring-code-format = true
+
 [tool.ruff.lint]
 select = ["D", "E", "F", "I", "UP", "W", "RUF"]
 ignore = [