WIP fix C_t broadcasting

ColtAllen · ColtAllen · commit 0b05f28527ef · 2025-08-13T10:17:28.000-06:00
diff --git a/pymc_extras/distributions/discrete.py b/pymc_extras/distributions/discrete.py
@@ -412,7 +412,12 @@ class GrassiaIIGeometricRV(RandomVariable):
     @classmethod
     def rng_fn(cls, rng, r, alpha, time_covariate_vector, size):
         # Aggregate time covariates for each sample before broadcasting
-        exp_time_covar = np.exp(time_covariate_vector).sum(axis=0)
+        time_cov = np.asarray(time_covariate_vector)
+        if np.ndim(time_cov) == 0:
+            exp_time_covar = np.asarray(1.0)
+        else:
+            # Collapse all time/feature axes to a scalar multiplier for RNG
+            exp_time_covar = np.asarray(np.exp(time_cov).sum())
 
         # Determine output size
         if size is None:
@@ -500,24 +505,29 @@ def dist(cls, r, alpha, time_covariate_vector=None, *args, **kwargs):
 
         if time_covariate_vector is None:
             time_covariate_vector = pt.constant(0.0)
+        time_covariate_vector = pt.as_tensor_variable(time_covariate_vector)
+        # Normalize covariate to be 1D over time
+        if time_covariate_vector.ndim == 0:
+            time_covariate_vector = pt.reshape(time_covariate_vector, (1,))
+        elif time_covariate_vector.ndim > 1:
+            feature_axes = tuple(range(time_covariate_vector.ndim - 1))
+            time_covariate_vector = pt.sum(time_covariate_vector, axis=feature_axes)
 
         return super().dist([r, alpha, time_covariate_vector], *args, **kwargs)
 
     def logp(value, r, alpha, time_covariate_vector):
-        logp = pt.log(
-            pt.pow(alpha / (alpha + C_t(value - 1, time_covariate_vector)), r)
-            - pt.pow(alpha / (alpha + C_t(value, time_covariate_vector)), r)
-        )
-
-        # Handle invalid values
-        logp = pt.switch(
-            pt.or_(
-                value < 1,  # Value must be >= 1
-                pt.isnan(logp),  # Handle NaN cases
-            ),
-            -np.inf,
-            logp,
-        )
+        v = pt.as_tensor_variable(value)
+        ct_prev = C_t(v - 1, time_covariate_vector)
+        ct_curr = C_t(v, time_covariate_vector)
+        logS_prev = r * (pt.log(alpha) - pt.log(alpha + ct_prev))
+        logS_curr = r * (pt.log(alpha) - pt.log(alpha + ct_curr))
+        # Compute log(exp(logS_prev) - exp(logS_curr)) stably
+        max_logS = pt.maximum(logS_prev, logS_curr)
+        diff = pt.exp(logS_prev - max_logS) - pt.exp(logS_curr - max_logS)
+        logp = max_logS + pt.log(diff)
+
+        # Handle invalid / out-of-domain values
+        logp = pt.switch(value < 1, -np.inf, logp)
 
         return check_parameters(
             logp,
@@ -527,9 +537,15 @@ def logp(value, r, alpha, time_covariate_vector):
         )
 
     def logcdf(value, r, alpha, time_covariate_vector):
-        logcdf = r * (
-            pt.log(C_t(value, time_covariate_vector))
-            - pt.log(alpha + C_t(value, time_covariate_vector))
+        # Log CDF: log(1 - (alpha / (alpha + C(t)))**r)
+        t = pt.as_tensor_variable(value)
+        ct = C_t(t, time_covariate_vector)
+        logS = r * (pt.log(alpha) - pt.log(alpha + ct))
+        # Numerically stable log(1 - exp(logS))
+        logcdf = pt.switch(
+            pt.lt(logS, np.log(0.5)),
+            pt.log1p(-pt.exp(logS)),
+            pt.log(-pt.expm1(logS)),
         )
 
         return check_parameters(
@@ -561,7 +577,11 @@ def support_point(rv, size, r, alpha, time_covariate_vector):
         )
 
         # Apply time covariates if provided
-        mean = mean * pt.exp(time_covariate_vector.sum(axis=0))
+        tcv = pt.as_tensor_variable(time_covariate_vector)
+        if tcv.ndim != 0:
+            # If 1D, treat as per-time vector; if 2D+, sum features while preserving time axis
+            cov_time = tcv if tcv.ndim == 1 else tcv.sum(axis=0)
+            mean = mean * pt.exp(cov_time)
 
         # Round up to nearest integer and ensure >= 1
         mean = pt.maximum(pt.ceil(mean), 1.0)
@@ -575,14 +595,31 @@ def support_point(rv, size, r, alpha, time_covariate_vector):
 
 def C_t(t: pt.TensorVariable, time_covariate_vector: pt.TensorVariable) -> pt.TensorVariable:
     """Utility for processing time-varying covariates in GrassiaIIGeometric distribution."""
+    # If unspecified (scalar), simply return t
     if time_covariate_vector.ndim == 0:
-        # Reshape time_covariate_vector to length t
-        return pt.full((t,), time_covariate_vector)
+        return t
+
+    # Sum exp(covariates) across feature axes, keep last axis as time
+    if time_covariate_vector.ndim == 1:
+        per_time_sum = pt.exp(time_covariate_vector)
     else:
-        # Ensure t is a valid index
-        t_idx = pt.maximum(0, t - 1)  # Convert to 0-based indexing
-        # If t_idx exceeds length of time_covariate_vector, use last value
-        max_idx = pt.shape(time_covariate_vector)[0] - 1
-        safe_idx = pt.minimum(t_idx, max_idx)
-        covariate_value = time_covariate_vector[..., safe_idx]
-        return pt.exp(covariate_value).sum()
+        feature_axes = tuple(range(time_covariate_vector.ndim - 1))
+        per_time_sum = pt.sum(pt.exp(time_covariate_vector), axis=feature_axes)
+
+    # Build cumulative sum up to each t without advanced indexing
+    time_length = pt.shape(per_time_sum)[0]
+    # Ensure t is at least 1D int64 for broadcasting
+    t_vec = pt.cast(t, "int64")
+    t_vec = pt.shape_padleft(t_vec) if t_vec.ndim == 0 else t_vec
+    # Create time indices [0, 1, ..., T-1]
+    time_idx = pt.arange(time_length, dtype="int64")
+    # Mask where time index < t (exclusive upper bound)
+    mask = pt.lt(time_idx, pt.shape_padright(t_vec, 1))
+    # Sum per-time contributions over time axis
+    base_sum = pt.sum(pt.shape_padleft(per_time_sum) * mask, axis=-1)
+    # Carry-forward last per-time value for t beyond time_length
+    last_value = per_time_sum[-1]
+    excess_steps = pt.maximum(t_vec - time_length, 0)
+    carried = base_sum + excess_steps * last_value
+    # If original t was scalar, return scalar
+    return pt.squeeze(carried)
diff --git a/tests/distributions/test_discrete.py b/tests/distributions/test_discrete.py
@@ -24,7 +24,7 @@
     BaseTestDistributionRandom,
     Domain,
     I,
-    NatBig,
+    PosNat,
     Rplus,
     assert_support_point_is_expected,
     check_logp,
@@ -314,7 +314,7 @@ def test_logp(self):
     def test_logcdf(self):
         # test logcdf matches log sums across parameter values
         check_selfconsistency_discrete_logcdf(
-            GrassiaIIGeometric, NatBig, {"r": Rplus, "alpha": Rplus, "time_covariate_vector": I}
+            GrassiaIIGeometric, PosNat, {"r": Rplus, "alpha": Rplus, "time_covariate_vector": I}
         )
 
     @pytest.mark.parametrize(
@@ -349,3 +349,59 @@ def test_support_point(self, r, alpha, time_covariate_vector, size, expected_sha
 
         # TODO: expected values must be provided
         # assert_support_point_is_expected(model, init_point)
+
+    def test_C_t_unspecified_returns_t(self):
+        # When unspecified is represented as a scalar 0.0, C_t should return t
+        from pymc_extras.distributions.discrete import C_t
+
+        t = pt.vector("t", dtype="int64")
+        cov = pt.as_tensor_variable(0.0)
+        fn = pytensor.function([t], C_t(t, cov))
+        test_t = np.array([0, 1, 2, 3, 10], dtype="int64")
+        np.testing.assert_array_equal(fn(test_t), test_t)
+
+    def test_C_t_1d_vector_sum_up_to_t_with_saturation(self):
+        # For a 1D time_covariate_vector, C_t should sum exp up to t (exclusive upper bound),
+        # and saturate when t exceeds the length
+        from pymc_extras.distributions.discrete import C_t
+
+        t = pt.vector("t", dtype="int64")
+        cov = pt.as_tensor_variable(np.array([0.0, 1.0, -1.0], dtype=float))  # length 3
+        fn = pytensor.function([t], C_t(t, cov))
+        test_t = np.array([0, 1, 2, 3, 4], dtype="int64")
+        per_time = np.exp(np.array([0.0, 1.0, -1.0]))
+        csum = np.cumsum(per_time)
+        expected = []
+        for tt in test_t:
+            if tt <= 0:
+                expected.append(0.0)
+            elif tt >= len(per_time):
+                expected.append(csum[-1])
+            else:
+                expected.append(csum[tt - 1])
+        expected = np.array(expected)
+        np.testing.assert_allclose(fn(test_t), expected)
+
+    def test_C_t_2d_features_by_time_sum_up_to_t_with_saturation(self):
+        # For a 2D (features x time) covariate, sum features first then cumulative over time
+        # and saturate when t exceeds the length
+        from pymc_extras.distributions.discrete import C_t
+
+        t = pt.vector("t", dtype="int64")
+        cov = pt.as_tensor_variable(
+            np.array([[0.5, 1.0, 1.5], [0.0, 0.0, 0.0]], dtype=float)
+        )  # 2x3
+        fn = pytensor.function([t], C_t(t, cov))
+        test_t = np.array([0, 1, 2, 3, 4], dtype="int64")
+        per_time = np.sum(np.exp(np.array([[0.5, 1.0, 1.5], [0.0, 0.0, 0.0]])), axis=0)
+        csum = np.cumsum(per_time)
+        expected = []
+        for tt in test_t:
+            if tt <= 0:
+                expected.append(0.0)
+            elif tt >= len(per_time):
+                expected.append(csum[-1])
+            else:
+                expected.append(csum[tt - 1])
+        expected = np.array(expected)
+        np.testing.assert_allclose(fn(test_t), expected)