.WIP

ricardoV94 · ricardoV94 · commit 02b4eb2b39ec · 2024-10-01T10:17:14.000+02:00
diff --git a/pymc_experimental/model/marginal/distributions.py b/pymc_experimental/model/marginal/distributions.py
@@ -54,12 +54,11 @@ def get_domain_of_finite_discrete_rv(rv: TensorVariable) -> tuple[int, ...]:
 
 
 def _reduce_batch_dependent_logps(
-    marginalized_op: MarginalRV,
-    marginalized_logp: TensorVariable,
+    dependent_dims_connections: Sequence[tuple[int | None, ...]],
     dependent_ops: Sequence[Op],
     dependent_logps: Sequence[TensorVariable],
 ) -> TensorVariable:
-    """Combine the logps of dependent RVs with the marginalized logp.
+    """Combine the logps of dependent RVs and align them with the marginalized logp.
 
      This requires reducing extra batch dims and transposing when they are not aligned.
 
@@ -68,13 +67,14 @@ def _reduce_batch_dependent_logps(
         pm.Normal("dep2", mu=idx * 2, shape=(7, 2, 3))
 
         marginalize(idx)
-        dims_connections = [(1, 0, None), (None, 0, 1)]
-     """
 
-    dims_connections = marginalized_op.dims_connections
+        The marginalized op will have dims_connections = [(1, 0, None), (None, 0, 1)]
+        which tells us we need to reduce the last axis of dep1 logp and the first of dep2,
+        as well as transpose the remaining axis of dep1 logp before adding the two elemwise.
+     """
 
-    reduced_logps = [marginalized_logp]
-    for dependent_op, dependent_logp, dims_connection in zip(dependent_ops, dependent_logps, dims_connections):
+    reduced_logps = []
+    for dependent_op, dependent_logp, dependent_dims_connection in zip(dependent_ops, dependent_logps, dependent_dims_connections):
         if dependent_logp.type.ndim > 0:
             # Find which support axis implied by the MarginalRV need to be reduced
             # Some may have already been reduced by the logp expression of the dependent RV, for non-univariate RVs
@@ -88,27 +88,27 @@ def _reduce_batch_dependent_logps(
             # Dependent RV support axes are already collapsed in the logp, so we ignore them
             supp_axes = [
                 -i
-                for i, dim in enumerate(reversed(dims_connection), start=1)
+                for i, dim in enumerate(reversed(dependent_dims_connection), start=1)
                 if (dim == () and -i not in dep_supp_axes)
             ]
 
             dependent_logp = dependent_logp.sum(supp_axes)
-            assert dependent_logp.type.ndim == marginalized_logp.type.ndim
 
             # Finally, we need to align the dependent logp batch dimensions with the marginalized logp
-            dims_alignment = [dim[0] for dim in dims_connection if dim != ()]
+            dims_alignment = [dim[0] for dim in dependent_dims_connection if dim != ()]
             dependent_logp = dependent_logp.transpose(*dims_alignment)
 
         reduced_logps.append(dependent_logp)
 
     reduced_logp = pt.add(*reduced_logps)
+    return reduced_logp
 
-    if reduced_logp.type.ndim > 0:
+def _align_logp_with_dims(dims: tuple[tuple[int, None]], logp: TensorVariable) -> TensorVariable:
+    if logp.type.ndim > 0:
         # Transpose reduced logp into the direction of the first dependent RV
-        first_dims_alignment = [dim[0] for dim in dims_connections[0] if dim != ()]
-        reduced_logp = reduced_logp.transpose(*first_dims_alignment)
-
-    return reduced_logp
+        dims_alignment = [dim[0] for dim in dims if dim != ()]
+        logp = logp.transpose(*dims_alignment)
+    return logp
 
 
 dummy_zero = pt.constant(0, name="dummy_zero")
@@ -131,9 +131,8 @@ def finite_discrete_marginal_rv_logp(op: MarginalFiniteDiscreteRV, values, *inpu
 
     # Reduce logp dimensions corresponding to broadcasted variables
     marginalized_logp = logps_dict.pop(marginalized_vv)
-    joint_logp = _reduce_batch_dependent_logps(
-        marginalized_op=op,
-        marginalized_logp=marginalized_logp,
+    joint_logp = marginalized_logp + _reduce_batch_dependent_logps(
+        dependent_dims_connections=op.dims_connections,
         dependent_ops=[inner_rv.owner.op for inner_rv in inner_rvs],
         dependent_logps=[logps_dict[value] for value in values],
     )
@@ -174,8 +173,12 @@ def logp_fn(marginalized_rv_const, *non_sequences):
 
     joint_logp = pt.logsumexp(joint_logps, axis=0)
 
+    # Align logp with non-collapsed batch dimensions of first RV
+    joint_logp = _align_logp_with_dims(dims=op.dims_connections[0], logp=joint_logp)
+
     # We have to add dummy logps for the remaining value variables, otherwise PyMC will raise
-    return joint_logp, *((dummy_zero,) * (len(values) - 1))
+    dummy_logps = ((dummy_zero,) * (len(values) - 1))
+    return joint_logp, *dummy_logps
 
 
 @_logprob.register(MarginalDiscreteMarkovChainRV)
@@ -200,8 +203,10 @@ def marginal_hmm_logp(op, values, *inputs, **kwargs):
     logp_emissions_dict = conditional_logp(dict(zip(dependent_rvs, values)))
 
     # Reduce and add the batch dims beyond the chain dimension
-    reduced_logp_emissions = _add_reduce_batch_dependent_logps(
-        chain_rv.type, logp_emissions_dict.values()
+    reduced_logp_emissions = _reduce_batch_dependent_logps(
+        dependent_dims_connections=op.dims_connections,
+        dependent_ops=[dependent_rv.owner.op for dependent_rv in dependent_rvs],
+        dependent_logps=[logp_emissions_dict[value] for value in values],
     )
 
     # Add a batch dimension for the domain of the chain
@@ -240,9 +245,13 @@ def step_alpha(logp_emission, log_alpha, log_P):
     # Final logp is just the sum of the last scan state
     joint_logp = pt.logsumexp(log_alpha_seq[-1], axis=0)
 
-    # TODO: Transpose into shape of first emission
+    # Align logp with non-collapsed batch dimensions of first RV
+    remaining_dims_first_emission = list(op.dims_connections[0])
+    # The last dim of chain_rv was removed when computing the logp
+    remaining_dims_first_emission.remove((chain_rv.type.ndim - 1,))
+    joint_logp = _align_logp_with_dims(remaining_dims_first_emission, joint_logp)
 
     # If there are multiple emission streams, we have to add dummy logps for the remaining value variables. The first
     # return is the joint probability of everything together, but PyMC still expects one logp for each emission stream.
-    dummy_logps = (dummy_zero) * (len(values) - 1)
+    dummy_logps = (dummy_zero,) * (len(values) - 1)
     return joint_logp, *dummy_logps
diff --git a/pymc_experimental/model/marginal/graph_analysis.py b/pymc_experimental/model/marginal/graph_analysis.py
@@ -107,7 +107,7 @@ def _broadcast_dims(
         raise ValueError("Different known dimensions mixed via broadcasting")
 
     if len(set(output_dim[0] for output_dim in output_dims if output_dim != ())) < len([output_dim for output_dim in output_dims if output_dim != ()]):
-        raise ValueError("Same dimension used in different axis after broadcasting")
+        raise ValueError("Same known dimension used in different axis after broadcasting")
 
     return output_dims
 
diff --git a/pymc_experimental/model/marginal/marginal_model.py b/pymc_experimental/model/marginal/marginal_model.py
@@ -27,7 +27,7 @@
     MarginalDiscreteMarkovChainRV,
     MarginalFiniteDiscreteRV,
     _reduce_batch_dependent_logps,
-    get_domain_of_finite_discrete_rv,
+    get_domain_of_finite_discrete_rv, _align_logp_with_dims,
 )
 from pymc_experimental.model.marginal.graph_analysis import (
     collect_shared_vars,
@@ -423,18 +423,32 @@ def transform_input(inputs):
             m = self.clone()
             marginalized_rv = m.vars_to_clone[marginalized_rv]
             m.unmarginalize([marginalized_rv])
-            dependent_vars = find_conditional_dependent_rvs(marginalized_rv, m.basic_RVs)
-            joint_logps = m.logp(vars=[marginalized_rv, *dependent_vars], sum=False)
+            dependent_rvs = find_conditional_dependent_rvs(marginalized_rv, m.basic_RVs)
+            logps = m.logp(vars=[marginalized_rv, *dependent_rvs], sum=False)
+
+            if marginalized_rv.type.ndim > 0:
+                other_direct_rv_ancestors = [
+                    rv
+                    for rv in find_conditional_input_rvs(dependent_rvs, self.basic_RVs)
+                    if rv is not marginalized_rv
+                ]
+                dependent_rvs_dim_connections = subgraph_batch_dim_connection(
+                    marginalized_rv, other_direct_rv_ancestors, dependent_rvs
+                )
+                # Handle batch dims for marginalized value and its dependent RVs
+                marginalized_logp, *dependent_logps = logps
+                joint_logp = marginalized_logp + _reduce_batch_dependent_logps(
+                    dependent_rvs_dim_connections,
+                    [dependent_var.owner.op for dependent_var in dependent_rvs],
+                    dependent_logps
+                )
+            else:
+                joint_logp = pt.sum([logp.sum() for logp in logps])
+
 
             marginalized_value = m.rvs_to_values[marginalized_rv]
             other_values = [v for v in m.value_vars if v is not marginalized_value]
 
-            # Handle batch dims for marginalized value and its dependent RVs
-            marginalized_logp, *dependent_logps = joint_logps
-            joint_logp = marginalized_logp + _reduce_batch_dependent_logps(
-                marginalized_rv.type, dependent_logps
-            )
-
             rv_shape = constant_fold(tuple(marginalized_rv.shape), raise_not_constant=False)
             rv_domain = get_domain_of_finite_discrete_rv(marginalized_rv)
             rv_domain_tensor = pt.moveaxis(
@@ -447,37 +461,30 @@ def transform_input(inputs):
                 0,
             )
 
-            joint_logps = vectorize_graph(
+            batched_joint_logp = vectorize_graph(
                 joint_logp,
                 replace={marginalized_value: rv_domain_tensor},
             )
-            joint_logps = pt.moveaxis(joint_logps, 0, -1)
+            batched_joint_logp = pt.moveaxis(batched_joint_logp, 0, -1)
 
-            rv_loglike_fn = None
-            joint_logps_norm = log_softmax(joint_logps, axis=-1)
+            joint_logp_norm = log_softmax(batched_joint_logp, axis=-1)
             if return_samples:
-                sample_rv_outs = pymc.Categorical.dist(logit_p=joint_logps)
+                rv_draws = pymc.Categorical.dist(logit_p=batched_joint_logp)
                 if isinstance(marginalized_rv.owner.op, DiscreteUniform):
-                    sample_rv_outs += rv_domain[0]
-
-                rv_loglike_fn = compile_pymc(
-                    inputs=other_values,
-                    outputs=[joint_logps_norm, sample_rv_outs],
-                    on_unused_input="ignore",
-                    random_seed=seed,
-                )
+                    rv_draws += rv_domain[0]
+                outputs = [joint_logp_norm, rv_draws]
             else:
-                rv_loglike_fn = compile_pymc(
-                    inputs=other_values,
-                    outputs=joint_logps_norm,
-                    on_unused_input="ignore",
-                    random_seed=seed,
-                )
+                outputs = joint_logp_norm
+
+            rv_loglike_fn = compile_pymc(
+                inputs=other_values,
+                outputs=outputs,
+                on_unused_input="ignore",
+                random_seed=seed,
+            )
 
             logvs = [rv_loglike_fn(**vs) for vs in posterior_pts]
 
-            logps = None
-            samples = None
             if return_samples:
                 logps, samples = zip(*logvs)
                 logps = np.array(logps)
diff --git a/tests/model/marginal/test_distributions.py b/tests/model/marginal/test_distributions.py
@@ -76,7 +76,7 @@ def test_marginalized_hmm_categorical_emission(categorical_emission):
         chain = DiscreteMarkovChain("chain", P=P, init_dist=init_dist, steps=2)
         if categorical_emission:
             emission = pm.Categorical(
-                "emission", p=pt.where(pt.eq(chain, 0)[..., None], [0.8, 0.2], [0.4, 0.6])
+                "emission", p=pt.constant([[0.8, 0.2], [0.4, 0.6]])[chain]
             )
         else:
             emission = pm.Bernoulli("emission", p=pt.where(pt.eq(chain, 0), 0.2, 0.6))
@@ -88,29 +88,44 @@ def test_marginalized_hmm_categorical_emission(categorical_emission):
     np.testing.assert_allclose(logp_fn({"emission": test_value}), expected_logp)
 
 
+@pytest.mark.parametrize("batch_chain", (False, True))
 @pytest.mark.parametrize("batch_emission1", (False, True))
 @pytest.mark.parametrize("batch_emission2", (False, True))
-def test_marginalized_hmm_multiple_emissions(batch_emission1, batch_emission2):
-    emission1_shape = (2, 4) if batch_emission1 else (4,)
-    emission2_shape = (2, 4) if batch_emission2 else (4,)
+def test_marginalized_hmm_multiple_emissions(batch_chain, batch_emission1, batch_emission2):
+    chain_shape = (3, 1, 4) if batch_chain else (4,)
+    emission1_shape = (2, *reversed(chain_shape)) if batch_emission1 else tuple(reversed(chain_shape))
+    emission2_shape = (*chain_shape, 2) if batch_emission2 else chain_shape
     with MarginalModel() as m:
         P = [[0, 1], [1, 0]]
         init_dist = pm.Categorical.dist(p=[1, 0])
-        chain = DiscreteMarkovChain("chain", P=P, init_dist=init_dist, steps=3)
-        emission_1 = pm.Normal("emission_1", mu=chain * 2 - 1, sigma=1e-1, shape=emission1_shape)
+        chain = DiscreteMarkovChain("chain", P=P, init_dist=init_dist, shape=chain_shape)
+        emission_1 = pm.Normal("emission_1", mu=(chain * 2 - 1).T, sigma=1e-1, shape=emission1_shape)
+
+        emission2_mu = ((1 - chain) * 2 - 1)
+        if batch_emission2:
+            emission2_mu = emission2_mu[..., None]
         emission_2 = pm.Normal(
-            "emission_2", mu=(1 - chain) * 2 - 1, sigma=1e-1, shape=emission2_shape
+            "emission_2", mu=emission2_mu, sigma=1e-1, shape=emission2_shape
         )
 
     with pytest.warns(UserWarning, match="multiple dependent variables"):
         m.marginalize([chain])
 
-    logp_fn = m.compile_logp()
+    logp_fn = m.compile_logp(sum=False)
 
     test_value = np.array([-1, 1, -1, 1])
     multiplier = 2 + batch_emission1 + batch_emission2
+    if batch_chain:
+        multiplier *= 3
     expected_logp = norm.logpdf(np.zeros_like(test_value), 0, 1e-1).sum() * multiplier
-    test_value_emission1 = np.broadcast_to(test_value, emission1_shape)
-    test_value_emission2 = np.broadcast_to(-test_value, emission2_shape)
+
+    test_value = np.broadcast_to(test_value, chain_shape)
+    test_value_emission1 = np.broadcast_to(test_value.T, emission1_shape)
+    if batch_emission2:
+        test_value_emission2 = np.broadcast_to(-test_value[..., None], emission2_shape)
+    else:
+        test_value_emission2 = np.broadcast_to(-test_value, emission2_shape)
     test_point = {"emission_1": test_value_emission1, "emission_2": test_value_emission2}
-    np.testing.assert_allclose(logp_fn(test_point), expected_logp)
+    res_logp, dummy_logp = logp_fn(test_point)
+    assert res_logp.shape == ((1, 3) if batch_chain else ())
+    np.testing.assert_allclose(res_logp.sum(), expected_logp)
diff --git a/tests/model/marginal/test_graph_analysis.py b/tests/model/marginal/test_graph_analysis.py
@@ -105,8 +105,8 @@ def test_advanced_subtensor_key(self):
 
         # Mix keys dimensions
         out = base[:, inp, inp.T]
-        [dims] = subgraph_batch_dim_connection(inp, [], [out])
-        assert dims == ((), (0, 1), (0, 1))
+        with pytest.raises(ValueError, match="Different known dimensions mixed via broadcasting"):
+            subgraph_batch_dim_connection(inp, [], [out])
 
     def test_elemwise(self):
         inp = pt.tensor(shape=(5, 5))
@@ -116,11 +116,13 @@ def test_elemwise(self):
         assert dims == ((0,), (1,))
 
         out = inp + inp.T
-        [dims] = subgraph_batch_dim_connection(inp, [], [out])
-        assert dims == (
-            (0, 1),
-            (0, 1),
-        )
+        with pytest.raises(ValueError, match="Different known dimensions mixed via broadcasting"):
+            subgraph_batch_dim_connection(inp, [], [out])
+
+        out = inp[None, :, None, :] + inp[:, None, :, None]
+        with pytest.raises(ValueError, match="Same known dimension used in different axis after broadcasting"):
+            subgraph_batch_dim_connection(inp, [], [out])
+
 
     def test_blockwise(self):
         inp = pt.tensor(shape=(5, 4))
diff --git a/tests/model/marginal/test_marginal_model.py b/tests/model/marginal/test_marginal_model.py
@@ -802,7 +802,7 @@ def test_batched(self):
         with MarginalModel() as m:
             sigma = pm.HalfNormal("sigma")
             idx = pm.Bernoulli("idx", p=0.7, shape=(3, 2))
-            y = pm.Normal("y", mu=idx, sigma=sigma, shape=(3, 2))
+            y = pm.Normal("y", mu=idx.T, sigma=sigma, shape=(2, 3))
 
         m.marginalize([idx])
 
@@ -820,10 +820,9 @@ def test_batched(self):
 
         idata = m.recover_marginals(idata, return_samples=True)
         post = idata.posterior
-        assert "idx" in post
-        assert "lp_idx" in post
-        assert post.idx.shape == post.y.shape
-        assert post.lp_idx.shape == (*post.idx.shape, 2)
+        assert post["y"].shape == (1, 20, 2, 3)
+        assert post["idx"].shape == (1, 20, 3, 2)
+        assert post["lp_idx"].shape == (1, 20, 3, 2, 2)
 
     def test_nested(self):
         """Test that marginalization works when there are nested marginalized RVs"""