cleaned up mixtures and finished debugging EMM/works on example

Alexander Ororbia · Alexander Ororbia · commit 6e6561e4e585 · 2025-11-24T15:26:43.000-05:00
diff --git a/ngclearn/utils/density/bernoulliMixture.py b/ngclearn/utils/density/bernoulliMixture.py
@@ -18,13 +18,13 @@ def _log_bernoulli_pdf(X, p):
     Args:
         X: a design matrix (dataset) to compute the log likelihood of
 
-        mu: a parameter mean vector
+        p: a parameter mean vector (positive case probability)
 
     Returns:
         the log likelihood (scalar) of this design matrix X
     """
     #D = X.shape[1] * 1. ## get dimensionality
-    ## x log(mu_k) + (1-x) log(1 - mu_k)
+    ## general format:  x log(mu_k) + (1-x) log(1 - mu_k)
     vec_ll = X * jnp.log(p) + (1. - X) * jnp.log(1. - p) ## binary cross-entropy (log Bernoulli)
     log_ll = jnp.sum(vec_ll, axis=1, keepdims=True) ## get per-datapoint LL
     return log_ll
@@ -35,20 +35,27 @@ def _calc_bernoulli_pdf_vals(X, p):
     ll = jnp.exp(log_ll) ## likelihood
     return log_ll, ll
 
+@jit
+def _calc_bernoulli_mixture_stats(raw_likeli, pi):
+    likeli = raw_likeli * pi
+    gamma = likeli / jnp.sum(likeli, axis=1, keepdims=True)  ## responsibilities
+    likeli = jnp.sum(likeli, axis=1, keepdims=True)  ## Sum_j[ pi_j * pdf_gauss(x_n; mu_j, Sigma_j) ]
+    log_likeli = jnp.log(likeli)  ## vector of individual log p(x_n) values
+    complete_log_likeli = jnp.sum(log_likeli)  ## complete log-likelihood for design matrix X, i.e., log p(X)
+    return log_likeli, complete_log_likeli, gamma
+
 @jit
 def _calc_priors_and_means(X, weights, pi): ## M-step co-routine
     ## calc new means, responsibilities, and priors given current stats
     N = X.shape[0]  ## get number of samples
     ## calc responsibilities
-    r = (pi * weights)
-    r = r / jnp.sum(r, axis=1, keepdims=True) ## responsibilities
-    _pi = jnp.sum(r, axis=0, keepdims=True) / N ## calc new priors
+    _pi = jnp.sum(weights, axis=0, keepdims=True) / N ## calc new priors
     ## calc weighted means (weighted by responsibilities)
-    Z = jnp.sum(r, axis=0, keepdims=True) ## partition function
+    Z = jnp.sum(weights, axis=0, keepdims=True) ## partition function
     M = (Z > 0.) * 1.
     Z = Z * M + (1. + M) ## removes div-by-0 cases
-    means = jnp.matmul(r.T, X) / Z.T
-    return means, _pi, r
+    means = jnp.matmul(weights.T, X) / Z.T
+    return _pi, means
 
 @partial(jit, static_argnums=[1])
 def _sample_prior_weights(dkey, n_samples, pi): ## samples prior weighting parameters (of mixture)
@@ -58,7 +65,7 @@ def _sample_prior_weights(dkey, n_samples, pi): ## samples prior weighting param
 
 @partial(jit, static_argnums=[1])
 def _sample_component(dkey, n_samples, mu): ## samples a component (of mixture)
-    eps = random.bernoulli(dkey, p=mu, shape=(n_samples, mu.shape[1])) ## draw Bernoulli samples
+    x_s = random.bernoulli(dkey, p=mu, shape=(n_samples, mu.shape[1])) ## draw Bernoulli samples
     return x_s
 
 ########################################################################################################################
@@ -119,31 +126,32 @@ def calc_log_likelihood(self, X):
         Returns:
             (column) vector of individual log likelihoods, scalar for the complete log likelihood p(X)
         """
-        ll = 0.
+        likeli = []
         for j in range(self.K):
-            log_ll_j, ll_j = _calc_bernoulli_pdf_vals(X, self.mu[j])
-            ll = ll_j + ll
-        log_ll = jnp.log(ll) ## vector of individual log p(x_n) values
-        complete_ll = jnp.sum(log_ll) ## complete log-likelihood for design matrix X, i.e., log p(X)
-        return log_ll, complete_ll
+            _, likeli_j = _calc_bernoulli_pdf_vals(X, self.mu[j])
+            likeli.append(likeli_j)
+        likeli = jnp.concat(likeli, axis=1)
+        log_likeli_vec, complete_log_likeli, gamma = _calc_bernoulli_mixture_stats(likeli, self.pi)
+        return log_likeli_vec, complete_log_likeli
 
     def _E_step(self, X): ## Expectation (E) step, co-routine
-        weights = []
+        likeli = []
         for j in range(self.K):
-            log_ll_j, ll_j = _calc_bernoulli_pdf_vals(X, self.mu[j])
-            weights.append( ll_j )
-        weights = jnp.concat(weights, axis=1)
-        return weights ## data-dependent weights (intermediate responsibilities)
+            _, likeli_j = _calc_bernoulli_pdf_vals(X, self.mu[j])
+            likeli.append(likeli_j)
+        likeli = jnp.concat(likeli, axis=1)
+        log_likeli_vec, complete_log_likeli, gamma = _calc_bernoulli_mixture_stats(likeli, self.pi)
+        ## gamma => ## data-dependent weights (responsibilities)
+        return gamma, log_likeli_vec, complete_log_likeli
 
     def _M_step(self, X, weights): ## Maximization (M) step, co-routine
-        means, pi, r = _calc_priors_and_means(X, weights, self.pi)
-        self.pi = pi ## store new prior parameters
-        # calc weighted covariances
+        pi, means = _calc_priors_and_means(X, weights, self.pi)
+        self.pi = pi  ## store new prior parameters
         for j in range(self.K):
-            #r_j = r[:, j:j + 1]
+            #r_j = weights[:, j:j + 1]  ## get j-th responsibility slice
             mu_j = means[j:j + 1, :]
-            self.mu[j] = mu_j ## store new mean(j) parameter
-        return means, r
+            self.mu[j] = mu_j  ## store new mean(j) parameter
+        return pi, means
 
     def fit(self, X, tol=1e-3, verbose=False):
         """
@@ -159,11 +167,11 @@ def fit(self, X, tol=1e-3, verbose=False):
         """
         means_prev = jnp.concat(self.mu, axis=0)
         for i in range(self.max_iter):
-            self.update(X) ## carry out one E-step followed by an M-step
-            means = jnp.concat(self.mu, axis=0)
+            gamma, pi, means, complete_loglikeli = self.update(X) ## carry out one E-step followed by an M-step
+            #means = jnp.concat(self.mu, axis=0)
             dom = jnp.linalg.norm(means - means_prev) ## norm of difference-of-means
             if verbose:
-                print(f"{i}: Mean-diff = {dom}")
+                print(f"{i}: Mean-diff = {dom}  log(p(X)) = {complete_loglikeli} nats")
             #print(jnp.linalg.norm(means - means_prev))
             if tol >= 0. and dom < tol:
                 print(f"Converged after {i + 1} iterations.")
@@ -177,8 +185,9 @@ def update(self, X):
         Args:
             X: the dataset / design matrix to fit this BMM to
         """
-        r_w = self._E_step(X)  ## carry out E-step
-        means, respon = self._M_step(X, r_w) ## carry out M-step
+        gamma, _, complete_likeli = self._E_step(X)  ## carry out E-step
+        pi, means = self._M_step(X, gamma) ## carry out M-step
+        return gamma, pi, means, complete_likeli
 
     def sample(self, n_samples, mode_j=-1):
         """
@@ -193,15 +202,14 @@ def sample(self, n_samples, mode_j=-1):
         Returns:
             Design matrix of samples drawn under the distribution defined by this BMM
         """
-        ## sample prior
         self.key, *skey = random.split(self.key, 3)
-        if mode_j >= 0: ## sample from a particular mode / component
-            mu_j = self.mu[mode_j]
+        if mode_j >= 0: ## sample from a particular mode
+            mu_j = self.mu[mode_j] ## directly select a specific component
             Xs = _sample_component(skey[0], n_samples=n_samples, mu=mu_j)
         else: ## sample from full mixture distribution
-            ## sample components/latents
+            ## sample (prior) components/latents
             lats = _sample_prior_weights(skey[0], n_samples=n_samples, pi=self.pi)
-            ## then sample chosen component Bernoulli
+            ## then sample chosen component Bernoulli(s)
             Xs = []
             for j in range(self.K):
                 freq_j = int(jnp.sum((lats == j)))  ## compute frequency over mode
diff --git a/ngclearn/utils/density/exponentialMixture.py b/ngclearn/utils/density/exponentialMixture.py
@@ -1,65 +1,49 @@
 from jax import numpy as jnp, random, jit, scipy
 from functools import partial
 import time, sys
-import numpy as np
 
 from ngclearn.utils.density.mixture import Mixture
 
 ########################################################################################################################
 ## internal routines for mixture model
 ########################################################################################################################
-
 @jit
-def _log_exponential_pdf(X, rate):
+def _log_exponential_pdf(X, lmbda):
     """
-    Calculates the multivariate exponential log likelihood of a design matrix/dataset `X`, under a given parameter 
+    Calculates the multivariate exponential log likelihood of a design matrix/dataset `X`, under a given parameter
     probability `p`.
 
     Args:
         X: a design matrix (dataset) to compute the log likelihood of
 
-        rate: a parameter rate vector
+        lmbda: a parameter rate vector
 
     Returns:
         the log likelihood (scalar) of this design matrix X
     """
-    #D = X.shape[1] * 1. ## get dimensionality
-    ## pdf(x; r) = r * np.exp(-r * x), where r is "rate"
-    ## log (r exp(-r x) ) = log(r) + log(exp(-r x) = log(r) - r x
-    vec_ll = -(X * rate) + jnp.log(rate) ## log exponential
-    log_ll = jnp.sum(vec_ll, axis=1, keepdims=True) ## get per-datapoint LL
-    return log_ll
+    log_pdf = -jnp.matmul(X, lmbda.T) + jnp.sum(jnp.log(lmbda.T), axis=0)
+    return log_pdf
 
 @jit
-def _calc_exponential_pdf_vals(X, p):
-    log_ll = _log_exponential_pdf(X, p) ## get log-likelihood
-    ll = jnp.exp(log_ll) ## likelihood
-    return log_ll, ll
-
-#@jit
-def _calc_priors_and_rates(X, weights, pi): ## M-step co-routine
-    ## calc new rates, responsibilities, and priors given current stats
-    N = X.shape[0]  ## get number of samples
-    ## calc responsibilities
-    r = (pi * weights)
-    r = r / jnp.sum(r, axis=1, keepdims=True) ## responsibilities
-    _pi = jnp.sum(r, axis=0, keepdims=True) / N ## calc new priors
-    ## calc weighted rates (weighted by responsibilities)
-
-    Znum = jnp.sum(r, axis=0, keepdims=True)
-    #print(Znum.shape)
-    Zden = jnp.matmul(r.T, X)
-    rates = Znum.T/Zden
-    #print(Zden.shape)
-    #exit()
-    """
-    Z = jnp.sum(r, axis=0, keepdims=True) ## calc partition function
-    Ndata = jnp.matmul(r.T, X)
-    M = (Ndata > 0.) * 1.
-    Ndata = Ndata * M + (1. - M) ## we mask out division-by-0 cases
-    rates = Z.T / Ndata
-    """
-    return rates, _pi, r
+def _calc_exponential_mixture_stats(X, lmbda, pi):
+    log_exp_pdf = _log_exponential_pdf(X, lmbda)
+    log_likeli = log_exp_pdf + jnp.log(pi) ## raw log-likelihood
+    likeli = jnp.exp(log_likeli) ## raw likelihood
+    gamma = likeli / jnp.sum(likeli, axis=1, keepdims=True) ## responsibilities
+    weighted_log_likeli = jnp.sum(log_likeli * gamma, axis=1, keepdims=True)  ## get weighted EMM log-likelihood
+    complete_loglikeli = jnp.sum(weighted_log_likeli)  ## complete log-likelihood for design matrix X, i.e., log p(X)
+    return log_likeli, likeli, gamma, weighted_log_likeli, complete_loglikeli
+
+@jit
+def _calc_priors_and_rates(X, weights, pi):  ## M-step co-routine
+    ## compute updates to pi params
+    Zk = jnp.sum(weights, axis=0, keepdims=True)  ## summed weights/responsibilities; 1 x K
+    Z = jnp.sum(Zk)  ## partition function
+    pi = Zk / Z
+    ## compute updates to lmbda params
+    Z = jnp.matmul(weights.T, X)
+    lmbda = Zk.T / Z
+    return pi, lmbda
 
 @partial(jit, static_argnums=[1])
 def _sample_prior_weights(dkey, n_samples, pi): ## samples prior weighting parameters (of mixture)
@@ -70,18 +54,17 @@ def _sample_prior_weights(dkey, n_samples, pi): ## samples prior weighting param
 @partial(jit, static_argnums=[1])
 def _sample_component(dkey, n_samples, rate): ## samples a component (of mixture)
     ## sampling ~[exp(rx)] is same as r * [~exp(x)]
-    eps = jax.random.exponential(dkey, shape=(n_samples, mu.shape[1])) * rate ## draw exponential samples
+    x_s = random.exponential(dkey, shape=(n_samples, rate.shape[1])) * rate ## draw exponential samples
     return x_s
 
 ########################################################################################################################
 
 class ExponentialMixture(Mixture): ## Exponential mixture model (mixture-of-exponentials)
     """
-    Implements a exponential mixture model (EMM) -- or mixture of exponentials (MoExp).
-    Adaptation of parameters is conducted via the Expectation-Maximization (EM)
-    learning algorithm. Note that this exponential mixture assumes that each component 
-    is a factorizable mutlivariate exponential distribution. (A Categorical distribution 
-    is assumed over the latent variables).
+    Implements a exponential mixture model (EMM) -- or mixture of exponentials (MoExp). Adaptation of parameters is
+    conducted via the Expectation-Maximization (EM) learning algorithm. Note that this exponential mixture assumes that
+    each component is a factorizable mutlivariate exponential distribution. (A Categorical distribution is assumed over
+    the latent variables).
 
     Args:
         K: the number of components/latent variables within this EMM
@@ -110,15 +93,20 @@ def init(self, X):
 
         """
         dim = X.shape[1]
-        self.key, *skey = random.split(self.key, 3)
-        self.pi = jnp.ones((1, self.K)) / (self.K * 1.)
-        ptrs = random.permutation(skey[0], X.shape[0])
+        self.key, *skey = random.split(self.key, 4)
+        ## Computed jittered initial phi param values
+        #self.pi = jnp.ones((1, self.K)) / (self.K * 1.)
+        pi = jnp.ones((1, self.K))
+        eps = random.uniform(skey[0], minval=0.99, maxval=1.01, shape=(1, self.K))
+        pi = pi * eps
+        self.pi = pi / jnp.sum(pi)
+
+        ## Computed jittered initial rate (lmbda) param values
+        lmbda_h = 1.0/jnp.mean(X, axis=0, keepdims=True)
+        lmbda = random.uniform(skey[1], minval=0.99, maxval=1.01, shape=(self.K, dim)) * lmbda_h
         self.rate = [] 
-        for j in range(self.K):
-            ptr = ptrs[j]
-            self.key, *skey = random.split(self.key, 3)
-            eps = random.uniform(skey[0], minval=0.99, maxval=1.01, shape=(1, dim)) ## jitter initial rate params
-            self.rate.append(eps)
+        for j in range(self.K): ## set rates/lmbdas
+            self.rate.append(lmbda[j:j+1, :])
 
     def calc_log_likelihood(self, X):
         """
@@ -131,31 +119,26 @@ def calc_log_likelihood(self, X):
         Returns:
             (column) vector of individual log likelihoods, scalar for the complete log likelihood p(X)
         """
-        ll = 0.
-        for j in range(self.K):
-            log_ll_j, ll_j = _calc_exponential_pdf_vals(X, self.rate[j])
-            ll = ll_j + ll
-        log_ll = jnp.log(ll) ## vector of individual log p(x_n) values
-        complete_ll = jnp.sum(log_ll) ## complete log-likelihood for design matrix X, i.e., log p(X)
-        return log_ll, complete_ll
+        pi = self.pi ## get prior weight values
+        lmbda = jnp.concat(self.rate, axis=0) ## get rates as a block matrix
+        ## compute relevant log-likelihoods/likelihoods
+        log_ll, ll, gamma, weighted_loglikeli, complete_likeli = _calc_exponential_mixture_stats(X, lmbda, pi)
+        return weighted_loglikeli, complete_likeli
 
     def _E_step(self, X): ## Expectation (E) step, co-routine
-        weights = []
-        for j in range(self.K):
-            log_ll_j, ll_j = _calc_exponential_pdf_vals(X, self.rate[j])
-            weights.append( ll_j )
-        weights = jnp.concat(weights, axis=1)
-        return weights ## data-dependent weights (intermediate responsibilities)
+        pi = self.pi ## get prior weight values
+        lmbda = jnp.concat(self.rate, axis=0) ## get rates as a block matrix
+        _, _, gamma, weighted_loglikeli, complete_likeli = _calc_exponential_mixture_stats(X, lmbda, pi)
+        ## Note: responsibility weights gamma have shape => N x K
+        return gamma, weighted_loglikeli, complete_likeli
 
     def _M_step(self, X, weights): ## Maximization (M) step, co-routine
-        rates, pi, r = _calc_priors_and_rates(X, weights, self.pi)
-        self.pi = pi ## store new prior parameters
-        # calc weighted covariances
-        for j in range(self.K):
-            #r_j = r[:, j:j + 1]
-            rate_j = rates[j:j + 1, :]
-            self.rate[j] = rate_j ## store new rate(j) parameter
-        return rates, r
+        ## compute updates to pi and lmbda params
+        pi, lmbda = _calc_priors_and_rates(X, weights, self.pi)
+        self.pi = pi  ## store new prior parameters
+        for j in range(self.K): ## store new rate/lmbda parameters
+            self.rate[j] = lmbda[j:j+1, :]
+        return pi, lmbda
 
     def fit(self, X, tol=1e-3, verbose=False):
         """
@@ -171,11 +154,11 @@ def fit(self, X, tol=1e-3, verbose=False):
         """
         rates_prev = jnp.concat(self.rate, axis=0)
         for i in range(self.max_iter):
-            self.update(X) ## carry out one E-step followed by an M-step
-            rates = jnp.concat(self.rate, axis=0)
+            gamma, pi, rates, complete_loglikeli = self.update(X) ## carry out one E-step followed by an M-step
+            #rates = jnp.concat(self.rate, axis=0)
             dor = jnp.linalg.norm(rates - rates_prev) ## norm of difference-of-rates
             if verbose:
-                print(f"{i}: Rate-diff = {dor}")
+                print(f"{i}: Rate-diff = {dor}  log(p(X)) = {complete_loglikeli} nats")
             #print(jnp.linalg.norm(rates - rates_prev))
             if tol >= 0. and dor < tol:
                 print(f"Converged after {i + 1} iterations.")
@@ -188,9 +171,13 @@ def update(self, X):
 
         Args:
             X: the dataset / design matrix to fit this BMM to
+
+        Returns:
+            responsibilities (gamma), priors (pi), rates (lambda), EMM log-likelihood
         """
-        r_w = self._E_step(X)  ## carry out E-step
-        rates, respon = self._M_step(X, r_w) ## carry out M-step
+        gamma, _, complete_log_likeli  = self._E_step(X)  ## carry out E-step
+        pi, rates = self._M_step(X, gamma) ## carry out M-step
+        return gamma, pi, rates, complete_log_likeli
 
     def sample(self, n_samples, mode_j=-1):
         """
@@ -205,15 +192,14 @@ def sample(self, n_samples, mode_j=-1):
         Returns:
             Design matrix of samples drawn under the distribution defined by this EMM
         """
-        ## sample prior
         self.key, *skey = random.split(self.key, 3)
-        if mode_j >= 0: ## sample from a particular mode / component
-            rate_j = self.rate[mode_j]
+        if mode_j >= 0: ## sample from a particular mode
+            rate_j = self.rate[mode_j] ## directly select a specific component
             Xs = _sample_component(skey[0], n_samples=n_samples, rate=rate_j)
         else: ## sample from full mixture distribution
-            ## sample components/latents
+            ## sample (prior) components/latents
             lats = _sample_prior_weights(skey[0], n_samples=n_samples, pi=self.pi)
-            ## then sample chosen component exponential
+            ## then sample chosen component exponential(s)
             Xs = []
             for j in range(self.K):
                 freq_j = int(jnp.sum((lats == j)))  ## compute frequency over mode
diff --git a/ngclearn/utils/density/gaussianMixture.py b/ngclearn/utils/density/gaussianMixture.py