fixed errors in marginal distribution and saliency calculations

maximtrp · maximtrp · commit c3a59200f279 · 2025-03-15T22:26:55.000+01:00
diff --git a/src/tmplot/_helpers.py b/src/tmplot/_helpers.py
@@ -266,22 +266,24 @@ def calc_topics_marg_probs(
     """
     if topic_id is not None:
         if isinstance(theta, ndarray):
-            return theta[topic_id, :].sum()
+            return theta[topic_id, :].mean()
         if isinstance(theta, DataFrame):
-            return theta.iloc[topic_id, :].sum()
+            return theta.iloc[topic_id, :].mean()
 
-    return theta.sum(axis=1)
+    return theta.mean(axis=1)
 
 
 def calc_terms_marg_probs(
-    phi: Union[ndarray, DataFrame], word_id: Optional[int] = None
+    phi: Union[ndarray, DataFrame], pt: Union[ndarray, Series], word_id: Optional[int] = None
 ) -> Union[ndarray, Series]:
     """Calculate marginal terms probabilities.
 
     Parameters
     ----------
     phi : Union[numpy.ndarray, pandas.DataFrame]
         Words vs topics matrix.
+    pt: Union[numpy.ndarray, pandas.Series]
+        Topics marginal probabilities.
     word_id: Optional[int]
         Word index.
 
@@ -292,24 +294,22 @@ def calc_terms_marg_probs(
     """
     if word_id is not None:
         if isinstance(phi, ndarray):
-            return phi[word_id, :].sum()
+            return (phi[word_id, :] * pt).mean()
         if isinstance(phi, DataFrame):
-            return phi.iloc[word_id, :].sum()
+            return (phi.iloc[word_id, :] * pt).mean()
 
-    return phi.sum(axis=1)
+    return (phi * pt).mean(axis=1)
 
 
-def get_salient_terms(terms_freqs: ndarray, phi: ndarray, theta: ndarray) -> ndarray:
+def get_salient_terms(phi: ndarray, theta: ndarray) -> ndarray:
     """Get salient terms.
 
     Calculated as:
-    saliency(w) = frequency(w) * [sum_t p(t | w) * log(p(t | w)/p(t))],
+    saliency(w) = p(w) * [sum_t p(t | w) * log(p(t | w)/p(t))],
     where ``w`` is a term index, ``t`` is a topic index.
 
     Parameters
     ----------
-    terms_freqs : numpy.ndarray
-        Words frequencies.
     phi : numpy.ndarray
         Words vs topics matrix.
     theta : numpy.ndarray
@@ -328,7 +328,7 @@ def _p_tw(phi, w, t):
 
     saliency = array(
         (
-            terms_freqs[w]
+            p_w[w]
             * sum(
                 (
                     _p_tw(phi, w, t) * log(_p_tw(phi, w, t) / p_t[t])
@@ -338,7 +338,7 @@ def _p_tw(phi, w, t):
             for w in range(phi.shape[0])
         )
     )
-    # saliency(term w) = frequency(w)
+    # saliency(term w) = p(w)
     # * [sum_t p(t | w) * log(p(t | w)/p(t))] for topics t
     # p(t | w) = p(w | t) * p(t) / p(w)
     return saliency
diff --git a/tests/test_tmplot.py b/tests/test_tmplot.py
@@ -3,7 +3,7 @@
 from altair import LayerChart
 from tomotopy import LDAModel
 from src import tmplot as tm
-from numpy import random, floating
+from numpy import random, floating, array
 from ipywidgets import VBox
 from pandas import Series
 
@@ -100,15 +100,17 @@ def test_calc_topics_marg_probs(self):
         self.assertGreater(topic_marg_prob, 0)
         topics_marg_probs = tm.calc_topics_marg_probs(self.theta)
         self.assertIsInstance(topics_marg_probs, Series)
+        self.assertTrue(np.isclose(topics_marg_probs.sum(), 1))
         self.assertEqual(topics_marg_probs.size, self.tomotopy_model.k)
 
     def test_calc_terms_marg_probs(self):
-        term_marg_prob = tm.calc_terms_marg_probs(self.phi, 0)
+        term_marg_prob = tm.calc_terms_marg_probs(self.phi, tm.calc_topics_marg_probs(self.theta), 0)
         self.assertIsInstance(term_marg_prob, floating)
         self.assertGreater(term_marg_prob, 0)
-        terms_marg_probs = tm.calc_terms_marg_probs(self.phi)
+        terms_marg_probs = tm.calc_terms_marg_probs(self.phi, tm.calc_topics_marg_probs(self.theta))
         self.assertIsInstance(terms_marg_probs, Series)
         self.assertEqual(terms_marg_probs.size, self.phi.index.size)
+        self.assertTrue(np.isclose(terms_marg_probs.sum(), 1))
 
     def test_plot_scatter_topics(self):
         topics_coords = tm.prepare_coords(self.tomotopy_model)
@@ -149,6 +151,10 @@ def test_entropy(self):
         self.assertGreater(entropy, 0)
         self.assertGreater(entropy2, 0)
 
+    def test_get_salient_terms(self):
+        saliency = tm.get_salient_terms(self.phi, self.theta)
+        self.assertEqual(saliency.size, self.phi.shape[0])
+
 
 if __name__ == '__main__':
     unittest.main()