improved marginal distribution and saliency calculation

maximtrp · maximtrp · commit 71c838135ea2 · 2025-03-15T23:40:18.000+01:00
diff --git a/src/tmplot/_helpers.py b/src/tmplot/_helpers.py
@@ -249,7 +249,7 @@ def _select_docs(docs, theta, topic_id: int):
 
 def calc_topics_marg_probs(
     theta: Union[DataFrame, ndarray], topic_id: Optional[int] = None
-) -> Union[DataFrame, ndarray]:
+) -> ndarray:
     """Calculate marginal topics probabilities.
 
     Parameters
@@ -264,18 +264,18 @@ def calc_topics_marg_probs(
     Union[pandas.DataFrame, numpy.ndarray]
         Marginal topics probabilities.
     """
+    p_t = array(theta).sum(axis=1)
+    p_t /= p_t.sum()
     if topic_id is not None:
-        if isinstance(theta, ndarray):
-            return theta[topic_id, :].sum()
-        if isinstance(theta, DataFrame):
-            return theta.iloc[topic_id, :].sum()
-
-    return theta.sum(axis=1)
+        return p_t[topic_id]
+    return p_t
 
 
 def calc_terms_marg_probs(
-    phi: Union[ndarray, DataFrame], word_id: Optional[int] = None
-) -> Union[ndarray, Series]:
+    phi: Union[ndarray, DataFrame],
+    p_t: Union[ndarray, Series],
+    word_id: Optional[int] = None,
+) -> ndarray:
     """Calculate marginal terms probabilities.
 
     Parameters
@@ -290,16 +290,13 @@ def calc_terms_marg_probs(
     Union[numpy.ndarray, pandas.Series]
         Marginal terms probabilities.
     """
+    p_w = (array(phi) * array(p_t)).sum(axis=1)
     if word_id is not None:
-        if isinstance(phi, ndarray):
-            return phi[word_id, :].sum()
-        if isinstance(phi, DataFrame):
-            return phi.iloc[word_id, :].sum()
-
-    return phi.sum(axis=1)
+        return p_w[word_id]
+    return p_w
 
 
-def get_salient_terms(terms_freqs: ndarray, phi: ndarray, theta: ndarray) -> ndarray:
+def get_salient_terms(phi: ndarray, theta: ndarray) -> ndarray:
     """Get salient terms.
 
     Calculated as:
@@ -308,8 +305,6 @@ def get_salient_terms(terms_freqs: ndarray, phi: ndarray, theta: ndarray) -> nda
 
     Parameters
     ----------
-    terms_freqs : numpy.ndarray
-        Words frequencies.
     phi : numpy.ndarray
         Words vs topics matrix.
     theta : numpy.ndarray
@@ -320,15 +315,15 @@ def get_salient_terms(terms_freqs: ndarray, phi: ndarray, theta: ndarray) -> nda
     numpy.ndarray
         Terms saliency values.
     """
-    p_t = array(calc_topics_marg_probs(theta))
-    p_w = array(calc_terms_marg_probs(phi))
+    p_t = calc_topics_marg_probs(theta)
+    p_w = calc_terms_marg_probs(phi, p_t)
 
     def _p_tw(phi, w, t):
         return phi[w, t] * p_t[t] / p_w[w]
 
     saliency = array(
         (
-            terms_freqs[w]
+            p_w[w]
             * sum(
                 (
                     _p_tw(phi, w, t) * log(_p_tw(phi, w, t) / p_t[t])
diff --git a/tests/test_tmplot.py b/tests/test_tmplot.py
@@ -3,22 +3,20 @@
 from altair import LayerChart
 from tomotopy import LDAModel
 from src import tmplot as tm
-from numpy import random, floating
+from numpy import random, floating, ndarray
 from ipywidgets import VBox
-from pandas import Series
 
 
 class TestTmplot(unittest.TestCase):
-
     def setUp(self):
-        self.tomotopy_model = LDAModel.load('tests/models/tomotopyLDA.model')
-        with open('tests/models/gensimLDA.model', 'rb') as file:
+        self.tomotopy_model = LDAModel.load("tests/models/tomotopyLDA.model")
+        with open("tests/models/gensimLDA.model", "rb") as file:
             self.gensim_model = pkl.load(file)
-        with open('tests/models/gensimLDA.corpus', 'rb') as file:
+        with open("tests/models/gensimLDA.corpus", "rb") as file:
             self.gensim_corpus = pkl.load(file)
-        with open('tests/models/btm_big.pickle', 'rb') as file:
+        with open("tests/models/btm_big.pickle", "rb") as file:
             self.btm_model_big = pkl.load(file)
-        with open('tests/models/btm_small.pickle', 'rb') as file:
+        with open("tests/models/btm_small.pickle", "rb") as file:
             self.btm_model_small = pkl.load(file)
 
         self.phi = tm.get_phi(self.tomotopy_model)
@@ -69,66 +67,75 @@ def test_prepare_coords(self):
         topics_coords = tm.prepare_coords(self.btm_model_big)
         self.assertTupleEqual(topics_coords.shape, (self.btm_model_big.topics_num_, 5))
         topics_coords = tm.prepare_coords(self.btm_model_small)
-        self.assertTupleEqual(topics_coords.shape, (self.btm_model_small.topics_num_, 5))
+        self.assertTupleEqual(
+            topics_coords.shape, (self.btm_model_small.topics_num_, 5)
+        )
 
     def test_get_topics_scatter(self):
         topics_dists = tm.get_topics_dist(self.phi)
-        methods = ['tsne', 'sem', 'mds', 'lle', 'ltsa', 'isomap']
-        topics_scatters = list(map(
-            lambda method:
-                tm.get_topics_scatter(topics_dists, self.theta, method=method),
-            methods
-        ))
+        methods = ["tsne", "sem", "mds", "lle", "ltsa", "isomap"]
+        topics_scatters = list(
+            map(
+                lambda method: tm.get_topics_scatter(
+                    topics_dists, self.theta, method=method
+                ),
+                methods,
+            )
+        )
         for scatter in topics_scatters:
             self.assertTupleEqual(scatter.shape, (self.tomotopy_model.k, 4))
 
     def test_get_topics_dist(self):
         methods = ["klb", "jsd", "jef", "hel", "bhat", "tv", "jac"]
         topics_dists = list(
-            map(
-                lambda method: tm.get_topics_dist(self.phi, method=method),
-                methods)
+            map(lambda method: tm.get_topics_dist(self.phi, method=method), methods)
         )
         for dist in topics_dists:
             self.assertTupleEqual(
-                dist.shape,
-                (self.tomotopy_model.k, self.tomotopy_model.k))
+                dist.shape, (self.tomotopy_model.k, self.tomotopy_model.k)
+            )
 
     def test_calc_topics_marg_probs(self):
         topic_marg_prob = tm.calc_topics_marg_probs(self.theta, 0)
         self.assertIsInstance(topic_marg_prob, floating)
         self.assertGreater(topic_marg_prob, 0)
         topics_marg_probs = tm.calc_topics_marg_probs(self.theta)
-        self.assertIsInstance(topics_marg_probs, Series)
+        self.assertIsInstance(topics_marg_probs, ndarray)
         self.assertEqual(topics_marg_probs.size, self.tomotopy_model.k)
+        self.assertEqual(topics_marg_probs.sum(), 1)
 
     def test_calc_terms_marg_probs(self):
         term_marg_prob = tm.calc_terms_marg_probs(self.phi, 0)
         self.assertIsInstance(term_marg_prob, floating)
         self.assertGreater(term_marg_prob, 0)
         terms_marg_probs = tm.calc_terms_marg_probs(self.phi)
-        self.assertIsInstance(terms_marg_probs, Series)
+        self.assertIsInstance(terms_marg_probs, ndarray)
         self.assertEqual(terms_marg_probs.size, self.phi.index.size)
 
     def test_plot_scatter_topics(self):
         topics_coords = tm.prepare_coords(self.tomotopy_model)
         chart = tm.plot_scatter_topics(
-            topics_coords, size_col='size', label_col='label')
+            topics_coords, size_col="size", label_col="label"
+        )
         self.assertIsInstance(chart, LayerChart)
 
     def test_get_stable_topics(self):
         models = [
-            self.tomotopy_model, self.tomotopy_model, self.tomotopy_model,
-            self.tomotopy_model]
+            self.tomotopy_model,
+            self.tomotopy_model,
+            self.tomotopy_model,
+            self.tomotopy_model,
+        ]
         closest_topics, dists = tm.get_closest_topics(models)
         dists = random.normal(0, 0.10, dists.shape).__abs__()
         stable_topics, stable_dists = tm.get_stable_topics(
-            closest_topics, dists, norm=False)
+            closest_topics, dists, norm=False
+        )
 
         self.assertTupleEqual(
-            closest_topics.shape, (self.tomotopy_model.k, len(models)))
-        self.assertTupleEqual(
-            dists.shape, (self.tomotopy_model.k, len(models)))
+            closest_topics.shape, (self.tomotopy_model.k, len(models))
+        )
+        self.assertTupleEqual(dists.shape, (self.tomotopy_model.k, len(models)))
         self.assertLessEqual(stable_topics.shape[0], self.tomotopy_model.k)
         self.assertLessEqual(stable_dists.shape[0], self.tomotopy_model.k)
         self.assertGreaterEqual(stable_topics.shape[0], 0)
@@ -138,9 +145,8 @@ def test_get_stable_topics(self):
 
     def test_report(self):
         report = tm.report(
-            self.tomotopy_model,
-            docs=tm.get_docs(self.tomotopy_model),
-            width=250)
+            self.tomotopy_model, docs=tm.get_docs(self.tomotopy_model), width=250
+        )
         self.assertIsInstance(report, VBox)
 
     def test_entropy(self):
@@ -149,6 +155,10 @@ def test_entropy(self):
         self.assertGreater(entropy, 0)
         self.assertGreater(entropy2, 0)
 
+    def test_get_salient_terms(self):
+        saliency = tm.get_salient_terms(self.phi, self.theta)
+        self.assertEqual(saliency.size, self.phi.shape[0])
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()