Merge pull request #87 from x-tabdeveloping/prepare_multimodal_topic_data

x-tabdeveloping · web-flow · commit 3af440b2e56e · 2025-05-05T11:45:24.000+02:00
Multimodal TopicData
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ profile = "black"
 
 [tool.poetry]
 name = "turftopic"
-version = "0.15.0"
+version = "0.16.0"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = ["Márton Kardos <power.up1163@gmail.com>"]
 license = "MIT"
diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py
@@ -48,7 +48,10 @@ def multimodal_models():
 
 def test_multimodal(multimodal_models):
     for model in multimodal_models:
-        doc_topic_matrix = model.fit_transform_multimodal(texts, images=images)
-        fig = model.plot_topics_with_images()
-        assert len(model.top_images) == model.components_.shape[0]
-        assert doc_topic_matrix.shape[1] == model.components_.shape[0]
+        topic_data = model.prepare_multimodal_topic_data(texts, images=images)
+        fig = topic_data.plot_topics_with_images()
+        assert len(topic_data.top_images) == model.components_.shape[0]
+        assert (
+            topic_data.document_topic_matrix.shape[1]
+            == model.components_.shape[0]
+        )
diff --git a/turftopic/container.py b/turftopic/container.py
@@ -1,9 +1,11 @@
 import itertools
+import warnings
 from abc import ABC
 from datetime import datetime
 from typing import Any, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
+from PIL import Image
 from rich.console import Console
 from rich.table import Table
 
@@ -76,6 +78,92 @@ def _top_terms(
                 terms.append(list(vocab[highest]))
         return terms
 
+    def get_top_words(
+        self, top_k: int = 10, positive: bool = True
+    ) -> list[list[str]]:
+        """Returns list of top words for each topic.
+
+        Parameters
+        ----------
+        top_k: int, default 10
+            Number of words to return.
+        positive: bool, default True
+            Indicates whether the highest
+            or lowest scoring terms should be returned.
+        """
+        return self._top_terms(top_k, positive)
+
+    def get_top_documents(
+        self,
+        raw_documents=None,
+        document_topic_matrix=None,
+        top_k: int = 10,
+        positive: bool = True,
+    ) -> list[list[str]]:
+        """Returns list of top documents for each topic.
+
+        Parameters
+        ----------
+        top_k: int, default 10
+            Number of documents to return per topic.
+        positive: bool, default True
+            Indicates whether the highest
+            or lowest scoring documents should be returned.
+        """
+        docs = []
+        raw_documents = raw_documents or getattr(self, "corpus", None)
+        if raw_documents is None:
+            raise ValueError(
+                "No corpus was passed, can't search for representative documents."
+            )
+        document_topic_matrix = document_topic_matrix or getattr(
+            self, "document_topic_matrix", None
+        )
+        if document_topic_matrix is None:
+            try:
+                document_topic_matrix = self.transform(raw_documents)
+            except AttributeError:
+                raise ValueError(
+                    "Transductive methods cannot "
+                    "infer topical content in documents.\n"
+                    "Please pass a document_topic_matrix."
+                )
+        for topic_doc_vec in document_topic_matrix.T:
+            if positive:
+                topic_doc_vec = -topic_doc_vec
+            highest = np.argsort(topic_doc_vec)[:top_k]
+            docs.append([raw_documents[i_doc] for i_doc in highest])
+        return docs
+
+    def get_top_images(self, top_k: int = True, positive: bool = True):
+        """Returns list of top images for each topic.
+
+        Parameters
+        ----------
+        top_k: int, default 10
+            Number of images to return.
+        positive: bool, default True
+            Indicates whether the highest
+            or lowest scoring images should be returned.
+        """
+        if not hasattr(self, "top_images"):
+            raise ValueError(
+                "Model either has not been fit or was fit without images. top_images property missing."
+            )
+        if (not positive) and not hasattr(self, "negative_images"):
+            raise ValueError(
+                "Model either has not been fit or was fit without images. top_images property missing."
+            )
+        top_images = self.top_images if positive else self.negative_images
+        ims = []
+        for topic_images in top_images:
+            if len(topic_images) < top_k:
+                warnings.warn(
+                    "Number of images stored in the topic model is smaller than the specified top_k, returning all that the model has."
+                )
+            ims.append(topic_images[:top_k])
+        return ims
+
     def _rename_automatic(self, namer: TopicNamer) -> list[str]:
         self.topic_names_ = namer.name_topics(self._top_terms())
         return self.topic_names_
@@ -928,3 +1016,218 @@ def plot_topics_over_time(
         fig.update_xaxes(title="Time Slice Start")
         fig.update_yaxes(title="Topic Importance")
         return fig
+
+    @staticmethod
+    def _image_grid(
+        images: list[Image.Image],
+        final_size=(1200, 1200),
+        grid_size: tuple[int, int] = (4, 4),
+    ):
+        grid_img = Image.new("RGB", final_size, (255, 255, 255))
+        cell_width = final_size[0] // grid_size[0]
+        cell_height = final_size[1] // grid_size[1]
+        n_rows, n_cols = grid_size
+        for idx, img in enumerate(images[: n_rows * n_cols]):
+            img = img.resize(
+                (cell_width, cell_height), resample=Image.Resampling.LANCZOS
+            )
+            x_offset = (idx % grid_size[0]) * cell_width
+            y_offset = (idx // grid_size[1]) * cell_height
+            grid_img.paste(img, (x_offset, y_offset))
+        return grid_img
+
+    def plot_topics_with_images(self, n_cols: int = 3, grid_size: int = 4):
+        """Plots the most important images for each topic, along with keywords.
+
+        Note that you will need to `pip install plotly` to use plots in Turftopic.
+
+        Parameters
+        ----------
+        n_cols: int, default 3
+            Number of columns you want to have in the grid of topics.
+        grid_size: int, default 4
+            The square root of the number of images you want to display for a given topic.
+            For instance if grid_size==4, all topics will have 16 images displayed,
+            since the joint image will have 4 columns and 4 rows.
+
+        Returns
+        -------
+        go.Figure
+            Plotly figure containing top images and keywords for topics.
+        """
+        if not hasattr(self, "top_images"):
+            raise ValueError(
+                "Model either has not been fit or was fit without images. top_images property missing."
+            )
+        try:
+            import plotly.graph_objects as go
+        except (ImportError, ModuleNotFoundError) as e:
+            raise ModuleNotFoundError(
+                "Please install plotly if you intend to use plots in Turftopic."
+            ) from e
+        negative_images = getattr(self, "negative_images", None)
+        if negative_images is not None:
+            # If the model has negative images, it should display them side by side with the positive ones.
+            n_components = self.components_.shape[0]
+            fig = go.Figure()
+            width, height = 1200, 1200
+            scale_factor = 0.25
+            w, h = width * scale_factor, height * scale_factor
+            padding = 10
+            figure_height = (h + padding) * n_components
+            figure_width = (w + padding) * 2
+            fig = fig.add_trace(
+                go.Scatter(
+                    x=[0, figure_width],
+                    y=[0, figure_height],
+                    mode="markers",
+                    marker_opacity=0,
+                )
+            )
+            vocab = self.get_vocab()
+            for i, component in enumerate(self.components_):
+                positive = vocab[np.argsort(-component)[:7]]
+                negative = vocab[np.argsort(component)[:7]]
+                pos_image = self._image_grid(
+                    self.top_images[i],
+                    (width, height),
+                    grid_size=(grid_size, grid_size),
+                )
+                neg_image = self._image_grid(
+                    self.negative_images[i],
+                    (width, height),
+                    grid_size=(grid_size, grid_size),
+                )
+                x0 = 0
+                y0 = (h + padding) * (n_components - i)
+                fig = fig.add_layout_image(
+                    dict(
+                        x=x0,
+                        sizex=w,
+                        y=y0,
+                        sizey=h,
+                        xref="x",
+                        yref="y",
+                        opacity=1.0,
+                        layer="below",
+                        sizing="stretch",
+                        source=pos_image,
+                    ),
+                )
+                fig.add_annotation(
+                    x=(w / 2),
+                    y=(h + padding) * (n_components - i) - (h / 2),
+                    text="<b> " + "<br> ".join(positive),
+                    font=dict(
+                        size=16,
+                        family="Times New Roman",
+                        color="white",
+                    ),
+                    bgcolor="rgba(0,0,255, 0.5)",
+                )
+                x0 = (w + padding) * 1
+                fig = fig.add_layout_image(
+                    dict(
+                        x=x0,
+                        sizex=w,
+                        y=y0,
+                        sizey=h,
+                        xref="x",
+                        yref="y",
+                        opacity=1.0,
+                        layer="below",
+                        sizing="stretch",
+                        source=neg_image,
+                    ),
+                )
+                fig.add_annotation(
+                    x=(w + padding) + (w / 2),
+                    y=(h + padding) * (n_components - i) - (h / 2),
+                    text="<b> " + "<br> ".join(negative),
+                    font=dict(
+                        size=16,
+                        family="Times New Roman",
+                        color="white",
+                    ),
+                    bgcolor="rgba(255,0,0, 0.5)",
+                )
+            fig = fig.update_xaxes(visible=False, range=[0, figure_width])
+            fig = fig.update_yaxes(
+                visible=False,
+                range=[0, figure_height],
+                # the scaleanchor attribute ensures that the aspect ratio stays constant
+                scaleanchor="x",
+            )
+            fig = fig.update_layout(
+                width=figure_width,
+                height=figure_height,
+                margin={"l": 0, "r": 0, "t": 0, "b": 0},
+            )
+            return fig
+        else:
+            fig = go.Figure()
+            width, height = 1200, 1200
+            scale_factor = 0.25
+            w, h = width * scale_factor, height * scale_factor
+            padding = 10
+            n_components = self.components_.shape[0]
+            n_rows = n_components // n_cols + int(bool(n_components % n_cols))
+            figure_height = (h + padding) * n_rows
+            figure_width = (w + padding) * n_cols
+            fig = fig.add_trace(
+                go.Scatter(
+                    x=[0, figure_width],
+                    y=[0, figure_height],
+                    mode="markers",
+                    marker_opacity=0,
+                )
+            )
+            vocab = self.get_vocab()
+            for i, component in enumerate(self.components_):
+                col = i % n_cols
+                row = i // n_cols
+                top_7 = vocab[np.argsort(-component)[:7]]
+                images = self.top_images[i]
+                image = self._image_grid(
+                    images, (width, height), grid_size=(grid_size, grid_size)
+                )
+                x0 = (w + padding) * col
+                y0 = (h + padding) * (n_rows - row)
+                fig = fig.add_layout_image(
+                    dict(
+                        x=x0,
+                        sizex=w,
+                        y=y0,
+                        sizey=h,
+                        xref="x",
+                        yref="y",
+                        opacity=1.0,
+                        layer="below",
+                        sizing="stretch",
+                        source=image,
+                    ),
+                )
+                fig.add_annotation(
+                    x=(w + padding) * col + (w / 2),
+                    y=(h + padding) * (n_rows - row) - (h / 2),
+                    text="<b> " + "<br> ".join(top_7),
+                    font=dict(
+                        size=16,
+                        family="Times New Roman",
+                        color="white",
+                    ),
+                    bgcolor="rgba(0,0,0, 0.5)",
+                )
+            fig = fig.update_xaxes(visible=False, range=[0, figure_width])
+            fig = fig.update_yaxes(
+                visible=False,
+                range=[0, figure_height],
+                # the scaleanchor attribute ensures that the aspect ratio stays constant
+                scaleanchor="x",
+            )
+            fig = fig.update_layout(
+                width=figure_width,
+                height=figure_height,
+                margin={"l": 0, "r": 0, "t": 0, "b": 0},
+            )
+            return fig
diff --git a/turftopic/data.py b/turftopic/data.py
@@ -9,6 +9,7 @@
 
 import joblib
 import numpy as np
+from PIL import Image
 from rich.console import Console
 from rich.tree import Tree
 
@@ -63,6 +64,13 @@ class TopicData(Mapping, TopicContainer):
         This is in contrast to KeyNMF for instance, where only positive word importance should be considered.
     hierarchy: TopicNode, default None
         Optional topic hierarchy for models that support hierarchical topic modeling.
+    images: list[ImageRepr], default None
+        Images the model has been fit on
+    top_images: list[list[Image]], default None
+        Top images discovered by the topic model.
+    negative_images: list[list[Image]], default None
+        Lowest ranking images discivered by the topic model.
+        (Only relevant with models like S^3)
     """
 
     def __init__(
@@ -82,6 +90,9 @@ def __init__(
         temporal_importance: Optional[np.ndarray] = None,
         has_negative_side: bool = False,
         hierarchy: Optional[TopicNode] = None,
+        images: Optional[list[str | Image.Image]] = None,
+        top_images: Optional[list[list[Image.Image]]] = None,
+        negative_images: Optional[list[list[Image.Image]]] = None,
         **kwargs,
     ):
         self.corpus = corpus
@@ -98,6 +109,9 @@ def __init__(
         self.temporal_importance = temporal_importance
         self.hierarchy = hierarchy
         self._has_negative_side = has_negative_side
+        self.top_images = top_images
+        self.negative_images = negative_images
+        self.images = images
         for key, value in kwargs:
             setattr(self, key, value)
         self._attributes = [
diff --git a/turftopic/multimodal.py b/turftopic/multimodal.py