x-tabdeveloping
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/images/multimodal.html‎
Lines changed: 14 additions & 0 deletions b/‎docs/images/multimodal.html‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/multimodal.md‎
Lines changed: 129 additions & 0 deletions b/‎docs/multimodal.md‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/test_multimodal.py‎
Lines changed: 54 additions & 0 deletions b/‎tests/test_multimodal.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎turftopic/base.py‎
Lines changed: 2 additions & 0 deletions b/‎turftopic/base.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎turftopic/encoders/multimodal.py‎
Lines changed: 31 additions & 0 deletions b/‎turftopic/encoders/multimodal.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎turftopic/feature_importance.py‎
Lines changed: 32 additions & 4 deletions b/‎turftopic/feature_importance.py‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎turftopic/models/_hierarchical_clusters.py‎
Lines changed: 1 addition & 1 deletion b/‎turftopic/models/_hierarchical_clusters.py‎
Lines changed: 1 addition & 1 deletion
@@ -29,7 +29,7 @@ jobs:
         run: python3 -c "import sys; print(sys.version)"
 
       - name: Install dependencies
-        run: python3 -m pip install --upgrade turftopic[pyro-ppl] pandas pytest plotly igraph datasets
+        run: python3 -m pip install --upgrade turftopic[pyro-ppl] pandas pytest plotly igraph datasets pillow
       - name: Run tests
         run: python3 -m pytest tests/
 
@@ -0,0 +1,129 @@
+# Multimodal Topic Modelling ***(BETA)***
+
+!!! note 
+    Multimodal modeling is still a BETA feature in Turftopic, and it is likely that we will add more features and change the interface in the near future.
+
+Some corpora spread across multiple modalities.
+A good example of this would be news articles with images attached.
+Turftopic now supports multimodal modelling with a number of models.
+
+
+## Multimodal Encoders
+
+In order for images to be usable in Turftopic, you will need an embedding model that can both encode texts and images.
+You can both use models that are supported in SentenceTransformers, or those that support the MTEB multimodal encoder interface.
+
+
+!!! quote "Use a multimodal encoder model "
+    === "SentenceTransformers"
+
+        ```python
+        from turftopic import KeyNMF
+
+        multimodal_keynmf = KeyNMF(10, encoder="clip-ViT-B-32")
+        ```
+
+    === "MTEB/MIEB"
+        !!! tip 
+            You can find current state-of-the-art embedding models and their capabilities on the [Massive Image Embedding Benchmark leaderboard](http://mteb-leaderboard.hf.space/?benchmark_name=MIEB%28Multilingual%29).
+
+        ```bash
+        pip install "mteb<2.0.0"
+        ```
+
+        ```python
+        from turftopic import KeyNMF
+        import mteb
+
+        encoder = mteb.get_model("kakaobrain/align-base")
+
+        multimodal_keynmf = KeyNMF(10, encoder="clip-ViT-B-32")
+        ```
+
+## Corpus Structure
+
+Currently all documents **have to have** an image attached to them, and only one image.
+This is a limitation, and we will address it in the future.
+Images can both be represented as file paths or `PIL.Image` objects.
+
+```python
+from PIL import Image
+
+images: list[Image] = [Image.open("file_path/something.jpeg"), ...]
+texts: list[str] = [...]
+
+len(images) == len(texts)
+```
+
+## Basic Usage
+
+All multimodal models have a `fit_multimodal()`/`fit_transform_multimodal()` method,
+that you can use to discover topics in multimodal corpora.
+
+!!! quote "Fit a multimodal model on a corpus"
+    === "KeyNMF"
+
+        ```python
+        from turftopic import KeyNMF
+
+        model = KeyNMF(12, encoder="clip-ViT-B-32")
+        model.fit_multimodal(texts, images=images)
+        model.plot_topics_with_images()
+        ```
+
+    === "SemanticSignalSeparation"
+
+        ```python
+        from turftopic import SemanticSignalSeparation
+
+        model = SemanticSignalSeparation(12, encoder="clip-ViT-B-32")
+        model.fit_multimodal(texts, images=images)
+        model.plot_topics_with_images()
+        ```
+
+    === "Clustering Models"
+
+        ```python
+        from turftopic import ClusteringTopicModel
+
+        # BERTopic-style
+        model = ClusteringTopicModel(encoder="clip-ViT-B-32", feature_importance="c-tf-idf")
+        # Top2Vec-style
+        model = ClusteringTopicModel(encoder="clip-ViT-B-32", feature_importance="centroid")
+        model.fit_multimodal(texts, images=images)
+        model.plot_topics_with_images()
+        ```
+
+    === "GMM"
+
+        ```python
+        from turftopic import GMM
+
+        model = GMM(12, encoder="clip-ViT-B-32")
+        model.fit_multimodal(texts, images=images)
+        model.plot_topics_with_images()
+        ```
+
+    === "AutoEncodingTopicModel"
+
+        ```python
+        from turftopic import AutoEncodingTopicModel
+
+        # CombinedTM
+        model = AutoEncodingTopicModel(12, combined=True, encoder="clip-ViT-B-32")
+        # ZeroShotTM
+        model = AutoEncodingTopicModel(12, combined=False, encoder="clip-ViT-B-32")
+        model.fit_multimodal(texts, images=images)
+        model.plot_topics_with_images()
+        ```
+
+<iframe src="../images/multimodal.html", title="Multimodal KeyNMF on IKEA catalogue", style="height:350px;width:100%;padding:0px;border:none;"></iframe>
+
+## API reference
+
+::: turftopic.multimodal.MultimodalModel
+
+::: turftopic.encoders.multimodal.MultimodalEncoder
+
+
+
@@ -11,6 +11,7 @@ nav:
     - Online Topic Modeling: online.md
     - Hierarchical Topic Modeling: hierarchical.md
     - Cross-Lingual Topic Modeling: cross_lingual.md
+    - Multimodal Modeling (BETA): multimodal.md
     - Modifying and Finetuning Models: finetuning.md
     - Saving and Loading: persistence.md
     - Using TopicData: topic_data.md
 
@@ -9,7 +9,7 @@ profile = "black"
 
 [tool.poetry]
 name = "turftopic"
-version = "0.14.1"
+version = "0.15.0"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = ["Márton Kardos <power.up1163@gmail.com>"]
 license = "MIT"
@@ -26,6 +26,7 @@ rich = "^13.6.0"
 huggingface-hub = ">=0.23.2"
 joblib = "^1.2.0"
 igraph = "~0.11.6"
+pillow = "~10.4.0"
 snowballstemmer = {version=">=2.0.0", optional=true}
 spacy = {version=">=3.6.0", optional=true}
 jieba = {version=">=0.40.0", optional=true}
 
@@ -0,0 +1,54 @@
+import pytest
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import CountVectorizer
+
+from turftopic import (
+    GMM,
+    AutoEncodingTopicModel,
+    ClusteringTopicModel,
+    KeyNMF,
+    SemanticSignalSeparation,
+)
+
+
+@pytest.fixture
+def multimodal_models():
+    encoder = SentenceTransformer("sentence-transformers/clip-ViT-B-16")
+    return [
+        AutoEncodingTopicModel(
+            2, combined=True, encoder=encoder, vectorizer=CountVectorizer()
+        ),
+        GMM(2, encoder=encoder, vectorizer=CountVectorizer()),
+        KeyNMF(2, encoder=encoder, vectorizer=CountVectorizer()),
+        SemanticSignalSeparation(
+            2, encoder=encoder, vectorizer=CountVectorizer()
+        ),
+        ClusteringTopicModel(
+            dimensionality_reduction=PCA(10),
+            clustering=KMeans(3),
+            feature_importance="c-tf-idf",
+            encoder=encoder,
+        ),
+        ClusteringTopicModel(
+            dimensionality_reduction=PCA(10),
+            clustering=KMeans(3),
+            feature_importance="centroid",
+            encoder=encoder,
+        ),
+    ]
+
+
+flowers = load_dataset("kardosdrur/flowers_multimodal_test", split="train")
+texts = flowers["blip_caption"]
+images = flowers["image"]
+
+
+def test_multimodal(multimodal_models):
+    for model in multimodal_models:
+        doc_topic_matrix = model.fit_transform_multimodal(texts, images=images)
+        fig = model.plot_topics_with_images()
+        assert len(model.top_images) == model.components_.shape[0]
+        assert doc_topic_matrix.shape[1] == model.components_.shape[0]
@@ -39,6 +39,8 @@ def encode_documents(self, raw_documents: Iterable[str]) -> np.ndarray:
         ndarray of shape (n_documents, n_dimensions)
             Matrix of document embeddings.
         """
+        if not hasattr(self.encoder_, "encode"):
+            return self.encoder.get_text_embeddings(list(raw_documents))
         return self.encoder_.encode(raw_documents)
 
     @abstractmethod
 
@@ -0,0 +1,31 @@
+from typing import Protocol
+
+from PIL import Image
+
+
+class MultimodalEncoder(Protocol):
+    """Base class for external encoder models."""
+
+    def get_text_embeddings(
+        self,
+        texts: list[str],
+        *,
+        batch_size: int = 8,
+        **kwargs,
+    ): ...
+
+    def get_image_embeddings(
+        self,
+        images: list[Image.Image],
+        *,
+        batch_size: int = 8,
+        **kwargs,
+    ): ...
+
+    def get_fused_embeddings(
+        self,
+        texts: list[str] = None,
+        images: list[Image.Image] = None,
+        batch_size: int = 8,
+        **kwargs,
+    ): ...
@@ -1,6 +1,9 @@
 import numpy as np
 import scipy.sparse as spr
+from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array
 
 
 def cluster_centroid_distance(
@@ -34,7 +37,9 @@ def cluster_centroid_distance(
 
 
 def soft_ctf_idf(
-    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+    doc_topic_matrix: np.ndarray,
+    doc_term_matrix: spr.csr_matrix,
+    return_idf: bool = False,
 ) -> np.ndarray:
     """Computes feature importances using Soft C-TF-IDF
 
@@ -57,11 +62,23 @@ def soft_ctf_idf(
     tf = (term_importance.T / (overall_in_topic + eps)).T
     idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps))
     ctf_idf = tf * idf
-    return ctf_idf
+    idf_diag = spr.diags(
+        idf,
+        offsets=0,
+        shape=(doc_term_matrix.shape[1], doc_term_matrix.shape[1]),
+        format="csr",
+        dtype=tf.dtype,
+    )
+    if not return_idf:
+        return ctf_idf
+    else:
+        return ctf_idf, idf_diag
 
 
 def ctf_idf(
-    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+    doc_topic_matrix: np.ndarray,
+    doc_term_matrix: spr.csr_matrix,
+    return_idf: bool = False,
 ) -> np.ndarray:
     """Computes feature importances using standard C-TF-IDF
 
@@ -89,7 +106,18 @@ def ctf_idf(
         )
         component = freq * np.log(1 + average / overall_freq)
         components.append(component)
-    return np.stack(components)
+    idf = np.log((average / overall_freq) + 1)
+    idf_diag = spr.diags(
+        idf,
+        offsets=0,
+        shape=(doc_term_matrix.shape[1], doc_term_matrix.shape[1]),
+        format="csr",
+        dtype=doc_term_matrix.dtype,
+    )
+    if not return_idf:
+        return np.stack(components)
+    else:
+        return np.stack(components), idf_diag
 
 
 def bayes_rule(
 
@@ -187,7 +187,7 @@ def _estimate_children_components(self) -> dict[int, np.ndarray]:
             )  # type: ignore
         elif self.model.feature_importance == "centroid":
             if not hasattr(self.model, "vocab_embeddings"):
-                self.model.vocab_embeddings = self.model.encoder_.encode(
+                self.model.vocab_embeddings = self.model.encode_documents(
                     self.model.vectorizer.get_feature_names_out()
                 )  # type: ignore
                 if (