diff --git a/docs/Topeax.md b/docs/Topeax.md
new file mode 100644
index 0000000..f7c8a4d
--- /dev/null
+++ b/docs/Topeax.md
@@ -0,0 +1,18 @@
+# Topeax
+
+Topeax is a probabilistic topic model based on the Peax clustering model, which finds topics based on peaks in point density in the embedding space.
+It can recover the number of topics automatically.
+
+<br>
+<figure>
+  <img src="../images/peax.png" width="100%" style="margin-left: auto;margin-right: auto;">
+  <figcaption>Schematic overview of the steps of the Peax clustering algorithm</figcaption>
+</figure>
+
+:warning: **This part of the documentation is still under construction, as more details and a paper are on their way.** :warning:
+
+## API Reference
+
+::: turftopic.models.topeax.Topeax
+
+::: turftopic.models.topeax.Peax
diff --git a/docs/images/peax.png b/docs/images/peax.png
new file mode 100644
index 0000000..fa2a7bc
Binary files /dev/null and b/docs/images/peax.png differ
diff --git a/pyproject.toml b/pyproject.toml
index f284a70..c91abef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ profile = "black"
 
 [project]
 name = "turftopic"
-version = "0.19.1"
+version = "0.20.0"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = [
    { name = "Márton Kardos <power.up1163@gmail.com>", email = "martonkardos@cas.au.dk" }
diff --git a/turftopic/__init__.py b/turftopic/__init__.py
index cc3ab74..99250de 100644
--- a/turftopic/__init__.py
+++ b/turftopic/__init__.py
@@ -7,6 +7,7 @@
 from turftopic.models.fastopic import FASTopic
 from turftopic.models.gmm import GMM
 from turftopic.models.keynmf import KeyNMF
+from turftopic.models.topeax import Topeax
 from turftopic.serialization import load_model
 
 try:
@@ -20,6 +21,7 @@
     "ClusteringTopicModel",
     "SemanticSignalSeparation",
     "GMM",
+    "Topeax",
     "KeyNMF",
     "AutoEncodingTopicModel",
     "ContextualModel",
diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py
index 4a527f4..ba3a749 100644
--- a/turftopic/encoders/utils.py
+++ b/turftopic/encoders/utils.py
@@ -1,6 +1,10 @@
 import itertools
 from typing import Iterable, List
 
+import numpy as np
+import torch
+from tqdm import trange
+
 
 def batched(iterable, n: int) -> Iterable[List[str]]:
     "Batch data into tuples of length n. The last batch may be shorter."
@@ -10,3 +14,47 @@ def batched(iterable, n: int) -> Iterable[List[str]]:
     it = iter(iterable)
     while batch := list(itertools.islice(it, n)):
         yield batch
+
+
+def encode_chunks(
+    encoder,
+    sentences,
+    batch_size=64,
+    window_size=50,
+    step_size=40,
+    return_chunks=False,
+    show_progress_bar=False,
+):
+    chunks = []
+    chunk_embeddings = []
+    for start_index in trange(
+        0,
+        len(sentences),
+        batch_size,
+        desc="Encoding batches...",
+        disable=not show_progress_bar,
+    ):
+        batch = sentences[start_index : start_index + batch_size]
+        features = encoder.tokenize(batch)
+        with torch.no_grad():
+            output_features = encoder.forward(features)
+        n_tokens = output_features["attention_mask"].sum(axis=1)
+        for i_doc in range(len(batch)):
+            for chunk_start in range(0, n_tokens[i_doc], step_size):
+                chunk_end = min(chunk_start + window_size, n_tokens[i_doc])
+                _emb = output_features["token_embeddings"][
+                    i_doc, chunk_start:chunk_end, :
+                ].mean(axis=0)
+                chunk_embeddings.append(_emb)
+                if return_chunks:
+                    chunks.append(
+                        encoder.tokenizer.decode(
+                            features["input_ids"][i_doc, chunk_start:chunk_end]
+                        )
+                        .replace("[CLS]", "")
+                        .replace("[SEP]", "")
+                    )
+    if not return_chunks:
+        chunks = None
+    chunk_embeddings = np.stack(chunk_embeddings)
+    return chunk_embeddings, chunks
diff --git a/turftopic/models/_keynmf.py b/turftopic/models/_keynmf.py
index 3a27628..5e135e8 100644
--- a/turftopic/models/_keynmf.py
+++ b/turftopic/models/_keynmf.py
@@ -2,7 +2,8 @@
 import warnings
 from collections import defaultdict
 from datetime import datetime
-from typing import Iterable, Literal, Optional
+from functools import partial
+from typing import Iterable, Literal, Optional, Union
 
 import igraph as ig
 import numpy as np
@@ -21,6 +22,10 @@
 from sklearn.utils.validation import check_non_negative
 
 from turftopic.base import Encoder
+from turftopic.optimization import (
+    decomposition_gaussian_bic,
+    optimize_n_components,
+)
 
 NOT_MATCHING_ERROR = (
     "Document embedding dimensionality ({n_dims}) doesn't match term embedding dimensionality ({n_word_dims}). "
@@ -242,7 +247,7 @@ def batch_extract_keywords(
 class KeywordNMF:
     def __init__(
         self,
-        n_components: int,
+        n_components: Union[int, Literal["auto"]],
         seed: Optional[int] = None,
         top_n: Optional[int] = None,
     ):
@@ -318,6 +323,15 @@ def vectorize(
 
     def fit_transform(self, keywords: list[dict[str, float]]) -> np.ndarray:
         X = self.vectorize(keywords, fitting=True)
+        if self.n_components == "auto":
+            # Finding N components with BIC
+            bic_fn = partial(
+                decomposition_gaussian_bic,
+                decomp_class=NMF,
+                X=X,
+            )
+            n_components = optimize_n_components(bic_fn, min_n=1, verbose=True)
+            self.n_components = n_components
         check_non_negative(X, "NMF (input X)")
         W, H = _initialize_nmf(X, self.n_components, random_state=self.seed)
         W, H, self.n_iter = NMF(
@@ -339,6 +353,10 @@ def transform(self, keywords: list[dict[str, float]]):
         return W.astype(X.dtype)
 
     def partial_fit(self, keyword_batch: list[dict[str, float]]):
+        if self.n_components == "auto":
+            raise ValueError(
+                "Cannot infer number of components with BIC when online fitting the model."
+            )
         X = self.vectorize(keyword_batch, fitting=True)
         try:
             check_non_negative(X, "NMF (input X)")
@@ -365,6 +383,15 @@ def fit_transform_dynamic(
         n_bins = len(time_bin_edges) - 1
         document_term_matrix = self.vectorize(keywords, fitting=True)
         check_non_negative(document_term_matrix, "NMF (input X)")
+        if self.n_components == "auto":
+            # Finding N components with BIC
+            bic_fn = partial(
+                decomposition_gaussian_bic,
+                decomp_class=NMF,
+                X=X,
+            )
+            n_components = optimize_n_components(bic_fn, verbose=True)
+            self.n_components = n_components
         document_topic_matrix, H = _initialize_nmf(
             document_term_matrix,
             self.n_components,
diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
index 9760ed4..69ebad0 100644
--- a/turftopic/models/gmm.py
+++ b/turftopic/models/gmm.py
@@ -1,4 +1,6 @@
+import warnings
 from datetime import datetime
+from functools import partial
 from typing import Literal, Optional, Union
 
 import numpy as np
@@ -7,77 +9,103 @@
 from sklearn.base import TransformerMixin
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
-from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.pipeline import Pipeline
 
+from turftopic._datamapplot import build_datamapplot
 from turftopic.base import ContextualModel, Encoder
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.encoders.multimodal import MultimodalEncoder
-from turftopic.feature_importance import soft_ctf_idf
+from turftopic.feature_importance import (
+    ctf_idf,
+    fighting_words,
+    npmi,
+    soft_ctf_idf,
+)
 from turftopic.multimodal import (
     Image,
     ImageRepr,
     MultimodalEmbeddings,
     MultimodalModel,
 )
+from turftopic.optimization import optimize_n_components
 from turftopic.vectorizers.default import default_vectorizer
 
+FEATURE_IMPORTANCE_METHODS = {
+    "soft-c-tf-idf": soft_ctf_idf,
+    "c-tf-idf": ctf_idf,
+    "fighting-words": fighting_words,
+    "npmi": partial(npmi, smoothing=2),
+}
+LexicalWordImportance = Literal[
+    "soft-c-tf-idf",
+    "c-tf-idf",
+    "npmi",
+    "fighting-words",
+]
+
 
 class GMM(ContextualModel, DynamicTopicModel, MultimodalModel):
     """Multivariate Gaussian Mixture Model over document embeddings.
-    Models topics as mixture components.
-
-    ```python
-    from turftopic import GMM
+        Models topics as mixture components.
 
+        ```python
+        from turftopic import GMM
     corpus: list[str] = ["some text", "more text", ...]
 
-    model = GMM(10, weight_prior="dirichlet_process").fit(corpus)
-    model.print_topics()
-    ```
+        model = GMM(10, weight_prior="dirichlet_process").fit(corpus)
+        model.print_topics()
+        ```
 
-    Parameters
-    ----------
-    n_components: int
-        Number of topics. If you're using priors on the weight,
-        feel free to overshoot with this value.
-    encoder: str or SentenceTransformer
-        Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
-    vectorizer: CountVectorizer, default None
-        Vectorizer used for term extraction.
-        Can be used to prune or filter the vocabulary.
-    weight_prior: 'dirichlet', 'dirichlet_process' or None, default 'dirichlet'
-        Prior to impose on component weights, if None,
-        maximum likelihood is optimized with expectation maximization,
-        otherwise variational inference is used.
-    gamma: float, default None
-        Concentration parameter of the symmetric prior.
-        By default 1/n_components is used.
-        Ignored when weight_prior is None.
-    dimensionality_reduction: TransformerMixin, default None
-        Optional dimensionality reduction step before GMM is run.
-        This is recommended for very large datasets with high dimensionality,
-        as the number of parameters grows vast in the model otherwise.
-        We recommend using PCA, as it is a linear solution, and will likely
-        result in Gaussian components.
-        For even larger datasets you can use IncrementalPCA to reduce
-        memory load.
-    random_state: int, default None
-        Random state to use so that results are exactly reproducible.
+        Parameters
+        ----------
+        n_components: int or "auto"
+            Number of topics.
+            If "auto", the Bayesian Information criterion
+            will be used to estimate this quantity.
+            *Note that "auto" can only be used when no priors as specified*.
+        encoder: str or SentenceTransformer
+            Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
+        vectorizer: CountVectorizer, default None
+            Vectorizer used for term extraction.
+            Can be used to prune or filter the vocabulary.
+        weight_prior: 'dirichlet', 'dirichlet_process' or None, default 'dirichlet'
+            Prior to impose on component weights, if None,
+            maximum likelihood is optimized with expectation maximization,
+            otherwise variational inference is used.
+        gamma: float, default None
+            Concentration parameter of the symmetric prior.
+            By default 1/n_components is used.
+            Ignored when weight_prior is None.
+        dimensionality_reduction: TransformerMixin, default None
+            Optional dimensionality reduction step before GMM is run.
+            This is recommended for very large datasets with high dimensionality,
+            as the number of parameters grows vast in the model otherwise.
+            We recommend using PCA, as it is a linear solution, and will likely
+            result in Gaussian components.
+            For even larger datasets you can use IncrementalPCA to reduce
+            memory load.
+        feature_importance: LexicalWordImportance, default 'soft-c-tf-idf'
+            Feature importance method to use.
+            *Note that only lexical methods can be used with GMM,
+            not embedding-based ones.*
+        random_state: int, default None
+            Random state to use so that results are exactly reproducible.
 
-    Attributes
-    ----------
-    weights_: ndarray of shape (n_components)
-        Weights of the different mixture components.
+        Attributes
+        ----------
+        weights_: ndarray of shape (n_components)
+            Weights of the different mixture components.
     """
 
     def __init__(
         self,
-        n_components: int,
+        n_components: Union[int, Literal["auto"]],
         encoder: Union[
             Encoder, str, MultimodalEncoder
         ] = "sentence-transformers/all-MiniLM-L6-v2",
         vectorizer: Optional[CountVectorizer] = None,
         dimensionality_reduction: Optional[TransformerMixin] = None,
+        feature_importance: LexicalWordImportance = "soft-c-tf-idf",
         weight_prior: Literal["dirichlet", "dirichlet_process", None] = None,
         gamma: Optional[float] = None,
         random_state: Optional[int] = None,
@@ -96,7 +124,63 @@ def __init__(
             self.vectorizer = default_vectorizer()
         else:
             self.vectorizer = vectorizer
+        if feature_importance not in FEATURE_IMPORTANCE_METHODS:
+            valid = list(FEATURE_IMPORTANCE_METHODS.keys())
+            raise ValueError(
+                f"{feature_importance} not in list of valid feature importance methods: {valid}"
+            )
+        self.feature_importance = feature_importance
         self.dimensionality_reduction = dimensionality_reduction
+        if (self.n_components == "auto") and (self.weight_prior is not None):
+            raise ValueError(
+                "You cannot use N='auto' with a prior. Try setting weight_prior=None."
+            )
+
+    def estimate_components(
+        self,
+        feature_importance: Optional[LexicalWordImportance] = None,
+        doc_topic_matrix=None,
+        doc_term_matrix=None,
+    ) -> np.ndarray:
+        feature_importance = feature_importance or self.feature_importance
+        imp_fn = FEATURE_IMPORTANCE_METHODS[feature_importance]
+        doc_topic_matrix = (
+            doc_topic_matrix
+            if doc_topic_matrix is not None
+            else self.doc_topic_matrix
+        )
+        doc_term_matrix = (
+            doc_term_matrix
+            if doc_term_matrix is not None
+            else self.doc_term_matrix
+        )
+        self.components_ = imp_fn(doc_topic_matrix, doc_term_matrix)
+        return self.components_
+
+    def _create_bic(self, embeddings: np.ndarray):
+        def f_bic(n_components: int):
+            random_state = 42
+            success = False
+            n_tries = 1
+            while not success and (n_tries <= 5):
+                try:
+                    # This can sometimes run into problems especially
+                    # with covariance estimation
+                    model = GaussianMixture(
+                        n_components, random_state=self.random_state
+                    )
+                    model.fit(embeddings)
+                    success = True
+                except Exception:
+                    random_state += 1
+                    n_tries += 1
+            if n_tries > 5:
+                return 0
+            return model.bic(embeddings)
+
+        return f_bic
+
+    def _init_model(self, n_components: int):
         if self.weight_prior is not None:
             mixture = BayesianGaussianMixture(
                 n_components=n_components,
@@ -105,17 +189,14 @@ def __init__(
                     if self.weight_prior == "dirichlet"
                     else "dirichlet_process"
                 ),
-                weight_concentration_prior=gamma,
+                weight_concentration_prior=self.gamma,
                 random_state=self.random_state,
             )
         else:
             mixture = GaussianMixture(
                 n_components, random_state=self.random_state
             )
-        if dimensionality_reduction is not None:
-            self.gmm_ = make_pipeline(dimensionality_reduction, mixture)
-        else:
-            self.gmm_ = mixture
+        return mixture
 
     def fit_transform(
         self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
@@ -126,22 +207,34 @@ def fit_transform(
                 status.update("Encoding documents")
                 embeddings = self.encoder_.encode(raw_documents)
                 console.log("Documents encoded.")
+            self.embeddings = embeddings
             status.update("Extracting terms.")
-            document_term_matrix = self.vectorizer.fit_transform(raw_documents)
+            self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
             console.log("Term extraction done.")
+            X = embeddings
+            if self.dimensionality_reduction is not None:
+                status.update("Reducing embedding dimensionality.")
+                X = self.dimensionality_reduction.fit_transform(embeddings)
+                console.log("Dimensionality reduction complete.")
+                self.reduced_embeddings = X
+            n_components = self.n_components
+            if self.n_components == "auto":
+                status.update("Finding optimal value of N")
+                f_bic = self._create_bic(X)
+                n_components = optimize_n_components(f_bic, verbose=True)
+                console.log(f"Found optimal N={n_components}.")
             status.update("Fitting mixture model.")
-            self.gmm_.fit(embeddings)
+            self.gmm_ = self._init_model(n_components)
+            self.gmm_.fit(X)
             console.log("Mixture model fitted.")
             status.update("Estimating term importances.")
-            document_topic_matrix = self.gmm_.predict_proba(embeddings)
-            self.components_ = soft_ctf_idf(
-                document_topic_matrix, document_term_matrix
-            )
+            self.doc_topic_matrix = self.gmm_.predict_proba(X)
+            self.components_ = self.estimate_components()
             console.log("Model fitting done.")
             self.top_documents = self.get_top_documents(
-                raw_documents, document_topic_matrix=document_topic_matrix
+                raw_documents, document_topic_matrix=self.doc_topic_matrix
             )
-        return document_topic_matrix
+        return self.doc_topic_matrix
 
     def fit_transform_multimodal(
         self,
@@ -161,31 +254,49 @@ def fit_transform_multimodal(
                 )
                 console.log("Documents encoded.")
             status.update("Extracting terms.")
-            document_term_matrix = self.vectorizer.fit_transform(raw_documents)
+            self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
             console.log("Term extraction done.")
+            X = self.multimodal_embeddings["document_embeddings"]
+            if self.dimensionality_reduction is not None:
+                status.update("Reducing embedding dimensionality.")
+                X = self.dimensionality_reduction.fit_transform(embeddings)
+                console.log("Dimensionality reduction complete.")
+            n_components = self.n_components
+            if self.n_components == "auto":
+                status.update("Finding optimal value of N")
+                f_bic = self._create_bic(X)
+                n_components = optimize_n_components(f_bic, verbose=True)
+                console.log(f"Found optimal N={n_components}.")
             status.update("Fitting mixture model.")
-            self.gmm_.fit(self.multimodal_embeddings["document_embeddings"])
+            self.gmm_ = self._init_model(n_components)
+            self.gmm_.fit(X)
             console.log("Mixture model fitted.")
             status.update("Estimating term importances.")
-            document_topic_matrix = self.gmm_.predict_proba(
-                self.multimodal_embeddings["document_embeddings"]
-            )
-            self.components_ = soft_ctf_idf(
-                document_topic_matrix, document_term_matrix
-            )
+            self.doc_topic_matrix = self.gmm_.predict_proba(X)
+            self.components_ = self.estimate_components()
             console.log("Model fitting done.")
-            self.image_topic_matrix = self.transform(
-                raw_documents,
-                embeddings=self.multimodal_embeddings["document_embeddings"],
-            )
+            try:
+                self.image_topic_matrix = self.transform(
+                    raw_documents,
+                    embeddings=self.multimodal_embeddings["image_embeddings"],
+                )
+            except Exception as e:
+                warnings.warn(
+                    f"Couldn't produce image topic matrix due to exception: {e}, using doc-topic matrix."
+                )
+                self.image_topic_matrix = self.doc_topic_matrix
             self.top_images: list[list[Image.Image]] = self.collect_top_images(
                 images, self.image_topic_matrix
             )
             self.top_documents = self.get_top_documents(
-                raw_documents, document_topic_matrix=document_topic_matrix
+                raw_documents, document_topic_matrix=self.doc_topic_matrix
             )
             console.log("Transformation done.")
-        return document_topic_matrix
+        return self.doc_topic_matrix
+
+    @property
+    def labels_(self):
+        return np.argmax(self.doc_topic_matrix, axis=1)
 
     @property
     def weights_(self) -> np.ndarray:
@@ -214,6 +325,8 @@ def transform(
         """
         if embeddings is None:
             embeddings = self.encoder_.encode(raw_documents)
+        if self.dimensionality_reduction is not None:
+            embeddings = self.dimensionality_reduction.transform(embeddings)
         return self.gmm_.predict_proba(embeddings)
 
     def fit_transform_dynamic(
@@ -234,11 +347,11 @@ def fit_transform_dynamic(
             doc_topic_matrix = self.fit_transform(
                 raw_documents, embeddings=embeddings
             )
-        document_term_matrix = self.vectorizer.transform(raw_documents)
+        self.doc_term_matrix = self.vectorizer.transform(raw_documents)
         n_comp, n_vocab = self.components_.shape
         n_bins = len(self.time_bin_edges) - 1
         self.temporal_components_ = np.zeros(
-            (n_bins, n_comp, n_vocab), dtype=document_term_matrix.dtype
+            (n_bins, n_comp, n_vocab), dtype=self.doc_term_matrix.dtype
         )
         self.temporal_importance_ = np.zeros((n_bins, n_comp))
         for i_timebin in np.unique(time_labels):
@@ -247,10 +360,153 @@ def fit_transform_dynamic(
             )
             # Normalizing
             topic_importances = topic_importances / topic_importances.sum()
-            components = soft_ctf_idf(
-                doc_topic_matrix[time_labels == i_timebin],
-                document_term_matrix[time_labels == i_timebin],  # type: ignore
+            components = self.estimate_components(
+                doc_topic_matrix=doc_topic_matrix[time_labels == i_timebin],
+                doc_term_matrix=self.doc_term_matrix[time_labels == i_timebin],  # type: ignore
             )
             self.temporal_components_[i_timebin] = components
             self.temporal_importance_[i_timebin] = topic_importances
         return doc_topic_matrix
+
+    def plot_components_datamapplot(
+        self,
+        coordinates: Optional[np.ndarray] = None,
+        hover_text: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        """Creates an interactive browser plot of the topics in your data using datamapplot.
+
+        Parameters
+        ----------
+        coordinates: np.ndarray, default None
+            Lower dimensional projection of the embeddings.
+            If None, will try to use the projections from the
+            dimensionality_reduction method of the model.
+        hover_text: list of str, optional
+            Text to show when hovering over a document.
+
+        Returns
+        -------
+        plot
+            Interactive datamap plot, you can call the `.show()` method to
+            display it in your default browser or save it as static HTML using `.write_html()`.
+        """
+        if coordinates is None:
+            if not hasattr(self, "reduced_embeddings"):
+                raise ValueError(
+                    "Coordinates not specified, but the model does not contain reduced embeddings."
+                )
+            coordinates = self.reduced_embeddings[:, (0, 1)]
+        labels = np.argmax(self.doc_topic_matrix, axis=1)
+        plot = build_datamapplot(
+            coordinates=coordinates,
+            topic_names=self.topic_names,
+            labels=labels,
+            classes=np.arange(self.gmm_.n_components),
+            top_words=self.get_top_words(),
+            hover_text=hover_text,
+            topic_descriptions=getattr(self, "topic_descriptions", None),
+            **kwargs,
+        )
+        return plot
+
+    def plot_density(
+        self, hover_text: list[str] = None, show_points=False, light_mode=False
+    ):
+        try:
+            import plotly.graph_objects as go
+        except (ImportError, ModuleNotFoundError) as e:
+            raise ModuleNotFoundError(
+                "Please install plotly if you intend to use plots in Turftopic."
+            ) from e
+
+        if not hasattr(self, "reduced_embeddings"):
+            raise ValueError(
+                "No reduced embeddings found, can't display in 2d space."
+            )
+        if self.reduced_embeddings.shape[1] != 2:
+            warnings.warn(
+                "Embeddings are not in 2d space, only using first 2 dimensions"
+            )
+
+        coord_min, coord_max = np.min(self.reduced_embeddings), np.max(
+            self.reduced_embeddings
+        )
+        coord_spread = coord_max - coord_min
+        coord_min = coord_min - coord_spread * 0.05
+        coord_max = coord_max + coord_spread * 0.05
+        coord = np.linspace(coord_min, coord_max, num=100)
+        z = []
+        for yval in coord:
+            points = np.stack([coord, np.full(coord.shape, yval)]).T
+            prob = np.exp(self.gmm_.score_samples(points))
+            z.append(prob)
+        z = np.stack(z)
+        color_grid = [0.0, 0.25, 0.5, 0.75, 1.0]
+        colorscale = [
+            "#01014B",
+            "#000080",
+            "#5D5DEF",
+            "#B7B7FF",
+            "#ffffff",
+        ]
+        if light_mode:
+            colorscale = colorscale[::-1]
+        traces = [
+            go.Contour(
+                z=z,
+                colorscale=list(zip(color_grid, colorscale)),
+                showscale=False,
+                x=coord,
+                y=coord,
+                hoverinfo="skip",
+            ),
+        ]
+        if show_points:
+            scatter = go.Scatter(
+                x=self.reduced_embeddings[:, 0],
+                y=self.reduced_embeddings[:, 1],
+                mode="markers",
+                showlegend=False,
+                text=hover_text,
+                marker=dict(
+                    symbol="circle",
+                    opacity=0.5,
+                    color="white",
+                    size=8,
+                    line=dict(width=1),
+                ),
+            )
+            traces.append(scatter)
+        fig = go.Figure(data=traces)
+        fig = fig.update_layout(
+            showlegend=False, margin=dict(r=0, l=0, t=0, b=0)
+        )
+        fig = fig.update_xaxes(showticklabels=False)
+        fig = fig.update_yaxes(showticklabels=False)
+        for mean, name, keywords in zip(
+            self.gmm_.means_, self.topic_names, self.get_top_words()
+        ):
+            _keys = ""
+            for i, key in enumerate(keywords):
+                if (i % 5) == 0:
+                    _keys += "<br> "
+                _keys += key
+                if i < (len(keywords) - 1):
+                    _keys += ","
+                _keys += " "
+            text = f"<b>{name}</b> <i>{_keys}</i> "
+            fig.add_annotation(
+                text=text,
+                x=mean[0],
+                y=mean[1],
+                align="left",
+                showarrow=False,
+                xshift=0,
+                yshift=50,
+                font=dict(family="Roboto Mono", size=18, color="black"),
+                bgcolor="rgba(255,255,255,0.9)",
+                bordercolor="black",
+                borderwidth=2,
+            )
+        return fig
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
index 3d26b9b..c9618a7 100644
--- a/turftopic/models/keynmf.py
+++ b/turftopic/models/keynmf.py
@@ -43,8 +43,10 @@ class KeyNMF(ContextualModel, DynamicTopicModel, MultimodalModel):
 
     Parameters
     ----------
-    n_components: int
+    n_components: int or "auto"
         Number of topics.
+        Auto selects the number of topics using
+        the Bayesian Information Criterion.
     encoder: str or SentenceTransformer
         Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
     vectorizer: CountVectorizer, default None
@@ -71,7 +73,7 @@ class KeyNMF(ContextualModel, DynamicTopicModel, MultimodalModel):
 
     def __init__(
         self,
-        n_components: int,
+        n_components: Union[int, Literal["auto"]],
         encoder: Union[
             Encoder, str, MultimodalEncoder
         ] = "sentence-transformers/all-MiniLM-L6-v2",
diff --git a/turftopic/models/topeax.py b/turftopic/models/topeax.py
new file mode 100644
index 0000000..7f17608
--- /dev/null
+++ b/turftopic/models/topeax.py
@@ -0,0 +1,211 @@
+from typing import Optional, Union
+
+import numpy as np
+from scipy.ndimage import (
+    binary_erosion,
+    generate_binary_structure,
+    maximum_filter,
+)
+from scipy.stats import gaussian_kde
+from sklearn.base import BaseEstimator, ClusterMixin
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.manifold import TSNE
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.mixture import GaussianMixture
+from sklearn.mixture._gaussian_mixture import (
+    _compute_precision_cholesky,
+    _estimate_gaussian_parameters,
+)
+
+from turftopic.base import Encoder
+from turftopic.encoders.multimodal import MultimodalEncoder
+from turftopic.models.gmm import GMM, LexicalWordImportance
+
+
+def detect_peaks(image):
+    # define an 8-connected neighborhood
+    neighborhood = generate_binary_structure(2, 25)
+    local_max = maximum_filter(image, footprint=neighborhood) == image
+    # local_max is a mask that contains the peaks we are
+    # looking for, but also the background.
+    # In order to isolate the peaks we must remove the background from the mask.
+    # we create the mask of the background
+    background = image == 0
+    # a little technicality: we must erode the background in order to
+    # successfully subtract it form local_max, otherwise a line will
+    # appear along the background border (artifact of the local maximum filter)
+    eroded_background = binary_erosion(
+        background, structure=neighborhood, border_value=1
+    )
+    # we obtain the final mask, containing only peaks,
+    # by removing the background from the local_max mask (xor operation)
+    detected_peaks = local_max ^ eroded_background
+    return detected_peaks
+
+
+class FixedMeanGaussianMixture(GaussianMixture):
+    def _m_step(self, X, log_resp):
+        # Skipping mean update
+        self.weights_, _, self.covariances_ = _estimate_gaussian_parameters(
+            X, np.exp(log_resp), self.reg_covar, self.covariance_type
+        )
+        self.weights_ /= self.weights_.sum()
+        self.precisions_cholesky_ = _compute_precision_cholesky(
+            self.covariances_, self.covariance_type
+        )
+
+
+class Peax(ClusterMixin, BaseEstimator):
+    """Clustering model based on density peaks.
+
+    Parameters
+    ----------
+    random_state: int, default None
+        Random seed to use for fitting gaussian mixture to peaks.
+    """
+
+    def __init__(self, random_state: Optional[int] = None):
+        self.random_state = random_state
+
+    def fit(self, X, y=None):
+        self.X_range = np.min(X), np.max(X)
+        self.density = gaussian_kde(X.T, "scott")
+        coord = np.linspace(*self.X_range, num=100)
+        z = []
+        for yval in coord:
+            points = np.stack([coord, np.full(coord.shape, yval)]).T
+            prob = np.exp(self.density.logpdf(points.T))
+            z.append(prob)
+        z = np.stack(z)
+        peaks = detect_peaks(z.T)
+        peak_ind = np.nonzero(peaks)
+        peak_pos = np.stack([coord[peak_ind[0]], coord[peak_ind[1]]]).T
+        weights = self.density.pdf(peak_pos.T)
+        weights = weights / weights.sum()
+        self.gmm_ = FixedMeanGaussianMixture(
+            peak_pos.shape[0],
+            means_init=peak_pos,
+            weights_init=weights,
+            random_state=self.random_state,
+        )
+        self.labels_ = self.gmm_.fit_predict(X)
+        # Checking whether there are close to zero components
+        is_zero = np.isclose(self.gmm_.weights_, 0)
+        n_zero = np.sum(is_zero)
+        if n_zero > 0:
+            print(
+                f"{n_zero} components have zero weight, removing them and refitting."
+            )
+        peak_pos = peak_pos[~is_zero]
+        weights = self.gmm_.weights_[~is_zero]
+        weights = weights / weights.sum()
+        self.gmm_ = FixedMeanGaussianMixture(
+            peak_pos.shape[0],
+            means_init=peak_pos,
+            weights_init=weights,
+            random_state=self.random_state,
+        )
+        self.labels_ = self.gmm_.fit_predict(X)
+        self.classes_ = np.sort(np.unique(self.labels_))
+        self.means_ = self.gmm_.means_
+        self.weights_ = self.gmm_.weights_
+        return self.labels_
+
+    @property
+    def n_components(self) -> int:
+        return self.gmm_.n_components
+
+    def predict_proba(self, X):
+        return self.gmm_.predict_proba(X)
+
+    def score_samples(self, X):
+        return self.density.logpdf(X.T)
+
+    def score(self, X):
+        return np.mean(self.score_samples(X))
+
+
+class Topeax(GMM):
+    """Topic model based on the Peax clustering algorithm.
+    The algorithm discovers the number of topics automatically, and is based on GMM.
+
+    Parameters
+    ----------
+    encoder: str or SentenceTransformer
+        Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
+    vectorizer: CountVectorizer, default None
+        Vectorizer used for term extraction.
+        Can be used to prune or filter the vocabulary.
+    perplexity: int, default 50
+        Number of neighbours to take into account when running TSNE.
+    random_state: int, default None
+        Random state to use so that results are exactly reproducible.
+
+    """
+
+    def __init__(
+        self,
+        encoder: Union[
+            Encoder, str, MultimodalEncoder
+        ] = "sentence-transformers/all-MiniLM-L6-v2",
+        vectorizer: Optional[CountVectorizer] = None,
+        perplexity: int = 50,
+        random_state: Optional[int] = None,
+    ):
+        dimensionality_reduction = TSNE(
+            2,
+            metric="cosine",
+            perplexity=perplexity,
+            random_state=random_state,
+        )
+        super().__init__(
+            n_components=0,
+            encoder=encoder,
+            vectorizer=vectorizer,
+            dimensionality_reduction=dimensionality_reduction,
+            random_state=random_state,
+        )
+
+    def estimate_components(
+        self,
+        feature_importance: Optional[LexicalWordImportance] = None,
+        doc_topic_matrix=None,
+        doc_term_matrix=None,
+    ) -> np.ndarray:
+        doc_topic_matrix = (
+            doc_topic_matrix
+            if doc_topic_matrix is not None
+            else self.doc_topic_matrix
+        )
+        doc_term_matrix = (
+            doc_term_matrix
+            if doc_term_matrix is not None
+            else self.doc_term_matrix
+        )
+        lexical_components = super().estimate_components(
+            "npmi", doc_topic_matrix, doc_term_matrix
+        )
+        vocab = self.get_vocab()
+        if getattr(self, "vocab_embeddings", None) is None or (
+            self.vocab_embeddings.shape[0] != vocab.shape[0]
+        ):
+            self.vocab_embeddings = self.encode_documents(vocab)
+        topic_embeddings = []
+        for weight in doc_topic_matrix.T:
+            topic_embeddings.append(
+                np.average(self.embeddings, axis=0, weights=weight)
+            )
+        self.topic_embeddings = np.stack(topic_embeddings)
+        semantic_components = cosine_similarity(
+            self.topic_embeddings, self.vocab_embeddings
+        )
+        # Transforming to positive values from 0 to 1
+        # Then taking geometric average of the two values
+        self.components_ = np.sqrt(
+            ((1 + lexical_components) / 2) * ((1 + semantic_components) / 2)
+        )
+        return self.components_
+
+    def _init_model(self, n_components: int):
+        mixture = Peax()
+        return mixture
diff --git a/turftopic/optimization.py b/turftopic/optimization.py
new file mode 100644
index 0000000..76763a0
--- /dev/null
+++ b/turftopic/optimization.py
@@ -0,0 +1,114 @@
+from functools import cache
+from typing import Callable
+
+import numpy as np
+from scipy import optimize
+from scipy.sparse import issparse
+
+
+def decomposition_gaussian_bic(
+    n_components: int, decomp_class, X, random_state: int = 42
+) -> float:
+    """Computes Bayesian information criterion for
+    a decomposition model with assumed Gaussian noise.
+
+    Parameters
+    ----------
+    n_components: int
+        Number of components in the model.
+    decomp_class
+        Class of the decomposition model, e.g. `decomp_class=FastICA`.
+    X: array
+        Training data to decompose.
+    random_state: int, default 42
+        Seed to pass to the decomposition algorithm.
+    """
+    decomp = decomp_class(n_components=n_components, random_state=42)
+    doc_topic = decomp.fit_transform(X)
+    if not hasattr(decomp, "reconstruction_err_"):
+        X_reconstruction = decomp.inverse_transform(doc_topic)
+        # Computing residual sum of squares
+        if issparse(X):
+            X = np.asarray(X.todense())
+        rss = np.sum(np.square(X - X_reconstruction))
+    else:
+        rss = np.square(decomp.reconstruction_err_)
+    n_params = decomp.components_.size + np.prod(doc_topic.shape)
+    n_datapoints = np.prod(X.shape)
+    # Computing Bayesian information criterion assuming IID
+    bic = n_params * np.log(n_datapoints) + n_datapoints * np.log(
+        rss / n_datapoints
+    )
+    return bic
+
+
+def optimize_n_components(
+    f_ic: Callable[int, float], min_n: int = 2, max_n: int = 250, verbose=False
+) -> int:
+    """Optimizes the nuber of components using the Brent minimum finding
+    algorithm given an information criterion.
+    The bracket is found by exponentially incrementing n_components.
+
+    Parameters
+    ----------
+    f_ic: Callable[int, float]
+        Information criterion given N components.
+    min_n: int
+        Minimum value of N.
+    max_n: int
+        Maximum value of N, in case the algorithm doesn't converge.
+    """
+    if verbose:
+        print("Optimizing N based on an information criterion...")
+
+    # Caching and adding debugging statements
+    @cache
+    def _f_ic(n_components) -> float:
+        # Making sure n_components is an integer
+        # The optimization algorithm definitely give it a float
+        n_components = int(n_components)
+        val = f_ic(n_components)
+        if verbose:
+            print(f" - IC(N={n_components})={val:.2f}")
+        return val
+
+    # Finding bracket
+    if verbose:
+        print(" - Finding bracket...")
+    low = min_n
+    n_comp = 2
+    while not _f_ic(n_comp) < _f_ic(min_n):
+        n_comp += 1
+        if n_comp >= 10:
+            if verbose:
+                print(
+                    " - Couldn't find lower value than n=1 up to n=10, stopping."
+                )
+            return 1
+    middle = n_comp
+    current = _f_ic(middle)
+    inc = 5
+    while not current > _f_ic(middle):
+        n_comp += inc
+        if n_comp >= max_n:
+            if verbose:
+                print(f" - Bracket didn't converge, returning max N: {max_n}")
+            return max_n
+        current = _f_ic(n_comp)
+        if current < _f_ic(middle):
+            low = n_comp - inc
+            middle = n_comp
+        inc *= 2
+    bracket = low, middle, n_comp
+    if verbose:
+        print(f" - Running optimization with bracket: {bracket}")
+    # Optimizing
+    res = optimize.minimize_scalar(
+        _f_ic,
+        method="brent",
+        bracket=(low, middle, n_comp),
+        options=dict(xtol=0.2),
+    )
+    if verbose:
+        print(f" - Converged after {res.nit} iterations.")
+    return int(res.x)