Merge pull request #86 from haesleinhuepf/wordclouds

haesleinhuepf · web-flow · commit 08a048d35037 · 2024-12-05T10:13:02.000+01:00
Wordclouds
diff --git a/README.md b/README.md
@@ -305,6 +305,13 @@ stackview.sliceplot(df, images, column_x="UMAP0", column_y="UMAP1")
 
 ![](https://raw.githubusercontent.com/haesleinhuepf/stackview/main/docs/images/sliceplot.gif)
 
+
+### Wordcloudplot
+
+If you have a pandas DataFrame with a column containing text and additionally numeric columns related to the text, you can use the `wordcloudplot` function to visualize selected texts in a wordcloud.
+
+![img.png](https://raw.githubusercontent.com/haesleinhuepf/stackview/main/docs/images/wordcloudplot.png)
+
 ### Interact
 
 Exploration of the parameter space of image processing functions is available using `interact`:
diff --git a/docs/data/sentence_embeddings.csv b/docs/data/sentence_embeddings.csv
diff --git a/docs/data/sentence_embeddings_source.txt b/docs/data/sentence_embeddings_source.txt
@@ -0,0 +1,4 @@
+Extracted from: https://arxiv.org/abs/2204.07547 licensed CC-BY 4.0 by:
+Robert Haase, Elnaz Fazeli, David Legland, Michael Doube, Siân Culley, Ilya Belevich, Eija Jokitalo, Martin Schorb, Anna Klemm, Christian Tischer
+
+Used embedding: https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
diff --git a/docs/images/wordcloudplot.png b/docs/images/wordcloudplot.png
diff --git a/docs/wordcloudplots.ipynb b/docs/wordcloudplots.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "20a04b0c-06cb-4f29-8381-a6a0d4a20ccd",
+   "metadata": {},
+   "source": [
+    "# Wordcloud plots\n",
+    "For text exploration, it might make sense to visualize texts as data points and interact with them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8d301701-368f-4365-b555-dae6f06d8bea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import stackview\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a991df41-86ab-4188-af47-e6e0cf6d7b32",
+   "metadata": {},
+   "source": [
+    "Here we reuse a list of sentences and a [UMAP](https://umap-learn.readthedocs.io/en/latest/) produced from their text-embeddings. The sentences are taken from [Haase et al. 2022](https://arxiv.org/abs/2204.07547) which is licensed [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bb75ed7e-aa83-4015-a74b-8dfb9405ecf1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>sentence</th>\n",
+       "      <th>UMAP0</th>\n",
+       "      <th>UMAP1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>A Hitchhiker’s Guide through the Bio-image Ana...</td>\n",
+       "      <td>-2.863276</td>\n",
+       "      <td>8.680281</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Modern research in the life sciences is unthin...</td>\n",
+       "      <td>-3.731295</td>\n",
+       "      <td>7.875060</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>In the past decade, we observed a dramatic inc...</td>\n",
+       "      <td>-4.748690</td>\n",
+       "      <td>6.128065</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>As it is increasingly difficult to keep track ...</td>\n",
+       "      <td>-4.183692</td>\n",
+       "      <td>6.847530</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>We give guidance on which aspects to consider ...</td>\n",
+       "      <td>-4.912832</td>\n",
+       "      <td>6.691180</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0                                           sentence     UMAP0  \\\n",
+       "0           0  A Hitchhiker’s Guide through the Bio-image Ana... -2.863276   \n",
+       "1           1  Modern research in the life sciences is unthin... -3.731295   \n",
+       "2           2  In the past decade, we observed a dramatic inc... -4.748690   \n",
+       "3           3  As it is increasingly difficult to keep track ... -4.183692   \n",
+       "4           4  We give guidance on which aspects to consider ... -4.912832   \n",
+       "\n",
+       "      UMAP1  \n",
+       "0  8.680281  \n",
+       "1  7.875060  \n",
+       "2  6.128065  \n",
+       "3  6.847530  \n",
+       "4  6.691180  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"data/sentence_embeddings.csv\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5a69daa-2282-47d6-a020-cf3f8a5539fe",
+   "metadata": {},
+   "source": [
+    "A word cloud plot is an interactive plot where you can select texts and from your selection, a wordcloud is generated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "99720165-0a6c-4c0a-8922-e6350b5a70f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b30463a9357f4846827c31acb06fc0bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(HBox(children=(VBox(children=(VBox(children=(HBox(children=(VBox(children=(Image…"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stackview.wordcloudplot(df, column_text=\"sentence\", column_x=\"UMAP0\", column_y=\"UMAP1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39c71ab5-43f3-4768-9e99-973905082950",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="stackview",
-    version="0.12.0",
+    version="0.12.1",
     author="Robert Haase",
     author_email="robert.haase@uni-leipzig.de",
     description="Interactive image stack viewing in jupyter notebooks",
@@ -14,7 +14,7 @@
     url="https://github.com/haesleinhuepf/stackview/",
     packages=setuptools.find_packages(),
     include_package_data=True,
-    install_requires=["numpy!=1.19.4", "ipycanvas", "ipywidgets", "scikit-image", "ipyevents", "toolz", "matplotlib", "ipykernel", "imageio", "ipympl"],
+    install_requires=["numpy!=1.19.4", "ipycanvas", "ipywidgets", "scikit-image", "ipyevents", "toolz", "matplotlib", "ipykernel", "imageio", "ipympl", "wordcloud"],
     python_requires='>=3.6',
     classifiers=[
         "Programming Language :: Python :: 3",
diff --git a/stackview/__init__.py b/stackview/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.12.0"
+__version__ = "0.12.1"
 
 from ._static_view import jupyter_displayable_output, insight
 from ._utilities import merge_rgb
@@ -22,5 +22,6 @@
 from ._grid import grid
 from ._clusterplot import clusterplot
 from ._sliceplot import sliceplot
+from ._wordcloudplot import wordcloudplot
 
 
diff --git a/stackview/_wordcloudplot.py b/stackview/_wordcloudplot.py
@@ -0,0 +1,82 @@
+def wordcloudplot(df, column_x: str = "x", column_y: str = "y", column_text: str = "text",
+                  column_selection: str = "selection",
+                  figsize=(4, 4), markersize=4, width=400, height=400):
+    """
+    Visualizes a scatter plot of columns in a given dataframe next to a word cloud.
+    Per default, the dataframe should contain a column "text".
+
+    Parameters
+    ----------
+    df: pandas.DataFrame
+        The dataframe to plot
+    column_x: str, optional
+        The column to use for the x-axis
+    column_y: str, optional
+        The column to use for the y-axis
+    column_text: str, optional
+        The column to use for the text that make the word cloud
+    column_selection: str, optional
+        The column to use for the selection
+    figsize: tuple, optional
+        The size of the scatter plot figure
+    markersize: int
+        The size of the markers
+    width: int
+        The width of the word cloud
+    height: int
+        The height of the word cloud
+
+    Returns
+    -------
+    An ipywidgets widget
+    """
+    import numpy as np
+    from ._grid import grid
+    from ._curtain import curtain
+    from ._slice import slice
+    from ._scatterplot import scatterplot
+    import functools
+    from wordcloud import WordCloud
+
+    if column_selection in df.columns:
+        selected_texts = df[df['selection'] == 1][column_text]
+        text = "\n".join(selected_texts)
+    else:
+        selected_texts = df[column_text]
+        text = "\n".join(selected_texts)
+
+    wordcloud = WordCloud(colormap="twilight", background_color="white", width=width, height=height).generate(text)
+    image = wordcloud.to_image()
+    selected_image = np.array(image)
+
+    image_display = slice(selected_image)
+
+    def update(selection, df, column_text, selected_image, widget):
+        selected_texts = df[column_text][list(selection)]
+        text = "\n".join(selected_texts)
+
+        if len(text) == 0:
+            text = "empty wordcloud"
+
+        wordcloud = WordCloud(colormap="twilight", background_color="white", width=width, height=height).generate(text)
+        image = wordcloud.to_image()
+        temp = np.array(image)
+
+        # overwrite the pixels in the given image
+        np.copyto(selected_image, temp.astype(selected_image.dtype))
+
+        # redraw the visualization
+        widget.update()
+
+    update_selection = functools.partial(update, df=df, column_text=column_text, selected_image=selected_image,
+                                         widget=image_display)
+
+    scatterplot = scatterplot(df, column_x, column_y, column_selection, figsize=figsize,
+                              selection_changed_callback=update_selection, markersize=markersize)
+
+    return grid([[
+        image_display,
+        scatterplot,
+
+    ]])
+