Merge pull request #110 from AnFreTh/main

AnFreTh · web-flow · commit bc3ddfbe00b9 · 2025-01-04T23:13:31.000+01:00
Releasing version 0.2.0
diff --git a/README.md b/README.md
@@ -73,22 +73,28 @@ You can install STREAM directly from PyPI or from the GitHub repository:
 
 1. **PyPI (Recommended)**:
     ```bash
-    pip install stream_topic
+    pip install stream-topic
     ```
 
 2. **GitHub**:
     ```bash
     pip install git+https://github.com/AnFreTh/STREAM.git
     ```
 
-3. **Download NLTK Resources**:
-    Ensure you have the necessary NLTK resources installed:
-    ```python
-    import nltk
-    nltk.download('stopwords')
-    nltk.download('punkt')
-    nltk.download('wordnet')
-    nltk.download('averaged_perceptron_tagger')
+3. **Install requirements for add-ons**:
+    To use STREAMS visualizations, simply run:
+    ```bash
+    pip install stream-topic[plotting]
+    ```
+
+    For BERTopic, run:
+    ```bash
+    pip install stream-topic[hdbscan]
+    ```
+
+    For DCTE:
+    ```bash
+    pip install stream-topic[dcte]
     ```
 
 # 📦 Available Models
diff --git a/requirements.txt b/requirements.txt
@@ -1,42 +1,28 @@
 # basics
-numpy<=1.26.4
+numpy
 pandas
-pyarrow
-scikit-learn==1.1.0
-scipy==1.10.1
+scikit-learn
+nltk
+datasets
 
 # dl
-lightning==2.3.3
-torch==2.4.0
+lightning
+torch
+sentence_transformers
 
 # nlp
-transformers==4.40.2
-setfit==1.0.3
-gensim==4.2.0
-umap-learn==0.5.6
-wordcloud==1.9.3
+transformers
+gensim
+umap-learn
+
 
 community
-networkx==3.3
+networkx
 python_louvain
 langdetect
-hdbscan==0.8.37
-huggingface_hub==0.23.5
-# nltk
-# datasets==2.20.0
-# sentence-transformers==3.0.1
-
 
-# plotting
-dash
-plotly
-matplotlib
 
 # misc
 loguru
-ipywidgets
-ipykernel<6.22.0
-# tqdm
-# pre-commit
-optuna==3.6.1
-optuna-integration==3.6.0
+optuna
+optuna-integration
diff --git a/setup.py b/setup.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 import os
 from pathlib import Path
-
 from setuptools import find_packages, setup
+from setuptools.command.install import install
 
 # Package meta-data.
 NAME = "stream_topic"
@@ -12,7 +12,27 @@
 DOCS = "https://stream.readthedocs.io/en/"
 EMAIL = "anton.thielmann@tu-clausthal.de"
 AUTHOR = "Anton Thielmann"
-REQUIRES_PYTHON = ">=3.6, <=3.11"
+REQUIRES_PYTHON = ">=3.6"
+
+
+class PostInstallCommand(install):
+    """Post-installation for downloading NLTK resources."""
+
+    def run(self):
+        install.run(self)
+        try:
+            import nltk
+
+            nltk.download("stopwords")
+            nltk.download("wordnet")
+            nltk.download("punkt_tab")
+            nltk.download("brown")
+            nltk.download("averaged_perceptron_tagger_eng")
+        except ImportError:
+            print(
+                "NLTK not installed. Ensure it is listed in install_requires or installed separately."
+            )
+
 
 # Load the package's verison file and its content.
 ROOT_DIR = Path(__file__).resolve().parent
@@ -30,6 +50,13 @@
         if not line.startswith("#") and not line.startswith("git+")
     ]
 
+extras_require = {
+    "plotting": ["dash", "plotly", "matplotlib", "wordcloud"],
+    "bertopic": ["hdbscan"],
+    "dcte": ["pyarrow", "setfit"],
+}
+
+
 # get long description from readme file
 with open(os.path.join(ROOT_DIR, "README.md")) as f:
     LONG_DESCRIPTION = f.read()
@@ -45,7 +72,7 @@
     author_email=EMAIL,
     python_requires=REQUIRES_PYTHON,
     install_requires=install_reqs,
-    # extras_require=extras_reqs,
+    extras_require=extras_require,
     license="MIT",  # adapt based on your needs
     packages=find_packages(exclude=["examples", "examples.*", "tests", "tests.*"]),
     include_package_data=True,
@@ -65,4 +92,7 @@
     ],
     project_urls={"Documentation": DOCS},
     url=HOMEPAGE,
+    cmdclass={
+        "install": PostInstallCommand,
+    },
 )
diff --git a/stream_topic/__version__.py b/stream_topic/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.1.9"
+__version__ = "0.2.0"
diff --git a/stream_topic/metrics/coherence_metrics.py b/stream_topic/metrics/coherence_metrics.py
@@ -355,6 +355,26 @@ def __init__(
 
         self.n_words = n_words
 
+    def get_info(self):
+        """
+        Get information about the metric.
+
+        Returns
+        -------
+        dict
+            Dictionary containing model information including metric name,
+            number of top words, number of intruders, embedding model name,
+            metric range and metric description.
+        """
+
+        info = {
+            "metric_name": "Embedding Coherence",
+            "n_words": self.n_words,
+            "description": "Embedding Coherence coherence",
+        }
+
+        return info
+
     def score_per_topic(self, topics):
         """
         Calculates coherence scores for each topic individually based on embedding similarities.
@@ -414,16 +434,3 @@ def score(self, topics):
         """
         res = self.score_per_topic(topics).values()
         return sum(res) / len(res)
-
-
-def _load_default_texts():
-    """
-    Loads default general texts
-
-    Returns
-    -------
-    result : default 20newsgroup texts
-    """
-    dataset = Dataset()
-    dataset.fetch_dataset("20NewsGroup")
-    return dataset.get_corpus()
diff --git a/stream_topic/models/DCTE.py b/stream_topic/models/DCTE.py
@@ -4,12 +4,8 @@
 import pyarrow as pa
 from datasets import Dataset
 from loguru import logger
-from sentence_transformers.losses import CosineSimilarityLoss
-from setfit import SetFitModel, TrainingArguments
-from setfit import Trainer as SetfitTrainer
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import OneHotEncoder
-
 from ..commons.check_steps import check_dataset_steps
 from ..preprocessor._tf_idf import c_tf_idf, extract_tfidf_topics
 from ..utils.dataset import TMDataset
@@ -21,6 +17,19 @@
 # logger.add(f"{MODEL_NAME}_{time}.log", backtrace=True, diagnose=True)
 
 
+def import_setfit():
+    try:
+        from setfit import SetFitModel, TrainingArguments
+        from setfit import Trainer as SetfitTrainer
+        from sentence_transformers.losses import CosineSimilarityLoss
+
+        return SetFitModel, TrainingArguments, SetfitTrainer, CosineSimilarityLoss
+    except ImportError as e:
+        raise ImportError(
+            "Setfit is not installed. Please install it by running 'pip install setfit'."
+        ) from e
+
+
 class DCTE(BaseModel):
     """
     A document classification and topic extraction class that utilizes the SetFitModel for
@@ -62,6 +71,9 @@ def __init__(
         )
         self.n_topics = None
 
+        # Lazy import SetFit components
+        SetFitModel, _, _, _ = import_setfit()
+
         self.model = SetFitModel.from_pretrained(f"sentence-transformers/{model}")
         self._status = TrainingStatus.NOT_STARTED
         self.n_topics = None
@@ -122,7 +134,7 @@ def _get_topic_representation(self, predict_df: pd.DataFrame, top_words: int):
             n=top_words,
         )
 
-        one_hot_encoder = OneHotEncoder(sparse=False)
+        one_hot_encoder = OneHotEncoder(sparse_output=False)
         predictions_one_hot = one_hot_encoder.fit_transform(predict_df[["predictions"]])
 
         beta = tfidf
@@ -154,6 +166,8 @@ def fit(
             dict: A dictionary containing the extracted topics and the topic-word matrix.
         """
 
+        _, TrainingArguments, SetfitTrainer, CosineSimilarityLoss = import_setfit()
+
         assert isinstance(
             dataset, TMDataset
         ), "The dataset must be an instance of TMDataset."
diff --git a/stream_topic/models/KmeansTM.py b/stream_topic/models/KmeansTM.py
@@ -220,7 +220,7 @@ def fit(
             )
             self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=100)
 
-            one_hot_encoder = OneHotEncoder(sparse=False)
+            one_hot_encoder = OneHotEncoder(sparse_output=False)
             predictions_one_hot = one_hot_encoder.fit_transform(
                 self.dataframe[["predictions"]]
             )
diff --git a/stream_topic/models/abstract_helper_models/base.py b/stream_topic/models/abstract_helper_models/base.py
@@ -3,12 +3,9 @@
 import pickle
 from abc import ABC, abstractmethod
 from enum import Enum
-
-import optuna
 import torch.nn as nn
 import umap.umap_ as umap
 from loguru import logger
-from optuna.integration import PyTorchLightningPruningCallback
 
 
 class BaseModel(ABC):
@@ -340,6 +337,10 @@ def optimize_hyperparameters(
         dict
             Dictionary containing the best parameters and the optimal number of topics.
         """
+        import importlib
+
+        optuna = importlib.import_module("optuna")
+
         assert criterion in [
             "aic",
             "bic",
diff --git a/stream_topic/models/bertopicTM.py b/stream_topic/models/bertopicTM.py
@@ -1,6 +1,4 @@
 from datetime import datetime
-
-import hdbscan
 import numpy as np
 from loguru import logger
 from sklearn.preprocessing import OneHotEncoder
@@ -121,6 +119,10 @@ def _clustering(self):
         Applies K-Means clustering to the reduced embeddings.
         """
 
+        import importlib
+
+        hdbscan = importlib.import_module("hdbscan")
+
         assert (
             hasattr(self, "reduced_embeddings") and self.reduced_embeddings is not None
         ), "Reduced embeddings must be generated before clustering."
@@ -192,7 +194,7 @@ def fit(self, dataset, n_topics=None):
 
             self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=100)
 
-            one_hot_encoder = OneHotEncoder(sparse=False)
+            one_hot_encoder = OneHotEncoder(sparse_output=False)
             predictions_one_hot = one_hot_encoder.fit_transform(
                 self.dataframe[["predictions"]]
             )
diff --git a/stream_topic/models/cbc.py b/stream_topic/models/cbc.py
@@ -9,8 +9,7 @@
 
 from ..commons.check_steps import check_dataset_steps
 from ..preprocessor import c_tf_idf, extract_tfidf_topics
-from ..utils.cbc_utils import (DocumentCoherence,
-                               get_top_tfidf_words_per_document)
+from ..utils.cbc_utils import DocumentCoherence, get_top_tfidf_words_per_document
 from ..utils.dataset import TMDataset
 from .abstract_helper_models.base import BaseModel, TrainingStatus
 
@@ -189,12 +188,10 @@ def fit(
                 clusters = self.cluster_documents()
 
                 num_clusters = len(clusters)
-                print(
-                    f"Iteration {iteration}: {num_clusters} clusters formed.")
+                print(f"Iteration {iteration}: {num_clusters} clusters formed.")
 
                 # Prepare for the next iteration
-                combined_documents = self.combine_documents(
-                    current_documents, clusters)
+                combined_documents = self.combine_documents(current_documents, clusters)
                 current_documents = combined_documents
                 iteration += 1
 
@@ -247,8 +244,7 @@ def fit(
             self.labels += 1
 
             # Update the 'predictions' column in the dataframe with -1 where NaN was present
-            self.dataframe["predictions"] = self.dataframe["predictions"].fillna(
-                -1)
+            self.dataframe["predictions"] = self.dataframe["predictions"].fillna(-1)
             self.dataframe["predictions"] += 1
             print("--- replaced NaN values with 0 in topics ---")
             print(
@@ -259,13 +255,11 @@ def fit(
             {"text": " ".join}
         )
         logger.info("--- Extract topics ---")
-        tfidf, count = c_tf_idf(
-            docs_per_topic["text"].values, m=len(self.dataframe))
-        self.topic_dict = extract_tfidf_topics(
-            tfidf, count, docs_per_topic, n=10)
+        tfidf, count = c_tf_idf(docs_per_topic["text"].values, m=len(self.dataframe))
+        self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=10)
 
         one_hot_encoder = OneHotEncoder(
-            sparse=False
+            sparse_output=False
         )  # Use sparse=False to get a dense array
         predictions_one_hot = one_hot_encoder.fit_transform(
             self.dataframe[["predictions"]]

Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ def fit(`
`220`	`220`	`)`
`221`	`221`	`self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=100)`
`222`	`222`
`223`		`- one_hot_encoder = OneHotEncoder(sparse=False)`
	`223`	`+ one_hot_encoder = OneHotEncoder(sparse_output=False)`
`224`	`224`	`predictions_one_hot = one_hot_encoder.fit_transform(`
`225`	`225`	`self.dataframe[["predictions"]]`
`226`	`226`	`)`