Skip to content

Commit 61a2cd2

Browse files
authored
v0.16 (#1572)
* Zeroshot Topic Modeling * Seed (domain-specific) words * More LLM documentation, including Zephyr example * Add support for Cohere's Embed v3 * Added llama.cpp * Added HUGE changelog and up version for upcoming release
1 parent bcb3ca2 commit 61a2cd2

File tree

30 files changed

+1541
-202
lines changed

30 files changed

+1541
-202
lines changed

README.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,12 @@ BERTopic supports all kinds of topic modeling techniques:
3333
<tr>
3434
<td><a href="https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html">Multimodal</a></td>
3535
<td><a href="https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html">Multi-aspect</a></td>
36-
<td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#text-generation-prompts">Text Generation/LLM</a></td>
36+
<td><a href="https://maartengr.github.io/BERTopic/getting_started/representation/llm.html">Text Generation/LLM</a></td>
3737
</tr>
3838
<tr>
39-
<td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models</a></td>
40-
<td></td>
41-
<td></td>
39+
<td><a href="https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html">Zero-shot <b>(new!)</b></a></td>
40+
<td><a href="https://maartengr.github.io/BERTopic/getting_started/merge/merge.html">Merge Models <b>(new!)</b></a></td>
41+
<td><a href="https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html">Seed Words <b>(new!)</b></a></td>
4242
</tr>
4343
</table>
4444

@@ -159,8 +159,8 @@ import openai
159159
from bertopic.representation import OpenAI
160160

161161
# Fine-tune topic representations with GPT
162-
openai.api_key = "sk-..."
163-
representation_model = OpenAI(model="gpt-3.5-turbo", chat=True)
162+
client = openai.OpenAI(api_key="sk-...")
163+
representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
164164
topic_model = BERTopic(representation_model=representation_model)
165165
```
166166

@@ -259,6 +259,7 @@ There are many different use cases in which topic modeling can be used. As such,
259259
| [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` |
260260
| [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` |
261261
| [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` |
262+
| [Zero-shot Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) | `BERTopic(zeroshot_topic_list=zeroshot_topic_list)` |
262263
| [Merge Multiple Models](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) | `BERTopic.merge_models([topic_model_1, topic_model_2])` |
263264

264265

bertopic/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from bertopic._bertopic import BERTopic
22

3-
__version__ = "0.15.0"
3+
__version__ = "0.16.0"
44

55
__all__ = [
66
"BERTopic",

bertopic/_bertopic.py

Lines changed: 335 additions & 100 deletions
Large diffs are not rendered by default.

bertopic/_utils.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
import pandas as pd
23
import logging
34
from collections.abc import Iterable
45
from scipy.sparse import csr_matrix
@@ -13,7 +14,10 @@ def __init__(self, level):
1314
self.logger.propagate = False
1415

1516
def info(self, message):
16-
self.logger.info("{}".format(message))
17+
self.logger.info(f"{message}")
18+
19+
def warning(self, message):
20+
self.logger.warning(f"WARNING: {message}")
1721

1822
def set_level(self, level):
1923
levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
@@ -32,10 +36,11 @@ def _add_handler(self):
3236

3337
def check_documents_type(documents):
3438
""" Check whether the input documents are indeed a list of strings """
35-
if isinstance(documents, Iterable) and not isinstance(documents, str):
39+
if isinstance(documents, pd.DataFrame):
40+
raise TypeError("Make sure to supply a list of strings, not a dataframe.")
41+
elif isinstance(documents, Iterable) and not isinstance(documents, str):
3642
if not any([isinstance(doc, str) for doc in documents]):
3743
raise TypeError("Make sure that the iterable only contains strings.")
38-
3944
else:
4045
raise TypeError("Make sure that the documents variable is an iterable containing strings only.")
4146

@@ -94,15 +99,16 @@ def __getattr__(self, *args, **kwargs):
9499
def __call__(self, *args, **kwargs):
95100
raise ModuleNotFoundError(self.msg)
96101

102+
97103
def validate_distance_matrix(X, n_samples):
98104
""" Validate the distance matrix and convert it to a condensed distance matrix
99105
if necessary.
100106
101-
A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
102-
with zeros on the diagonal and non-negative values or condensed distance matrix
103-
of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
107+
A valid distance matrix is either a square matrix of shape (n_samples, n_samples)
108+
with zeros on the diagonal and non-negative values or condensed distance matrix
109+
of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the
104110
distance matrix.
105-
111+
106112
Arguments:
107113
X: Distance matrix to validate.
108114
n_samples: Number of samples in the dataset.
@@ -118,26 +124,26 @@ def validate_distance_matrix(X, n_samples):
118124
if len(s) == 1:
119125
# check it has correct size
120126
n = s[0]
121-
if n != (n_samples * (n_samples -1) / 2):
127+
if n != (n_samples * (n_samples - 1) / 2):
122128
raise ValueError("The condensed distance matrix must have "
123-
"shape (n*(n-1)/2,).")
129+
"shape (n*(n-1)/2,).")
124130
elif len(s) == 2:
125131
# check it has correct size
126132
if (s[0] != n_samples) or (s[1] != n_samples):
127133
raise ValueError("The distance matrix must be of shape "
128-
"(n, n) where n is the number of samples.")
134+
"(n, n) where n is the number of samples.")
129135
# force zero diagonal and convert to condensed
130136
np.fill_diagonal(X, 0)
131137
X = squareform(X)
132138
else:
133139
raise ValueError("The distance matrix must be either a 1-D condensed "
134-
"distance matrix of shape (n*(n-1)/2,) or a "
135-
"2-D square distance matrix of shape (n, n)."
136-
"where n is the number of documents."
137-
"Got a distance matrix of shape %s" % str(s))
140+
"distance matrix of shape (n*(n-1)/2,) or a "
141+
"2-D square distance matrix of shape (n, n)."
142+
"where n is the number of documents."
143+
"Got a distance matrix of shape %s" % str(s))
138144

139145
# Make sure its entries are non-negative
140146
if np.any(X < 0):
141147
raise ValueError("Distance matrix cannot contain negative values.")
142148

143-
return X
149+
return X

bertopic/backend/_cohere.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
import time
2-
import cohere
32
import numpy as np
43
from tqdm import tqdm
5-
from typing import List
4+
from typing import Any, List, Mapping
65
from bertopic.backend import BaseEmbedder
76

87

98
class CohereBackend(BaseEmbedder):
109
""" Cohere Embedding Model
11-
10+
1211
Arguments:
1312
client: A `cohere` client.
1413
embedding_model: A Cohere model. Default is "large".
@@ -17,6 +16,9 @@ class CohereBackend(BaseEmbedder):
1716
delay_in_seconds: If a `batch_size` is given, use this set
1817
the delay in seconds between batches.
1918
batch_size: The size of each batch.
19+
embed_kwargs: Kwargs passed to `cohere.Client.embed`.
20+
Can be used to define additional parameters
21+
such as `input_type`
2022
2123
Examples:
2224
@@ -27,17 +29,34 @@ class CohereBackend(BaseEmbedder):
2729
client = cohere.Client("APIKEY")
2830
cohere_model = CohereBackend(client)
2931
```
32+
33+
If you want to specify `input_type`:
34+
35+
```python
36+
cohere_model = CohereBackend(
37+
client,
38+
embedding_model="embed-english-v3.0",
39+
embed_kwargs={"input_type": "clustering"}
40+
)
41+
```
3042
"""
31-
def __init__(self,
43+
def __init__(self,
3244
client,
3345
embedding_model: str = "large",
3446
delay_in_seconds: float = None,
35-
batch_size: int = None):
47+
batch_size: int = None,
48+
embed_kwargs: Mapping[str, Any] = {}):
3649
super().__init__()
3750
self.client = client
3851
self.embedding_model = embedding_model
3952
self.delay_in_seconds = delay_in_seconds
4053
self.batch_size = batch_size
54+
self.embed_kwargs = embed_kwargs
55+
56+
if self.embed_kwargs.get("model"):
57+
self.embedding_model = embed_kwargs.get("model")
58+
else:
59+
self.embed_kwargs["model"] = self.embedding_model
4160

4261
def embed(self,
4362
documents: List[str],
@@ -57,19 +76,19 @@ def embed(self,
5776
if self.batch_size is not None:
5877
embeddings = []
5978
for batch in tqdm(self._chunks(documents), disable=not verbose):
60-
response = self.client.embed(batch, model=self.embedding_model)
79+
response = self.client.embed(batch, **self.embed_kwargs)
6180
embeddings.extend(response.embeddings)
62-
81+
6382
# Delay subsequent calls
6483
if self.delay_in_seconds:
6584
time.sleep(self.delay_in_seconds)
6685

6786
# Extract embeddings all at once
6887
else:
69-
response = self.client.embed(documents, model=self.embedding_model)
88+
response = self.client.embed(documents, **self.embed_kwargs)
7089
embeddings = response.embeddings
7190
return np.array(embeddings)
72-
73-
def _chunks(self, documents):
91+
92+
def _chunks(self, documents):
7493
for i in range(0, len(documents), self.batch_size):
75-
yield documents[i:i + self.batch_size]
94+
yield documents[i:i + self.batch_size]

bertopic/backend/_openai.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,39 +2,50 @@
22
import openai
33
import numpy as np
44
from tqdm import tqdm
5-
from typing import List
5+
from typing import List, Mapping, Any
66
from bertopic.backend import BaseEmbedder
77

88

99
class OpenAIBackend(BaseEmbedder):
1010
""" OpenAI Embedding Model
11-
11+
1212
Arguments:
13-
embedding_model: An OpenAI model. Default is
13+
client: A `openai.OpenAI` client.
14+
embedding_model: An OpenAI model. Default is
1415
For an overview of models see:
1516
https://platform.openai.com/docs/models/embeddings
1617
delay_in_seconds: If a `batch_size` is given, use this set
1718
the delay in seconds between batches.
1819
batch_size: The size of each batch.
20+
generator_kwargs: Kwargs passed to `openai.Embedding.create`.
21+
Can be used to define custom engines or
22+
deployment_ids.
1923
2024
Examples:
2125
2226
```python
2327
import openai
2428
from bertopic.backend import OpenAIBackend
2529
26-
openai.api_key = MY_API_KEY
27-
openai_embedder = OpenAIBackend("text-embedding-ada-002")
30+
client = openai.OpenAI(api_key="sk-...")
31+
openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
2832
```
2933
"""
30-
def __init__(self,
34+
def __init__(self,
3135
embedding_model: str = "text-embedding-ada-002",
3236
delay_in_seconds: float = None,
33-
batch_size: int = None):
37+
batch_size: int = None,
38+
generator_kwargs: Mapping[str, Any] = {}):
3439
super().__init__()
3540
self.embedding_model = embedding_model
3641
self.delay_in_seconds = delay_in_seconds
3742
self.batch_size = batch_size
43+
self.generator_kwargs = generator_kwargs
44+
45+
if self.generator_kwargs.get("model"):
46+
self.embedding_model = generator_kwargs.get("model")
47+
elif not self.generator_kwargs.get("engine"):
48+
self.generator_kwargs["model"] = self.embedding_model
3849

3950
def embed(self,
4051
documents: List[str],
@@ -54,7 +65,7 @@ def embed(self,
5465
if self.batch_size is not None:
5566
embeddings = []
5667
for batch in tqdm(self._chunks(documents), disable=not verbose):
57-
response = openai.Embedding.create(input=batch, model=self.embedding_model)
68+
response = openai.Embedding.create(input=batch, **self.generator_kwargs)
5869
embeddings.extend([r["embedding"] for r in response["data"]])
5970

6071
# Delay subsequent calls
@@ -63,10 +74,10 @@ def embed(self,
6374

6475
# Extract embeddings all at once
6576
else:
66-
response = openai.Embedding.create(input=documents, model=self.embedding_model)
77+
response = openai.Embedding.create(input=documents, **self.generator_kwargs)
6778
embeddings = [r["embedding"] for r in response["data"]]
6879
return np.array(embeddings)
69-
70-
def _chunks(self, documents):
80+
81+
def _chunks(self, documents):
7182
for i in range(0, len(documents), self.batch_size):
7283
yield documents[i:i + self.batch_size]

bertopic/representation/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@
44
from bertopic.representation._keybert import KeyBERTInspired
55
from bertopic.representation._mmr import MaximalMarginalRelevance
66

7+
8+
# Llama CPP Generator
9+
try:
10+
from bertopic.representation._llamacpp import LlamaCPP
11+
except ModuleNotFoundError:
12+
msg = "`pip install llama-cpp-python` \n\n"
13+
LlamaCPP = NotInstalled("llama.cpp", "llama-cpp-python", custom_msg=msg)
14+
715
# Text Generation using transformers
816
try:
917
from bertopic.representation._textgeneration import TextGeneration
@@ -25,7 +33,7 @@
2533
msg = "`pip install openai` \n\n"
2634
OpenAI = NotInstalled("OpenAI", "openai", custom_msg=msg)
2735

28-
# OpenAI Generator
36+
# LangChain Generator
2937
try:
3038
from bertopic.representation._langchain import LangChain
3139
except ModuleNotFoundError:
@@ -45,7 +53,6 @@
4553
VisualRepresentation = NotInstalled("a visual representation model", "vision")
4654

4755

48-
4956
__all__ = [
5057
"BaseRepresentation",
5158
"TextGeneration",
@@ -56,5 +63,6 @@
5663
"Cohere",
5764
"OpenAI",
5865
"LangChain",
66+
"LlamaCPP",
5967
"VisualRepresentation"
6068
]

bertopic/representation/_cohere.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class Cohere(BaseRepresentation):
4444
https://docs.cohere.ai/docs
4545
4646
Arguments:
47-
client: A cohere.Client
47+
client: A `cohere.Client`
4848
model: Model to use within Cohere, defaults to `"xlarge"`.
4949
prompt: The prompt to be used in the model. If no prompt is given,
5050
`self.default_prompt_` is used instead.

0 commit comments

Comments
 (0)