Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions bertopic/representation/_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
Keywords: [KEYWORDS]
Topic name:"""

DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."


class Cohere(BaseRepresentation):
"""Use the Cohere API to generate topic labels based on their
Expand All @@ -51,6 +53,8 @@ class Cohere(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
system_prompt: The system prompt to be used in the model. If no system prompt is given,
`self.default_system_prompt_` is used instead.
delay_in_seconds: The delay in seconds between consecutive prompts
in order to prevent RateLimitErrors.
nr_docs: The number of documents to pass to OpenAI if a prompt
Expand Down Expand Up @@ -107,8 +111,9 @@ class Cohere(BaseRepresentation):
def __init__(
self,
client,
model: str = "xlarge",
model: str = "command-r",
prompt: str = None,
system_prompt: str = None,
delay_in_seconds: float = None,
nr_docs: int = 4,
diversity: float = None,
Expand All @@ -118,7 +123,9 @@ def __init__(
self.client = client
self.model = model
self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
self.default_prompt_ = DEFAULT_PROMPT
self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.delay_in_seconds = delay_in_seconds
self.nr_docs = nr_docs
self.diversity = diversity
Expand Down Expand Up @@ -160,14 +167,14 @@ def extract_topics(
if self.delay_in_seconds:
time.sleep(self.delay_in_seconds)

request = self.client.generate(
request = self.client.chat(
model=self.model,
prompt=prompt,
preamble=self.system_prompt,
message=prompt,
max_tokens=50,
num_generations=1,
stop_sequences=["\n"],
)
label = request.generations[0].text.strip()
label = request.text.strip()
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]

return updated_topics
Expand Down
78 changes: 55 additions & 23 deletions bertopic/representation/_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,34 @@


DEFAULT_PROMPT = """
Q: I have a topic that contains the following documents:
This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
---
Topic:
Sample texts from this topic:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

Keywords: meat beef eat eating emissions steak food health processed chicken
Topic name: Environmental impacts of eating meat
---
Topic:
Sample texts from this topic:
- I have ordered the product weeks ago but it still has not arrived!
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
- I got a message stating that I received the monitor but that is not true!
- It took a month longer to deliver than was advised...

Keywords: deliver weeks product shipping long delivery received arrived arrive week
Topic name: Shipping and delivery issues
---
Topic:
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
Topic name:"""

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, can you give a short label of the topic?
A: """
DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."


class LlamaCPP(BaseRepresentation):
Expand All @@ -28,6 +49,8 @@ class LlamaCPP(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
system_prompt: The system prompt to be used in the model. If no system prompt is given,
`self.default_system_prompt_` is used instead.
pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama`
when it is called such as `max_tokens` to be generated.
nr_docs: The number of documents to pass to OpenAI if a prompt
Expand Down Expand Up @@ -93,14 +116,15 @@ def __init__(
self,
model: Union[str, Llama],
prompt: str = None,
system_prompt: str = None,
pipeline_kwargs: Mapping[str, Any] = {},
nr_docs: int = 4,
diversity: float = None,
doc_length: int = None,
tokenizer: Union[str, Callable] = None,
):
if isinstance(model, str):
self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:")
self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML")
elif isinstance(model, Llama):
self.model = model
else:
Expand All @@ -110,7 +134,9 @@ def __init__(
"local LLM or a ` llama_cpp.Llama` object."
)
self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
self.default_prompt_ = DEFAULT_PROMPT
self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.pipeline_kwargs = pipeline_kwargs
self.nr_docs = nr_docs
self.diversity = diversity
Expand Down Expand Up @@ -150,33 +176,39 @@ def extract_topics(
self.prompts_.append(prompt)

# Extract result from generator and use that as label
topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]

if len(topic_description) < 10:
topic_description += [("", 0) for _ in range(10 - len(topic_description))]

updated_topics[topic] = topic_description
# topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
topic_description = self.model.create_chat_completion(
messages=[{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}],
**self.pipeline_kwargs,
)
label = topic_description["choices"][0]["message"]["content"].strip()
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]

return updated_topics

def _create_prompt(self, docs, topic, topics):
keywords = ", ".join(list(zip(*topics[topic]))[0])
keywords = list(zip(*topics[topic]))[0]

# Use the default prompt and replace keywords
# Use the Default Chat Prompt
if self.prompt == DEFAULT_PROMPT:
prompt = self.prompt.replace("[KEYWORDS]", keywords)
prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords))
prompt = self._replace_documents(prompt, docs)

# Use a prompt that leverages either keywords or documents in
# a custom location
# Use a custom prompt that leverages keywords, documents or both using
# custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively
else:
prompt = self.prompt
if "[KEYWORDS]" in prompt:
prompt = prompt.replace("[KEYWORDS]", keywords)
prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords))
if "[DOCUMENTS]" in prompt:
to_replace = ""
for doc in docs:
to_replace += f"- {doc}\n"
prompt = prompt.replace("[DOCUMENTS]", to_replace)
prompt = self._replace_documents(prompt, docs)

return prompt

@staticmethod
def _replace_documents(prompt, docs):
to_replace = ""
for doc in docs:
to_replace += f"- {doc}\n"
prompt = prompt.replace("[DOCUMENTS]", to_replace)
return prompt
13 changes: 12 additions & 1 deletion bertopic/representation/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
topic: <topic label>
"""

DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."


class OpenAI(BaseRepresentation):
r"""Using the OpenAI API to generate topic labels based
Expand All @@ -73,6 +75,8 @@ class OpenAI(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
system_prompt: The system prompt to be used in the model. If no system prompt is given,
`self.default_system_prompt_` is used instead.
delay_in_seconds: The delay in seconds between consecutive prompts
in order to prevent RateLimitErrors.
exponential_backoff: Retry requests with a random exponential backoff.
Expand Down Expand Up @@ -144,6 +148,7 @@ def __init__(
client,
model: str = "text-embedding-3-small",
prompt: str = None,
system_prompt: str = None,
generator_kwargs: Mapping[str, Any] = {},
delay_in_seconds: float = None,
exponential_backoff: bool = False,
Expand All @@ -161,7 +166,13 @@ def __init__(
else:
self.prompt = prompt

if chat and system_prompt is None:
self.system_prompt = DEFAULT_SYSTEM_PROMPT
else:
self.system_prompt = system_prompt

self.default_prompt_ = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT
self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.delay_in_seconds = delay_in_seconds
self.exponential_backoff = exponential_backoff
self.chat = chat
Expand Down Expand Up @@ -216,7 +227,7 @@ def extract_topics(

if self.chat:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": prompt},
]
kwargs = {
Expand Down
Loading