File tree Expand file tree Collapse file tree 3 files changed +67
-0
lines changed
docs/getting_started/embeddings Expand file tree Collapse file tree 3 files changed +67
-0
lines changed Original file line number Diff line number Diff line change 3838 msg = "`pip install fastembed` \n \n "
3939 FastEmbedBackend = NotInstalled ("FastEmbed" , "FastEmbed" , custom_msg = msg )
4040
41+
42+ # Langchain Embedddings
43+ try :
44+ from bertopic .backend ._langchain import LangChainBackend
45+ except ModuleNotFoundError :
46+ msg = "`pip install langchain` \n \n "
47+ LangChainBackend = NotInstalled ("LangChain" , "LangChain" , custom_msg = msg )
48+
49+
4150__all__ = [
4251 "BaseEmbedder" ,
4352 "WordDocEmbedder" ,
4655 "Model2VecBackend" ,
4756 "MultiModalBackend" ,
4857 "FastEmbedBackend" ,
58+ "LangChainBackend" ,
4959 "languages" ,
5060]
Original file line number Diff line number Diff line change 1+ from typing import List
2+
3+ import numpy as np
4+ from bertopic .backend import BaseEmbedder
5+ from langchain_core .embeddings import Embeddings
6+
7+
8+ class LangChainBackend (BaseEmbedder ):
9+ """LangChain Embedding Model.
10+
11+ This class uses the LangChain Embedding class to embed the documents.
12+ Argument:
13+ embedding_model: A LangChain Embedding Instance.
14+
15+ Examples:
16+ ```python
17+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
18+ from bertopic.backend import LangChainBackend
19+
20+ hf_embedding = HuggingFaceInstructEmbeddings()
21+ langchain_embedder = LangChainBackend(hf_embedding)
22+ ```
23+ """
24+
25+ def __init__ (self , embedding_model : Embeddings ):
26+ self .embedding_model = embedding_model
27+
28+ def embed (self , documents : List [str ], verbose : bool = False ) -> np .ndarray :
29+ """Embed a list of n documents/words into an n-dimensional
30+ matrix of embeddings.
31+
32+ Arguments:
33+ documents: A list of documents or words to be embedded
34+ verbose: Controls the verbosity of the process
35+
36+ Returns:
37+ Document/words embeddings with shape (n, m) with `n` documents/words
38+ that each have an embeddings size of `m`
39+ """
40+ # Prepare documents, replacing empty strings with a single space
41+ prepared_documents = [" " if doc == "" else doc for doc in documents ]
42+ response = self .embedding_model .embed_documents (prepared_documents )
43+ return np .array (response )
Original file line number Diff line number Diff line change @@ -124,6 +124,20 @@ topic_model = BERTopic(embedding_model=embedding_model)
124124!!! tip "Tip!"
125125 These transformers also work quite well using ` sentence-transformers ` which has great optimizations tricks that make using it a bit faster.
126126
127+ ** Langchain**
128+ [ Langchain] ( https://python.langchain.com/docs/introduction ) allows you to use different embedding models supported by various cloud providers. On top of that, it supports various integrations to open source models. To get started:
129+
130+ ``` python
131+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
132+ from bertopic.backend import LangChainBackend
133+
134+ hf_embedding = HuggingFaceInstructEmbeddings()
135+ langchain_embedder = LangChainBackend(hf_embedding)
136+ ```
137+
138+ To see what providers are being supported by Langchain, you can check the list [ here] ( https://python.langchain.com/docs/integrations/providers/ ) .
139+ For more information, you can have a look on [ Langchain's Embedding Models] ( https://python.langchain.com/docs/integrations/text_embedding/ ) .
140+
127141## ** Flair**
128142[ Flair] ( https://github.com/flairNLP/flair ) allows you to choose almost any embedding model that
129143is publicly available. Flair can be used as follows:
You can’t perform that action at this time.
0 commit comments