Skip to content

Commit 6fa15cc

Browse files
committed
add ingestion script for Chroma
1 parent f7bfc18 commit 6fa15cc

File tree

4 files changed

+1293
-11
lines changed

4 files changed

+1293
-11
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,4 @@ generated/
166166
*.pt
167167
*.jpg
168168
*.jpeg
169+
.chroma
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import logging
2+
from os import getenv
3+
4+
from dotenv import load_dotenv
5+
from langchain.text_splitter import RecursiveCharacterTextSplitter
6+
from langchain_chroma import Chroma
7+
from langchain_community.document_loaders import WebBaseLoader
8+
from langchain_openai import AzureOpenAIEmbeddings
9+
10+
load_dotenv()
11+
12+
urls = [
13+
"https://lilianweng.github.io/posts/2023-06-23-agent/",
14+
"https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
15+
"https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
16+
]
17+
18+
docs = [WebBaseLoader(url).load() for url in urls]
19+
docs_list = [item for sublist in docs for item in sublist]
20+
21+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
22+
chunk_size=250,
23+
chunk_overlap=0,
24+
)
25+
doc_splits = text_splitter.split_documents(docs_list)
26+
27+
embedding = AzureOpenAIEmbeddings(
28+
api_key=getenv("AZURE_OPENAI_API_KEY"),
29+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
30+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
31+
model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
32+
)
33+
34+
collection_name = "rag-chroma"
35+
persist_directory = "./.chroma"
36+
37+
38+
logging.basicConfig(level=logging.DEBUG)
39+
vectorstore = Chroma.from_documents(
40+
documents=doc_splits,
41+
collection_name=collection_name,
42+
embedding=embedding,
43+
persist_directory=persist_directory,
44+
)
45+
46+
retriever = Chroma(
47+
collection_name=collection_name,
48+
persist_directory=persist_directory,
49+
embedding_function=embedding,
50+
).as_retriever()
51+
52+
response = retriever.invoke("What is LLM Powered autonomous agent?")
53+
print(response)

0 commit comments

Comments
 (0)