|
| 1 | +import logging |
| 2 | +from os import getenv |
| 3 | + |
| 4 | +from dotenv import load_dotenv |
| 5 | +from langchain.text_splitter import RecursiveCharacterTextSplitter |
| 6 | +from langchain_chroma import Chroma |
| 7 | +from langchain_community.document_loaders import WebBaseLoader |
| 8 | +from langchain_openai import AzureOpenAIEmbeddings |
| 9 | + |
| 10 | +load_dotenv() |
| 11 | + |
| 12 | +urls = [ |
| 13 | + "https://lilianweng.github.io/posts/2023-06-23-agent/", |
| 14 | + "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/", |
| 15 | + "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/", |
| 16 | +] |
| 17 | + |
| 18 | +docs = [WebBaseLoader(url).load() for url in urls] |
| 19 | +docs_list = [item for sublist in docs for item in sublist] |
| 20 | + |
| 21 | +text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( |
| 22 | + chunk_size=250, |
| 23 | + chunk_overlap=0, |
| 24 | +) |
| 25 | +doc_splits = text_splitter.split_documents(docs_list) |
| 26 | + |
| 27 | +embedding = AzureOpenAIEmbeddings( |
| 28 | + api_key=getenv("AZURE_OPENAI_API_KEY"), |
| 29 | + api_version=getenv("AZURE_OPENAI_API_VERSION"), |
| 30 | + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), |
| 31 | + model=getenv("AZURE_OPENAI_EMBEDDING_MODEL"), |
| 32 | +) |
| 33 | + |
| 34 | +collection_name = "rag-chroma" |
| 35 | +persist_directory = "./.chroma" |
| 36 | + |
| 37 | + |
| 38 | +logging.basicConfig(level=logging.DEBUG) |
| 39 | +vectorstore = Chroma.from_documents( |
| 40 | + documents=doc_splits, |
| 41 | + collection_name=collection_name, |
| 42 | + embedding=embedding, |
| 43 | + persist_directory=persist_directory, |
| 44 | +) |
| 45 | + |
| 46 | +retriever = Chroma( |
| 47 | + collection_name=collection_name, |
| 48 | + persist_directory=persist_directory, |
| 49 | + embedding_function=embedding, |
| 50 | +).as_retriever() |
| 51 | + |
| 52 | +response = retriever.invoke("What is LLM Powered autonomous agent?") |
| 53 | +print(response) |
0 commit comments