Skip to content

Commit e7acc09

Browse files
committed
Renamed app to rag; created .md file
1 parent 928596b commit e7acc09

File tree

5 files changed

+176
-11
lines changed

5 files changed

+176
-11
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
.venv/
2-
vector_stores/
2+
vector_stores/
3+
__pycache__/

app.py renamed to rag.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
from langchain_core.runnables import RunnablePassthrough
1515
from langchain_core.output_parsers import StrOutputParser
1616

17-
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
18-
from transformers import AutoTokenizer, pipeline
1917
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
2018

2119
# for streaming in Streamlit without LECL
@@ -61,9 +59,9 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
6159
def format_docs(docs):
6260
return "\n\n".join([doc.page_content for doc in docs])
6361

64-
####################### RAG #################################
65-
62+
############################################## RAG ########################################################
6663

64+
########## Creating prompt ##########
6765
prompt_template = """Use the following pieces of context regarding titanic ship to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
6866
6967
{context}
@@ -74,7 +72,7 @@ def format_docs(docs):
7472

7573
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
7674

77-
#VectorDB creation and saving to disk
75+
########## VectorDB creation and saving to disk ##########
7876
client = chromadb.Client()
7977

8078
persist_directory="/Users/raunakanand/Documents/Work_R/llm0/vector_stores"
@@ -86,7 +84,7 @@ def format_docs(docs):
8684
)
8785
vectordb.persist()
8886

89-
#VectorDB -loading from disk
87+
########## VectorDB -loading from disk ##########
9088
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings, collection_name='chroma1')
9189
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
9290

@@ -107,12 +105,14 @@ def format_docs(docs):
107105
# callbacks=[StreamingStdOutCallbackHandler()]
108106
)
109107

108+
########## When using RetrievalQA chain from llm's chain ##########
110109
qa = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff',
111110
retriever=retriever,
112111
# return_source_documents=True,
113112
chain_type_kwargs={'prompt': prompt},
114113
verbose=False)
115114

115+
########## RAG's chain in langchain's LECL format ##########
116116
rag_chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()} |
117117
prompt | llm | StrOutputParser())
118118

@@ -121,7 +121,6 @@ def inference(query: str):
121121
# return qa.run(query)
122122
return rag_chain.stream(query)
123123

124-
print('final')
125124

126125

127126

readme.md

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,35 @@ transformers
44
chromadb
55
streamlit
66
sentence-transformers
7-
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
87
# Example: METAL
98
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.83 --no-cache-dir
10-
llama_cpp_python
9+
10+
11+
12+
# Enviornment Setup
13+
14+
1. Clone the repo using git:
15+
```shell
16+
git clone https://github.com/rauni-iitr/langchain_chromaDB_opensourceLLM_streamlit.git
17+
```
18+
19+
2. Create a virtual enviornment, with 'venv' or with 'conda' and activate.
20+
```shell
21+
python3 -m venv .venv
22+
source .venv/bin/activate
23+
```
24+
25+
3. Now this rag application is built using few dependencies:
26+
- pypdf -- for reading pdf documents
27+
- chromadb -- vectorDB for creating a vector store
28+
- transformers -- dependency for sentence-transfors, atleast in this repository
29+
- sentence-transformers -- for embedding models to convert pdf documnts into vectors
30+
- streamlit -- to make UI for the LLM PDF's Q&A
31+
- llama-cpp_python -- to load gguf files for CPU inference of LLMs
32+
33+
You can install all of these with pip;
34+
```shell
35+
pip install pypdf chromadb transformers sentence-transformers streamlit
36+
```
37+
38+

requirements.txt

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
aiohttp==3.9.3
2+
aiosignal==1.3.1
3+
altair==5.2.0
4+
annotated-types==0.6.0
5+
anyio==4.3.0
6+
asgiref==3.8.1
7+
attrs==23.2.0
8+
backoff==2.2.1
9+
bcrypt==4.1.2
10+
blinker==1.7.0
11+
build==1.2.1
12+
cachetools==5.3.3
13+
certifi==2024.2.2
14+
charset-normalizer==3.3.2
15+
chroma-hnswlib==0.7.3
16+
chromadb==0.4.24
17+
click==8.1.7
18+
coloredlogs==15.0.1
19+
dataclasses-json==0.6.4
20+
Deprecated==1.2.14
21+
diskcache==5.6.3
22+
fastapi==0.110.0
23+
filelock==3.13.3
24+
flatbuffers==24.3.25
25+
frozenlist==1.4.1
26+
fsspec==2024.3.1
27+
gitdb==4.0.11
28+
GitPython==3.1.42
29+
google-auth==2.29.0
30+
googleapis-common-protos==1.63.0
31+
grpcio==1.62.1
32+
h11==0.14.0
33+
httptools==0.6.1
34+
huggingface-hub==0.22.1
35+
humanfriendly==10.0
36+
idna==3.6
37+
importlib-metadata==6.11.0
38+
importlib_resources==6.4.0
39+
Jinja2==3.1.3
40+
joblib==1.3.2
41+
jsonpatch==1.33
42+
jsonpointer==2.4
43+
jsonschema==4.21.1
44+
jsonschema-specifications==2023.12.1
45+
kubernetes==29.0.0
46+
langchain==0.1.13
47+
langchain-community==0.0.29
48+
langchain-core==0.1.36
49+
langchain-text-splitters==0.0.1
50+
langsmith==0.1.36
51+
llama_cpp_python==0.1.83
52+
markdown-it-py==3.0.0
53+
MarkupSafe==2.1.5
54+
marshmallow==3.21.1
55+
mdurl==0.1.2
56+
mmh3==4.1.0
57+
monotonic==1.6
58+
mpmath==1.3.0
59+
multidict==6.0.5
60+
mypy-extensions==1.0.0
61+
networkx==3.2.1
62+
numpy==1.26.4
63+
oauthlib==3.2.2
64+
onnxruntime==1.17.1
65+
opentelemetry-api==1.23.0
66+
opentelemetry-exporter-otlp-proto-common==1.23.0
67+
opentelemetry-exporter-otlp-proto-grpc==1.23.0
68+
opentelemetry-instrumentation==0.44b0
69+
opentelemetry-instrumentation-asgi==0.44b0
70+
opentelemetry-instrumentation-fastapi==0.44b0
71+
opentelemetry-proto==1.23.0
72+
opentelemetry-sdk==1.23.0
73+
opentelemetry-semantic-conventions==0.44b0
74+
opentelemetry-util-http==0.44b0
75+
orjson==3.10.0
76+
overrides==7.7.0
77+
packaging==23.2
78+
pandas==2.2.1
79+
pillow==10.2.0
80+
posthog==3.5.0
81+
protobuf==4.25.3
82+
pulsar-client==3.4.0
83+
pyarrow==15.0.2
84+
pyasn1==0.6.0
85+
pyasn1_modules==0.4.0
86+
pydantic==2.6.4
87+
pydantic_core==2.16.3
88+
pydeck==0.8.1b0
89+
Pygments==2.17.2
90+
pypdf==4.1.0
91+
PyPika==0.48.9
92+
pyproject_hooks==1.0.0
93+
python-dateutil==2.9.0.post0
94+
python-dotenv==1.0.1
95+
pytz==2024.1
96+
PyYAML==6.0.1
97+
referencing==0.34.0
98+
regex==2023.12.25
99+
requests==2.31.0
100+
requests-oauthlib==2.0.0
101+
rich==13.7.1
102+
rpds-py==0.18.0
103+
rsa==4.9
104+
safetensors==0.4.2
105+
scikit-learn==1.4.1.post1
106+
scipy==1.12.0
107+
sentence-transformers==2.6.1
108+
setuptools==69.2.0
109+
six==1.16.0
110+
smmap==5.0.1
111+
sniffio==1.3.1
112+
SQLAlchemy==2.0.29
113+
starlette==0.36.3
114+
streamlit==1.32.2
115+
sympy==1.12
116+
tenacity==8.2.3
117+
threadpoolctl==3.4.0
118+
tokenizers==0.15.2
119+
toml==0.10.2
120+
toolz==0.12.1
121+
torch==2.2.2
122+
tornado==6.4
123+
tqdm==4.66.2
124+
transformers==4.39.2
125+
typer==0.11.0
126+
typing-inspect==0.9.0
127+
typing_extensions==4.10.0
128+
tzdata==2024.1
129+
urllib3==2.2.1
130+
uvicorn==0.29.0
131+
uvloop==0.19.0
132+
watchfiles==0.21.0
133+
websocket-client==1.7.0
134+
websockets==12.0
135+
wrapt==1.16.0
136+
yarl==1.9.4
137+
zipp==3.18.1

st_app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import streamlit as st
2-
from app import *
2+
from rag import *
33

44
st.set_page_config(page_title="LLM Search Titaninc", page_icon=':robot:')
55
# st.header("Query PDF")

0 commit comments

Comments
 (0)