Skip to content

Commit 9297bf0

Browse files
committed
added indexing support for text, images, and excel/.csv
1 parent 0c11652 commit 9297bf0

File tree

4 files changed

+236
-33
lines changed

4 files changed

+236
-33
lines changed

requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,7 @@ pypdf
77
python-dotenv
88
pinecone
99
langgraph
10-
unstructured[pdf,docx,pptx,md,image]
10+
unstructured[pdf,docx,pptx,md,image]
11+
duckdb
12+
duckdb-engine
13+
openpyxl

src/any_chatbot/agent.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from langgraph.checkpoint.memory import MemorySaver
99
from langchain.chat_models import init_chat_model
1010

11-
from any_chatbot.indexing import index_text_docs
11+
from any_chatbot.indexing import embed_and_index_all_docs
1212
from any_chatbot.tools import initialize_retrieve_tool
1313

1414
load_dotenv()
@@ -18,7 +18,7 @@
1818
OUTPUTS = BASE / "outputs"
1919

2020
# INDEXING
21-
embeddings, vector_store = index_text_docs(DATA)
21+
embeddings, vector_store = embed_and_index_all_docs(DATA)
2222

2323
# BUILD LLM
2424
if not os.environ.get("GOOGLE_API_KEY"):
@@ -38,15 +38,21 @@
3838
# save to file
3939
with open(OUTPUTS / "graph.png", "wb") as f:
4040
f.write(png_bytes)
41-
print("Wrote graph.png")
41+
print("Created graph.png")
4242

4343
# PROMPT
4444
# specify an ID for the thread
4545
# config = {"configurable": {"thread_id": "abc123"}}
4646
config = {"configurable": {"thread_id": random.random()}}
4747

48+
# input_message = (
49+
# "What is the content of the image?\n\n"
50+
# "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
51+
# "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
52+
# )
53+
4854
input_message = (
49-
"What is the content of the image?\n\n"
55+
"What colums does the excel have? once you found the answer, tell me there types too.\n\n"
5056
"When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
5157
"Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
5258
)

src/any_chatbot/indexing.py

Lines changed: 222 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,250 @@
1+
import os
2+
import re
3+
import pandas as pd
4+
import duckdb
15
from dotenv import load_dotenv
26
from pathlib import Path
37

48
from langchain_core.vectorstores import InMemoryVectorStore
59
from langchain_google_genai import GoogleGenerativeAIEmbeddings
610
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
711
from langchain_text_splitters import RecursiveCharacterTextSplitter
12+
from langchain_core.documents import Document
813

914
load_dotenv()
1015

1116
BASE = Path(__file__).parent.parent.parent
1217
DATA = BASE / "data"
1318

1419

15-
def index_text_docs(
16-
data_pth: Path = DATA,
17-
):
18-
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
19-
vector_store = InMemoryVectorStore(embeddings)
20+
def load_and_split_text_docs(data_dir):
21+
text_chunks = []
22+
globs = [
23+
"**/*.pdf",
24+
"**/*.docx",
25+
"**/*.pptx",
26+
"**/*.md",
27+
"**/*.html",
28+
"**/*.txt",
29+
]
30+
# gaudrail if no files matched
31+
if not any(next(data_dir.rglob(p), None) for p in globs):
32+
print(f"No text files found under {data_dir}; skipping.")
33+
return text_chunks
2034

21-
# Load the text documents
35+
print(f"Detected text files under {data_dir}")
2236
loader = DirectoryLoader(
23-
str(data_pth),
24-
glob=[
25-
"**/*.pdf",
26-
"**/*.docx",
27-
"**/*.pptx",
28-
"**/*.md",
29-
"**/*.html",
30-
"**/*.txt",
31-
"**/*.png",
32-
"**/*.jpg",
33-
"**/*.jpeg",
34-
"**/*.tiff",
35-
],
37+
str(data_dir),
38+
glob=globs,
3639
loader_cls=UnstructuredFileLoader,
3740
)
38-
print(f"Loading files from {data_pth}")
41+
print(f"Loading files from {data_dir}")
3942
docs = loader.load()
40-
print(f"Loaded {len(docs)} files")
41-
42-
# Split the texts
43+
print(f"Loaded {len(docs)} text files")
44+
# split
4345
text_splitter = RecursiveCharacterTextSplitter(
4446
chunk_size=1000,
4547
chunk_overlap=200,
4648
add_start_index=True,
4749
separators=["\n\n", "\n", " ", ""],
4850
)
49-
all_splits = text_splitter.split_documents(docs)
50-
print(len(all_splits))
51+
text_chunks = text_splitter.split_documents(docs)
52+
print(f"Split text chunks: {len(text_chunks)}")
53+
# tag
54+
for chunk in text_chunks:
55+
chunk.metadata["source_type"] = "text"
56+
57+
return text_chunks
58+
59+
60+
def load_image_docs_as_text(data_dir):
61+
image_text_docs = []
62+
globs = [
63+
"**/*.png",
64+
"**/*.jpg",
65+
"**/*.jpeg",
66+
"**/*.tiff",
67+
]
68+
# gaudrail if no files matched
69+
if not any(next(data_dir.rglob(p), None) for p in globs):
70+
print(f"No images found under {data_dir}; skipping.")
71+
return image_text_docs
72+
73+
print(f"Detected images under {data_dir}")
74+
loader = DirectoryLoader(
75+
str(data_dir),
76+
glob=globs,
77+
loader_cls=UnstructuredFileLoader,
78+
)
79+
print(f"Loading images from {data_dir}")
80+
image_text_docs = loader.load()
81+
print(f"Loaded {len(image_text_docs)} image files")
82+
# tag
83+
for img in image_text_docs:
84+
img.metadata["source_type"] = "image_text"
85+
86+
return image_text_docs
87+
88+
89+
def _tbl(name: str) -> str:
90+
"""make a safe SQL table name"""
91+
name = re.sub(r"[^0-9a-zA-Z_]+", "_", name).strip("_")
92+
if not name or name[0].isdigit():
93+
name = f"t_{name}"
94+
return name.lower()
95+
96+
97+
def build_duckdb_and_summary_cards(
98+
data_dir: Path,
99+
db_path: Path,
100+
) -> list[Document]:
101+
summary_cards = []
102+
# skip if there are no .csv/.xlsx/.xls files
103+
patterns = ("*.csv", "*.xlsx", "*.xls")
104+
if not any(next(data_dir.rglob(p), None) for p in patterns):
105+
print(f"No CSV or Excel files found under {data_dir}; skipping.")
106+
return summary_cards
107+
print(f"Detected CSV or Excel files under {data_dir}")
108+
# ensure the DB folder exists
109+
os.makedirs(db_path.parent, exist_ok=True)
110+
# empty the entire DB
111+
if db_path.exists():
112+
db_path.unlink()
113+
# start from an empty fresh DB
114+
with duckdb.connect(str(db_path)) as con:
115+
# ingest .csv files into DuckDB (overwrite on rerun)
116+
for fp in data_dir.rglob("*.csv"):
117+
table = _tbl(fp.stem)
118+
fp_sql = fp.as_posix().replace("'", "''") # escape single quotes
119+
con.execute(
120+
f"""
121+
CREATE OR REPLACE TABLE {table} AS
122+
SELECT * FROM read_csv_auto('{fp_sql}', header=true)
123+
"""
124+
)
125+
126+
# XLSX ingestion via pandas
127+
for fp in data_dir.rglob("*.xlsx"):
128+
try:
129+
xls = pd.ExcelFile(fp) # lists sheet names
130+
except Exception as e:
131+
print(f"Skip {fp.name}: {e}")
132+
continue
133+
134+
# One table per sheet
135+
for sheet in xls.sheet_names:
136+
try:
137+
df = pd.read_excel(fp, sheet_name=sheet)
138+
except Exception as e:
139+
print(f"Skip {fp.name}:{sheet}: {e}")
140+
continue
141+
142+
tmp_name = f"_tmp_{_tbl(fp.stem)}_{_tbl(sheet)}"
143+
con.register(tmp_name, df)
51144

52-
# index the docs
53-
ids = vector_store.add_documents(documents=all_splits)
54-
print(len(ids))
145+
table = _tbl(f"{fp.stem}__{sheet}")
146+
con.execute(
147+
f"""
148+
CREATE OR REPLACE TABLE {table} AS
149+
SELECT * FROM {tmp_name}"""
150+
)
151+
con.unregister(tmp_name)
55152

153+
for fp in data_dir.rglob("*.xls"):
154+
# .xls not supported by DuckDB
155+
print(f"Skip {fp.name}: .xls not supported by DuckDB.")
156+
157+
# build summary cards from DuckDB
158+
tables = [r[0] for r in con.execute("SHOW TABLES").fetchall()]
159+
for tbl in tables:
160+
# DESCRIBE/PRAGMA to get columns & types
161+
schema_rows = con.execute(f"DESCRIBE {tbl}").fetchall()
162+
col_names = [r[0] for r in schema_rows]
163+
col_types = [r[1] for r in schema_rows]
164+
nrows = con.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0]
165+
preview_df = con.execute(f"SELECT * FROM {tbl} LIMIT 5").df()
166+
167+
col_str = ", ".join(f"{n}:{t}" for n, t in zip(col_names, col_types))
168+
preview_txt = preview_df.to_string(index=False)
169+
170+
text = (
171+
f"TABLE CARD — {tbl}\n"
172+
f"Columns (Length: {len(col_names)}; Format: 'column_name:data_type'): {col_str}\n"
173+
f"Rows: {nrows}\n\n"
174+
f"Sample rows (up to 5):\n{preview_txt}\n"
175+
)
176+
177+
summary_cards.append(
178+
Document(
179+
page_content=text,
180+
metadata={
181+
"source_type": "table_summary",
182+
"table": tbl,
183+
"db_path": str(db_path),
184+
},
185+
)
186+
)
187+
188+
return summary_cards
189+
190+
191+
def embed_and_index_all_docs(
192+
data_dir: Path = DATA, db_path: Path = DATA / "csv_excel_to_db" / "my_data.duckdb"
193+
):
194+
# load embeedings and vector store
195+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
196+
vector_store = InMemoryVectorStore(embeddings)
197+
198+
# LOAD AND SPLIT TEXT DOCS
199+
text_chunks = load_and_split_text_docs(data_dir)
200+
# LOAD IMAGES (OCR converts image -> text)
201+
image_text_docs = load_image_docs_as_text(data_dir)
202+
# LOAD AND SPLIT CSV/EXCEL DOCS
203+
summary_cards = build_duckdb_and_summary_cards(data_dir, db_path)
204+
205+
vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
56206
return embeddings, vector_store
207+
208+
209+
# def index_text_docs(
210+
# data_pth: Path = DATA,
211+
# ):
212+
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
213+
# vector_store = InMemoryVectorStore(embeddings)
214+
215+
# # Load the text documents
216+
# loader = DirectoryLoader(
217+
# str(data_pth),
218+
# glob=[
219+
# "**/*.pdf",
220+
# "**/*.docx",
221+
# "**/*.pptx",
222+
# "**/*.md",
223+
# "**/*.html",
224+
# "**/*.txt",
225+
# "**/*.png",
226+
# "**/*.jpg",
227+
# "**/*.jpeg",
228+
# "**/*.tiff",
229+
# ],
230+
# loader_cls=UnstructuredFileLoader,
231+
# )
232+
# print(f"Loading files from {data_pth}")
233+
# docs = loader.load()
234+
# print(f"Loaded {len(docs)} files")
235+
236+
# # Split the texts
237+
# text_splitter = RecursiveCharacterTextSplitter(
238+
# chunk_size=1000,
239+
# chunk_overlap=200,
240+
# add_start_index=True,
241+
# separators=["\n\n", "\n", " ", ""],
242+
# )
243+
# all_splits = text_splitter.split_documents(docs)
244+
# print(len(all_splits))
245+
246+
# # index the docs
247+
# ids = vector_store.add_documents(documents=all_splits)
248+
# print(len(ids))
249+
250+
# return embeddings, vector_store

src/any_chatbot/retrievers.py

Whitespace-only changes.

0 commit comments

Comments
 (0)