Skip to content

Commit f2d5307

Browse files
committed
added arg parse to load new data, if not use existing FAISS and duckdb
1 parent 46789b4 commit f2d5307

File tree

2 files changed

+38
-26
lines changed

2 files changed

+38
-26
lines changed

src/any_chatbot/agent.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,22 @@ def parse_args() -> argparse.Namespace:
3737
help="Your input to agent",
3838
)
3939
p.add_argument(
40-
"--data_dir",
41-
type=Path,
42-
default=BASE / "data",
43-
help="Path to data dir where your files are uploaded",
40+
"--load_data",
41+
action="store_true",
42+
help="If set, (re)load and process all data files, rebuilding FAISS and DuckDB. If not set, just use existing data.",
4443
)
4544
p.add_argument(
4645
"--thread_id",
4746
type=str,
4847
default=str(random.random()),
4948
help="Your conversation history ID. Different IDs save different chat histories with agent",
5049
)
50+
p.add_argument(
51+
"--data_dir",
52+
type=Path,
53+
default=BASE / "data",
54+
help="Path to data dir where your files are uploaded",
55+
)
5156
p.add_argument(
5257
"--outputs_dir",
5358
type=Path,
@@ -67,7 +72,9 @@ def main() -> None:
6772
cfg = parse_args()
6873
load_environ_vars()
6974
# INDEXING
70-
_, vector_store = embed_and_index_all_docs(cfg.data_dir, cfg.database_dir)
75+
_, vector_store = embed_and_index_all_docs(
76+
cfg.data_dir, cfg.database_dir, load_data=cfg.load_data
77+
)
7178

7279
# BUILD LLM
7380
llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

src/any_chatbot/indexing.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -189,34 +189,39 @@ def build_duckdb_and_summary_cards(
189189
return summary_cards
190190

191191

192-
def reset_faiss_index(index_path: Path):
193-
if index_path.exists():
194-
print("Reseting previous index...")
195-
shutil.rmtree(index_path)
196-
197-
198192
def embed_and_index_all_docs(
199193
data_dir: Path = DATA,
200194
db_path: Path = DATA / "generated_db" / "csv_excel_to_db.duckdb",
201195
index_path: Path = DATA / "generated_db" / "faiss_index",
196+
load_data: bool = False,
202197
):
203-
# delete old FAISS index if it exists
204-
reset_faiss_index(index_path)
205-
206198
# load embeedings and vector store
207199
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
208200

209-
# LOAD AND SPLIT TEXT DOCS
210-
text_chunks = load_and_split_text_docs(data_dir)
211-
# LOAD IMAGES (OCR converts image -> text)
212-
image_text_docs = load_image_docs_as_text(data_dir)
213-
# LOAD AND SPLIT CSV/EXCEL DOCS
214-
summary_cards = build_duckdb_and_summary_cards(data_dir, db_path)
215-
216-
# vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
217-
vector_store = FAISS.from_documents(
218-
text_chunks + image_text_docs + summary_cards, embeddings
219-
)
220-
vector_store.save_local(index_path)
201+
if not load_data and index_path.exists():
202+
# load existing FAISS index
203+
vector_store = FAISS.load_local(
204+
index_path, embeddings, allow_dangerous_deserialization=True
205+
)
206+
print("Loaded existing FAISS index and database.")
207+
else:
208+
# delete old FAISS index if it exists
209+
if index_path.exists():
210+
print("Reseting previous index...")
211+
shutil.rmtree(index_path)
212+
213+
# LOAD AND SPLIT TEXT DOCS
214+
text_chunks = load_and_split_text_docs(data_dir)
215+
# LOAD IMAGES (OCR converts image -> text)
216+
image_text_docs = load_image_docs_as_text(data_dir)
217+
# LOAD AND SPLIT CSV/EXCEL DOCS
218+
summary_cards = build_duckdb_and_summary_cards(data_dir, db_path)
219+
220+
# vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
221+
vector_store = FAISS.from_documents(
222+
text_chunks + image_text_docs + summary_cards, embeddings
223+
)
224+
vector_store.save_local(index_path)
225+
print("Built and saved new FAISS index.")
221226

222227
return embeddings, vector_store

0 commit comments

Comments
 (0)