added persistent memory and parse args

codinglabsong · codinglabsong · commit 46789b403078 · 2025-07-29T10:08:05.000-07:00
diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,6 @@ langgraph
 unstructured[pdf,docx,pptx,md,image]
 duckdb
 duckdb-engine
-openpyxl
+openpyxl
+faiss-cpu
+langgraph-checkpoint-sqlite
diff --git a/scripts/run_agent.sh b/scripts/run_agent.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python -m any_chatbot.agent "$@"
diff --git a/src/any_chatbot/agent.py b/src/any_chatbot/agent.py
@@ -1,73 +1,111 @@
-import getpass
 import os
+import argparse
 import random
+import sqlite3
 from dotenv import load_dotenv
 from pathlib import Path
 
 from langgraph.prebuilt import create_react_agent
-from langgraph.checkpoint.memory import MemorySaver
+from langgraph.checkpoint.sqlite import SqliteSaver
 from langchain.chat_models import init_chat_model
 
 from any_chatbot.indexing import embed_and_index_all_docs
 from any_chatbot.tools import initialize_retrieve_tool, initialize_sql_toolkit
 from any_chatbot.prompts import system_message
+from any_chatbot.utils import load_environ_vars
 
 load_dotenv()
 
 BASE = Path(__file__).parent.parent.parent
-DATA = BASE / "data"
-OUTPUTS = BASE / "outputs"
-DATABASE = DATA / "csv_excel_to_db" / "my_data.duckdb"
 
-# INDEXING
-embeddings, vector_store = embed_and_index_all_docs(DATA, DATABASE)
 
-# BUILD LLM
-if not os.environ.get("GOOGLE_API_KEY"):
-    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")
-llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for training and evaluation."""
+    p = argparse.ArgumentParser()
 
-# LOAD TOOLS
-retrieve_tool = initialize_retrieve_tool(vector_store)
-sql_tools = initialize_sql_toolkit(llm, DATABASE)
+    p.add_argument(
+        "--ask",
+        type=str,
+        default=(
+            "What kinds (images, text docs, or excel sheets) are available in the documents I have provided to you? Use the functional call to retrieve information for each type first.\n\n"
+            # "What colums does the excel have? once you found the answer, tell me there types too.\n\n"
+            # "Once you have that answer, I want you to calculate the median for each column.\n\n"
+            "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
+            "You can refine your semantic search queries and try multiple times with different queries until you resonably determine the data is not available on the given documents.\n\n"
+            "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
+        ),
+        help="Your input to agent",
+    )
+    p.add_argument(
+        "--data_dir",
+        type=Path,
+        default=BASE / "data",
+        help="Path to data dir where your files are uploaded",
+    )
+    p.add_argument(
+        "--thread_id",
+        type=str,
+        default=str(random.random()),
+        help="Your conversation history ID. Different IDs save different chat histories with agent",
+    )
+    p.add_argument(
+        "--outputs_dir",
+        type=Path,
+        default=BASE / "outputs",
+        help="Path to output dir where image of agent architecture is saved",
+    )
+    p.add_argument(
+        "--database_dir",
+        type=Path,
+        default=BASE / "data" / "generated_db" / "csv_excel_to_db.duckdb",
+        help="Path to database dir where the sql version of CSV/EXCEL files are stored",
+    )
+    return p.parse_args()
 
-# BUILD AGENT
-# build checkpointer
-memory = MemorySaver()
-# build agent
-agent_executor = create_react_agent(
-    llm, [retrieve_tool, *sql_tools], prompt=system_message, checkpointer=memory
-)
-# save architecture graph image
-png_bytes = agent_executor.get_graph().draw_mermaid_png()
-# save to file
-with open(OUTPUTS / "graph.png", "wb") as f:
-    f.write(png_bytes)
-print("Created graph.png")
 
-# PROMPT
-# specify an ID for the thread
-# config = {"configurable": {"thread_id": "abc123"}}
-config = {"configurable": {"thread_id": random.random()}}
+def main() -> None:
+    cfg = parse_args()
+    load_environ_vars()
+    # INDEXING
+    _, vector_store = embed_and_index_all_docs(cfg.data_dir, cfg.database_dir)
 
-# input_message = (
-#     "What is the content of the image?\n\n"
-#     "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
-#     "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
-# )
+    # BUILD LLM
+    llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
 
-input_message = (
-    "What kinds (images, text docs, or excel sheets) are available in the documents I have provided to you? Use the functional call to retrieve information for each type first.\n\n"
-    # "What colums does the excel have? once you found the answer, tell me there types too.\n\n"
-    # "Once you have that answer, I want you to calculate the median for each column.\n\n"
-    "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
-    "You can refine your semantic search queries and try multiple times with different queries until you resonably determine the data is not available on the given documents.\n\n"
-    "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
-)
+    # LOAD TOOLS
+    retrieve_tool = initialize_retrieve_tool(vector_store)
+    sql_tools = initialize_sql_toolkit(llm, cfg.database_dir)
 
-for event in agent_executor.stream(
-    {"messages": [{"role": "user", "content": input_message}]},
-    stream_mode="values",
-    config=config,
-):
-    event["messages"][-1].pretty_print()
+    # BUILD AGENT
+    # build persistent checkpointer
+    con = sqlite3.connect(
+        cfg.data_dir / "generated_db" / "agent_history.db", check_same_thread=False
+    )
+    memory = SqliteSaver(con)
+    # build agent
+    agent_executor = create_react_agent(
+        llm, [retrieve_tool, *sql_tools], prompt=system_message, checkpointer=memory
+    )
+    # save architecture graph image
+    png_bytes = agent_executor.get_graph().draw_mermaid_png()
+    # ensure the output folder exists
+    os.makedirs(cfg.outputs_dir, exist_ok=True)
+    # save to file
+    with open(cfg.outputs_dir / "graph.png", "wb") as f:
+        f.write(png_bytes)
+    print("Created graph.png")
+
+    # PROMPT
+    # specify an ID for the thread
+    config = {"configurable": {"thread_id": cfg.thread_id}}
+    # stream conversation
+    for event in agent_executor.stream(
+        {"messages": [{"role": "user", "content": cfg.ask}]},
+        stream_mode="values",
+        config=config,
+    ):
+        event["messages"][-1].pretty_print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/any_chatbot/indexing.py b/src/any_chatbot/indexing.py
@@ -2,10 +2,11 @@
 import re
 import pandas as pd
 import duckdb
+import shutil
 from dotenv import load_dotenv
 from pathlib import Path
 
-from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_community.vectorstores import FAISS
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
 from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -188,12 +189,22 @@ def build_duckdb_and_summary_cards(
     return summary_cards
 
 
+def reset_faiss_index(index_path: Path):
+    if index_path.exists():
+        print("Reseting previous index...")
+        shutil.rmtree(index_path)
+
+
 def embed_and_index_all_docs(
-    data_dir: Path = DATA, db_path: Path = DATA / "csv_excel_to_db" / "my_data.duckdb"
+    data_dir: Path = DATA,
+    db_path: Path = DATA / "generated_db" / "csv_excel_to_db.duckdb",
+    index_path: Path = DATA / "generated_db" / "faiss_index",
 ):
+    # delete old FAISS index if it exists
+    reset_faiss_index(index_path)
+
     # load embeedings and vector store
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    vector_store = InMemoryVectorStore(embeddings)
 
     # LOAD AND SPLIT TEXT DOCS
     text_chunks = load_and_split_text_docs(data_dir)
@@ -202,5 +213,10 @@ def embed_and_index_all_docs(
     # LOAD AND SPLIT CSV/EXCEL DOCS
     summary_cards = build_duckdb_and_summary_cards(data_dir, db_path)
 
-    vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
+    # vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
+    vector_store = FAISS.from_documents(
+        text_chunks + image_text_docs + summary_cards, embeddings
+    )
+    vector_store.save_local(index_path)
+
     return embeddings, vector_store
diff --git a/src/any_chatbot/tools.py b/src/any_chatbot/tools.py
@@ -29,7 +29,7 @@ def retrieve(
         retrieved_docs = vector_store.similarity_search(
             query,
             k=2,
-            filter=lambda doc: doc.metadata.get("source_type") == tag,
+            filter={"source_type": tag},
         )
         serialized = "\n\n".join(
             (f"Source: {doc.metadata}\nContent: {doc.page_content}")
diff --git a/src/any_chatbot/utils.py b/src/any_chatbot/utils.py
@@ -0,0 +1,10 @@
+import getpass
+import os
+
+
+def load_environ_vars() -> None:
+    """Set basic environment variables needed for a run."""
+    if not os.environ.get("GOOGLE_API_KEY"):
+        os.environ["GOOGLE_API_KEY"] = getpass.getpass(
+            "Enter API key for Google Gemini: "
+        )

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ def retrieve(`
`29`	`29`	`retrieved_docs = vector_store.similarity_search(`
`30`	`30`	`query,`
`31`	`31`	`k=2,`
`32`		`- filter=lambda doc: doc.metadata.get("source_type") == tag,`
	`32`	`+ filter={"source_type": tag},`
`33`	`33`	`)`
`34`	`34`	`serialized = "\n\n".join(`
`35`	`35`	`(f"Source: {doc.metadata}\nContent: {doc.page_content}")`