added sql query toolkit for agent

codinglabsong · codinglabsong · commit ef5dae350517 · 2025-07-28T21:30:34.000-07:00
diff --git a/src/any_chatbot/agent.py b/src/any_chatbot/agent.py
@@ -9,16 +9,18 @@
 from langchain.chat_models import init_chat_model
 
 from any_chatbot.indexing import embed_and_index_all_docs
-from any_chatbot.tools import initialize_retrieve_tool
+from any_chatbot.tools import initialize_retrieve_tool, initialize_sql_toolkit
+from any_chatbot.prompts import system_message
 
 load_dotenv()
 
 BASE = Path(__file__).parent.parent.parent
 DATA = BASE / "data"
 OUTPUTS = BASE / "outputs"
+DATABASE = DATA / "csv_excel_to_db" / "my_data.duckdb"
 
 # INDEXING
-embeddings, vector_store = embed_and_index_all_docs(DATA)
+embeddings, vector_store = embed_and_index_all_docs(DATA, DATABASE)
 
 # BUILD LLM
 if not os.environ.get("GOOGLE_API_KEY"):
@@ -27,12 +29,15 @@
 
 # LOAD TOOLS
 retrieve_tool = initialize_retrieve_tool(vector_store)
+sql_tools = initialize_sql_toolkit(llm, DATABASE)
 
 # BUILD AGENT
 # build checkpointer
 memory = MemorySaver()
 # build agent
-agent_executor = create_react_agent(llm, [retrieve_tool], checkpointer=memory)
+agent_executor = create_react_agent(
+    llm, [retrieve_tool, *sql_tools], prompt=system_message, checkpointer=memory
+)
 # save architecture graph image
 png_bytes = agent_executor.get_graph().draw_mermaid_png()
 # save to file
@@ -52,8 +57,11 @@
 # )
 
 input_message = (
-    "What colums does the excel have? once you found the answer, tell me there types too.\n\n"
+    "How many employees were working for Nike? The informaton is in the pdf.\n\n"
+    # "What colums does the excel have? once you found the answer, tell me there types too.\n\n"
+    # "Once you have that answer, I want you to calculate the median for each column.\n\n"
     "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
+    "You can refine your semantic search queries and try multiple times with different queries until you resonably determine the data is not available on the given documents.\n\n"
     "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
 )
 
diff --git a/src/any_chatbot/indexing.py b/src/any_chatbot/indexing.py
@@ -38,7 +38,7 @@ def load_and_split_text_docs(data_dir):
         glob=globs,
         loader_cls=UnstructuredFileLoader,
     )
-    print(f"Loading files from {data_dir}")
+    print(f"Loading text files from {data_dir}")
     docs = loader.load()
     print(f"Loaded {len(docs)} text files")
     # split
@@ -204,47 +204,3 @@ def embed_and_index_all_docs(
 
     vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
     return embeddings, vector_store
-
-
-# def index_text_docs(
-#     data_pth: Path = DATA,
-# ):
-#     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-#     vector_store = InMemoryVectorStore(embeddings)
-
-#     # Load the text documents
-#     loader = DirectoryLoader(
-#         str(data_pth),
-#         glob=[
-#             "**/*.pdf",
-#             "**/*.docx",
-#             "**/*.pptx",
-#             "**/*.md",
-#             "**/*.html",
-#             "**/*.txt",
-#             "**/*.png",
-#             "**/*.jpg",
-#             "**/*.jpeg",
-#             "**/*.tiff",
-#         ],
-#         loader_cls=UnstructuredFileLoader,
-#     )
-#     print(f"Loading files from {data_pth}")
-#     docs = loader.load()
-#     print(f"Loaded {len(docs)} files")
-
-#     # Split the texts
-#     text_splitter = RecursiveCharacterTextSplitter(
-#         chunk_size=1000,
-#         chunk_overlap=200,
-#         add_start_index=True,
-#         separators=["\n\n", "\n", " ", ""],
-#     )
-#     all_splits = text_splitter.split_documents(docs)
-#     print(len(all_splits))
-
-#     # index the docs
-#     ids = vector_store.add_documents(documents=all_splits)
-#     print(len(ids))
-
-#     return embeddings, vector_store
diff --git a/src/any_chatbot/prompts.py b/src/any_chatbot/prompts.py
@@ -0,0 +1,25 @@
+system_message = """
+You are an agent designed to interact with a SQL database.
+Given an input question, create a syntactically correct {dialect} query to run,
+then look at the results of the query and return the answer. Unless the user
+specifies a specific number of examples they wish to obtain, always limit your
+query to at most {top_k} results.
+
+You can order the results by a relevant column to return the most interesting
+examples in the database. Never query for all the columns from a specific table,
+only ask for the relevant columns given the question.
+
+You MUST double check your query before executing it. If you get an error while
+executing a query, rewrite the query and try again.
+
+DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the
+database.
+
+To start you should ALWAYS look at the tables in the database to see what you
+can query. Do NOT skip this step.
+
+Then you should query the schema of the most relevant tables.
+""".format(
+    dialect="DuckDB",
+    top_k=5,
+)
diff --git a/src/any_chatbot/tools.py b/src/any_chatbot/tools.py
@@ -1,7 +1,14 @@
 from typing import Tuple, List
+from pathlib import Path
+
 from langchain_core.tools import tool
 from langchain.vectorstores.base import VectorStore
 from langchain.schema import Document
+from langchain_community.utilities.sql_database import SQLDatabase
+from langchain_community.agent_toolkits import SQLDatabaseToolkit
+
+BASE = Path(__file__).parent.parent.parent
+DATA = BASE / "data"
 
 
 def initialize_retrieve_tool(vector_store: VectorStore):
@@ -18,3 +25,13 @@ def retrieve(query: str) -> Tuple[str, List[Document]]:
         return serialized, retrieved_docs
 
     return retrieve
+
+
+def initialize_sql_toolkit(
+    llm,
+    db_path: Path = DATA / "csv_excel_to_db" / "my_data.duckdb",
+):
+    db = SQLDatabase.from_uri(f"duckdb:///{db_path}")
+    toolkit = SQLDatabaseToolkit(db=db, llm=llm)
+    tools = toolkit.get_tools()
+    return tools