transfered notebook to python scripts with DRY and fixing dir indexing erros

codinglabsong · codinglabsong · commit ecf37815d1cc · 2025-07-28T13:34:04.000-07:00
diff --git a/notebooks/experiments/00_semantic_search.ipynb b/notebooks/experiments/00_semantic_search.ipynb
@@ -191,7 +191,6 @@
     "    TextLoader,\n",
     "    UnstructuredWordDocumentLoader,\n",
     "    Unsc\n",
-    "    \n",
     ")\n",
     "from langchain.document_loaders import UnstructuredFileLoader\n",
     "\n",
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ langchain-pinecone
 pypdf
 python-dotenv
 pinecone
-langgraph
+langgraph
+unstructured[pdf,docx,pptx,md]
diff --git a/src/any_chatbot/agent.py b/src/any_chatbot/agent.py
@@ -0,0 +1,61 @@
+import getpass
+import os
+from dotenv import load_dotenv
+from pathlib import Path
+
+from langgraph.prebuilt import create_react_agent
+from langgraph.checkpoint.memory import MemorySaver
+from langchain.chat_models import init_chat_model
+
+from any_chatbot.indexing import index_text_docs
+from any_chatbot.tools import initialize_retrieve_tool
+
+load_dotenv()
+
+BASE = Path(__file__).parent.parent.parent
+DATA = BASE / "data"
+OUTPUTS = BASE / "outputs"
+
+# INDEXING
+embeddings, vector_store = index_text_docs(DATA)
+
+# BUILD LLM
+if not os.environ.get("GOOGLE_API_KEY"):
+    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")
+llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
+
+# LOAD TOOLS
+retrieve_tool = initialize_retrieve_tool(vector_store)
+
+# BUILD AGENT
+# build checkpointer
+memory = MemorySaver()
+# build agent
+agent_executor = create_react_agent(llm, [retrieve_tool], checkpointer=memory)
+# save architecture graph image
+png_bytes = agent_executor.get_graph().draw_mermaid_png()
+# save to file
+with open(OUTPUTS / "graph.png", "wb") as f:
+    f.write(png_bytes)
+print("Wrote graph.png")
+
+# PROMPT
+# specify an ID for the thread
+import random
+# config = {"configurable": {"thread_id": "abc123"}}
+config = {"configurable": {"thread_id": random.random()}}
+
+input_message = (
+    "First retrieve what the revenue for Nike in 2023 was using the functional call.\n\n"
+    "Once you get the answer, do a second retrieve to tell me which distribution centers nike have.\n\n"
+    "Once you get the second answer,, tell me how many employees nike has. You can retreive MULTIPLE TIMES\n\n"
+    "Base your answers only on the retrieved information thorugh the functional call you have."
+)
+
+for event in agent_executor.stream(
+    {"messages": [{"role": "user", "content": input_message}]},
+    stream_mode="values",
+    config=config,
+):
+    event["messages"][-1].pretty_print()
+
diff --git a/src/any_chatbot/indexing.py b/src/any_chatbot/indexing.py
@@ -0,0 +1,54 @@
+from dotenv import load_dotenv
+from pathlib import Path
+
+from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+load_dotenv()
+
+BASE = Path(__file__).parent.parent.parent
+DATA = BASE / "data"
+
+def index_text_docs(
+    data_pth: Path = DATA,
+):
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_store = InMemoryVectorStore(embeddings)
+
+    # Load the text documents
+    loader = DirectoryLoader(
+        str(data_pth),
+        glob=[
+            "**/*.pdf",
+            "**/*.docx",
+            "**/*.pptx",
+            "**/*.md",
+            "**/*.html",
+            "**/*.txt",
+        ],
+        loader_cls=UnstructuredFileLoader
+    )
+    print(f"Loading docs from {data_pth}")
+    docs = loader.load()
+    print(f"Loaded {len(docs)} docs")
+
+    # Split the texts
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000, 
+        chunk_overlap=200, 
+        add_start_index=True,
+        separators=["\n\n", "\n", " ", ""],  
+    )
+    all_splits = text_splitter.split_documents(docs)
+    print(len(all_splits))
+
+    # index the docs
+    ids = vector_store.add_documents(documents=all_splits)
+    print(len(ids))
+    
+    return embeddings, vector_store
+
+
+
diff --git a/src/any_chatbot/tools.py b/src/any_chatbot/tools.py
@@ -0,0 +1,21 @@
+from typing import Tuple, List
+from langchain_core.tools import tool
+from langchain.vectorstores.base import VectorStore
+from langchain.schema import Document    
+
+def initialize_retrieve_tool(vector_store: VectorStore):
+    @tool(
+        description="Retrieve information related to a query",
+        response_format="content_and_artifact",
+    )
+    def retrieve(
+        query: str
+    ) -> Tuple[str, List[Document]]:
+        retrieved_docs = vector_store.similarity_search(query, k=3)
+        serialized = "\n\n".join(
+            (f"Source: {doc.metadata}\nContent: {doc.page_content}")
+            for doc in retrieved_docs
+        )
+        return serialized, retrieved_docs
+
+    return retrieve