added indexing support for text, images, and excel/.csv

codinglabsong · codinglabsong · commit 9297bf0f9bcc · 2025-07-28T17:54:05.000-07:00
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,7 @@ pypdf
 python-dotenv
 pinecone
 langgraph
-unstructured[pdf,docx,pptx,md,image]
+unstructured[pdf,docx,pptx,md,image]
+duckdb
+duckdb-engine
+openpyxl
diff --git a/src/any_chatbot/agent.py b/src/any_chatbot/agent.py
@@ -8,7 +8,7 @@
 from langgraph.checkpoint.memory import MemorySaver
 from langchain.chat_models import init_chat_model
 
-from any_chatbot.indexing import index_text_docs
+from any_chatbot.indexing import embed_and_index_all_docs
 from any_chatbot.tools import initialize_retrieve_tool
 
 load_dotenv()
@@ -18,7 +18,7 @@
 OUTPUTS = BASE / "outputs"
 
 # INDEXING
-embeddings, vector_store = index_text_docs(DATA)
+embeddings, vector_store = embed_and_index_all_docs(DATA)
 
 # BUILD LLM
 if not os.environ.get("GOOGLE_API_KEY"):
@@ -38,15 +38,21 @@
 # save to file
 with open(OUTPUTS / "graph.png", "wb") as f:
     f.write(png_bytes)
-print("Wrote graph.png")
+print("Created graph.png")
 
 # PROMPT
 # specify an ID for the thread
 # config = {"configurable": {"thread_id": "abc123"}}
 config = {"configurable": {"thread_id": random.random()}}
 
+# input_message = (
+#     "What is the content of the image?\n\n"
+#     "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
+#     "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
+# )
+
 input_message = (
-    "What is the content of the image?\n\n"
+    "What colums does the excel have? once you found the answer, tell me there types too.\n\n"
     "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
     "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
 )
diff --git a/src/any_chatbot/indexing.py b/src/any_chatbot/indexing.py
@@ -1,56 +1,250 @@
+import os
+import re
+import pandas as pd
+import duckdb
 from dotenv import load_dotenv
 from pathlib import Path
 
 from langchain_core.vectorstores import InMemoryVectorStore
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
 from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
 
 load_dotenv()
 
 BASE = Path(__file__).parent.parent.parent
 DATA = BASE / "data"
 
 
-def index_text_docs(
-    data_pth: Path = DATA,
-):
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    vector_store = InMemoryVectorStore(embeddings)
+def load_and_split_text_docs(data_dir):
+    text_chunks = []
+    globs = [
+        "**/*.pdf",
+        "**/*.docx",
+        "**/*.pptx",
+        "**/*.md",
+        "**/*.html",
+        "**/*.txt",
+    ]
+    # gaudrail if no files matched
+    if not any(next(data_dir.rglob(p), None) for p in globs):
+        print(f"No text files found under {data_dir}; skipping.")
+        return text_chunks
 
-    # Load the text documents
+    print(f"Detected text files under {data_dir}")
     loader = DirectoryLoader(
-        str(data_pth),
-        glob=[
-            "**/*.pdf",
-            "**/*.docx",
-            "**/*.pptx",
-            "**/*.md",
-            "**/*.html",
-            "**/*.txt",
-            "**/*.png",
-            "**/*.jpg",
-            "**/*.jpeg",
-            "**/*.tiff",
-        ],
+        str(data_dir),
+        glob=globs,
         loader_cls=UnstructuredFileLoader,
     )
-    print(f"Loading files from {data_pth}")
+    print(f"Loading files from {data_dir}")
     docs = loader.load()
-    print(f"Loaded {len(docs)} files")
-
-    # Split the texts
+    print(f"Loaded {len(docs)} text files")
+    # split
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=200,
         add_start_index=True,
         separators=["\n\n", "\n", " ", ""],
     )
-    all_splits = text_splitter.split_documents(docs)
-    print(len(all_splits))
+    text_chunks = text_splitter.split_documents(docs)
+    print(f"Split text chunks: {len(text_chunks)}")
+    # tag
+    for chunk in text_chunks:
+        chunk.metadata["source_type"] = "text"
+
+    return text_chunks
+
+
+def load_image_docs_as_text(data_dir):
+    image_text_docs = []
+    globs = [
+        "**/*.png",
+        "**/*.jpg",
+        "**/*.jpeg",
+        "**/*.tiff",
+    ]
+    # gaudrail if no files matched
+    if not any(next(data_dir.rglob(p), None) for p in globs):
+        print(f"No images found under {data_dir}; skipping.")
+        return image_text_docs
+
+    print(f"Detected images under {data_dir}")
+    loader = DirectoryLoader(
+        str(data_dir),
+        glob=globs,
+        loader_cls=UnstructuredFileLoader,
+    )
+    print(f"Loading images from {data_dir}")
+    image_text_docs = loader.load()
+    print(f"Loaded {len(image_text_docs)} image files")
+    # tag
+    for img in image_text_docs:
+        img.metadata["source_type"] = "image_text"
+
+    return image_text_docs
+
+
+def _tbl(name: str) -> str:
+    """make a safe SQL table name"""
+    name = re.sub(r"[^0-9a-zA-Z_]+", "_", name).strip("_")
+    if not name or name[0].isdigit():
+        name = f"t_{name}"
+    return name.lower()
+
+
+def build_duckdb_and_summary_cards(
+    data_dir: Path,
+    db_path: Path,
+) -> list[Document]:
+    summary_cards = []
+    # skip if there are no .csv/.xlsx/.xls files
+    patterns = ("*.csv", "*.xlsx", "*.xls")
+    if not any(next(data_dir.rglob(p), None) for p in patterns):
+        print(f"No CSV or Excel files found under {data_dir}; skipping.")
+        return summary_cards
+    print(f"Detected CSV or Excel files under {data_dir}")
+    # ensure the DB folder exists
+    os.makedirs(db_path.parent, exist_ok=True)
+    # empty the entire DB
+    if db_path.exists():
+        db_path.unlink()
+    # start from an empty fresh DB
+    with duckdb.connect(str(db_path)) as con:
+        # ingest .csv files into DuckDB (overwrite on rerun)
+        for fp in data_dir.rglob("*.csv"):
+            table = _tbl(fp.stem)
+            fp_sql = fp.as_posix().replace("'", "''")  # escape single quotes
+            con.execute(
+                f"""
+                CREATE OR REPLACE TABLE {table} AS
+                SELECT * FROM read_csv_auto('{fp_sql}', header=true)
+                """
+            )
+
+        # XLSX ingestion via pandas
+        for fp in data_dir.rglob("*.xlsx"):
+            try:
+                xls = pd.ExcelFile(fp)  # lists sheet names
+            except Exception as e:
+                print(f"Skip {fp.name}: {e}")
+                continue
+
+            # One table per sheet
+            for sheet in xls.sheet_names:
+                try:
+                    df = pd.read_excel(fp, sheet_name=sheet)
+                except Exception as e:
+                    print(f"Skip {fp.name}:{sheet}: {e}")
+                    continue
+
+                tmp_name = f"_tmp_{_tbl(fp.stem)}_{_tbl(sheet)}"
+                con.register(tmp_name, df)
 
-    # index the docs
-    ids = vector_store.add_documents(documents=all_splits)
-    print(len(ids))
+                table = _tbl(f"{fp.stem}__{sheet}")
+                con.execute(
+                    f"""
+                    CREATE OR REPLACE TABLE {table} AS 
+                    SELECT * FROM {tmp_name}"""
+                )
+                con.unregister(tmp_name)
 
+        for fp in data_dir.rglob("*.xls"):
+            # .xls not supported by DuckDB
+            print(f"Skip {fp.name}: .xls not supported by DuckDB.")
+
+        # build summary cards from DuckDB
+        tables = [r[0] for r in con.execute("SHOW TABLES").fetchall()]
+        for tbl in tables:
+            # DESCRIBE/PRAGMA to get columns & types
+            schema_rows = con.execute(f"DESCRIBE {tbl}").fetchall()
+            col_names = [r[0] for r in schema_rows]
+            col_types = [r[1] for r in schema_rows]
+            nrows = con.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0]
+            preview_df = con.execute(f"SELECT * FROM {tbl} LIMIT 5").df()
+
+            col_str = ", ".join(f"{n}:{t}" for n, t in zip(col_names, col_types))
+            preview_txt = preview_df.to_string(index=False)
+
+            text = (
+                f"TABLE CARD — {tbl}\n"
+                f"Columns (Length: {len(col_names)}; Format: 'column_name:data_type'): {col_str}\n"
+                f"Rows: {nrows}\n\n"
+                f"Sample rows (up to 5):\n{preview_txt}\n"
+            )
+
+            summary_cards.append(
+                Document(
+                    page_content=text,
+                    metadata={
+                        "source_type": "table_summary",
+                        "table": tbl,
+                        "db_path": str(db_path),
+                    },
+                )
+            )
+
+    return summary_cards
+
+
+def embed_and_index_all_docs(
+    data_dir: Path = DATA, db_path: Path = DATA / "csv_excel_to_db" / "my_data.duckdb"
+):
+    # load embeedings and vector store
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_store = InMemoryVectorStore(embeddings)
+
+    # LOAD AND SPLIT TEXT DOCS
+    text_chunks = load_and_split_text_docs(data_dir)
+    # LOAD IMAGES (OCR converts image -> text)
+    image_text_docs = load_image_docs_as_text(data_dir)
+    # LOAD AND SPLIT CSV/EXCEL DOCS
+    summary_cards = build_duckdb_and_summary_cards(data_dir, db_path)
+
+    vector_store.add_documents(text_chunks + image_text_docs + summary_cards)
     return embeddings, vector_store
+
+
+# def index_text_docs(
+#     data_pth: Path = DATA,
+# ):
+#     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+#     vector_store = InMemoryVectorStore(embeddings)
+
+#     # Load the text documents
+#     loader = DirectoryLoader(
+#         str(data_pth),
+#         glob=[
+#             "**/*.pdf",
+#             "**/*.docx",
+#             "**/*.pptx",
+#             "**/*.md",
+#             "**/*.html",
+#             "**/*.txt",
+#             "**/*.png",
+#             "**/*.jpg",
+#             "**/*.jpeg",
+#             "**/*.tiff",
+#         ],
+#         loader_cls=UnstructuredFileLoader,
+#     )
+#     print(f"Loading files from {data_pth}")
+#     docs = loader.load()
+#     print(f"Loaded {len(docs)} files")
+
+#     # Split the texts
+#     text_splitter = RecursiveCharacterTextSplitter(
+#         chunk_size=1000,
+#         chunk_overlap=200,
+#         add_start_index=True,
+#         separators=["\n\n", "\n", " ", ""],
+#     )
+#     all_splits = text_splitter.split_documents(docs)
+#     print(len(all_splits))
+
+#     # index the docs
+#     ids = vector_store.add_documents(documents=all_splits)
+#     print(len(ids))
+
+#     return embeddings, vector_store
diff --git a/src/any_chatbot/retrievers.py b/src/any_chatbot/retrievers.py