Merge pull request #1 from crd/codex/improve-documentation-for-ingest.py-and-chat.py

crd · web-flow · commit b129ba0dfaa6 · 2025-11-04T15:53:22.000-05:00
Add illustrative docstrings
diff --git a/chat.py b/chat.py
@@ -1,15 +1,35 @@
-import os
-import yaml
-import chromadb
+"""Simple command-line chat client for exploring the indexed Logseq graph.
+
+This module keeps the runtime experience intentionally transparent: it shows how
+to rebuild a query engine from the stored embeddings and how to send natural
+language questions to it. The print statements highlight how answers relate to
+the original notes.
+"""
 
+import chromadb
+import yaml
 from llama_index.core import Settings, VectorStoreIndex
-from llama_index.llms.ollama import Ollama
 from llama_index.embeddings.ollama import OllamaEmbedding
+from llama_index.llms.ollama import Ollama
 from llama_index.vector_stores.chroma import ChromaVectorStore
 
-CONFIG = yaml.safe_load(open("config.yaml", "r"))
+with open("config.yaml", "r", encoding="utf-8") as f:
+    CONFIG = yaml.safe_load(f)
 
 def build_query_engine():
+    """Create a ``QueryEngine`` that can answer questions over the Logseq index.
+
+    The steps here mirror the high-level components of a RAG system: choose an
+    LLM, choose an embedding model, open the vector store, then ask LlamaIndex
+    for a query interface. Reading through the code reinforces the mental model
+    introduced in ``ingest.py``.
+
+    Returns
+    -------
+    BaseQueryEngine
+        The object that exposes ``query("...")`` for the interactive loop.
+    """
+
     # Models (local via Ollama)
     Settings.llm = Ollama(
         model=CONFIG["models"]["llm"],
@@ -35,6 +55,12 @@ def build_query_engine():
     return query_engine
 
 def main():
+    """Start an interactive chat loop backed by the previously ingested notes.
+
+    Type questions in plain English to see how the retriever surfaces relevant
+    pages. Use ``:q`` to exit when you are done experimenting.
+    """
+
     print("Loading query engine...")
     qe = build_query_engine()
     print("Ready. Type your question (or :q to quit).")
diff --git a/ingest.py b/ingest.py
@@ -1,25 +1,59 @@
-import os, re, glob, pathlib, yaml
+"""Utilities for turning a Logseq graph into a retriever-friendly index.
+
+This script walks through each stage of a Retrieval Augmented Generation (RAG)
+workflow. The ``main`` function mirrors the typical steps:
+
+1. Collect source files from Logseq.
+2. Clean and tag the raw markdown.
+3. Split the content into small, retrievable "nodes".
+4. Persist the resulting embeddings into a vector database (Chroma).
+
+Running ``python ingest.py`` demonstrates how raw notes are transformed into
+something a chatbot can search. The helper functions below keep the individual
+tasks digestible and ready for experimentation.
+"""
+
+import glob
+import os
+import pathlib
+import re
 from typing import List
-from llama_index.core import VectorStoreIndex, StorageContext, Document, Settings
+
+import chromadb
+import yaml
+from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
 from llama_index.core.node_parser import SimpleNodeParser
 from llama_index.embeddings.ollama import OllamaEmbedding
 from llama_index.llms.ollama import Ollama
 from llama_index.vector_stores.chroma import ChromaVectorStore
-import chromadb
 
-CONFIG = yaml.safe_load(open("config.yaml", "r"))
+with open("config.yaml", "r", encoding="utf-8") as f:
+    CONFIG = yaml.safe_load(f)
 
 PAGE_LINK = re.compile(r"\[\[([^\]]+)\]\]")                 # [[Page]]
 BLOCK_REF = re.compile(r"\(\(([a-zA-Z0-9_-]{6,})\)\)")       # ((block-id))
 TAG_HASH  = re.compile(r"(?<!\w)#([A-Za-z0-9/_-]+)")            # #tag
 TAG_PROP  = re.compile(r"^tags::\s*(.+)$", re.MULTILINE)        # tags:: a, b
 
 def normalize_logseq_links(text: str) -> str:
+    """Replace Logseq-specific link syntax with plain text.
+
+    Special markup can confuse language models. This helper demonstrates how to
+    pre-process text so the downstream embedding model sees natural language
+    instead of wiki-style tokens.
+    """
+
     text = PAGE_LINK.sub(lambda m: m.group(1), text)
     text = BLOCK_REF.sub(lambda m: f"[ref:{m.group(1)}]", text)
     return text
 
 def parse_tags(text: str) -> List[str]:
+    """Extract Logseq tags from both inline ``#hashtags`` and ``tags::`` fields.
+
+    The returned list is sorted to keep results predictable when you display or
+    filter by tags later in the workflow.
+    """
+
     tags = set()
     for m in TAG_HASH.finditer(text):
         tags.add(m.group(1))
@@ -31,10 +65,18 @@ def parse_tags(text: str) -> List[str]:
     return sorted(tags)
 
 def page_title_from_path(path: str) -> str:
+    """Convert a file path into a human-friendly Logseq page title."""
+
     name = pathlib.Path(path).stem
     return name.replace("_", "-")
 
 def collect_files(root: str, include_dirs: List[str], file_exts: List[str], exclude_globs: List[str]) -> List[str]:
+    """Locate Logseq files to ingest based on the config settings.
+
+    This function connects the configuration knobs to the actual file system. It
+    assembles the "raw corpus" that feeds the remaining steps of the pipeline.
+    """
+
     files = []
     for rel in include_dirs:
         base = os.path.join(root, rel)
@@ -46,10 +88,18 @@ def collect_files(root: str, include_dirs: List[str], file_exts: List[str], excl
     return [f for f in files if f not in excluded and os.path.isfile(f)]
 
 def load_documents(paths: List[str]) -> List[Document]:
+    """Read markdown files and create ``Document`` objects with helpful metadata.
+
+    Each ``Document`` becomes a single unit of knowledge for LlamaIndex. We add
+    metadata like the title, directory, and tags so the chat interface can show
+    meaningful references instead of opaque file names.
+    """
+
     docs = []
     for p in paths:
         try:
-            txt = open(p, "r", encoding="utf-8").read()
+            with open(p, "r", encoding="utf-8") as f:
+                txt = f.read()
         except Exception:
             continue
 
@@ -71,6 +121,12 @@ def load_documents(paths: List[str]) -> List[Document]:
     return docs
 
 def main():
+    """Run the full ingestion workflow using settings from ``config.yaml``.
+
+    Running this function end-to-end shows how data collection, cleaning,
+    chunking, and indexing fit together in a practical RAG pipeline.
+    """
+
     root = CONFIG["logseq_root"]
     include_dirs = CONFIG["include_dirs"]
     file_exts = CONFIG["file_exts"]