Skip to content

Commit b129ba0

Browse files
authored
Merge pull request #1 from crd/codex/improve-documentation-for-ingest.py-and-chat.py
Add illustrative docstrings
2 parents 0ef191d + 97265c8 commit b129ba0

File tree

2 files changed

+92
-10
lines changed

2 files changed

+92
-10
lines changed

chat.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,35 @@
1-
import os
2-
import yaml
3-
import chromadb
1+
"""Simple command-line chat client for exploring the indexed Logseq graph.
2+
3+
This module keeps the runtime experience intentionally transparent: it shows how
4+
to rebuild a query engine from the stored embeddings and how to send natural
5+
language questions to it. The print statements highlight how answers relate to
6+
the original notes.
7+
"""
48

9+
import chromadb
10+
import yaml
511
from llama_index.core import Settings, VectorStoreIndex
6-
from llama_index.llms.ollama import Ollama
712
from llama_index.embeddings.ollama import OllamaEmbedding
13+
from llama_index.llms.ollama import Ollama
814
from llama_index.vector_stores.chroma import ChromaVectorStore
915

10-
CONFIG = yaml.safe_load(open("config.yaml", "r"))
16+
with open("config.yaml", "r", encoding="utf-8") as f:
17+
CONFIG = yaml.safe_load(f)
1118

1219
def build_query_engine():
20+
"""Create a ``QueryEngine`` that can answer questions over the Logseq index.
21+
22+
The steps here mirror the high-level components of a RAG system: choose an
23+
LLM, choose an embedding model, open the vector store, then ask LlamaIndex
24+
for a query interface. Reading through the code reinforces the mental model
25+
introduced in ``ingest.py``.
26+
27+
Returns
28+
-------
29+
BaseQueryEngine
30+
The object that exposes ``query("...")`` for the interactive loop.
31+
"""
32+
1333
# Models (local via Ollama)
1434
Settings.llm = Ollama(
1535
model=CONFIG["models"]["llm"],
@@ -35,6 +55,12 @@ def build_query_engine():
3555
return query_engine
3656

3757
def main():
58+
"""Start an interactive chat loop backed by the previously ingested notes.
59+
60+
Type questions in plain English to see how the retriever surfaces relevant
61+
pages. Use ``:q`` to exit when you are done experimenting.
62+
"""
63+
3864
print("Loading query engine...")
3965
qe = build_query_engine()
4066
print("Ready. Type your question (or :q to quit).")

ingest.py

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,59 @@
1-
import os, re, glob, pathlib, yaml
1+
"""Utilities for turning a Logseq graph into a retriever-friendly index.
2+
3+
This script walks through each stage of a Retrieval Augmented Generation (RAG)
4+
workflow. The ``main`` function mirrors the typical steps:
5+
6+
1. Collect source files from Logseq.
7+
2. Clean and tag the raw markdown.
8+
3. Split the content into small, retrievable "nodes".
9+
4. Persist the resulting embeddings into a vector database (Chroma).
10+
11+
Running ``python ingest.py`` demonstrates how raw notes are transformed into
12+
something a chatbot can search. The helper functions below keep the individual
13+
tasks digestible and ready for experimentation.
14+
"""
15+
16+
import glob
17+
import os
18+
import pathlib
19+
import re
220
from typing import List
3-
from llama_index.core import VectorStoreIndex, StorageContext, Document, Settings
21+
22+
import chromadb
23+
import yaml
24+
from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
425
from llama_index.core.node_parser import SimpleNodeParser
526
from llama_index.embeddings.ollama import OllamaEmbedding
627
from llama_index.llms.ollama import Ollama
728
from llama_index.vector_stores.chroma import ChromaVectorStore
8-
import chromadb
929

10-
CONFIG = yaml.safe_load(open("config.yaml", "r"))
30+
with open("config.yaml", "r", encoding="utf-8") as f:
31+
CONFIG = yaml.safe_load(f)
1132

1233
PAGE_LINK = re.compile(r"\[\[([^\]]+)\]\]") # [[Page]]
1334
BLOCK_REF = re.compile(r"\(\(([a-zA-Z0-9_-]{6,})\)\)") # ((block-id))
1435
TAG_HASH = re.compile(r"(?<!\w)#([A-Za-z0-9/_-]+)") # #tag
1536
TAG_PROP = re.compile(r"^tags::\s*(.+)$", re.MULTILINE) # tags:: a, b
1637

1738
def normalize_logseq_links(text: str) -> str:
39+
"""Replace Logseq-specific link syntax with plain text.
40+
41+
Special markup can confuse language models. This helper demonstrates how to
42+
pre-process text so the downstream embedding model sees natural language
43+
instead of wiki-style tokens.
44+
"""
45+
1846
text = PAGE_LINK.sub(lambda m: m.group(1), text)
1947
text = BLOCK_REF.sub(lambda m: f"[ref:{m.group(1)}]", text)
2048
return text
2149

2250
def parse_tags(text: str) -> List[str]:
51+
"""Extract Logseq tags from both inline ``#hashtags`` and ``tags::`` fields.
52+
53+
The returned list is sorted to keep results predictable when you display or
54+
filter by tags later in the workflow.
55+
"""
56+
2357
tags = set()
2458
for m in TAG_HASH.finditer(text):
2559
tags.add(m.group(1))
@@ -31,10 +65,18 @@ def parse_tags(text: str) -> List[str]:
3165
return sorted(tags)
3266

3367
def page_title_from_path(path: str) -> str:
68+
"""Convert a file path into a human-friendly Logseq page title."""
69+
3470
name = pathlib.Path(path).stem
3571
return name.replace("_", "-")
3672

3773
def collect_files(root: str, include_dirs: List[str], file_exts: List[str], exclude_globs: List[str]) -> List[str]:
74+
"""Locate Logseq files to ingest based on the config settings.
75+
76+
This function connects the configuration knobs to the actual file system. It
77+
assembles the "raw corpus" that feeds the remaining steps of the pipeline.
78+
"""
79+
3880
files = []
3981
for rel in include_dirs:
4082
base = os.path.join(root, rel)
@@ -46,10 +88,18 @@ def collect_files(root: str, include_dirs: List[str], file_exts: List[str], excl
4688
return [f for f in files if f not in excluded and os.path.isfile(f)]
4789

4890
def load_documents(paths: List[str]) -> List[Document]:
91+
"""Read markdown files and create ``Document`` objects with helpful metadata.
92+
93+
Each ``Document`` becomes a single unit of knowledge for LlamaIndex. We add
94+
metadata like the title, directory, and tags so the chat interface can show
95+
meaningful references instead of opaque file names.
96+
"""
97+
4998
docs = []
5099
for p in paths:
51100
try:
52-
txt = open(p, "r", encoding="utf-8").read()
101+
with open(p, "r", encoding="utf-8") as f:
102+
txt = f.read()
53103
except Exception:
54104
continue
55105

@@ -71,6 +121,12 @@ def load_documents(paths: List[str]) -> List[Document]:
71121
return docs
72122

73123
def main():
124+
"""Run the full ingestion workflow using settings from ``config.yaml``.
125+
126+
Running this function end-to-end shows how data collection, cleaning,
127+
chunking, and indexing fit together in a practical RAG pipeline.
128+
"""
129+
74130
root = CONFIG["logseq_root"]
75131
include_dirs = CONFIG["include_dirs"]
76132
file_exts = CONFIG["file_exts"]

0 commit comments

Comments
 (0)