Initial commit

crd · crd · commit 0ef191d60711 · 2025-11-04T15:33:20.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+# RAG location
+.rag
+
+# Config file
+config.yaml
diff --git a/Makefile b/Makefile
@@ -0,0 +1,23 @@
+.PHONY: venv install ingest chat lock clean
+
+# Create/refresh a local .venv and install deps from pyproject/uv.lock
+install:
+	uv sync
+
+# Optional: create venv explicitly (uv sync will also create one if missing)
+venv:
+	uv venv -q
+
+# Run scripts using the project env without manual activation
+ingest:
+	uv run ingest.py
+
+chat:
+	uv run chat.py
+
+# Create/update a lockfile explicitly (optional; uv sync also updates it)
+lock:
+	uv lock
+
+clean:
+	rm -rf .rag
diff --git a/README.md b/README.md
@@ -0,0 +1,46 @@
+# logseq-chat (local RAG over Logseq)
+
+A fully local RAG pipeline using LlamaIndex + Ollama + Chroma to query your Logseq notes.
+
+## Prereqs
+- Python 3.10+
+- Ollama running (https://ollama.com)
+- Pull a chat and embedding model:
+  ```bash
+  ollama pull llama3.1
+  ollama pull nomic-embed-text
+  ```
+
+  or (lighter weight):
+
+  ```bash
+  ollama pull llama3.1
+  ollama pull all-minilm
+  ```
+
+## Setup
+```bash
+cd logseq-chat
+make install
+```
+
+Edit `config.yaml` and at a minimum set `logseq_root` to your Logseq graph directory.
+
+## Build index
+```bash
+make ingest
+```
+
+## Chat
+```bash
+make chat
+```
+
+### Example questions
+- Summarize tasks tagged #home in October 2025.
+- Find notes referencing [[Team Topologies]] and list my pros/cons.
+
+## Notes
+- Skips `assets/` by default. Enable OCR later if needed.
+- Uses Markdown-aware chunking; tags from `#tag` and `tags::` stored in metadata.
+- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or `qwen2.5:7b` and smaller chunks.
diff --git a/chat.py b/chat.py
@@ -0,0 +1,69 @@
+import os
+import yaml
+import chromadb
+
+from llama_index.core import Settings, VectorStoreIndex
+from llama_index.llms.ollama import Ollama
+from llama_index.embeddings.ollama import OllamaEmbedding
+from llama_index.vector_stores.chroma import ChromaVectorStore
+
+CONFIG = yaml.safe_load(open("config.yaml", "r"))
+
+def build_query_engine():
+    # Models (local via Ollama)
+    Settings.llm = Ollama(
+        model=CONFIG["models"]["llm"],
+        request_timeout=180,
+    )
+    Settings.embed_model = OllamaEmbedding(
+        model_name=CONFIG["models"]["embedding"],
+    )
+
+    # Vector store
+    client = chromadb.PersistentClient(path=CONFIG["storage"]["chroma_path"])
+    collection = client.get_or_create_collection("logseq_rag")
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+
+    # Index from existing Chroma collection
+    index = VectorStoreIndex.from_vector_store(vector_store)
+
+    # Let LlamaIndex create the retriever internally; pass our knobs only
+    query_engine = index.as_query_engine(
+        similarity_top_k=CONFIG["retrieval"]["top_k"],
+        use_mmr=CONFIG["retrieval"]["mmr"],
+    )
+    return query_engine
+
+def main():
+    print("Loading query engine...")
+    qe = build_query_engine()
+    print("Ready. Type your question (or :q to quit).")
+    while True:
+        try:
+            q = input("> ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print()
+            break
+        if q == ":q":
+            break
+
+        resp = qe.query(q)
+
+        print("\n--- Answer ---")
+        print(resp.response)
+
+        print("\n--- Top refs ---")
+        for s in resp.source_nodes[:5]:
+            meta = s.node.metadata or {}
+            title = meta.get("title", "(untitled)")
+            d = meta.get("dir")
+            src = meta.get("source")
+            tags_csv = meta.get("tags")  # CSV string or None
+            if tags_csv:
+                print(f"{title} [{d}] tags: {tags_csv} -> {src}")
+            else:
+                print(f"{title} [{d}] -> {src}")
+        print()
+
+if __name__ == "__main__":
+    main()
diff --git a/config.yaml.sample b/config.yaml.sample
@@ -0,0 +1,33 @@
+# Update this to the absolute path of your Logseq graph directory
+# (the folder that contains assets/, journals/, pages/)
+logseq_root: "/Users/jdoe/logseq"
+
+include_dirs:
+  - "journals"
+  - "pages"
+
+exclude_globs:
+  - "**/.git/**"
+  - "**/.DS_Store"
+  - "**/assets/**"   # Skip heavy assets by default
+
+file_exts: [".md"]
+
+chunk:
+  by_headers: true
+  chunk_size: 1200
+  chunk_overlap: 200
+
+retrieval:
+  top_k: 6
+  mmr: true
+
+# On smaller machines use all-minilm, it's relatively lightweight
+models:
+  llm: "llama3.1"
+  embedding: "all-minilm"
+
+storage:
+  chroma_path: ".rag/chroma"
+  index_store: ".rag/index_store"
+  docstore: ".rag/docstore"
diff --git a/ingest.py b/ingest.py
@@ -0,0 +1,111 @@
+import os, re, glob, pathlib, yaml
+from typing import List
+from llama_index.core import VectorStoreIndex, StorageContext, Document, Settings
+from llama_index.core.node_parser import SimpleNodeParser
+from llama_index.embeddings.ollama import OllamaEmbedding
+from llama_index.llms.ollama import Ollama
+from llama_index.vector_stores.chroma import ChromaVectorStore
+import chromadb
+
+CONFIG = yaml.safe_load(open("config.yaml", "r"))
+
+PAGE_LINK = re.compile(r"\[\[([^\]]+)\]\]")                 # [[Page]]
+BLOCK_REF = re.compile(r"\(\(([a-zA-Z0-9_-]{6,})\)\)")       # ((block-id))
+TAG_HASH  = re.compile(r"(?<!\w)#([A-Za-z0-9/_-]+)")            # #tag
+TAG_PROP  = re.compile(r"^tags::\s*(.+)$", re.MULTILINE)        # tags:: a, b
+
+def normalize_logseq_links(text: str) -> str:
+    text = PAGE_LINK.sub(lambda m: m.group(1), text)
+    text = BLOCK_REF.sub(lambda m: f"[ref:{m.group(1)}]", text)
+    return text
+
+def parse_tags(text: str) -> List[str]:
+    tags = set()
+    for m in TAG_HASH.finditer(text):
+        tags.add(m.group(1))
+    for m in TAG_PROP.finditer(text):
+        raw = [t.strip(" ,#") for t in m.group(1).split(",")]
+        for t in raw:
+            if t:
+                tags.add(t)
+    return sorted(tags)
+
+def page_title_from_path(path: str) -> str:
+    name = pathlib.Path(path).stem
+    return name.replace("_", "-")
+
+def collect_files(root: str, include_dirs: List[str], file_exts: List[str], exclude_globs: List[str]) -> List[str]:
+    files = []
+    for rel in include_dirs:
+        base = os.path.join(root, rel)
+        for ext in file_exts:
+            files.extend(glob.glob(os.path.join(base, f"**/*{ext}"), recursive=True))
+    excluded = set()
+    for pat in exclude_globs:
+        excluded.update(glob.glob(os.path.join(root, pat), recursive=True))
+    return [f for f in files if f not in excluded and os.path.isfile(f)]
+
+def load_documents(paths: List[str]) -> List[Document]:
+    docs = []
+    for p in paths:
+        try:
+            txt = open(p, "r", encoding="utf-8").read()
+        except Exception:
+            continue
+
+        clean = normalize_logseq_links(txt)
+
+        # compute tags here so tags_csv is in scope
+        tags_list = parse_tags(txt)
+        tags_csv = ", ".join(tags_list) if tags_list else None
+
+        title = page_title_from_path(p)
+        meta = {
+            "source": p,
+            "title": title,
+            "tags": tags_csv,  # scalar (str/None), not a list
+            "basename": os.path.basename(p),
+            "dir": os.path.basename(os.path.dirname(p)),
+        }
+        docs.append(Document(text=clean, metadata=meta))
+    return docs
+
+def main():
+    root = CONFIG["logseq_root"]
+    include_dirs = CONFIG["include_dirs"]
+    file_exts = CONFIG["file_exts"]
+    exclude = CONFIG["exclude_globs"]
+
+    if not os.path.isdir(root):
+        raise SystemExit(f"Logseq root does not exist: {root}\nEdit config.yaml to set logseq_root.")
+
+    paths = collect_files(root, include_dirs, file_exts, exclude)
+    print(f"Found {len(paths)} markdown files.")
+
+    docs = load_documents(paths)
+    print(f"Loaded {len(docs)} documents.")
+
+    Settings.llm = Ollama(model=CONFIG["models"]["llm"], request_timeout=180)
+    Settings.embed_model = OllamaEmbedding(model_name=CONFIG["models"]["embedding"])
+
+    parser = SimpleNodeParser.from_defaults(
+        include_metadata=True,
+        chunk_size=CONFIG["chunk"]["chunk_size"],
+        chunk_overlap=CONFIG["chunk"]["chunk_overlap"]
+    )
+    nodes = parser.get_nodes_from_documents(docs)
+    print(f"Parsed into {len(nodes)} nodes.")
+
+    chroma_path = CONFIG["storage"]["chroma_path"]
+    os.makedirs(chroma_path, exist_ok=True)
+    client = chromadb.PersistentClient(path=chroma_path)
+    collection = client.get_or_create_collection("logseq_rag")
+
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_ctx = StorageContext.from_defaults(vector_store=vector_store)
+
+    _ = VectorStoreIndex(nodes, storage_context=storage_ctx)
+    print("Index built and persisted to Chroma.")
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,14 @@
+[project]
+name = "logseq-chat"
+version = "0.1.0"
+description = "Local RAG over Logseq with LlamaIndex + Ollama + Chroma"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "chromadb>=1.3.2",
+    "llama-index>=0.14.7",
+    "llama-index-embeddings-ollama>=0.8.3",
+    "llama-index-llms-ollama>=0.9.0",
+    "llama-index-vector-stores-chroma>=0.5.3",
+    "pyyaml>=6.0.3",
+]
diff --git a/uv.lock b/uv.lock