Skip to content

Commit 0ef191d

Browse files
committed
Initial commit
0 parents  commit 0ef191d

File tree

8 files changed

+3287
-0
lines changed

8 files changed

+3287
-0
lines changed

.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv
11+
12+
# RAG location
13+
.rag
14+
15+
# Config file
16+
config.yaml

Makefile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.PHONY: venv install ingest chat lock clean
2+
3+
# Create/refresh a local .venv and install deps from pyproject/uv.lock
4+
install:
5+
uv sync
6+
7+
# Optional: create venv explicitly (uv sync will also create one if missing)
8+
venv:
9+
uv venv -q
10+
11+
# Run scripts using the project env without manual activation
12+
ingest:
13+
uv run ingest.py
14+
15+
chat:
16+
uv run chat.py
17+
18+
# Create/update a lockfile explicitly (optional; uv sync also updates it)
19+
lock:
20+
uv lock
21+
22+
clean:
23+
rm -rf .rag

README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# logseq-chat (local RAG over Logseq)
2+
3+
A fully local RAG pipeline using LlamaIndex + Ollama + Chroma to query your Logseq notes.
4+
5+
## Prereqs
6+
- Python 3.10+
7+
- Ollama running (https://ollama.com)
8+
- Pull a chat and embedding model:
9+
```bash
10+
ollama pull llama3.1
11+
ollama pull nomic-embed-text
12+
```
13+
14+
or (lighter weight):
15+
16+
```bash
17+
ollama pull llama3.1
18+
ollama pull all-minilm
19+
```
20+
21+
## Setup
22+
```bash
23+
cd logseq-chat
24+
make install
25+
```
26+
27+
Edit `config.yaml` and at a minimum set `logseq_root` to your Logseq graph directory.
28+
29+
## Build index
30+
```bash
31+
make ingest
32+
```
33+
34+
## Chat
35+
```bash
36+
make chat
37+
```
38+
39+
### Example questions
40+
- Summarize tasks tagged #home in October 2025.
41+
- Find notes referencing [[Team Topologies]] and list my pros/cons.
42+
43+
## Notes
44+
- Skips `assets/` by default. Enable OCR later if needed.
45+
- Uses Markdown-aware chunking; tags from `#tag` and `tags::` stored in metadata.
46+
- For faster machines, try bigger models; for CPU-only, consider `llama3.2` or `qwen2.5:7b` and smaller chunks.

chat.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
import yaml
3+
import chromadb
4+
5+
from llama_index.core import Settings, VectorStoreIndex
6+
from llama_index.llms.ollama import Ollama
7+
from llama_index.embeddings.ollama import OllamaEmbedding
8+
from llama_index.vector_stores.chroma import ChromaVectorStore
9+
10+
CONFIG = yaml.safe_load(open("config.yaml", "r"))
11+
12+
def build_query_engine():
13+
# Models (local via Ollama)
14+
Settings.llm = Ollama(
15+
model=CONFIG["models"]["llm"],
16+
request_timeout=180,
17+
)
18+
Settings.embed_model = OllamaEmbedding(
19+
model_name=CONFIG["models"]["embedding"],
20+
)
21+
22+
# Vector store
23+
client = chromadb.PersistentClient(path=CONFIG["storage"]["chroma_path"])
24+
collection = client.get_or_create_collection("logseq_rag")
25+
vector_store = ChromaVectorStore(chroma_collection=collection)
26+
27+
# Index from existing Chroma collection
28+
index = VectorStoreIndex.from_vector_store(vector_store)
29+
30+
# Let LlamaIndex create the retriever internally; pass our knobs only
31+
query_engine = index.as_query_engine(
32+
similarity_top_k=CONFIG["retrieval"]["top_k"],
33+
use_mmr=CONFIG["retrieval"]["mmr"],
34+
)
35+
return query_engine
36+
37+
def main():
38+
print("Loading query engine...")
39+
qe = build_query_engine()
40+
print("Ready. Type your question (or :q to quit).")
41+
while True:
42+
try:
43+
q = input("> ").strip()
44+
except (EOFError, KeyboardInterrupt):
45+
print()
46+
break
47+
if q == ":q":
48+
break
49+
50+
resp = qe.query(q)
51+
52+
print("\n--- Answer ---")
53+
print(resp.response)
54+
55+
print("\n--- Top refs ---")
56+
for s in resp.source_nodes[:5]:
57+
meta = s.node.metadata or {}
58+
title = meta.get("title", "(untitled)")
59+
d = meta.get("dir")
60+
src = meta.get("source")
61+
tags_csv = meta.get("tags") # CSV string or None
62+
if tags_csv:
63+
print(f"{title} [{d}] tags: {tags_csv} -> {src}")
64+
else:
65+
print(f"{title} [{d}] -> {src}")
66+
print()
67+
68+
if __name__ == "__main__":
69+
main()

config.yaml.sample

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Update this to the absolute path of your Logseq graph directory
2+
# (the folder that contains assets/, journals/, pages/)
3+
logseq_root: "/Users/jdoe/logseq"
4+
5+
include_dirs:
6+
- "journals"
7+
- "pages"
8+
9+
exclude_globs:
10+
- "**/.git/**"
11+
- "**/.DS_Store"
12+
- "**/assets/**" # Skip heavy assets by default
13+
14+
file_exts: [".md"]
15+
16+
chunk:
17+
by_headers: true
18+
chunk_size: 1200
19+
chunk_overlap: 200
20+
21+
retrieval:
22+
top_k: 6
23+
mmr: true
24+
25+
# On smaller machines use all-minilm, it's relatively lightweight
26+
models:
27+
llm: "llama3.1"
28+
embedding: "all-minilm"
29+
30+
storage:
31+
chroma_path: ".rag/chroma"
32+
index_store: ".rag/index_store"
33+
docstore: ".rag/docstore"

ingest.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import os, re, glob, pathlib, yaml
2+
from typing import List
3+
from llama_index.core import VectorStoreIndex, StorageContext, Document, Settings
4+
from llama_index.core.node_parser import SimpleNodeParser
5+
from llama_index.embeddings.ollama import OllamaEmbedding
6+
from llama_index.llms.ollama import Ollama
7+
from llama_index.vector_stores.chroma import ChromaVectorStore
8+
import chromadb
9+
10+
CONFIG = yaml.safe_load(open("config.yaml", "r"))
11+
12+
PAGE_LINK = re.compile(r"\[\[([^\]]+)\]\]") # [[Page]]
13+
BLOCK_REF = re.compile(r"\(\(([a-zA-Z0-9_-]{6,})\)\)") # ((block-id))
14+
TAG_HASH = re.compile(r"(?<!\w)#([A-Za-z0-9/_-]+)") # #tag
15+
TAG_PROP = re.compile(r"^tags::\s*(.+)$", re.MULTILINE) # tags:: a, b
16+
17+
def normalize_logseq_links(text: str) -> str:
18+
text = PAGE_LINK.sub(lambda m: m.group(1), text)
19+
text = BLOCK_REF.sub(lambda m: f"[ref:{m.group(1)}]", text)
20+
return text
21+
22+
def parse_tags(text: str) -> List[str]:
23+
tags = set()
24+
for m in TAG_HASH.finditer(text):
25+
tags.add(m.group(1))
26+
for m in TAG_PROP.finditer(text):
27+
raw = [t.strip(" ,#") for t in m.group(1).split(",")]
28+
for t in raw:
29+
if t:
30+
tags.add(t)
31+
return sorted(tags)
32+
33+
def page_title_from_path(path: str) -> str:
34+
name = pathlib.Path(path).stem
35+
return name.replace("_", "-")
36+
37+
def collect_files(root: str, include_dirs: List[str], file_exts: List[str], exclude_globs: List[str]) -> List[str]:
38+
files = []
39+
for rel in include_dirs:
40+
base = os.path.join(root, rel)
41+
for ext in file_exts:
42+
files.extend(glob.glob(os.path.join(base, f"**/*{ext}"), recursive=True))
43+
excluded = set()
44+
for pat in exclude_globs:
45+
excluded.update(glob.glob(os.path.join(root, pat), recursive=True))
46+
return [f for f in files if f not in excluded and os.path.isfile(f)]
47+
48+
def load_documents(paths: List[str]) -> List[Document]:
49+
docs = []
50+
for p in paths:
51+
try:
52+
txt = open(p, "r", encoding="utf-8").read()
53+
except Exception:
54+
continue
55+
56+
clean = normalize_logseq_links(txt)
57+
58+
# compute tags here so tags_csv is in scope
59+
tags_list = parse_tags(txt)
60+
tags_csv = ", ".join(tags_list) if tags_list else None
61+
62+
title = page_title_from_path(p)
63+
meta = {
64+
"source": p,
65+
"title": title,
66+
"tags": tags_csv, # scalar (str/None), not a list
67+
"basename": os.path.basename(p),
68+
"dir": os.path.basename(os.path.dirname(p)),
69+
}
70+
docs.append(Document(text=clean, metadata=meta))
71+
return docs
72+
73+
def main():
74+
root = CONFIG["logseq_root"]
75+
include_dirs = CONFIG["include_dirs"]
76+
file_exts = CONFIG["file_exts"]
77+
exclude = CONFIG["exclude_globs"]
78+
79+
if not os.path.isdir(root):
80+
raise SystemExit(f"Logseq root does not exist: {root}\nEdit config.yaml to set logseq_root.")
81+
82+
paths = collect_files(root, include_dirs, file_exts, exclude)
83+
print(f"Found {len(paths)} markdown files.")
84+
85+
docs = load_documents(paths)
86+
print(f"Loaded {len(docs)} documents.")
87+
88+
Settings.llm = Ollama(model=CONFIG["models"]["llm"], request_timeout=180)
89+
Settings.embed_model = OllamaEmbedding(model_name=CONFIG["models"]["embedding"])
90+
91+
parser = SimpleNodeParser.from_defaults(
92+
include_metadata=True,
93+
chunk_size=CONFIG["chunk"]["chunk_size"],
94+
chunk_overlap=CONFIG["chunk"]["chunk_overlap"]
95+
)
96+
nodes = parser.get_nodes_from_documents(docs)
97+
print(f"Parsed into {len(nodes)} nodes.")
98+
99+
chroma_path = CONFIG["storage"]["chroma_path"]
100+
os.makedirs(chroma_path, exist_ok=True)
101+
client = chromadb.PersistentClient(path=chroma_path)
102+
collection = client.get_or_create_collection("logseq_rag")
103+
104+
vector_store = ChromaVectorStore(chroma_collection=collection)
105+
storage_ctx = StorageContext.from_defaults(vector_store=vector_store)
106+
107+
_ = VectorStoreIndex(nodes, storage_context=storage_ctx)
108+
print("Index built and persisted to Chroma.")
109+
110+
if __name__ == "__main__":
111+
main()

pyproject.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[project]
2+
name = "logseq-chat"
3+
version = "0.1.0"
4+
description = "Local RAG over Logseq with LlamaIndex + Ollama + Chroma"
5+
readme = "README.md"
6+
requires-python = ">=3.13"
7+
dependencies = [
8+
"chromadb>=1.3.2",
9+
"llama-index>=0.14.7",
10+
"llama-index-embeddings-ollama>=0.8.3",
11+
"llama-index-llms-ollama>=0.9.0",
12+
"llama-index-vector-stores-chroma>=0.5.3",
13+
"pyyaml>=6.0.3",
14+
]

0 commit comments

Comments
 (0)