1- import os , re , glob , pathlib , yaml
1+ """Utilities for turning a Logseq graph into a retriever-friendly index.
2+
3+ This script walks through each stage of a Retrieval Augmented Generation (RAG)
4+ workflow. The ``main`` function mirrors the typical steps:
5+
6+ 1. Collect source files from Logseq.
7+ 2. Clean and tag the raw markdown.
8+ 3. Split the content into small, retrievable "nodes".
9+ 4. Persist the resulting embeddings into a vector database (Chroma).
10+
11+ Running ``python ingest.py`` demonstrates how raw notes are transformed into
12+ something a chatbot can search. The helper functions below keep the individual
13+ tasks digestible and ready for experimentation.
14+ """
15+
16+ import glob
17+ import os
18+ import pathlib
19+ import re
220from typing import List
3- from llama_index .core import VectorStoreIndex , StorageContext , Document , Settings
21+
22+ import chromadb
23+ import yaml
24+ from llama_index .core import Document , Settings , StorageContext , VectorStoreIndex
425from llama_index .core .node_parser import SimpleNodeParser
526from llama_index .embeddings .ollama import OllamaEmbedding
627from llama_index .llms .ollama import Ollama
728from llama_index .vector_stores .chroma import ChromaVectorStore
8- import chromadb
929
10- CONFIG = yaml .safe_load (open ("config.yaml" , "r" ))
30+ with open ("config.yaml" , "r" , encoding = "utf-8" ) as f :
31+ CONFIG = yaml .safe_load (f )
1132
1233PAGE_LINK = re .compile (r"\[\[([^\]]+)\]\]" ) # [[Page]]
1334BLOCK_REF = re .compile (r"\(\(([a-zA-Z0-9_-]{6,})\)\)" ) # ((block-id))
1435TAG_HASH = re .compile (r"(?<!\w)#([A-Za-z0-9/_-]+)" ) # #tag
1536TAG_PROP = re .compile (r"^tags::\s*(.+)$" , re .MULTILINE ) # tags:: a, b
1637
1738def normalize_logseq_links (text : str ) -> str :
39+ """Replace Logseq-specific link syntax with plain text.
40+
41+ Special markup can confuse language models. This helper demonstrates how to
42+ pre-process text so the downstream embedding model sees natural language
43+ instead of wiki-style tokens.
44+ """
45+
1846 text = PAGE_LINK .sub (lambda m : m .group (1 ), text )
1947 text = BLOCK_REF .sub (lambda m : f"[ref:{ m .group (1 )} ]" , text )
2048 return text
2149
2250def parse_tags (text : str ) -> List [str ]:
51+ """Extract Logseq tags from both inline ``#hashtags`` and ``tags::`` fields.
52+
53+ The returned list is sorted to keep results predictable when you display or
54+ filter by tags later in the workflow.
55+ """
56+
2357 tags = set ()
2458 for m in TAG_HASH .finditer (text ):
2559 tags .add (m .group (1 ))
@@ -31,10 +65,18 @@ def parse_tags(text: str) -> List[str]:
3165 return sorted (tags )
3266
3367def page_title_from_path (path : str ) -> str :
68+ """Convert a file path into a human-friendly Logseq page title."""
69+
3470 name = pathlib .Path (path ).stem
3571 return name .replace ("_" , "-" )
3672
3773def collect_files (root : str , include_dirs : List [str ], file_exts : List [str ], exclude_globs : List [str ]) -> List [str ]:
74+ """Locate Logseq files to ingest based on the config settings.
75+
76+ This function connects the configuration knobs to the actual file system. It
77+ assembles the "raw corpus" that feeds the remaining steps of the pipeline.
78+ """
79+
3880 files = []
3981 for rel in include_dirs :
4082 base = os .path .join (root , rel )
@@ -46,10 +88,18 @@ def collect_files(root: str, include_dirs: List[str], file_exts: List[str], excl
4688 return [f for f in files if f not in excluded and os .path .isfile (f )]
4789
4890def load_documents (paths : List [str ]) -> List [Document ]:
91+ """Read markdown files and create ``Document`` objects with helpful metadata.
92+
93+ Each ``Document`` becomes a single unit of knowledge for LlamaIndex. We add
94+ metadata like the title, directory, and tags so the chat interface can show
95+ meaningful references instead of opaque file names.
96+ """
97+
4998 docs = []
5099 for p in paths :
51100 try :
52- txt = open (p , "r" , encoding = "utf-8" ).read ()
101+ with open (p , "r" , encoding = "utf-8" ) as f :
102+ txt = f .read ()
53103 except Exception :
54104 continue
55105
@@ -71,6 +121,12 @@ def load_documents(paths: List[str]) -> List[Document]:
71121 return docs
72122
73123def main ():
124+ """Run the full ingestion workflow using settings from ``config.yaml``.
125+
126+ Running this function end-to-end shows how data collection, cleaning,
127+ chunking, and indexing fit together in a practical RAG pipeline.
128+ """
129+
74130 root = CONFIG ["logseq_root" ]
75131 include_dirs = CONFIG ["include_dirs" ]
76132 file_exts = CONFIG ["file_exts" ]
0 commit comments