SAAB/ingest.py at main · txavier/SAAB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Ingest SAAB markdown docs into a local ChromaDB vector store.
Uses sentence-transformers for embeddings — fully offline, no API keys.
"""

import os
import glob
import chromadb
from sentence_transformers import SentenceTransformer

DOCS_DIR = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(DOCS_DIR, ".vectordb")
COLLECTION = "saab_knowledge"
CHUNK_SIZE = 800  # characters per chunk
CHUNK_OVERLAP = 100


def load_markdown_files(directory: str) -> list[dict]:
    """Load all .md files (except README and Template) and return as documents."""
    docs = []
    for path in glob.glob(os.path.join(directory, "*.md")):
        basename = os.path.basename(path)
        if basename.lower() in ("readme.md", "template.md"):
            continue
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        docs.append({"source": basename, "text": text})
    return docs


def chunk_text(text: str, source: str) -> list[dict]:
    """Split text into overlapping chunks, preserving section headers."""
    chunks = []
    current_header = source  # default context is the filename
    lines = text.split("\n")
    current_chunk = []
    current_len = 0

    for line in lines:
        # Track the most recent markdown header for context
        stripped = line.strip()
        if stripped.startswith("#"):
            current_header = stripped.lstrip("#").strip()

        current_chunk.append(line)
        current_len += len(line) + 1  # +1 for newline

        if current_len >= CHUNK_SIZE:
            chunk_text_str = "\n".join(current_chunk)
            chunks.append({
                "text": chunk_text_str,
                "source": source,
                "section": current_header,
            })
            # Keep overlap by retaining the last few lines
            overlap_lines = []
            overlap_len = 0
            for prev_line in reversed(current_chunk):
                overlap_len += len(prev_line) + 1
                overlap_lines.insert(0, prev_line)
                if overlap_len >= CHUNK_OVERLAP:
                    break
            current_chunk = overlap_lines
            current_len = overlap_len

    # Don't forget the last chunk
    if current_chunk:
        chunk_text_str = "\n".join(current_chunk)
        if chunk_text_str.strip():
            chunks.append({
                "text": chunk_text_str,
                "source": source,
                "section": current_header,
            })

    return chunks


def main():
    print("Loading SAAB documents...")
    docs = load_markdown_files(DOCS_DIR)
    print(f"  Found {len(docs)} document(s): {[d['source'] for d in docs]}")

    print("Chunking documents...")
    all_chunks = []
    for doc in docs:
        chunks = chunk_text(doc["text"], doc["source"])
        all_chunks.extend(chunks)
    print(f"  Created {len(all_chunks)} chunks")

    print("Loading embedding model (first run downloads ~90MB)...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    print("Embedding chunks...")
    texts = [c["text"] for c in all_chunks]
    embeddings = model.encode(texts, show_progress_bar=True).tolist()

    print("Storing in ChromaDB...")
    client = chromadb.PersistentClient(path=DB_DIR)
    # Delete existing collection if re-ingesting
    try:
        client.delete_collection(COLLECTION)
    except (ValueError, Exception):
        pass
    collection = client.create_collection(
        name=COLLECTION,
        metadata={"hnsw:space": "cosine"},
    )

    ids = [f"chunk_{i}" for i in range(len(all_chunks))]
    metadatas = [{"source": c["source"], "section": c["section"]} for c in all_chunks]

    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=texts,
        metadatas=metadatas,
    )

    print(f"\nDone! {len(all_chunks)} chunks stored in {DB_DIR}")
    print("You can now run: python chat.py")


if __name__ == "__main__":
    main()