Skip to content

Commit 2daf009

Browse files
committed
feat(rag): incremental indexing, lower threshold, query logging
- Replace full TRUNCATE+INSERT with incremental content-hash (SHA-256) sync - Add content_hash column to knowledge_chunks (migration 005) - Generate embeddings only for changed chunks, saving API costs - Lower RAG_SIMILARITY_THRESHOLD from 0.3 to 0.1 in config.py - Log user query text in [RAG] Found log line for debugging - Fix latent bug: deduplicate markdown headers with ordinal suffix - Update schema.sql to reflect content_hash column - Update README.md with RAG and incremental indexing description - Add tests: get_existing_hashes, sync_chunks, compute_content_hash, chunk_markdown dedup
1 parent b8fe93e commit 2daf009

File tree

7 files changed

+329
-65
lines changed

7 files changed

+329
-65
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,12 @@ The dashboard runs on the same port as the bot (`$PORT`, default `8080`). If `DA
160160

161161
## Knowledge Base (RAG)
162162

163-
When you chat with the bot, it uses **Retrieval-Augmented Generation** to answer questions about its own features, settings, and code. The bot's codebase is indexed into a vector database (Supabase pgvector) and relevant documentation is automatically retrieved for each question.
163+
The bot can answer questions about its own functionality, settings, and source code. It uses **Retrieval-Augmented Generation** — the codebase is indexed into a vector database (Supabase pgvector) and relevant fragments are automatically retrieved for each question.
164164

165165
**How it works:**
166-
1. On each deploy, `scripts/index_knowledge.py` parses all source files using Python AST and indexes them as embeddings in Supabase.
167-
2. When you ask a question, the bot finds the most relevant code/documentation chunks via vector similarity search.
168-
3. Retrieved chunks are injected into the AI prompt as context.
166+
1. On each deploy, `scripts/index_knowledge.py` parses source files (Python AST, Markdown headers, SQL) and stores embeddings in Supabase.
167+
2. Indexing is **incremental**: content is hashed (SHA-256) and only changed chunks are re-embedded, saving API costs.
168+
3. When you ask a question, the bot finds the most relevant chunks via vector similarity search and injects them into the AI prompt.
169169

170170
**Manual re-indexing:**
171171

config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def style_display_name(style: str) -> str:
161161
# ====== RAG ======
162162
RAG_EMBEDDING_MODEL = "openai/text-embedding-3-small" # 1536 dims
163163
RAG_TOP_K = 5 # Макс. кол-во чанков в контексте (реально может быть 0..5)
164-
RAG_SIMILARITY_THRESHOLD = 0.3 # Мин. cosine similarity (0..1); чанки ниже порога отбрасываются
164+
RAG_SIMILARITY_THRESHOLD = 0.1 # Мин. cosine similarity (0..1); чанки ниже порога отбрасываются
165165
INDEX_BATCH_SIZE = 100 # Размер батча для embedding-запросов и INSERT в Supabase
166166

167167
# ====== ЧАСОВОЙ ПОЯС ======

database/knowledge.py

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,91 @@ async def match_knowledge_chunks(
3232
return result.data if result.data else []
3333

3434

35-
async def replace_all_chunks(rows: list[dict]) -> None:
36-
"""Полностью заменяет содержимое knowledge_chunks (TRUNCATE + INSERT).
35+
async def get_existing_hashes() -> dict[tuple[str, str | None], str]:
36+
"""Загружает хэши существующих чанков из БД.
3737
38-
Args:
39-
rows: Список dict с полями: source, section, content, embedding
38+
Returns:
39+
Словарь {(source, section): content_hash}
4040
"""
41-
# TRUNCATE (Supabase не поддерживает TRUNCATE напрямую)
42-
await run_supabase(
43-
lambda: supabase.table("knowledge_chunks").delete().neq("id", 0).execute()
41+
result = await run_supabase(
42+
lambda: supabase.table("knowledge_chunks")
43+
.select("source, section, content_hash")
44+
.execute()
4445
)
46+
rows = result.data if result.data else []
47+
return {(r["source"], r.get("section")): r["content_hash"] for r in rows}
48+
49+
50+
async def sync_chunks(
51+
new_rows: list[dict],
52+
all_keys: set[tuple[str, str | None]],
53+
) -> tuple[int, int, int]:
54+
"""Инкрементально синхронизирует чанки: INSERT новые, DELETE устаревшие.
55+
56+
Args:
57+
new_rows: Список новых/изменённых чанков для INSERT
58+
(source, section, content, content_hash, embedding)
59+
all_keys: Множество (source, section) ВСЕХ актуальных чанков
60+
(для определения удалённых)
4561
46-
# INSERT батчами
47-
for i in range(0, len(rows), INDEX_BATCH_SIZE):
48-
batch = rows[i:i + INDEX_BATCH_SIZE]
62+
Returns:
63+
Кортеж (added, deleted, unchanged)
64+
"""
65+
# 1. Загружаем существующие ключи
66+
existing_hashes = await get_existing_hashes()
67+
existing_keys = set(existing_hashes.keys())
68+
69+
# 2. Определяем устаревшие чанки (есть в БД, но нет в текущей кодовой базе)
70+
stale_keys = existing_keys - all_keys
71+
deleted = len(stale_keys)
72+
73+
# 3. Удаляем устаревшие по одному (source + section)
74+
for source, section in stale_keys:
75+
if section is None:
76+
await run_supabase(
77+
lambda s=source: supabase.table("knowledge_chunks")
78+
.delete()
79+
.eq("source", s)
80+
.is_("section", "null")
81+
.execute()
82+
)
83+
else:
84+
await run_supabase(
85+
lambda s=source, sec=section: supabase.table("knowledge_chunks")
86+
.delete()
87+
.eq("source", s)
88+
.eq("section", sec)
89+
.execute()
90+
)
91+
92+
# 4. Удаляем строки, которые будут заменены (изменённые чанки)
93+
for row in new_rows:
94+
source = row["source"]
95+
section = row["section"]
96+
if section is None:
97+
await run_supabase(
98+
lambda s=source: supabase.table("knowledge_chunks")
99+
.delete()
100+
.eq("source", s)
101+
.is_("section", "null")
102+
.execute()
103+
)
104+
else:
105+
await run_supabase(
106+
lambda s=source, sec=section: supabase.table("knowledge_chunks")
107+
.delete()
108+
.eq("source", s)
109+
.eq("section", sec)
110+
.execute()
111+
)
112+
113+
# 5. INSERT новых/изменённых батчами
114+
added = len(new_rows)
115+
for i in range(0, len(new_rows), INDEX_BATCH_SIZE):
116+
batch = new_rows[i:i + INDEX_BATCH_SIZE]
49117
await run_supabase(
50118
lambda b=batch: supabase.table("knowledge_chunks").insert(b).execute()
51119
)
120+
121+
unchanged = len(all_keys) - added
122+
return added, deleted, unchanged

logic/rag.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,12 @@ async def retrieve_context(question: str) -> str:
5151

5252
context = "\n\n".join(formatted_parts)
5353

54+
# Логируем query для отладки: позволяет сопоставить вопрос с найденными чанками
5455
if DEBUG_PRINT:
5556
print(
5657
f"{get_timestamp()} [RAG] Found {len(chunks)} chunk(s), "
57-
f"best similarity: {chunks[0].get('similarity', 0):.2f}"
58+
f"best similarity: {chunks[0].get('similarity', 0):.2f} "
59+
f"| query: {question!r}"
5860
)
5961

6062
return context

schema.sql

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ alter table public.users enable row level security;
3030
create extension if not exists vector;
3131

3232
create table if not exists public.knowledge_chunks (
33-
id bigint generated always as identity primary key,
34-
source text not null, -- Файл-источник (README.md, config.py, ...)
35-
section text, -- Функция/класс/секция
36-
content text not null, -- Текст чанка
37-
embedding vector(1536) not null, -- Embedding (text-embedding-3-small = 1536 dims)
38-
created_at timestamptz default now()
33+
id bigint generated always as identity primary key,
34+
source text not null, -- Файл-источник (README.md, config.py, ...)
35+
section text, -- Функция/класс/секция
36+
content text not null, -- Текст чанка
37+
content_hash text not null, -- SHA-256 хэш контента
38+
embedding vector(1536) not null, -- Embedding (text-embedding-3-small = 1536 dims)
39+
created_at timestamptz default now()
3940
);
4041

4142
create index if not exists idx_knowledge_embedding

scripts/index_knowledge.py

Lines changed: 73 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@
55
по естественным границам (функции, классы, секции) и загружает embeddings
66
в таблицу knowledge_chunks.
77
8-
Идемпотентный: при каждом запуске полностью пересоздаёт базу знаний
9-
(TRUNCATE + INSERT).
8+
Инкрементальный: сравнивает SHA-256 хэши контента с БД и пересчитывает
9+
embeddings только для изменённых чанков.
1010
1111
Запуск:
1212
python scripts/index_knowledge.py
1313
"""
1414

1515
import ast
1616
import asyncio
17+
import hashlib
1718
import os
1819
import re
1920
import sys
@@ -26,7 +27,7 @@
2627

2728
from clients.x402gate.openrouter_embeddings import get_embeddings # noqa: E402 — import after sys.path
2829
from config import INDEX_BATCH_SIZE # noqa: E402 — import after sys.path
29-
from database.knowledge import replace_all_chunks # noqa: E402 — import after sys.path
30+
from database.knowledge import get_existing_hashes, sync_chunks # noqa: E402 — import after sys.path
3031
from utils.utils import get_timestamp # noqa: E402 — import after sys.path
3132

3233
# Директории и файлы для индексации (относительно PROJECT_ROOT)
@@ -184,6 +185,8 @@ def chunk_markdown(filepath: str) -> list[dict]:
184185
chunks: list[dict] = []
185186
current_section = None
186187
current_lines: list[str] = []
188+
# Счётчик дубликатов заголовков: гарантирует уникальность ключа (source, section)
189+
section_counts: dict[str, int] = {}
187190

188191
for line in content.splitlines():
189192
# Проверяем на заголовок H1/H2/H3
@@ -195,7 +198,12 @@ def chunk_markdown(filepath: str) -> list[dict]:
195198
if text:
196199
chunks.append({"source": rel_path, "section": current_section, "content": text})
197200

198-
current_section = header_match.group(2).strip()
201+
section_name = header_match.group(2).strip()
202+
section_counts[section_name] = section_counts.get(section_name, 0) + 1
203+
if section_counts[section_name] > 1:
204+
current_section = f"{section_name} ({section_counts[section_name]})"
205+
else:
206+
current_section = section_name
199207
current_lines = [line]
200208
else:
201209
current_lines.append(line)
@@ -243,18 +251,25 @@ def chunk_file(filepath: str) -> list[dict]:
243251
return []
244252

245253

254+
def compute_content_hash(content: str) -> str:
255+
"""Вычисляет SHA-256 хэш контента чанка."""
256+
return hashlib.sha256(content.encode("utf-8")).hexdigest()
257+
258+
246259
async def main() -> None:
247-
"""Главная функция индексации."""
260+
"""Главная функция индексации (инкрементальная)."""
248261
start_time = time.time()
249262

250263
# 1. Собираем файлы
251264
files = collect_files()
252265
print(f"{get_timestamp()} [INDEX] Found {len(files)} files to process")
253266

254-
# 2. Нарезаем на чанки
267+
# 2. Нарезаем на чанки и считаем хэши
255268
all_chunks: list[dict] = []
256269
for filepath in files:
257270
chunks = chunk_file(filepath)
271+
for chunk in chunks:
272+
chunk["content_hash"] = compute_content_hash(chunk["content"])
258273
all_chunks.extend(chunks)
259274

260275
print(f"{get_timestamp()} [INDEX] Created {len(all_chunks)} chunks")
@@ -263,36 +278,61 @@ async def main() -> None:
263278
print(f"{get_timestamp()} [INDEX] No chunks to index, exiting")
264279
return
265280

266-
# 3. Генерируем embeddings батчами
267-
texts = [c["content"] for c in all_chunks]
268-
all_embeddings: list[list[float]] = []
269-
270-
batches = [texts[i:i + INDEX_BATCH_SIZE] for i in range(0, len(texts), INDEX_BATCH_SIZE)]
271-
print(f"{get_timestamp()} [INDEX] Generating embeddings ({len(batches)} batch(es))...")
272-
273-
for i, batch in enumerate(batches):
274-
embeddings = await get_embeddings(batch)
275-
all_embeddings.extend(embeddings)
276-
if len(batches) > 1:
277-
print(f"{get_timestamp()} [INDEX] Batch {i + 1}/{len(batches)}: {len(batch)} embeddings")
278-
279-
# 4. Очищаем таблицу и загружаем новые данные
280-
print(f"{get_timestamp()} [INDEX] Uploading to Supabase...")
281-
282-
rows = [
283-
{
284-
"source": chunk["source"],
285-
"section": chunk["section"],
286-
"content": chunk["content"],
287-
"embedding": embedding,
288-
}
289-
for chunk, embedding in zip(all_chunks, all_embeddings)
290-
]
281+
# 3. Загружаем существующие хэши из БД и вычисляем дельту
282+
existing_hashes = await get_existing_hashes()
283+
changed_chunks: list[dict] = []
284+
285+
for chunk in all_chunks:
286+
key = (chunk["source"], chunk["section"])
287+
old_hash = existing_hashes.get(key)
288+
if old_hash != chunk["content_hash"]:
289+
changed_chunks.append(chunk)
290+
291+
all_keys = {(c["source"], c["section"]) for c in all_chunks}
292+
stale_count = len(set(existing_hashes.keys()) - all_keys)
293+
294+
print(
295+
f"{get_timestamp()} [INDEX] Delta: "
296+
f"{len(changed_chunks)} changed, "
297+
f"{len(all_chunks) - len(changed_chunks)} unchanged, "
298+
f"{stale_count} stale"
299+
)
300+
301+
# 4. Генерируем embeddings ТОЛЬКО для изменённых чанков
302+
if changed_chunks:
303+
texts = [c["content"] for c in changed_chunks]
304+
all_embeddings: list[list[float]] = []
305+
306+
batches = [texts[i:i + INDEX_BATCH_SIZE] for i in range(0, len(texts), INDEX_BATCH_SIZE)]
307+
print(f"{get_timestamp()} [INDEX] Generating embeddings ({len(batches)} batch(es))...")
308+
309+
for i, batch in enumerate(batches):
310+
embeddings = await get_embeddings(batch)
311+
all_embeddings.extend(embeddings)
312+
if len(batches) > 1:
313+
print(f"{get_timestamp()} [INDEX] Batch {i + 1}/{len(batches)}: {len(batch)} embeddings")
314+
315+
new_rows = [
316+
{
317+
"source": chunk["source"],
318+
"section": chunk["section"],
319+
"content": chunk["content"],
320+
"content_hash": chunk["content_hash"],
321+
"embedding": embedding,
322+
}
323+
for chunk, embedding in zip(changed_chunks, all_embeddings)
324+
]
325+
else:
326+
new_rows = []
291327

292-
await replace_all_chunks(rows)
328+
# 5. Синхронизируем с БД (INSERT изменённых, DELETE устаревших)
329+
added, deleted, unchanged = await sync_chunks(new_rows, all_keys)
293330

294331
duration = time.time() - start_time
295-
print(f"{get_timestamp()} [INDEX] ✅ Done: {len(all_chunks)} chunks indexed in {duration:.1f}s")
332+
print(
333+
f"{get_timestamp()} [INDEX] ✅ Done in {duration:.1f}s: "
334+
f"{added} added, {deleted} deleted, {unchanged} unchanged"
335+
)
296336

297337

298338
if __name__ == "__main__":

0 commit comments

Comments
 (0)