feat(rag): incremental indexing, lower threshold, query logging

oponfil · oponfil · commit 2daf00988fbd · 2026-03-23T18:58:26.000+07:00
- Replace full TRUNCATE+INSERT with incremental content-hash (SHA-256) sync
- Add content_hash column to knowledge_chunks (migration 005)
- Generate embeddings only for changed chunks, saving API costs
- Lower RAG_SIMILARITY_THRESHOLD from 0.3 to 0.1 in config.py
- Log user query text in [RAG] Found log line for debugging
- Fix latent bug: deduplicate markdown headers with ordinal suffix
- Update schema.sql to reflect content_hash column
- Update README.md with RAG and incremental indexing description
- Add tests: get_existing_hashes, sync_chunks, compute_content_hash, chunk_markdown dedup
diff --git a/README.md b/README.md
@@ -160,12 +160,12 @@ The dashboard runs on the same port as the bot (`$PORT`, default `8080`). If `DA
 
 ## Knowledge Base (RAG)
 
-When you chat with the bot, it uses **Retrieval-Augmented Generation** to answer questions about its own features, settings, and code. The bot's codebase is indexed into a vector database (Supabase pgvector) and relevant documentation is automatically retrieved for each question.
+The bot can answer questions about its own functionality, settings, and source code. It uses **Retrieval-Augmented Generation** — the codebase is indexed into a vector database (Supabase pgvector) and relevant fragments are automatically retrieved for each question.
 
 **How it works:**
-1. On each deploy, `scripts/index_knowledge.py` parses all source files using Python AST and indexes them as embeddings in Supabase.
-2. When you ask a question, the bot finds the most relevant code/documentation chunks via vector similarity search.
-3. Retrieved chunks are injected into the AI prompt as context.
+1. On each deploy, `scripts/index_knowledge.py` parses source files (Python AST, Markdown headers, SQL) and stores embeddings in Supabase.
+2. Indexing is **incremental**: content is hashed (SHA-256) and only changed chunks are re-embedded, saving API costs.
+3. When you ask a question, the bot finds the most relevant chunks via vector similarity search and injects them into the AI prompt.
 
 **Manual re-indexing:**
 
diff --git a/config.py b/config.py
@@ -161,7 +161,7 @@ def style_display_name(style: str) -> str:
 # ====== RAG ======
 RAG_EMBEDDING_MODEL = "openai/text-embedding-3-small"  # 1536 dims
 RAG_TOP_K = 5                  # Макс. кол-во чанков в контексте (реально может быть 0..5)
-RAG_SIMILARITY_THRESHOLD = 0.3  # Мин. cosine similarity (0..1); чанки ниже порога отбрасываются
+RAG_SIMILARITY_THRESHOLD = 0.1  # Мин. cosine similarity (0..1); чанки ниже порога отбрасываются
 INDEX_BATCH_SIZE = 100            # Размер батча для embedding-запросов и INSERT в Supabase
 
 # ====== ЧАСОВОЙ ПОЯС ======
diff --git a/database/knowledge.py b/database/knowledge.py
@@ -32,20 +32,91 @@ async def match_knowledge_chunks(
     return result.data if result.data else []
 
 
-async def replace_all_chunks(rows: list[dict]) -> None:
-    """Полностью заменяет содержимое knowledge_chunks (TRUNCATE + INSERT).
+async def get_existing_hashes() -> dict[tuple[str, str | None], str]:
+    """Загружает хэши существующих чанков из БД.
 
-    Args:
-        rows: Список dict с полями: source, section, content, embedding
+    Returns:
+        Словарь {(source, section): content_hash}
     """
-    # TRUNCATE (Supabase не поддерживает TRUNCATE напрямую)
-    await run_supabase(
-        lambda: supabase.table("knowledge_chunks").delete().neq("id", 0).execute()
+    result = await run_supabase(
+        lambda: supabase.table("knowledge_chunks")
+        .select("source, section, content_hash")
+        .execute()
     )
+    rows = result.data if result.data else []
+    return {(r["source"], r.get("section")): r["content_hash"] for r in rows}
+
+
+async def sync_chunks(
+    new_rows: list[dict],
+    all_keys: set[tuple[str, str | None]],
+) -> tuple[int, int, int]:
+    """Инкрементально синхронизирует чанки: INSERT новые, DELETE устаревшие.
+
+    Args:
+        new_rows: Список новых/изменённых чанков для INSERT
+                  (source, section, content, content_hash, embedding)
+        all_keys: Множество (source, section) ВСЕХ актуальных чанков
+                  (для определения удалённых)
 
-    # INSERT батчами
-    for i in range(0, len(rows), INDEX_BATCH_SIZE):
-        batch = rows[i:i + INDEX_BATCH_SIZE]
+    Returns:
+        Кортеж (added, deleted, unchanged)
+    """
+    # 1. Загружаем существующие ключи
+    existing_hashes = await get_existing_hashes()
+    existing_keys = set(existing_hashes.keys())
+
+    # 2. Определяем устаревшие чанки (есть в БД, но нет в текущей кодовой базе)
+    stale_keys = existing_keys - all_keys
+    deleted = len(stale_keys)
+
+    # 3. Удаляем устаревшие по одному (source + section)
+    for source, section in stale_keys:
+        if section is None:
+            await run_supabase(
+                lambda s=source: supabase.table("knowledge_chunks")
+                .delete()
+                .eq("source", s)
+                .is_("section", "null")
+                .execute()
+            )
+        else:
+            await run_supabase(
+                lambda s=source, sec=section: supabase.table("knowledge_chunks")
+                .delete()
+                .eq("source", s)
+                .eq("section", sec)
+                .execute()
+            )
+
+    # 4. Удаляем строки, которые будут заменены (изменённые чанки)
+    for row in new_rows:
+        source = row["source"]
+        section = row["section"]
+        if section is None:
+            await run_supabase(
+                lambda s=source: supabase.table("knowledge_chunks")
+                .delete()
+                .eq("source", s)
+                .is_("section", "null")
+                .execute()
+            )
+        else:
+            await run_supabase(
+                lambda s=source, sec=section: supabase.table("knowledge_chunks")
+                .delete()
+                .eq("source", s)
+                .eq("section", sec)
+                .execute()
+            )
+
+    # 5. INSERT новых/изменённых батчами
+    added = len(new_rows)
+    for i in range(0, len(new_rows), INDEX_BATCH_SIZE):
+        batch = new_rows[i:i + INDEX_BATCH_SIZE]
         await run_supabase(
             lambda b=batch: supabase.table("knowledge_chunks").insert(b).execute()
         )
+
+    unchanged = len(all_keys) - added
+    return added, deleted, unchanged
diff --git a/logic/rag.py b/logic/rag.py
@@ -51,10 +51,12 @@ async def retrieve_context(question: str) -> str:
 
         context = "\n\n".join(formatted_parts)
 
+        # Логируем query для отладки: позволяет сопоставить вопрос с найденными чанками
         if DEBUG_PRINT:
             print(
                 f"{get_timestamp()} [RAG] Found {len(chunks)} chunk(s), "
-                f"best similarity: {chunks[0].get('similarity', 0):.2f}"
+                f"best similarity: {chunks[0].get('similarity', 0):.2f} "
+                f"| query: {question!r}"
             )
 
         return context
diff --git a/schema.sql b/schema.sql
@@ -30,12 +30,13 @@ alter table public.users enable row level security;
 create extension if not exists vector;
 
 create table if not exists public.knowledge_chunks (
-  id         bigint generated always as identity primary key,
-  source     text not null,                    -- Файл-источник (README.md, config.py, ...)
-  section    text,                             -- Функция/класс/секция
-  content    text not null,                    -- Текст чанка
-  embedding  vector(1536) not null,            -- Embedding (text-embedding-3-small = 1536 dims)
-  created_at timestamptz default now()
+  id           bigint generated always as identity primary key,
+  source       text not null,                    -- Файл-источник (README.md, config.py, ...)
+  section      text,                             -- Функция/класс/секция
+  content      text not null,                    -- Текст чанка
+  content_hash text not null,                    -- SHA-256 хэш контента
+  embedding    vector(1536) not null,            -- Embedding (text-embedding-3-small = 1536 dims)
+  created_at   timestamptz default now()
 );
 
 create index if not exists idx_knowledge_embedding
diff --git a/scripts/index_knowledge.py b/scripts/index_knowledge.py
@@ -5,15 +5,16 @@
 по естественным границам (функции, классы, секции) и загружает embeddings
 в таблицу knowledge_chunks.
 
-Идемпотентный: при каждом запуске полностью пересоздаёт базу знаний
-(TRUNCATE + INSERT).
+Инкрементальный: сравнивает SHA-256 хэши контента с БД и пересчитывает
+embeddings только для изменённых чанков.
 
 Запуск:
     python scripts/index_knowledge.py
 """
 
 import ast
 import asyncio
+import hashlib
 import os
 import re
 import sys
@@ -26,7 +27,7 @@
 
 from clients.x402gate.openrouter_embeddings import get_embeddings  # noqa: E402 — import after sys.path
 from config import INDEX_BATCH_SIZE  # noqa: E402 — import after sys.path
-from database.knowledge import replace_all_chunks  # noqa: E402 — import after sys.path
+from database.knowledge import get_existing_hashes, sync_chunks  # noqa: E402 — import after sys.path
 from utils.utils import get_timestamp  # noqa: E402 — import after sys.path
 
 # Директории и файлы для индексации (относительно PROJECT_ROOT)
@@ -184,6 +185,8 @@ def chunk_markdown(filepath: str) -> list[dict]:
     chunks: list[dict] = []
     current_section = None
     current_lines: list[str] = []
+    # Счётчик дубликатов заголовков: гарантирует уникальность ключа (source, section)
+    section_counts: dict[str, int] = {}
 
     for line in content.splitlines():
         # Проверяем на заголовок H1/H2/H3
@@ -195,7 +198,12 @@ def chunk_markdown(filepath: str) -> list[dict]:
                 if text:
                     chunks.append({"source": rel_path, "section": current_section, "content": text})
 
-            current_section = header_match.group(2).strip()
+            section_name = header_match.group(2).strip()
+            section_counts[section_name] = section_counts.get(section_name, 0) + 1
+            if section_counts[section_name] > 1:
+                current_section = f"{section_name} ({section_counts[section_name]})"
+            else:
+                current_section = section_name
             current_lines = [line]
         else:
             current_lines.append(line)
@@ -243,18 +251,25 @@ def chunk_file(filepath: str) -> list[dict]:
         return []
 
 
+def compute_content_hash(content: str) -> str:
+    """Вычисляет SHA-256 хэш контента чанка."""
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()
+
+
 async def main() -> None:
-    """Главная функция индексации."""
+    """Главная функция индексации (инкрементальная)."""
     start_time = time.time()
 
     # 1. Собираем файлы
     files = collect_files()
     print(f"{get_timestamp()} [INDEX] Found {len(files)} files to process")
 
-    # 2. Нарезаем на чанки
+    # 2. Нарезаем на чанки и считаем хэши
     all_chunks: list[dict] = []
     for filepath in files:
         chunks = chunk_file(filepath)
+        for chunk in chunks:
+            chunk["content_hash"] = compute_content_hash(chunk["content"])
         all_chunks.extend(chunks)
 
     print(f"{get_timestamp()} [INDEX] Created {len(all_chunks)} chunks")
@@ -263,36 +278,61 @@ async def main() -> None:
         print(f"{get_timestamp()} [INDEX] No chunks to index, exiting")
         return
 
-    # 3. Генерируем embeddings батчами
-    texts = [c["content"] for c in all_chunks]
-    all_embeddings: list[list[float]] = []
-
-    batches = [texts[i:i + INDEX_BATCH_SIZE] for i in range(0, len(texts), INDEX_BATCH_SIZE)]
-    print(f"{get_timestamp()} [INDEX] Generating embeddings ({len(batches)} batch(es))...")
-
-    for i, batch in enumerate(batches):
-        embeddings = await get_embeddings(batch)
-        all_embeddings.extend(embeddings)
-        if len(batches) > 1:
-            print(f"{get_timestamp()} [INDEX]   Batch {i + 1}/{len(batches)}: {len(batch)} embeddings")
-
-    # 4. Очищаем таблицу и загружаем новые данные
-    print(f"{get_timestamp()} [INDEX] Uploading to Supabase...")
-
-    rows = [
-        {
-            "source": chunk["source"],
-            "section": chunk["section"],
-            "content": chunk["content"],
-            "embedding": embedding,
-        }
-        for chunk, embedding in zip(all_chunks, all_embeddings)
-    ]
+    # 3. Загружаем существующие хэши из БД и вычисляем дельту
+    existing_hashes = await get_existing_hashes()
+    changed_chunks: list[dict] = []
+
+    for chunk in all_chunks:
+        key = (chunk["source"], chunk["section"])
+        old_hash = existing_hashes.get(key)
+        if old_hash != chunk["content_hash"]:
+            changed_chunks.append(chunk)
+
+    all_keys = {(c["source"], c["section"]) for c in all_chunks}
+    stale_count = len(set(existing_hashes.keys()) - all_keys)
+
+    print(
+        f"{get_timestamp()} [INDEX] Delta: "
+        f"{len(changed_chunks)} changed, "
+        f"{len(all_chunks) - len(changed_chunks)} unchanged, "
+        f"{stale_count} stale"
+    )
+
+    # 4. Генерируем embeddings ТОЛЬКО для изменённых чанков
+    if changed_chunks:
+        texts = [c["content"] for c in changed_chunks]
+        all_embeddings: list[list[float]] = []
+
+        batches = [texts[i:i + INDEX_BATCH_SIZE] for i in range(0, len(texts), INDEX_BATCH_SIZE)]
+        print(f"{get_timestamp()} [INDEX] Generating embeddings ({len(batches)} batch(es))...")
+
+        for i, batch in enumerate(batches):
+            embeddings = await get_embeddings(batch)
+            all_embeddings.extend(embeddings)
+            if len(batches) > 1:
+                print(f"{get_timestamp()} [INDEX]   Batch {i + 1}/{len(batches)}: {len(batch)} embeddings")
+
+        new_rows = [
+            {
+                "source": chunk["source"],
+                "section": chunk["section"],
+                "content": chunk["content"],
+                "content_hash": chunk["content_hash"],
+                "embedding": embedding,
+            }
+            for chunk, embedding in zip(changed_chunks, all_embeddings)
+        ]
+    else:
+        new_rows = []
 
-    await replace_all_chunks(rows)
+    # 5. Синхронизируем с БД (INSERT изменённых, DELETE устаревших)
+    added, deleted, unchanged = await sync_chunks(new_rows, all_keys)
 
     duration = time.time() - start_time
-    print(f"{get_timestamp()} [INDEX] ✅ Done: {len(all_chunks)} chunks indexed in {duration:.1f}s")
+    print(
+        f"{get_timestamp()} [INDEX] ✅ Done in {duration:.1f}s: "
+        f"{added} added, {deleted} deleted, {unchanged} unchanged"
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rag.py b/tests/test_rag.py