Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b2b12da
Build summary index
LordDarkula Sep 5, 2025
69c0037
Replace answer call with run_llama_cpp
LordDarkula Sep 5, 2025
3138104
Add default values for model_path and pdf_dir params in build_summar…
LordDarkula Sep 5, 2025
93b20fc
Add main function to summarizer
LordDarkula Sep 6, 2025
0a90fff
Fix summarizer path to pdf
LordDarkula Sep 11, 2025
07170d7
Move text_cleaning function to separate utils module
LordDarkula Sep 11, 2025
0b2471d
Raise ValueError in DocumentChunker for unrecognized mode
LordDarkula Sep 11, 2025
537eed7
Remove llama debug prints from summary index
LordDarkula Sep 11, 2025
a2f5228
Remove Summary title lines from summary
LordDarkula Sep 11, 2025
2f91bde
Add generated summary index
LordDarkula Sep 11, 2025
b9c55ea
Merge branch 'main' into aubhro
LordDarkula Sep 19, 2025
52b7681
Remove nonexistent mode validation in DocumentChunker
LordDarkula Sep 19, 2025
2eedeff
Merge branch 'main' into aubhro
LordDarkula Sep 26, 2025
364abfe
Merge branch 'main' into aubhro
LordDarkula Sep 26, 2025
5cbf1d1
Merge branch 'main' into aubhro
LordDarkula Oct 3, 2025
a9162a9
Merge branch 'main' into aubhro
LordDarkula Oct 13, 2025
2d126c4
Update summary index building with new DocumentChunker API
LordDarkula Oct 13, 2025
6c328a0
Add summary indexes
LordDarkula Oct 17, 2025
c823df3
Merge branch 'main' into aubhro
LordDarkula Oct 17, 2025
1ebf8e6
Update import paths in summarizer to reflect new project structure
LordDarkula Oct 17, 2025
9b64b50
Add option in main to build index from summary
LordDarkula Oct 17, 2025
21831cd
Update load_artifacts param to take prefix directly
LordDarkula Oct 17, 2025
711ae3e
Update test benchmark retriever to use BM25
LordDarkula Oct 17, 2025
d403952
Update summary index for qwen 3.1
LordDarkula Oct 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os, subprocess, textwrap, re, shutil, pathlib

from src.utils import text_cleaning

ANSWER_START = "<<<ANSWER>>>"
ANSWER_END = "<<<END>>>"

Expand Down
19 changes: 12 additions & 7 deletions src/index_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,20 @@ def build_index(
- {prefix}_sources.pkl
- {prefix}_meta.pkl
"""
all_chunks: List[str] = []
sources: List[str] = []
metadata: List[Dict] = []


# Extract sections from markdown
sections = extract_sections_from_markdown(markdown_file)
build_index_from_sections(sections=sections, cfg=cfg, filename=markdown_file, keep_tables=keep_tables, do_visualize=do_visualize)


def build_index_from_sections(sections, cfg: QueryPlanConfig, filename: str, keep_tables: bool = True, do_visualize: bool = False, index_prefix: os.PathLike = None):
index_prefix = index_prefix or cfg.get_index_prefix()

all_chunks: List[str] = []
sources: List[str] = []
metadata: List[Dict] = []

# Create strategy and chunker
strategy = cfg.make_strategy()
chunker = DocumentChunker(strategy=strategy, keep_tables=keep_tables)
Expand All @@ -65,7 +72,7 @@ def build_index(
for i, c in enumerate(sections):
has_table = bool(TABLE_RE.search(c['content']))
meta = {
"filename": markdown_file,
"filename": filename,
"chunk_id": i,
"mode": cfg.chunk_config.to_string(),
"keep_tables": keep_tables,
Expand All @@ -80,11 +87,9 @@ def build_index(
sub_chunks = chunker.chunk(c['content'])
for sub_c in sub_chunks:
all_chunks.append(sub_c)
sources.append(markdown_file)
sources.append(filename)
metadata.append(meta)

index_prefix = cfg.get_index_prefix()

# Step 2: Create embeddings for FAISS index
print(f"Embedding {len(all_chunks):,} chunks with {cfg.embed_model} ...")
embedder = SentenceTransformer(cfg.embed_model)
Expand Down
15 changes: 12 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from src.config import QueryPlanConfig
from src.generator import answer
from src.index_builder import build_index
from src.index_builder import build_index, build_index_from_sections
from src.instrumentation.logging import init_logger, get_logger
from src.ranking.ranker import EnsembleRanker
from src.ranking.reranker import rerank
Expand All @@ -21,7 +21,7 @@ def parse_args() -> argparse.Namespace:
# Required arguments
parser.add_argument(
"mode",
choices=["index", "chat"],
choices=["index", "chat", "summary"],
help="operation mode: 'index' to build index, 'chat' to query"
)

Expand Down Expand Up @@ -119,7 +119,7 @@ def run_chat_session(args: argparse.Namespace, cfg: QueryPlanConfig):
try:
# Disabled till we fix the core pipeline
# cfg = planner.plan(q)
faiss_index, bm25_index, chunks, sources = load_artifacts(cfg)
faiss_index, bm25_index, chunks, sources = load_artifacts(cfg.get_index_prefix())

retrievers = [
FAISSRetriever(faiss_index, cfg.embed_model),
Expand Down Expand Up @@ -208,6 +208,15 @@ def main():
run_index_mode(args, cfg)
elif args.mode == "chat":
run_chat_session(args, cfg)
elif args.mode == "summary":
with open("summary_index.txt") as f:
summary_section = {
"heading": "Summary",
"content": f.read(),
}
summary_index_path = pathlib.Path("index", "summary")
summary_index_path.mkdir(parents=True, exist_ok=True)
build_index_from_sections(sections=[summary_section], filename="summary_index.txt", cfg=cfg, index_prefix=summary_index_path / "summary_index")


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion src/preprocessing/extraction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import re
import json
import os

def extract_sections_from_markdown(file_path):
def extract_sections_from_markdown(file_path: os.PathLike) -> list[dict[str, str]]:
"""
Chunks a markdown file into sections based on '##' headings.

Expand Down
5 changes: 2 additions & 3 deletions src/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from __future__ import annotations

import os
import pickle
from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, Dict
Expand All @@ -31,15 +32,13 @@ def _get_embedder(model_name: str) -> SentenceTransformer:

# -------------------------- Read artifacts -------------------------------

def load_artifacts(cfg: QueryPlanConfig) -> Tuple[faiss.Index, List[str], List[str]]:
def load_artifacts(index_prefix: os.PathLike) -> Tuple[faiss.Index, List[str], List[str]]:
"""
Loads:
- FAISS index: {index_prefix}.faiss
- chunks: {index_prefix}_chunks.pkl
- sources: {index_prefix}_sources.pkl
"""
index_prefix = cfg.get_index_prefix()

faiss_index = faiss.read_index(f"{index_prefix}.faiss")
bm25_index = pickle.load(open(f"{index_prefix}_bm25.pkl", "rb"))
chunks = pickle.load(open(f"{index_prefix}_chunks.pkl", "rb"))
Expand Down
111 changes: 111 additions & 0 deletions src/summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import textwrap
from typing import Optional
import fitz # PyMuPDF
from tqdm import tqdm
import sys
import os
import pathlib

from src.utils import text_cleaning

src_module = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(str(src_module))
sys.path.append(str(src_module.parent))

from src.preprocessing.chunking import DocumentChunker
from src.preprocessing.chunking import SectionRecursiveStrategy, SectionRecursiveConfig
from src.generator import run_llama_cpp

ANSWER_START = "<<<ANSWER>>>"
ANSWER_END = "<<<END>>>"


def summary_prompt(section: str) -> str:
section = text_cleaning(section)
return textwrap.dedent(
f"""\
<|im_start|>system
You are a textbook summarizer. Your job is to summarize the following section of a Databases textbook in a couple sentences
while retaining conceptual information
and important definitions. \
The summary must be shorter than the original section.
End your reply with {ANSWER_END}.
<|im_end|>
<|im_start|>user

Textbook Section:
{section}

<|im_end|>
<|im_start|>assistant
{ANSWER_START}
"""
)


def build_summary_index(
model_path: os.PathLike = "build/llama.cpp/models/qwen2.5-0.5b-instruct-q5_k_m.gguf",
pdf_dir: str = "data/chapters/",
):
model_path = pathlib.Path(model_path)
print(f"Building summary index using model: {model_path}")
chunker = DocumentChunker(SectionRecursiveStrategy(SectionRecursiveConfig()), keep_tables=True)

with fitz.open(pathlib.Path(pdf_dir, "silberschatz.pdf")) as doc:
full_text = "".join(page.get_text() for page in doc)

chunks = chunker.chunk(full_text)
print(f"Number of chunks: {len(chunks)}")

llama_debug_line_prefixes = [
"llama_perf_sampler_print:",
"llama_perf_context_print:",
"llama_model_loader:",
"llama_model_load_from_file_impl:",
"ggml_cuda_init:",
"Device 0:",
"Device 1:",
"build:",
"main:",
"load:",
"print_info:",
"load_tensors:",
"llama_context:",
"llama_kv_cache:",
"common_init_from_params:",
"system_info:",
".........",
"<think>",
"</think>",
]

def is_debug_line(line: str) -> bool:
stripped_line = line.strip()

if stripped_line == "Summary:":
return True

for prefix in llama_debug_line_prefixes:
if stripped_line.startswith(prefix):
return True

return False

with open(f"summary_index-{model_path.stem}.txt", "w") as f:
for chunk in tqdm(chunks):
query = summary_prompt(chunk)
response = run_llama_cpp(query, model_path)
response_lines = response.split("\n")
answer_lines = [
f"{r_line}\n"
for r_line in response_lines
if len(r_line) > 0 and not is_debug_line(r_line)
]
f.writelines(answer_lines)

def main():
model_path = pathlib.Path("build", "llama.cpp", "models", "Qwen3-1.7B-Q8_0.gguf")
build_summary_index(model_path=model_path)

if __name__ == "__main__":
main()
15 changes: 15 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import re

def text_cleaning(prompt):
_CONTROL_CHARS_RE = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
_DANGEROUS_PATTERNS = [
r"ignore\s+(all\s+)?previous\s+instructions?",
r"you\s+are\s+now\s+(in\s+)?developer\s+mode",
r"system\s+override",
r"reveal\s+prompt",
]
text = _CONTROL_CHARS_RE.sub("", prompt)
text = re.sub(r"\s+", " ", text).strip()
for pat in _DANGEROUS_PATTERNS:
text = re.sub(pat, "[FILTERED]", text, flags=re.IGNORECASE)
return text
Loading