Build summary index #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

LordDarkula wants to merge 24 commits into main from aubhro

src/generator.py

-Original file line number
+Diff line change
@@ -1,5 +1,7 @@
     import os, subprocess, textwrap, re, shutil, pathlib
+    from src.utils import text_cleaning
     ANSWER_START = "<<<ANSWER>>>"
     ANSWER_END   = "<<<END>>>"
@@ Expand Down @@

src/index_builder.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -50,13 +50,20 @@ def build_index(
  
            - {prefix}_sources.pkl

            - {prefix}_meta.pkl

        """

        all_chunks: List[str] = []

        sources: List[str] = []

        metadata: List[Dict] = []

        # Extract sections from markdown

        sections = extract_sections_from_markdown(markdown_file)

        build_index_from_sections(sections=sections, cfg=cfg, filename=markdown_file, keep_tables=keep_tables, do_visualize=do_visualize)

    def build_index_from_sections(sections, cfg: QueryPlanConfig, filename: str, keep_tables: bool = True, do_visualize: bool = False, index_prefix: os.PathLike = None):

        index_prefix = index_prefix or cfg.get_index_prefix()

        all_chunks: List[str] = []

        sources: List[str] = []

        metadata: List[Dict] = []

        # Create strategy and chunker

        strategy = cfg.make_strategy()

        chunker = DocumentChunker(strategy=strategy, keep_tables=keep_tables)

    @@ -65,7 +72,7 @@ def build_index(
  
        for i, c in enumerate(sections):

            has_table = bool(TABLE_RE.search(c['content']))

            meta = {

                "filename": markdown_file,

                "filename": filename,

                "chunk_id": i,

                "mode": cfg.chunk_config.to_string(),

                "keep_tables": keep_tables,

    @@ -80,11 +87,9 @@ def build_index(
  
            sub_chunks = chunker.chunk(c['content'])

            for sub_c in sub_chunks:

                all_chunks.append(sub_c)

                sources.append(markdown_file)

                sources.append(filename)

                metadata.append(meta)

        index_prefix = cfg.get_index_prefix()

        # Step 2: Create embeddings for FAISS index

        print(f"Embedding {len(all_chunks):,} chunks with {cfg.embed_model} ...")

        embedder = SentenceTransformer(cfg.embed_model)

src/main.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -5,7 +5,7 @@
  
    from src.config import QueryPlanConfig

    from src.generator import answer

    from src.index_builder import build_index

    from src.index_builder import build_index, build_index_from_sections

    from src.instrumentation.logging import init_logger, get_logger

    from src.ranking.ranker import EnsembleRanker

    from src.ranking.reranker import rerank

    @@ -21,7 +21,7 @@ def parse_args() -> argparse.Namespace:
  
        # Required arguments

        parser.add_argument(

            "mode",

            choices=["index", "chat"],

            choices=["index", "chat", "summary"],

            help="operation mode: 'index' to build index, 'chat' to query"

        )

    @@ -119,7 +119,7 @@ def run_chat_session(args: argparse.Namespace, cfg: QueryPlanConfig):
  
        try:

            # Disabled till we fix the core pipeline

            # cfg = planner.plan(q)

            faiss_index, bm25_index, chunks, sources = load_artifacts(cfg)

            faiss_index, bm25_index, chunks, sources = load_artifacts(cfg.get_index_prefix())

            retrievers = [

                FAISSRetriever(faiss_index, cfg.embed_model),

    @@ -208,6 +208,15 @@ def main():
  
            run_index_mode(args, cfg)

        elif args.mode == "chat":

            run_chat_session(args, cfg)

        elif args.mode == "summary":

            with open("summary_index.txt") as f:

                summary_section = {

                    "heading": "Summary",

                    "content": f.read(),

                }

                summary_index_path = pathlib.Path("index", "summary")

                summary_index_path.mkdir(parents=True, exist_ok=True)

                build_index_from_sections(sections=[summary_section], filename="summary_index.txt", cfg=cfg, index_prefix=summary_index_path / "summary_index")

    if __name__ == "__main__":

src/preprocessing/extraction.py

-Original file line number
+Diff line change
@@ -1,7 +1,8 @@
     import re
     import json
+    import os
-    def extract_sections_from_markdown(file_path):
+    def extract_sections_from_markdown(file_path: os.PathLike) -> list[dict[str, str]]:
         """
         Chunks a markdown file into sections based on '##' headings.
@@ Expand Down @@

src/retriever.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ @@
     from __future__ import annotations
+    import os
     import pickle
     from abc import ABC, abstractmethod
     from typing import List, Tuple, Optional, Dict
@@ Expand All / @@ -31,15 +32,13 @@ def _get_embedder(model_name: str) -> SentenceTransformer: @@
     # -------------------------- Read artifacts -------------------------------
-    def load_artifacts(cfg: QueryPlanConfig) -> Tuple[faiss.Index, List[str], List[str]]:
+    def load_artifacts(index_prefix: os.PathLike) -> Tuple[faiss.Index, List[str], List[str]]:
         """
         Loads:
           - FAISS index: {index_prefix}.faiss
           - chunks:      {index_prefix}_chunks.pkl
           - sources:     {index_prefix}_sources.pkl
         """
-        index_prefix = cfg.get_index_prefix()
         faiss_index = faiss.read_index(f"{index_prefix}.faiss")
         bm25_index  = pickle.load(open(f"{index_prefix}_bm25.pkl", "rb"))
         chunks      = pickle.load(open(f"{index_prefix}_chunks.pkl", "rb"))
@@ Expand Down @@

src/summarizer.py

-Original file line number
+Diff line change
@@ -0,0 +1,111 @@
+    import textwrap
+    from typing import Optional
+    import fitz  # PyMuPDF
+    from tqdm import tqdm
+    import sys
+    import os
+    import pathlib
+    from src.utils import text_cleaning
+    src_module = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
+    sys.path.append(str(src_module))
+    sys.path.append(str(src_module.parent))
+    from src.preprocessing.chunking import DocumentChunker
+    from src.preprocessing.chunking import SectionRecursiveStrategy, SectionRecursiveConfig
+    from src.generator import run_llama_cpp
+    ANSWER_START = "<<<ANSWER>>>"
+    ANSWER_END = "<<<END>>>"
+    def summary_prompt(section: str) -> str:
+        section = text_cleaning(section)
+        return textwrap.dedent(
+            f"""\
+            <|im_start|>system
+            You are a textbook summarizer. Your job is to summarize the following section of a Databases textbook in a couple sentences
+            while retaining conceptual information
+            and important definitions. \
+            The summary must be shorter than the original section.
+            End your reply with {ANSWER_END}.
+            <|im_end|>
+            <|im_start|>user
+            Textbook Section:
+            {section}
+            <|im_end|>
+            <|im_start|>assistant
+            {ANSWER_START}
+        """
+        )
+    def build_summary_index(
+        model_path: os.PathLike = "build/llama.cpp/models/qwen2.5-0.5b-instruct-q5_k_m.gguf",
+        pdf_dir: str = "data/chapters/",
+    ):
+        model_path = pathlib.Path(model_path)
+        print(f"Building summary index using model: {model_path}")
+        chunker = DocumentChunker(SectionRecursiveStrategy(SectionRecursiveConfig()), keep_tables=True)
+        with fitz.open(pathlib.Path(pdf_dir, "silberschatz.pdf")) as doc:
+            full_text = "".join(page.get_text() for page in doc)
+        chunks = chunker.chunk(full_text)
+        print(f"Number of chunks: {len(chunks)}")
+        llama_debug_line_prefixes = [
+            "llama_perf_sampler_print:",
+            "llama_perf_context_print:",
+            "llama_model_loader:",
+            "llama_model_load_from_file_impl:",
+            "ggml_cuda_init:",
+            "Device 0:",
+            "Device 1:",
+            "build:",
+            "main:",
+            "load:",
+            "print_info:",
+            "load_tensors:",
+            "llama_context:",
+            "llama_kv_cache:",
+            "common_init_from_params:",
+            "system_info:",
+            ".........",
+            "<think>",
+            "</think>",
+        ]
+        def is_debug_line(line: str) -> bool:
+            stripped_line = line.strip()
+            if stripped_line == "Summary:":
+                return True
+            for prefix in llama_debug_line_prefixes:
+                if stripped_line.startswith(prefix):
+                    return True
+            return False
+        with open(f"summary_index-{model_path.stem}.txt", "w") as f:
+            for chunk in tqdm(chunks):
+                query = summary_prompt(chunk)
+                response = run_llama_cpp(query, model_path)
+                response_lines = response.split("\n")
+                answer_lines = [
+                    f"{r_line}\n"
+                    for r_line in response_lines
+                    if len(r_line) > 0 and not is_debug_line(r_line)
+                ]
+                f.writelines(answer_lines)
+    def main():
+        model_path = pathlib.Path("build", "llama.cpp", "models", "Qwen3-1.7B-Q8_0.gguf")
+        build_summary_index(model_path=model_path)
+    if __name__ == "__main__":
+        main()

src/utils.py

-Original file line number
+Diff line change
@@ -0,0 +1,15 @@
+    import re
+    def text_cleaning(prompt):
+        _CONTROL_CHARS_RE = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
+        _DANGEROUS_PATTERNS = [
+            r"ignore\s+(all\s+)?previous\s+instructions?",
+            r"you\s+are\s+now\s+(in\s+)?developer\s+mode",
+            r"system\s+override",
+            r"reveal\s+prompt",
+        ]
+        text = _CONTROL_CHARS_RE.sub("", prompt)
+        text = re.sub(r"\s+", " ", text).strip()
+        for pat in _DANGEROUS_PATTERNS:
+            text = re.sub(pat, "[FILTERED]", text, flags=re.IGNORECASE)
+        return text

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Build summary index #7

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Build summary index #7

Are you sure you want to change the base?

Uh oh!

Build summary index #7

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!