georgia-tech-db · Rishit2000 · Jan 30, 2026 · shahmeer99 · Feb 1, 2026
diff --git a/src/api_server.py b/src/api_server.py
@@ -139,9 +139,9 @@ async def lifespan(app: FastAPI):
             rrf_k=int(_config.rrf_k),
         )
 
-        print("✅ TokenSmith API initialized successfully")
+        print("TokenSmith API initialized successfully")
     except Exception as exc:
-        print(f"⚠️  Warning: Could not load artifacts: {exc}")
+        print(f"Warning: Could not load artifacts: {exc}")
         print("   Run indexing first or check your configuration")
 
     yield
@@ -185,7 +185,7 @@ async def health_check():
 @app.post("/api/test-chat")
 async def test_chat(request: ChatRequest):
     """Test chat endpoint that bypasses generation to isolate issues."""
-    print(f"🔍 Test chat request: {request.query}")
+    print(f"Test chat request: {request.query}")
 
     try:
         _ensure_initialized()
@@ -265,9 +265,10 @@ async def event_generator():
             chunks_by_page: Dict[int, List[str]] = {}
             for i in topk_idxs[:max_chunks]:
                 source_text = sources[i]
-                page = page_nums[i] if i in page_nums else 1
-                sources_used.add(SourceItem(page=page, text=source_text))
-                chunks_by_page.setdefault(page, []).append(chunks[i])
+                pages = page_nums.get(i, [1])
+                for page in pages:
+                    chunks_by_page.setdefault(page, []).append(chunks[i])
+                    sources_used.add(SourceItem(page=page, text=source_text))
 
             # Remove duplicates by converting to set of tuples, then back to SourceItem
             yield f"data: {json.dumps({'type': 'sources', 'content': [s.dict() for s in sources_used]})}\n\n"
@@ -291,7 +292,7 @@ async def event_generator():
 @app.post("/api/chat", response_model=ChatResponse)
 async def chat(request: ChatRequest):
     """Main chat endpoint."""
-    print(f"🔍 Received chat request: {request.query}")  # Debug logging
+    print(f"Received chat request: {request.query}")  # Debug logging
 
     _ensure_initialized()
 

diff --git a/src/index_builder.py b/src/index_builder.py
@@ -99,6 +99,9 @@ def build_index(
 
         # Iterate through each chunk produced from this section
         for sub_chunk_id, sub_chunk in enumerate(sub_chunks):
+            # Track all pages this specific chunk touches
+            chunk_pages = set()
+
             # Split the sub_chunk by page markers to see if it
             # spans multiple pages.
             fragments = page_pattern.split(sub_chunk)
@@ -107,6 +110,7 @@ def build_index(
             # it belongs to the current_page.
             if fragments[0].strip():
                 page_to_chunk_ids.setdefault(current_page, set()).add(total_chunks+sub_chunk_id)
+                chunk_pages.add(current_page)
 
             # Process the new pages found within this sub_chunk. Step by 2
             # where each pair represents (page number, text after it)
@@ -118,6 +122,7 @@ def build_index(
                     # If there is text after this marker, it belongs to the new_page.
                     if fragments[i+1].strip():
                         page_to_chunk_ids.setdefault(new_page, set()).add(total_chunks + sub_chunk_id)
+                        chunk_pages.add(new_page)
 
                     current_page = new_page
 
@@ -140,7 +145,7 @@ def build_index(
                 "section": c['heading'],
                 "section_path": full_section_path,
                 "text_preview": clean_chunk[:100],
-                "page_number": current_page,
+                "page_numbers": sorted(list(chunk_pages)),
                 "chunk_id": total_chunks + sub_chunk_id
             }
 

diff --git a/src/preprocessing/extraction.py b/src/preprocessing/extraction.py
@@ -203,11 +203,11 @@ def convert_and_save_with_page_numbers(input_file_path, output_file_path):
 
     doc = result.document
 
-    # 1. Define a unique placeholder that won't appear in the text.
+    # Define a unique placeholder that won't appear in the text.
     # Using "\n" ensures it's on its own line.
     UNIQUE_PLACEHOLDER = "\n%%%__DOCLING_PAGE_BREAK__%%%\n"
 
-    # 2. Export the *entire* document at once, using our placeholder.
+    # Export the entire document at once, using our placeholder.
     # This avoids the fragile doc.filter() method.
     try:
         full_markdown = doc.export_to_markdown(page_break_placeholder=UNIQUE_PLACEHOLDER)
@@ -223,24 +223,24 @@ def convert_and_save_with_page_numbers(input_file_path, output_file_path):
             print(f"Error writing fallback file: {e_io}", file=sys.stderr)
         return
 
-    # 3. Split the full markdown by our unique placeholder.
+    # Split the full markdown by our unique placeholder.
     # This gives us a list where each item is one page's content.
     markdown_pages = full_markdown.split(UNIQUE_PLACEHOLDER)
 
     final_output_chunks = []
 
-    # 4. Iterate through the pages, adding our custom footer.
+    # Iterate through the pages, adding our custom footer.
     # We use enumerate to get a 1-based page number.
     num_pages = len(markdown_pages)
     for i, page_content in enumerate(markdown_pages, 1):
         # Add the content for the current page
         final_output_chunks.append(page_content)
 
-        # Add our custom footer, but *not* after the very last page
+        # Add our custom footer, but not after the very last page
         if i < num_pages:
             final_output_chunks.append(f"\n\n--- Page {i} ---\n\n")
 
-    # 5. Write the combined markdown string to the output file
+    # Write the combined markdown string to the output file
     try:
         with open(output_file_path, "w", encoding="utf-8") as f:
             f.write("".join(final_output_chunks))
@@ -275,8 +275,9 @@ def preprocess_extracted_section(text: str) -> str:
 
 
 if __name__ == '__main__':
-    input_pdf = "data/chapters/silberschatz.pdf"
-    output_md = 'data/silberschatz.md'
+    project_root = Path(__file__).resolve().parent.parent.parent
+    input_pdf = project_root / "data/chapters/textbook.pdf"
+    output_md = project_root / "data/testing.md"
 
     print(f"Converting '{input_pdf}' to '{output_md}'...")
     convert_and_save_with_page_numbers(input_pdf, output_md)
@@ -285,7 +286,7 @@ def preprocess_extracted_section(text: str) -> str:
 
     if extracted_sections:
         print(f"Successfully extracted {len(extracted_sections)} sections.")
-        output_filename = 'data/extracted_sections.json'
+        output_filename = project_root / "data/extracted_sections.json"
         with open(output_filename, 'w', encoding='utf-8') as f:
             json.dump(extracted_sections, f, indent=4, ensure_ascii=False)
         print(f"\nFull extracted content saved to '{output_filename}'")
diff --git a/src/retriever.py b/src/retriever.py
@@ -55,7 +55,7 @@ def load_artifacts(artifacts_dir: os.PathLike, index_prefix: str) -> Tuple[faiss
 
 # -------------------------- Helper to get page nums for chunks -------------------------------
 
-def get_page_numbers(chunk_indices: list[int], metadata: list[dict]) -> dict[int, int]:
+def get_page_numbers(chunk_indices: list[int], metadata: list[dict]) -> dict[int, List[int]]:
     """
     Retrieves page numbers for the provided chunk indices.
     """
@@ -68,10 +68,8 @@ def get_page_numbers(chunk_indices: list[int], metadata: list[dict]) -> dict[int
         idx = int(idx)
         # Ensure index is within bounds
         if 0 <= idx < len(metadata):
-            # Access the 'page_number' key we saved in index_builder.py
-            p_num = metadata[idx].get("page_number")
-            if p_num is not None:
-                page_numbers[idx] = p_num
+            # Access the 'page_numbers' key we saved in index_builder.py
+            page_numbers[idx] = metadata[idx].get("page_numbers")
 
     return page_numbers