Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ async def lifespan(app: FastAPI):
rrf_k=int(_config.rrf_k),
)

print("TokenSmith API initialized successfully")
print("TokenSmith API initialized successfully")
except Exception as exc:
print(f"⚠️ Warning: Could not load artifacts: {exc}")
print(f"Warning: Could not load artifacts: {exc}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for removing these emojis. Hate when LLMs do that :)

print(" Run indexing first or check your configuration")

yield
Expand Down Expand Up @@ -185,7 +185,7 @@ async def health_check():
@app.post("/api/test-chat")
async def test_chat(request: ChatRequest):
"""Test chat endpoint that bypasses generation to isolate issues."""
print(f"🔍 Test chat request: {request.query}")
print(f"Test chat request: {request.query}")

try:
_ensure_initialized()
Expand Down Expand Up @@ -265,9 +265,10 @@ async def event_generator():
chunks_by_page: Dict[int, List[str]] = {}
for i in topk_idxs[:max_chunks]:
source_text = sources[i]
page = page_nums[i] if i in page_nums else 1
sources_used.add(SourceItem(page=page, text=source_text))
chunks_by_page.setdefault(page, []).append(chunks[i])
pages = page_nums.get(i, [1])
for page in pages:
chunks_by_page.setdefault(page, []).append(chunks[i])
sources_used.add(SourceItem(page=page, text=source_text))

# Remove duplicates by converting to set of tuples, then back to SourceItem
yield f"data: {json.dumps({'type': 'sources', 'content': [s.dict() for s in sources_used]})}\n\n"
Expand All @@ -291,7 +292,7 @@ async def event_generator():
@app.post("/api/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""Main chat endpoint."""
print(f"🔍 Received chat request: {request.query}") # Debug logging
print(f"Received chat request: {request.query}") # Debug logging

_ensure_initialized()

Expand Down
7 changes: 6 additions & 1 deletion src/index_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ def build_index(

# Iterate through each chunk produced from this section
for sub_chunk_id, sub_chunk in enumerate(sub_chunks):
# Track all pages this specific chunk touches
chunk_pages = set()

# Split the sub_chunk by page markers to see if it
# spans multiple pages.
fragments = page_pattern.split(sub_chunk)
Expand All @@ -107,6 +110,7 @@ def build_index(
# it belongs to the current_page.
if fragments[0].strip():
page_to_chunk_ids.setdefault(current_page, set()).add(total_chunks+sub_chunk_id)
chunk_pages.add(current_page)

# Process the new pages found within this sub_chunk. Step by 2
# where each pair represents (page number, text after it)
Expand All @@ -118,6 +122,7 @@ def build_index(
# If there is text after this marker, it belongs to the new_page.
if fragments[i+1].strip():
page_to_chunk_ids.setdefault(new_page, set()).add(total_chunks + sub_chunk_id)
chunk_pages.add(new_page)

current_page = new_page

Expand All @@ -140,7 +145,7 @@ def build_index(
"section": c['heading'],
"section_path": full_section_path,
"text_preview": clean_chunk[:100],
"page_number": current_page,
"page_numbers": sorted(list(chunk_pages)),
"chunk_id": total_chunks + sub_chunk_id
}

Expand Down
19 changes: 10 additions & 9 deletions src/preprocessing/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,11 @@ def convert_and_save_with_page_numbers(input_file_path, output_file_path):

doc = result.document

# 1. Define a unique placeholder that won't appear in the text.
# Define a unique placeholder that won't appear in the text.
# Using "\n" ensures it's on its own line.
UNIQUE_PLACEHOLDER = "\n%%%__DOCLING_PAGE_BREAK__%%%\n"

# 2. Export the *entire* document at once, using our placeholder.
# Export the entire document at once, using our placeholder.
# This avoids the fragile doc.filter() method.
try:
full_markdown = doc.export_to_markdown(page_break_placeholder=UNIQUE_PLACEHOLDER)
Expand All @@ -223,24 +223,24 @@ def convert_and_save_with_page_numbers(input_file_path, output_file_path):
print(f"Error writing fallback file: {e_io}", file=sys.stderr)
return

# 3. Split the full markdown by our unique placeholder.
# Split the full markdown by our unique placeholder.
# This gives us a list where each item is one page's content.
markdown_pages = full_markdown.split(UNIQUE_PLACEHOLDER)

final_output_chunks = []

# 4. Iterate through the pages, adding our custom footer.
# Iterate through the pages, adding our custom footer.
# We use enumerate to get a 1-based page number.
num_pages = len(markdown_pages)
for i, page_content in enumerate(markdown_pages, 1):
# Add the content for the current page
final_output_chunks.append(page_content)

# Add our custom footer, but *not* after the very last page
# Add our custom footer, but not after the very last page
if i < num_pages:
final_output_chunks.append(f"\n\n--- Page {i} ---\n\n")

# 5. Write the combined markdown string to the output file
# Write the combined markdown string to the output file
try:
with open(output_file_path, "w", encoding="utf-8") as f:
f.write("".join(final_output_chunks))
Expand Down Expand Up @@ -275,8 +275,9 @@ def preprocess_extracted_section(text: str) -> str:


if __name__ == '__main__':
input_pdf = "data/chapters/silberschatz.pdf"
output_md = 'data/silberschatz.md'
project_root = Path(__file__).resolve().parent.parent.parent
input_pdf = project_root / "data/chapters/textbook.pdf"
output_md = project_root / "data/testing.md"

print(f"Converting '{input_pdf}' to '{output_md}'...")
convert_and_save_with_page_numbers(input_pdf, output_md)
Expand All @@ -285,7 +286,7 @@ def preprocess_extracted_section(text: str) -> str:

if extracted_sections:
print(f"Successfully extracted {len(extracted_sections)} sections.")
output_filename = 'data/extracted_sections.json'
output_filename = project_root / "data/extracted_sections.json"
with open(output_filename, 'w', encoding='utf-8') as f:
json.dump(extracted_sections, f, indent=4, ensure_ascii=False)
print(f"\nFull extracted content saved to '{output_filename}'")
8 changes: 3 additions & 5 deletions src/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def load_artifacts(artifacts_dir: os.PathLike, index_prefix: str) -> Tuple[faiss

# -------------------------- Helper to get page nums for chunks -------------------------------

def get_page_numbers(chunk_indices: list[int], metadata: list[dict]) -> dict[int, int]:
def get_page_numbers(chunk_indices: list[int], metadata: list[dict]) -> dict[int, List[int]]:
"""
Retrieves page numbers for the provided chunk indices.
"""
Expand All @@ -68,10 +68,8 @@ def get_page_numbers(chunk_indices: list[int], metadata: list[dict]) -> dict[int
idx = int(idx)
# Ensure index is within bounds
if 0 <= idx < len(metadata):
# Access the 'page_number' key we saved in index_builder.py
p_num = metadata[idx].get("page_number")
if p_num is not None:
page_numbers[idx] = p_num
# Access the 'page_numbers' key we saved in index_builder.py
page_numbers[idx] = metadata[idx].get("page_numbers")

return page_numbers

Expand Down