georgia-tech-db · jlee600 · Jan 30, 2026 · shahmeer99 · Feb 1, 2026 · shahmeer99
diff --git a/Makefile b/Makefile
@@ -58,6 +58,11 @@ clean:
 	find . -type d -name __pycache__ -exec rm -rf {} +
 	find . -type f -name "*.pyc" -delete
 
+# PDF to Markdown extraction
+run-extract:
+	@echo "Extracting PDF to markdown (data/chapters/*.pdf -> data/book_with_pages.md)"
+	conda run --no-capture-output -n tokensmith python -m src.preprocessing.extraction
+
 # Run modes
 run-index:
 	@echo "Running TokenSmith index mode with additional CLI args: $(ARGS)"

diff --git a/README.md b/README.md
@@ -84,7 +84,14 @@ mkdir -p data/chapters
 cp your-documents.pdf data/chapters/
 ```
 
-### 5) Index documents
+### 5) Extract PDF to markdown
+
+```shell
+make run-extract
+```
+This generates a `book_with_pages.md` under `TOKENSMITH/data/`
+
+### 6) Index documents
 
 ```shell
 make run-index
@@ -96,15 +103,15 @@ With custom parameters:
 make run-index ARGS="--pdf_range 1-10 --chunk_mode chars --visualize"
 ```
 
-### 6) Chat
+### 7) Chat
 
 ```shell
 python -m src.main chat
 ```
 
 > If you see a missing-model error, download `qwen2.5-0.5b-instruct-q5_k_m.gguf` into `llama.cpp/models`.
 
-### 7) Deactivate
+### 8) Deactivate
 
 ```shell
 conda deactivate

diff --git a/src/main.py b/src/main.py
@@ -87,7 +87,7 @@ def run_index_mode(args: argparse.Namespace, cfg: RAGConfig):
     artifacts_dir = cfg.get_artifacts_directory()
 
     build_index(
-        markdown_file="data/silberschatz.md",
+        markdown_file="data/book_with_pages.md",
         chunker=chunker,
         chunk_config=cfg.chunk_config,
         embedding_model_path=cfg.embed_model,

diff --git a/src/preprocessing/extraction.py b/src/preprocessing/extraction.py
@@ -275,8 +275,22 @@ def preprocess_extracted_section(text: str) -> str:
 
 
 if __name__ == '__main__':
-    input_pdf = "data/chapters/silberschatz.pdf"
-    output_md = 'data/silberschatz.md'
+    # Returns all pdf files under data/chapters/
+    chapters_dir = Path("data/chapters")
+    pdfs = sorted(chapters_dir.glob("*.pdf"))
+
+    # Ensure exactly one PDF is found
+    if len(pdfs) == 0:
+        print("ERROR: No PDFs found in data/chapters/. Please copy a PDF there first.", file=sys.stderr)
+        sys.exit(1)
+    if len(pdfs) > 1:
+        print("ERROR: Multiple PDFs found in data/chapters/. Keep only one for now:", file=sys.stderr)
+        for p in pdfs:
+            print(f"  - {p}", file=sys.stderr)
+        sys.exit(1)
+
+    input_pdf = str(pdfs[0])
+    output_md = "data/book_with_pages.md"
 
     print(f"Converting '{input_pdf}' to '{output_md}'...")
     convert_and_save_with_page_numbers(input_pdf, output_md)