Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7e6c1c0
feat(pdfx): setup and recorded dependencies and python module
evansun06 Nov 4, 2025
ade88f4
feat(pdfx): created basic cli setup including health_check and extrac…
evansun06 Nov 4, 2025
ba1f768
feat(pdfx): added unique file hashing
evansun06 Nov 5, 2025
5e742af
feat(pdfx): added test pdfs (1 simple, 1 complex)
evansun06 Nov 5, 2025
1726621
feat(pdfx): Added write logic, and cli output path handling
evansun06 Nov 5, 2025
b7c072b
added page variability to cli options, additionally most basic text p…
evansun06 Nov 7, 2025
d4dd103
feat(pdfx): able to extract text, need to implement structure heurist…
evansun06 Nov 7, 2025
3e45845
feat(pdfx): added block metrics and added hyphenation fixing + list i…
evansun06 Nov 14, 2025
6225504
Merge branch 'main' of https://github.com/ubclaunchpad/Piazza-AI-Plug…
evansun06 Nov 25, 2025
ed63238
refactor(pdfx): refactored pdf extraction script to use PyMuPDF and OCR
evansun06 Nov 25, 2025
6b23a6c
feat(pdfx): included response codes + CLI integrtion to script
evansun06 Nov 25, 2025
2c8caa5
feat(pdfx): implemented correct ranged extraction
evansun06 Nov 26, 2025
e61a875
test(pdfx): added tests for json structure, return path checking, and…
evansun06 Nov 26, 2025
4069ada
chore(pdfx): fixed linting + styling
evansun06 Nov 26, 2025
4ec9719
chore(pdfx): applied ruff formatting
evansun06 Nov 26, 2025
cb44967
Merge branch 'main' of https://github.com/ubclaunchpad/Piazza-AI-Plug…
evansun06 Jan 27, 2026
ee7af49
feat(pdfx): integrate easyocr
evansun06 Jan 27, 2026
f2b6e58
feat(ingest): added tests for ocr
evansun06 Jan 27, 2026
e0f69c6
chore(ingest): update dependency lockfile
evansun06 Jan 27, 2026
062d38b
feat(pdfx): create optional ocr flag
evansun06 Jan 27, 2026
88d7f5c
chore(pdfx): apply styling and linting
evansun06 Jan 27, 2026
ef40cd9
Merge branch 'main' of https://github.com/ubclaunchpad/Piazza-AI-Plug…
evansun06 Jan 31, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ logs/
*.bak
*.backup

# PDF I/O
pdf_output/
pdf_input/
# Project specific
# Uncomment if using these in the future
# uploads/
Expand Down
246 changes: 246 additions & 0 deletions backend/app/pdfx/pdfx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
"""
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pdfx module is missing an init.py file. All other modules in the app directory (api, core, models, textGeneration, threadIngestion) have init.py files, following the established codebase convention for Python packages.

Copilot uses AI. Check for mistakes.
ThreadSense PDF extraction script.
"""

import argparse
import hashlib
import json
import os
import re
import sys
import unicodedata
import uuid
from datetime import datetime, timezone

import fitz

TOOL_VERSION = "0.1.0"


def sha256_file(path: str) -> str:
"""File hashing and unique identifier"""

h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
h.update(chunk)
return h.hexdigest()


def normalize_text(text: str) -> str:
"""Normalize Text"""

text = unicodedata.normalize("NFKC", text)
text = text.replace("β€œ", '"').replace("”", '"')
text = text.replace("’", "'")
text = re.sub(r"[ \t]+", " ", text) # collapse multi-spaces/tabs
text = re.sub(
r"\n{3,}", "\n\n", text
) # collapse triple(or more) newlines into "\n\n"
return text.strip()


def fix_hyphens(a: str, b: str) -> str:
"""Hyphen patch"""

if a.endswith("-") and b[:1].islower():
return a[:-1] + b
return a + " " + b


def extract_pdf_pymupdf(path: str, ocr_threshold=40, page_range=None):
"""
Extract and process text using PyMuPDF.
Only pages in `page_range` (1-indexed inclusive) are processed.
If page_range=None β†’ process all pages.
"""

# Prepare metadata
doc_hash = sha256_file(path)
doc_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, doc_hash))
doc = fitz.open(path)
page_count = doc.page_count

# Determine which pages to process
if page_range:
start, end = page_range
# Ensure it stays within document bounds
start = max(1, start)
end = min(page_count, end)
pages_to_process = range(start, end + 1)
else:
pages_to_process = range(1, page_count + 1)

total_word_count = 0
skipped = False
skipped_pages = []
pages_output = []
full_text_parts = []
prev_last_line = None

for page_num in pages_to_process:
page = doc[page_num - 1] # PyMuPDF is 0-indexed
used_ocr = False

# 1. Normal extraction
text = page.get_text("text")
page_wc = len(page.get_text("words"))
total_word_count += page_wc

# 2. OCR fallback
if len(text.strip()) < ocr_threshold:
print(f"[PyMuPDF OCR] Page {page_num}: low text β†’ using OCR…")
used_ocr = True
try:
text = page.get_text("ocr")
except Exception:
print(
f"[PyMuPDF OCR] Page {page_num}: OCR Failed - retrying with language…"
)
try:
text = page.get_text("ocr", ocr_language="eng")
except Exception:
print(f"[PyMuPDF OCR] Page {page_num}: OCR Failed - Skipped")
text = ""

# 2.5 Skip page if final text is empty
if len(text.strip()) == 0:
skipped = True
skipped_pages.append(page_num)
continue

# 3. Normalize + split
text = normalize_text(text)
lines = text.split("\n")

# 4. Cross-page stitching
if prev_last_line is not None and lines:
first_line = lines[0]

if prev_last_line.endswith("-"):
stitched = fix_hyphens(prev_last_line, first_line)
full_text_parts[-1] = stitched
lines = lines[1:]
else:
if (
prev_last_line.strip()
and first_line.strip()
and prev_last_line[-1].isalnum()
):
full_text_parts.append(" ")
else:
full_text_parts.append("\n\n")

# 5. Append lines
for line in lines:
full_text_parts.append(line)

prev_last_line = full_text_parts[-1]

pages_output.append(
{
"page_num": page_num,
"word_count": page_wc,
"used_ocr": used_ocr,
"text": "\n".join(lines),
}
)

# Final stitched full text
final_text = normalize_text("\n".join(full_text_parts))

return {
"doc_uuid": doc_uuid,
"page_count": page_count,
"processed_page_range": page_range,
"processed_pages": list(pages_to_process),
"total_word_count": total_word_count,
"created_at": datetime.now(timezone.utc).isoformat(),
"tool_version": TOOL_VERSION,
"skipped": skipped,
"skipped_pages": skipped_pages,
"text": final_text,
"pages": pages_output,
}


def run_extraction(input_path, out_dir, page_range=None):
"""Extraction wrapper for CLI"""

result = extract_pdf_pymupdf(path=input_path, page_range=page_range)

# Output filename = doc_uuid.json
out_path = f"{out_dir}/{result['doc_uuid']}.json"

with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)

print(f"βœ” Saved: {out_path}")
return result


def parse_page_range(r):
"""Convert '5-10' into (5,10)."""
try:
a, b = r.split("-")
return int(a), int(b)
except Exception:
raise argparse.ArgumentTypeError("Page range must be like 5-12")


def main():
"""Script Input"""
parser = argparse.ArgumentParser(
prog="pdfx", description="ThreadSense PDF Extractor"
)

sub = parser.add_subparsers(dest="command", required=True)

# pdfx extract ...
p_extract = sub.add_parser("extract", help="Extract text from a PDF")
p_extract.add_argument("input", help="Input PDF path")
p_extract.add_argument("--out", required=True, help="Output directory")
p_extract.add_argument("--page-range", type=parse_page_range, help="e.g. 3-10")

args = parser.parse_args()

if args.command == "extract":
if not os.path.exists(args.out):
os.makedirs(args.out, exist_ok=True)

result = None
return_code = 0

try:
result = run_extraction(
input_path=args.input, out_dir=args.out, page_range=args.page_range
)

# Decide return code
if not result or "text" not in result:
return_code = 2
elif result.get("skipped", False):
return_code = 1
else:
return_code = 0

except Exception as e:
print(f"Fatal error: {e}", file=sys.stderr)
return_code = 2

# Print result only if it exists
if result is not None:
print(json.dumps(result, indent=2))
else:
print(
json.dumps(
{"error": "Fatal error before producing any output"}, indent=2
),
file=sys.stderr,
)

sys.exit(return_code)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions backend/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Testing
pytest

# Format
black
ruff
mypy
5 changes: 4 additions & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ psycopg2-binary==2.9.9
# Uncomment when adding database functionality:
# asyncpg==0.29.0
# sqlalchemy[asyncio]==2.0.23
# supabase==2.0.3
# supabase==2.0.3

# PDF
PyMuPDF
Binary file added backend/tests/pdfx_docs/bourdain.pdf
Binary file not shown.
47 changes: 47 additions & 0 deletions backend/tests/test_pdfx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import json

from app.pdfx.pdfx import extract_pdf_pymupdf


def test_extract_basic_pdf():
result = extract_pdf_pymupdf("tests/pdfx_docs/bourdain.pdf")

assert not result["skipped"]
assert "text" in result
assert len(result["pages"]) == result["page_count"]
assert result["total_word_count"] >= 0


def test_output_integrity(tmp_path):
from app.pdfx.pdfx import run_extraction

out_dir = tmp_path
result = run_extraction("tests/pdfx_docs/bourdain.pdf", out_dir)

saved_file = out_dir / f"{result['doc_uuid']}.json"
assert saved_file.exists()

data = json.loads(saved_file.read_text())
assert "text" in data
assert "pages" in data
assert "page_count" in data
assert "doc_uuid" in data
assert "created_at" in data


def test_ranged_extraction(tmp_path):
from app.pdfx.pdfx import run_extraction

out_dir = tmp_path
result = run_extraction("tests/pdfx_docs/bourdain.pdf", out_dir, (3, 5))

saved_file = out_dir / f"{result['doc_uuid']}.json"
assert saved_file.exists()

data = json.loads(saved_file.read_text())
assert "text" in data
assert "pages" in data
assert "page_count" in data
assert len(data["pages"]) == 3
assert data["pages"][0]["page_num"] == 3
assert data["pages"][2]["page_num"] == 5
Loading