Skip to content

Commit 325e0ee

Browse files
committed
Added an optional pdf handler.
1 parent 2e20512 commit 325e0ee

File tree

5 files changed

+190
-5
lines changed

5 files changed

+190
-5
lines changed

.github/workflows/ci-cd.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ name: CI/CD Pipeline
22

33
on:
44
push:
5-
branches: [ main, develop, 'bugfix/**' ]
5+
branches: [ main, develop, 'bugfix/**', 'feature/**' ]
66
pull_request:
7-
branches: [ main, develop, 'bugfix/**' ]
7+
branches: [ main, develop, 'bugfix/**', 'feature/**' ]
88
release:
99
types: [ published ]
1010

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## Unreleased
4+
- Added optional PDF extraction via `pypdf`. Install with `pip install talkpipe[pypdf]` or
5+
`talkpipe[all]` to enable reading PDF files with readFile and the new readpdf segment.
6+
Without the optional dependency, if no other pdf hander is registered, PDF extraction
7+
fails with a clear ImportError directing users to install the pypdf extra.
8+
39
## 0.11.2
410
- Fix for prompt segment error that would display an error message if history_file is none.
511

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,11 @@ openai = [
6767
anthropic = [
6868
'anthropic'
6969
]
70+
pypdf = [
71+
'pypdf'
72+
]
7073
all = [
71-
'talkpipe[openai,ollama,anthropic]'
74+
'talkpipe[openai,ollama,anthropic,pypdf]'
7275
]
7376

7477
[project.scripts]
@@ -169,6 +172,7 @@ readFile = "talkpipe.data.extraction:ReadFile"
169172
fileToText = "talkpipe.data.extraction:ReadFile"
170173
readJsonl = "talkpipe.pipe.io:readJsonl"
171174
readdocx = "talkpipe.data.extraction:readdocx"
175+
readpdf = "talkpipe.data.extraction:readpdf"
172176
readcsv = "talkpipe.data.extraction:readcsv"
173177
readjsonl = "talkpipe.data.extraction:readjsonl"
174178
readtxt = "talkpipe.data.extraction:readtxt"

src/talkpipe/data/extraction.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,54 @@ def extract_jsonl(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
339339
yield ExtractionResult(**result_fields, **extra_fields)
340340

341341

342+
def extract_pdf(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
343+
"""
344+
Extract text from a PDF file.
345+
346+
Requires the pypdf package. Install with: pip install talkpipe[pypdf]
347+
348+
Args:
349+
file_path: Path to the PDF file.
350+
351+
Yields:
352+
ExtractionResult with the text content of the document.
353+
354+
Raises:
355+
FileNotFoundError: If the file does not exist.
356+
ImportError: If pypdf is not installed.
357+
"""
358+
try:
359+
from pypdf import PdfReader
360+
except ImportError:
361+
raise ImportError(
362+
"PDF extraction requires pypdf. Install it with: pip install talkpipe[pypdf]"
363+
) from None
364+
365+
p = Path(file_path)
366+
if not p.exists():
367+
logger.error(f"Path does not exist: {file_path}")
368+
raise FileNotFoundError(f"Path does not exist: {file_path}")
369+
if not p.is_file():
370+
logger.error(f"Unsupported path type: {file_path}")
371+
raise FileNotFoundError(f"Unsupported path type: {file_path}")
372+
373+
logger.info(f"Reading PDF file: {p}")
374+
source_str = str(p.resolve())
375+
reader = PdfReader(p)
376+
text_parts = []
377+
for page in reader.pages:
378+
page_text = page.extract_text()
379+
if page_text:
380+
text_parts.append(page_text)
381+
content = "\n\n".join(text_parts) if text_parts else ""
382+
yield ExtractionResult(
383+
content=content,
384+
source=source_str,
385+
id=source_str,
386+
title=p.name
387+
)
388+
389+
342390
def skip_file(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
343391
"""Default extractor that skips files by yielding nothing."""
344392
logger.debug(f"Skipping unsupported file: {file_path}")
@@ -358,6 +406,7 @@ def get_default_registry() -> ExtractorRegistry:
358406
registry.register("txt", extract_text)
359407
registry.register("md", extract_text)
360408
registry.register("docx", extract_docx)
409+
registry.register("pdf", extract_pdf)
361410
registry.register("csv", extract_csv)
362411
registry.register("jsonl", extract_jsonl)
363412
registry.register_default(skip_file)
@@ -401,6 +450,23 @@ def readdocx(file_path: Annotated[str, "Path to the .docx file to read"]):
401450
yield from extract_docx(file_path)
402451

403452

453+
@register_segment("readpdf")
454+
@field_segment(multi_emit=True)
455+
def readpdf(file_path: Annotated[str, "Path to the PDF file to read"]):
456+
"""Read and extract text from PDF files.
457+
458+
Requires the pypdf package. Install with: pip install talkpipe[pypdf]
459+
460+
Yields:
461+
ExtractionResult: Result containing content, source path, id, and title.
462+
463+
Raises:
464+
FileNotFoundError: If a path does not exist.
465+
ImportError: If pypdf is not installed.
466+
"""
467+
yield from extract_pdf(file_path)
468+
469+
404470
@register_segment("readcsv")
405471
@field_segment(multi_emit=True)
406472
def readcsv(file_path: Annotated[str, "Path to the CSV file to read"]):

tests/talkpipe/data/test_extraction.py

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import pytest
2+
from pathlib import Path
3+
from unittest.mock import patch
24
from talkpipe.data.extraction import (
3-
ReadFile, readtxt, readdocx, readcsv, readjsonl, listFiles,
4-
ExtractorRegistry, extract_text, extract_docx, extract_csv, extract_jsonl, skip_file, get_default_registry,
5+
ReadFile, readtxt, readdocx, readpdf, readcsv, readjsonl, listFiles,
6+
ExtractorRegistry, extract_text, extract_docx, extract_pdf, extract_csv, extract_jsonl, skip_file, get_default_registry,
57
global_extractor_registry, ExtractionResult
68
)
79

@@ -600,6 +602,113 @@ def test_readjsonl(tmp_path):
600602
assert "products.jsonl:2" in results[1].title
601603

602604

605+
def _create_pdf_with_text(path, text: str = "Hello PDF") -> None:
606+
"""Create a minimal PDF file with the given text content."""
607+
content = f"""BT
608+
/F1 12 Tf
609+
100 700 Td
610+
({text}) Tj
611+
ET
612+
""".encode()
613+
obj1 = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
614+
obj2 = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
615+
obj3 = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources 5 0 R >>\nendobj\n"
616+
obj4 = (
617+
b"4 0 obj\n<< /Length " + str(len(content)).encode("ascii") + b" >>\nstream\n"
618+
+ content + b"\nendstream\nendobj\n"
619+
)
620+
obj5 = b"5 0 obj\n<< /Font << /F1 6 0 R >> >>\nendobj\n"
621+
obj6 = b"6 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
622+
body = obj1 + obj2 + obj3 + obj4 + obj5 + obj6
623+
startxref = 9 + len(body)
624+
offsets = [9]
625+
for obj in [obj1, obj2, obj3, obj4, obj5]:
626+
offsets.append(offsets[-1] + len(obj))
627+
xref = b"xref\n0 7\n0000000000 65535 f \n"
628+
for i in range(1, 7):
629+
xref += f"{offsets[i - 1]:010d} 00000 n \n".encode()
630+
trailer = f"trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{startxref}\n%%EOF\n".encode()
631+
Path(path).write_bytes(b"%PDF-1.4\n" + body + xref + trailer)
632+
633+
634+
def test_extract_pdf_requires_pypdf(tmp_path):
635+
"""Test that extract_pdf raises helpful ImportError when pypdf is not installed."""
636+
pdf_path = tmp_path / "test.pdf"
637+
pdf_path.write_bytes(b"%PDF-1.4 minimal\n")
638+
639+
import builtins
640+
real_import = builtins.__import__
641+
642+
def mock_import(name, *args, **kwargs):
643+
if name == "pypdf":
644+
raise ImportError("No module named 'pypdf'")
645+
return real_import(name, *args, **kwargs)
646+
647+
with patch.object(builtins, "__import__", side_effect=mock_import):
648+
with pytest.raises(ImportError) as exc_info:
649+
list(extract_pdf(pdf_path))
650+
assert "pypdf" in str(exc_info.value)
651+
assert "pip install talkpipe[pypdf]" in str(exc_info.value)
652+
653+
654+
def test_extract_pdf_file_not_found():
655+
"""Test extract_pdf raises FileNotFoundError for missing file."""
656+
pytest.importorskip("pypdf")
657+
with pytest.raises(FileNotFoundError, match="Path does not exist"):
658+
list(extract_pdf("/nonexistent/path.pdf"))
659+
660+
661+
def test_extract_pdf_with_pypdf(tmp_path):
662+
"""Test PDF extraction when pypdf is installed."""
663+
pytest.importorskip("pypdf")
664+
665+
pdf_path = tmp_path / "test.pdf"
666+
_create_pdf_with_text(pdf_path, "Hello PDF")
667+
668+
results = list(extract_pdf(pdf_path))
669+
assert len(results) == 1
670+
assert isinstance(results[0], ExtractionResult)
671+
assert "test.pdf" in results[0].source
672+
assert results[0].id == results[0].source
673+
assert results[0].title == "test.pdf"
674+
assert "Hello PDF" in results[0].content
675+
676+
677+
def test_readpdf_segment(tmp_path):
678+
"""Test readpdf segment when pypdf is installed."""
679+
pytest.importorskip("pypdf")
680+
681+
pdf_path = tmp_path / "segment_test.pdf"
682+
_create_pdf_with_text(pdf_path, "Segment test content")
683+
684+
results = list(readpdf()([str(pdf_path)]))
685+
assert len(results) == 1
686+
assert isinstance(results[0], ExtractionResult)
687+
assert "segment_test.pdf" in results[0].source
688+
assert "Segment test content" in results[0].content
689+
690+
691+
def test_pdf_in_default_registry(tmp_path):
692+
"""Test that PDF extractor is registered in default registry."""
693+
registry = get_default_registry()
694+
assert "pdf" in registry.registered_extensions
695+
696+
697+
def test_ReadFile_with_pdf(tmp_path):
698+
"""Test ReadFile extracts PDF when pypdf is installed."""
699+
pytest.importorskip("pypdf")
700+
701+
pdf_path = tmp_path / "readfile_test.pdf"
702+
_create_pdf_with_text(pdf_path, "ReadFile PDF content")
703+
704+
fe = ReadFile()
705+
results = list(fe([str(pdf_path)]))
706+
assert len(results) == 1
707+
assert isinstance(results[0], ExtractionResult)
708+
assert "readfile_test.pdf" in results[0].source
709+
assert "ReadFile PDF content" in results[0].content
710+
711+
603712
def test_jsonl_in_default_registry(tmp_path):
604713
"""Test that JSONL extractor is registered in default registry."""
605714
registry = get_default_registry()

0 commit comments

Comments
 (0)