Added an optional pdf handler.

travis-bauer · travis-bauer · commit 325e0eefef81 · 2026-02-05T17:52:49.000-07:00
diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml
@@ -2,9 +2,9 @@ name: CI/CD Pipeline
 
 on:
   push:
-    branches: [ main, develop, 'bugfix/**' ]
+    branches: [ main, develop, 'bugfix/**', 'feature/**' ]
   pull_request:
-    branches: [ main, develop, 'bugfix/**' ]
+    branches: [ main, develop, 'bugfix/**', 'feature/**' ]
   release:
     types: [ published ]
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## Unreleased
+- Added optional PDF extraction via `pypdf`. Install with `pip install talkpipe[pypdf]` or 
+  `talkpipe[all]` to enable reading PDF files with readFile and the new readpdf segment. 
+  Without the optional dependency, if no other pdf hander is registered, PDF extraction 
+  fails with a clear ImportError directing users to install the pypdf extra.
+
 ## 0.11.2
 - Fix for prompt segment error that would display an error message if history_file is none.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -67,8 +67,11 @@ openai = [
 anthropic = [
     'anthropic'
 ]
+pypdf = [
+    'pypdf'
+]
 all = [
-    'talkpipe[openai,ollama,anthropic]'
+    'talkpipe[openai,ollama,anthropic,pypdf]'
 ]
 
 [project.scripts]
@@ -169,6 +172,7 @@ readFile = "talkpipe.data.extraction:ReadFile"
 fileToText = "talkpipe.data.extraction:ReadFile"
 readJsonl = "talkpipe.pipe.io:readJsonl"
 readdocx = "talkpipe.data.extraction:readdocx"
+readpdf = "talkpipe.data.extraction:readpdf"
 readcsv = "talkpipe.data.extraction:readcsv"
 readjsonl = "talkpipe.data.extraction:readjsonl"
 readtxt = "talkpipe.data.extraction:readtxt"
diff --git a/src/talkpipe/data/extraction.py b/src/talkpipe/data/extraction.py
@@ -339,6 +339,54 @@ def extract_jsonl(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
             yield ExtractionResult(**result_fields, **extra_fields)
 
 
+def extract_pdf(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
+    """
+    Extract text from a PDF file.
+
+    Requires the pypdf package. Install with: pip install talkpipe[pypdf]
+
+    Args:
+        file_path: Path to the PDF file.
+
+    Yields:
+        ExtractionResult with the text content of the document.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ImportError: If pypdf is not installed.
+    """
+    try:
+        from pypdf import PdfReader
+    except ImportError:
+        raise ImportError(
+            "PDF extraction requires pypdf. Install it with: pip install talkpipe[pypdf]"
+        ) from None
+
+    p = Path(file_path)
+    if not p.exists():
+        logger.error(f"Path does not exist: {file_path}")
+        raise FileNotFoundError(f"Path does not exist: {file_path}")
+    if not p.is_file():
+        logger.error(f"Unsupported path type: {file_path}")
+        raise FileNotFoundError(f"Unsupported path type: {file_path}")
+
+    logger.info(f"Reading PDF file: {p}")
+    source_str = str(p.resolve())
+    reader = PdfReader(p)
+    text_parts = []
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text_parts.append(page_text)
+    content = "\n\n".join(text_parts) if text_parts else ""
+    yield ExtractionResult(
+        content=content,
+        source=source_str,
+        id=source_str,
+        title=p.name
+    )
+
+
 def skip_file(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
     """Default extractor that skips files by yielding nothing."""
     logger.debug(f"Skipping unsupported file: {file_path}")
@@ -358,6 +406,7 @@ def get_default_registry() -> ExtractorRegistry:
     registry.register("txt", extract_text)
     registry.register("md", extract_text)
     registry.register("docx", extract_docx)
+    registry.register("pdf", extract_pdf)
     registry.register("csv", extract_csv)
     registry.register("jsonl", extract_jsonl)
     registry.register_default(skip_file)
@@ -401,6 +450,23 @@ def readdocx(file_path: Annotated[str, "Path to the .docx file to read"]):
     yield from extract_docx(file_path)
 
 
+@register_segment("readpdf")
+@field_segment(multi_emit=True)
+def readpdf(file_path: Annotated[str, "Path to the PDF file to read"]):
+    """Read and extract text from PDF files.
+
+    Requires the pypdf package. Install with: pip install talkpipe[pypdf]
+
+    Yields:
+        ExtractionResult: Result containing content, source path, id, and title.
+
+    Raises:
+        FileNotFoundError: If a path does not exist.
+        ImportError: If pypdf is not installed.
+    """
+    yield from extract_pdf(file_path)
+
+
 @register_segment("readcsv")
 @field_segment(multi_emit=True)
 def readcsv(file_path: Annotated[str, "Path to the CSV file to read"]):
diff --git a/tests/talkpipe/data/test_extraction.py b/tests/talkpipe/data/test_extraction.py
@@ -1,7 +1,9 @@
 import pytest
+from pathlib import Path
+from unittest.mock import patch
 from talkpipe.data.extraction import (
-    ReadFile, readtxt, readdocx, readcsv, readjsonl, listFiles,
-    ExtractorRegistry, extract_text, extract_docx, extract_csv, extract_jsonl, skip_file, get_default_registry,
+    ReadFile, readtxt, readdocx, readpdf, readcsv, readjsonl, listFiles,
+    ExtractorRegistry, extract_text, extract_docx, extract_pdf, extract_csv, extract_jsonl, skip_file, get_default_registry,
     global_extractor_registry, ExtractionResult
 )
 
@@ -600,6 +602,113 @@ def test_readjsonl(tmp_path):
     assert "products.jsonl:2" in results[1].title
 
 
+def _create_pdf_with_text(path, text: str = "Hello PDF") -> None:
+    """Create a minimal PDF file with the given text content."""
+    content = f"""BT
+/F1 12 Tf
+100 700 Td
+({text}) Tj
+ET
+""".encode()
+    obj1 = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
+    obj2 = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
+    obj3 = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources 5 0 R >>\nendobj\n"
+    obj4 = (
+        b"4 0 obj\n<< /Length " + str(len(content)).encode("ascii") + b" >>\nstream\n"
+        + content + b"\nendstream\nendobj\n"
+    )
+    obj5 = b"5 0 obj\n<< /Font << /F1 6 0 R >> >>\nendobj\n"
+    obj6 = b"6 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
+    body = obj1 + obj2 + obj3 + obj4 + obj5 + obj6
+    startxref = 9 + len(body)
+    offsets = [9]
+    for obj in [obj1, obj2, obj3, obj4, obj5]:
+        offsets.append(offsets[-1] + len(obj))
+    xref = b"xref\n0 7\n0000000000 65535 f \n"
+    for i in range(1, 7):
+        xref += f"{offsets[i - 1]:010d} 00000 n \n".encode()
+    trailer = f"trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{startxref}\n%%EOF\n".encode()
+    Path(path).write_bytes(b"%PDF-1.4\n" + body + xref + trailer)
+
+
+def test_extract_pdf_requires_pypdf(tmp_path):
+    """Test that extract_pdf raises helpful ImportError when pypdf is not installed."""
+    pdf_path = tmp_path / "test.pdf"
+    pdf_path.write_bytes(b"%PDF-1.4 minimal\n")
+
+    import builtins
+    real_import = builtins.__import__
+
+    def mock_import(name, *args, **kwargs):
+        if name == "pypdf":
+            raise ImportError("No module named 'pypdf'")
+        return real_import(name, *args, **kwargs)
+
+    with patch.object(builtins, "__import__", side_effect=mock_import):
+        with pytest.raises(ImportError) as exc_info:
+            list(extract_pdf(pdf_path))
+        assert "pypdf" in str(exc_info.value)
+        assert "pip install talkpipe[pypdf]" in str(exc_info.value)
+
+
+def test_extract_pdf_file_not_found():
+    """Test extract_pdf raises FileNotFoundError for missing file."""
+    pytest.importorskip("pypdf")
+    with pytest.raises(FileNotFoundError, match="Path does not exist"):
+        list(extract_pdf("/nonexistent/path.pdf"))
+
+
+def test_extract_pdf_with_pypdf(tmp_path):
+    """Test PDF extraction when pypdf is installed."""
+    pytest.importorskip("pypdf")
+
+    pdf_path = tmp_path / "test.pdf"
+    _create_pdf_with_text(pdf_path, "Hello PDF")
+
+    results = list(extract_pdf(pdf_path))
+    assert len(results) == 1
+    assert isinstance(results[0], ExtractionResult)
+    assert "test.pdf" in results[0].source
+    assert results[0].id == results[0].source
+    assert results[0].title == "test.pdf"
+    assert "Hello PDF" in results[0].content
+
+
+def test_readpdf_segment(tmp_path):
+    """Test readpdf segment when pypdf is installed."""
+    pytest.importorskip("pypdf")
+
+    pdf_path = tmp_path / "segment_test.pdf"
+    _create_pdf_with_text(pdf_path, "Segment test content")
+
+    results = list(readpdf()([str(pdf_path)]))
+    assert len(results) == 1
+    assert isinstance(results[0], ExtractionResult)
+    assert "segment_test.pdf" in results[0].source
+    assert "Segment test content" in results[0].content
+
+
+def test_pdf_in_default_registry(tmp_path):
+    """Test that PDF extractor is registered in default registry."""
+    registry = get_default_registry()
+    assert "pdf" in registry.registered_extensions
+
+
+def test_ReadFile_with_pdf(tmp_path):
+    """Test ReadFile extracts PDF when pypdf is installed."""
+    pytest.importorskip("pypdf")
+
+    pdf_path = tmp_path / "readfile_test.pdf"
+    _create_pdf_with_text(pdf_path, "ReadFile PDF content")
+
+    fe = ReadFile()
+    results = list(fe([str(pdf_path)]))
+    assert len(results) == 1
+    assert isinstance(results[0], ExtractionResult)
+    assert "readfile_test.pdf" in results[0].source
+    assert "ReadFile PDF content" in results[0].content
+
+
 def test_jsonl_in_default_registry(tmp_path):
     """Test that JSONL extractor is registered in default registry."""
     registry = get_default_registry()