Merge branch 'main' into develop

travis-bauer · travis-bauer · commit a9922b949b58 · 2026-02-10T21:21:50.000-07:00
diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml
@@ -2,9 +2,9 @@ name: CI/CD Pipeline
 
 on:
   push:
-    branches: [ main, develop, 'bugfix/**' ]
+    branches: [ main, develop, 'bugfix/**', 'feature/**' ]
   pull_request:
-    branches: [ main, develop, 'bugfix/**' ]
+    branches: [ main, develop, 'bugfix/**', 'feature/**' ]
   release:
     types: [ published ]
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,17 @@
   the compiler now resolves such a string from `const_store` so the adapter receives the MCP
   server instance instead of the string, eliminating "Could not extract tools from FastMCP instance".
 
+## 0.11.3
+- Fixed chatterlang_serve stream interface so search results display reliably. The UI now
+  shows results from the `/process` response directly instead of relying solely on
+  Server-Sent Events, avoiding a race where SSE sometimes did not deliver all items before
+  the next interaction. SSE events received during a request are buffered and discarded
+  when the response arrives, preventing duplicate display of results.
+- Added optional PDF extraction via `pypdf`. Install with `pip install talkpipe[pypdf]` or 
+  `talkpipe[all]` to enable reading PDF files with readFile and the new readpdf segment. 
+  Without the optional dependency, if no other pdf hander is registered, PDF extraction 
+  fails with a clear ImportError directing users to install the pypdf extra.
+
 ## 0.11.2
 - Fix for prompt segment error that would display an error message if history_file is none.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,8 +70,11 @@ anthropic = [
 mcp = [
     'fastmcp'
 ]
+pypdf = [
+    'pypdf'
+]
 all = [
-    'talkpipe[openai,ollama,anthropic,mcp]'
+    'talkpipe[openai,ollama,anthropic,mcp,pypdf]'
 ]
 
 [project.scripts]
@@ -172,6 +175,7 @@ readFile = "talkpipe.data.extraction:ReadFile"
 fileToText = "talkpipe.data.extraction:ReadFile"
 readJsonl = "talkpipe.pipe.io:readJsonl"
 readdocx = "talkpipe.data.extraction:readdocx"
+readpdf = "talkpipe.data.extraction:readpdf"
 readcsv = "talkpipe.data.extraction:readcsv"
 readjsonl = "talkpipe.data.extraction:readjsonl"
 readtxt = "talkpipe.data.extraction:readtxt"
diff --git a/src/talkpipe/app/chatterlang_serve.py b/src/talkpipe/app/chatterlang_serve.py
@@ -1025,6 +1025,11 @@ def _get_stream_interface(self) -> str:
                             if (data.type === 'user' && data.output === lastUserMessage) {{
                                 return;
                             }}
+                            // Buffer SSE during /process request - we'll display from response to avoid duplicates
+                            if (pendingRequest && data.type === 'response') {{
+                                sseBuffer.push(data);
+                                return;
+                            }}
                             addMessage(data.output, data.type || 'response', data.timestamp);
                         }} catch (e) {{
                             console.error('Error parsing SSE data:', e);
@@ -1140,6 +1145,8 @@ def _get_stream_interface(self) -> str:
                 }}
                 
                 let lastUserMessage = null; // Track last user message to avoid duplicates
+                let pendingRequest = false;  // True while /process request is in flight
+                let sseBuffer = [];  // Buffer SSE events during request to avoid duplicate display
                 
                 async function submitForm(event) {{
                     event.preventDefault();
@@ -1174,6 +1181,8 @@ def _get_stream_interface(self) -> str:
                     
                     submitBtn.disabled = true;
                     submitBtn.textContent = 'Sending...';
+                    pendingRequest = true;
+                    sseBuffer = [];
                     
                     try {{
                         const headers = {{'Content-Type': 'application/json'}};
@@ -1188,14 +1197,28 @@ def _get_stream_interface(self) -> str:
                             body: JSON.stringify(data)
                         }});
                         
+                        const result = await response.json();
                         if (!response.ok) {{
-                            throw new Error(`HTTP ${{response.status}}: ${{response.statusText}}`);
+                            const detail = result.detail;
+                            const msg = typeof detail === 'string' ? detail : (detail ? JSON.stringify(detail) : `HTTP ${{response.status}}: ${{response.statusText}}`);
+                            throw new Error(msg);
                         }}
                         
                         status.textContent = 'Message sent successfully!';
                         status.className = 'status success';
                         status.style.display = 'block';
                         
+                        // Display results from response - more reliable than SSE for batch results
+                        // (avoids race where SSE may not deliver all items before next interaction)
+                        if (result.data && result.data.output && Array.isArray(result.data.output)) {{
+                            const timestamp = result.timestamp || new Date().toISOString();
+                            for (const item of result.data.output) {{
+                                const content = typeof item === 'object' ? JSON.stringify(item, null, 2) : String(item);
+                                addMessage(content, 'response', timestamp);
+                            }}
+                        }}
+                        sseBuffer = [];  // Discard buffered SSE - we displayed from response
+                        
                         setTimeout(() => {{
                             status.style.display = 'none';
                             lastUserMessage = null; // Clear after a delay
@@ -1208,7 +1231,9 @@ def _get_stream_interface(self) -> str:
                         
                         addMessage(`Error: ${{error.message}}`, 'error', new Date().toISOString());
                         lastUserMessage = null; // Clear on error
+                        sseBuffer = [];
                     }} finally {{
+                        pendingRequest = false;
                         submitBtn.disabled = false;
                         submitBtn.textContent = 'Send Message';
                     }}
diff --git a/src/talkpipe/data/extraction.py b/src/talkpipe/data/extraction.py
@@ -339,6 +339,54 @@ def extract_jsonl(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
             yield ExtractionResult(**result_fields, **extra_fields)
 
 
+def extract_pdf(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
+    """
+    Extract text from a PDF file.
+
+    Requires the pypdf package. Install with: pip install talkpipe[pypdf]
+
+    Args:
+        file_path: Path to the PDF file.
+
+    Yields:
+        ExtractionResult with the text content of the document.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ImportError: If pypdf is not installed.
+    """
+    try:
+        from pypdf import PdfReader
+    except ImportError:
+        raise ImportError(
+            "PDF extraction requires pypdf. Install it with: pip install talkpipe[pypdf]"
+        ) from None
+
+    p = Path(file_path)
+    if not p.exists():
+        logger.error(f"Path does not exist: {file_path}")
+        raise FileNotFoundError(f"Path does not exist: {file_path}")
+    if not p.is_file():
+        logger.error(f"Unsupported path type: {file_path}")
+        raise FileNotFoundError(f"Unsupported path type: {file_path}")
+
+    logger.info(f"Reading PDF file: {p}")
+    source_str = str(p.resolve())
+    reader = PdfReader(p)
+    text_parts = []
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text_parts.append(page_text)
+    content = "\n\n".join(text_parts) if text_parts else ""
+    yield ExtractionResult(
+        content=content,
+        source=source_str,
+        id=source_str,
+        title=p.name
+    )
+
+
 def skip_file(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
     """Default extractor that skips files by yielding nothing."""
     logger.debug(f"Skipping unsupported file: {file_path}")
@@ -358,6 +406,7 @@ def get_default_registry() -> ExtractorRegistry:
     registry.register("txt", extract_text)
     registry.register("md", extract_text)
     registry.register("docx", extract_docx)
+    registry.register("pdf", extract_pdf)
     registry.register("csv", extract_csv)
     registry.register("jsonl", extract_jsonl)
     registry.register_default(skip_file)
@@ -401,6 +450,23 @@ def readdocx(file_path: Annotated[str, "Path to the .docx file to read"]):
     yield from extract_docx(file_path)
 
 
+@register_segment("readpdf")
+@field_segment(multi_emit=True)
+def readpdf(file_path: Annotated[str, "Path to the PDF file to read"]):
+    """Read and extract text from PDF files.
+
+    Requires the pypdf package. Install with: pip install talkpipe[pypdf]
+
+    Yields:
+        ExtractionResult: Result containing content, source path, id, and title.
+
+    Raises:
+        FileNotFoundError: If a path does not exist.
+        ImportError: If pypdf is not installed.
+    """
+    yield from extract_pdf(file_path)
+
+
 @register_segment("readcsv")
 @field_segment(multi_emit=True)
 def readcsv(file_path: Annotated[str, "Path to the CSV file to read"]):
diff --git a/tests/talkpipe/app/test_chatterlang_serve.py b/tests/talkpipe/app/test_chatterlang_serve.py
@@ -403,6 +403,17 @@ def test_stream_endpoint(self, client):
         html_content = response.text
         assert "ChatterLang Server - Stream" in html_content
         assert "chat-messages" in html_content
+
+    def test_stream_displays_results_from_process_response(self, client):
+        """Stream UI should display results from /process response for reliable display."""
+        response = client.get("/stream")
+        html_content = response.text
+        # Must use response data for display (fixes race where SSE may not deliver all items)
+        assert "result.data.output" in html_content
+        assert "Array.isArray(result.data.output)" in html_content
+        # Buffer SSE during request to avoid duplicate display
+        assert "pendingRequest" in html_content
+        assert "sseBuffer" in html_content
     
     def test_favicon_endpoint(self, client):
         """Test favicon endpoint."""
diff --git a/tests/talkpipe/data/test_extraction.py b/tests/talkpipe/data/test_extraction.py
@@ -1,7 +1,9 @@
 import pytest
+from pathlib import Path
+from unittest.mock import patch
 from talkpipe.data.extraction import (
-    ReadFile, readtxt, readdocx, readcsv, readjsonl, listFiles,
-    ExtractorRegistry, extract_text, extract_docx, extract_csv, extract_jsonl, skip_file, get_default_registry,
+    ReadFile, readtxt, readdocx, readpdf, readcsv, readjsonl, listFiles,
+    ExtractorRegistry, extract_text, extract_docx, extract_pdf, extract_csv, extract_jsonl, skip_file, get_default_registry,
     global_extractor_registry, ExtractionResult
 )
 
@@ -600,6 +602,113 @@ def test_readjsonl(tmp_path):
     assert "products.jsonl:2" in results[1].title
 
 
+def _create_pdf_with_text(path, text: str = "Hello PDF") -> None:
+    """Create a minimal PDF file with the given text content."""
+    content = f"""BT
+/F1 12 Tf
+100 700 Td
+({text}) Tj
+ET
+""".encode()
+    obj1 = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
+    obj2 = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
+    obj3 = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources 5 0 R >>\nendobj\n"
+    obj4 = (
+        b"4 0 obj\n<< /Length " + str(len(content)).encode("ascii") + b" >>\nstream\n"
+        + content + b"\nendstream\nendobj\n"
+    )
+    obj5 = b"5 0 obj\n<< /Font << /F1 6 0 R >> >>\nendobj\n"
+    obj6 = b"6 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
+    body = obj1 + obj2 + obj3 + obj4 + obj5 + obj6
+    startxref = 9 + len(body)
+    offsets = [9]
+    for obj in [obj1, obj2, obj3, obj4, obj5]:
+        offsets.append(offsets[-1] + len(obj))
+    xref = b"xref\n0 7\n0000000000 65535 f \n"
+    for i in range(1, 7):
+        xref += f"{offsets[i - 1]:010d} 00000 n \n".encode()
+    trailer = f"trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{startxref}\n%%EOF\n".encode()
+    Path(path).write_bytes(b"%PDF-1.4\n" + body + xref + trailer)
+
+
+def test_extract_pdf_requires_pypdf(tmp_path):
+    """Test that extract_pdf raises helpful ImportError when pypdf is not installed."""
+    pdf_path = tmp_path / "test.pdf"
+    pdf_path.write_bytes(b"%PDF-1.4 minimal\n")
+
+    import builtins
+    real_import = builtins.__import__
+
+    def mock_import(name, *args, **kwargs):
+        if name == "pypdf":
+            raise ImportError("No module named 'pypdf'")
+        return real_import(name, *args, **kwargs)
+
+    with patch.object(builtins, "__import__", side_effect=mock_import):
+        with pytest.raises(ImportError) as exc_info:
+            list(extract_pdf(pdf_path))
+        assert "pypdf" in str(exc_info.value)
+        assert "pip install talkpipe[pypdf]" in str(exc_info.value)
+
+
+def test_extract_pdf_file_not_found():
+    """Test extract_pdf raises FileNotFoundError for missing file."""
+    pytest.importorskip("pypdf")
+    with pytest.raises(FileNotFoundError, match="Path does not exist"):
+        list(extract_pdf("/nonexistent/path.pdf"))
+
+
+def test_extract_pdf_with_pypdf(tmp_path):
+    """Test PDF extraction when pypdf is installed."""
+    pytest.importorskip("pypdf")
+
+    pdf_path = tmp_path / "test.pdf"
+    _create_pdf_with_text(pdf_path, "Hello PDF")
+
+    results = list(extract_pdf(pdf_path))
+    assert len(results) == 1
+    assert isinstance(results[0], ExtractionResult)
+    assert "test.pdf" in results[0].source
+    assert results[0].id == results[0].source
+    assert results[0].title == "test.pdf"
+    assert "Hello PDF" in results[0].content
+
+
+def test_readpdf_segment(tmp_path):
+    """Test readpdf segment when pypdf is installed."""
+    pytest.importorskip("pypdf")
+
+    pdf_path = tmp_path / "segment_test.pdf"
+    _create_pdf_with_text(pdf_path, "Segment test content")
+
+    results = list(readpdf()([str(pdf_path)]))
+    assert len(results) == 1
+    assert isinstance(results[0], ExtractionResult)
+    assert "segment_test.pdf" in results[0].source
+    assert "Segment test content" in results[0].content
+
+
+def test_pdf_in_default_registry(tmp_path):
+    """Test that PDF extractor is registered in default registry."""
+    registry = get_default_registry()
+    assert "pdf" in registry.registered_extensions
+
+
+def test_ReadFile_with_pdf(tmp_path):
+    """Test ReadFile extracts PDF when pypdf is installed."""
+    pytest.importorskip("pypdf")
+
+    pdf_path = tmp_path / "readfile_test.pdf"
+    _create_pdf_with_text(pdf_path, "ReadFile PDF content")
+
+    fe = ReadFile()
+    results = list(fe([str(pdf_path)]))
+    assert len(results) == 1
+    assert isinstance(results[0], ExtractionResult)
+    assert "readfile_test.pdf" in results[0].source
+    assert "ReadFile PDF content" in results[0].content
+
+
 def test_jsonl_in_default_registry(tmp_path):
     """Test that JSONL extractor is registered in default registry."""
     registry = get_default_registry()