Skip to content

Commit a9922b9

Browse files
committed
Merge branch 'main' into develop
2 parents 46ffc75 + 377b2d9 commit a9922b9

File tree

7 files changed

+232
-6
lines changed

7 files changed

+232
-6
lines changed

.github/workflows/ci-cd.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ name: CI/CD Pipeline
22

33
on:
44
push:
5-
branches: [ main, develop, 'bugfix/**' ]
5+
branches: [ main, develop, 'bugfix/**', 'feature/**' ]
66
pull_request:
7-
branches: [ main, develop, 'bugfix/**' ]
7+
branches: [ main, develop, 'bugfix/**', 'feature/**' ]
88
release:
99
types: [ published ]
1010

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@
1717
the compiler now resolves such a string from `const_store` so the adapter receives the MCP
1818
server instance instead of the string, eliminating "Could not extract tools from FastMCP instance".
1919

20+
## 0.11.3
21+
- Fixed chatterlang_serve stream interface so search results display reliably. The UI now
22+
shows results from the `/process` response directly instead of relying solely on
23+
Server-Sent Events, avoiding a race where SSE sometimes did not deliver all items before
24+
the next interaction. SSE events received during a request are buffered and discarded
25+
when the response arrives, preventing duplicate display of results.
26+
- Added optional PDF extraction via `pypdf`. Install with `pip install talkpipe[pypdf]` or
27+
`talkpipe[all]` to enable reading PDF files with readFile and the new readpdf segment.
28+
Without the optional dependency, if no other pdf hander is registered, PDF extraction
29+
fails with a clear ImportError directing users to install the pypdf extra.
30+
2031
## 0.11.2
2132
- Fix for prompt segment error that would display an error message if history_file is none.
2233

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,11 @@ anthropic = [
7070
mcp = [
7171
'fastmcp'
7272
]
73+
pypdf = [
74+
'pypdf'
75+
]
7376
all = [
74-
'talkpipe[openai,ollama,anthropic,mcp]'
77+
'talkpipe[openai,ollama,anthropic,mcp,pypdf]'
7578
]
7679

7780
[project.scripts]
@@ -172,6 +175,7 @@ readFile = "talkpipe.data.extraction:ReadFile"
172175
fileToText = "talkpipe.data.extraction:ReadFile"
173176
readJsonl = "talkpipe.pipe.io:readJsonl"
174177
readdocx = "talkpipe.data.extraction:readdocx"
178+
readpdf = "talkpipe.data.extraction:readpdf"
175179
readcsv = "talkpipe.data.extraction:readcsv"
176180
readjsonl = "talkpipe.data.extraction:readjsonl"
177181
readtxt = "talkpipe.data.extraction:readtxt"

src/talkpipe/app/chatterlang_serve.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,11 @@ def _get_stream_interface(self) -> str:
10251025
if (data.type === 'user' && data.output === lastUserMessage) {{
10261026
return;
10271027
}}
1028+
// Buffer SSE during /process request - we'll display from response to avoid duplicates
1029+
if (pendingRequest && data.type === 'response') {{
1030+
sseBuffer.push(data);
1031+
return;
1032+
}}
10281033
addMessage(data.output, data.type || 'response', data.timestamp);
10291034
}} catch (e) {{
10301035
console.error('Error parsing SSE data:', e);
@@ -1140,6 +1145,8 @@ def _get_stream_interface(self) -> str:
11401145
}}
11411146
11421147
let lastUserMessage = null; // Track last user message to avoid duplicates
1148+
let pendingRequest = false; // True while /process request is in flight
1149+
let sseBuffer = []; // Buffer SSE events during request to avoid duplicate display
11431150
11441151
async function submitForm(event) {{
11451152
event.preventDefault();
@@ -1174,6 +1181,8 @@ def _get_stream_interface(self) -> str:
11741181
11751182
submitBtn.disabled = true;
11761183
submitBtn.textContent = 'Sending...';
1184+
pendingRequest = true;
1185+
sseBuffer = [];
11771186
11781187
try {{
11791188
const headers = {{'Content-Type': 'application/json'}};
@@ -1188,14 +1197,28 @@ def _get_stream_interface(self) -> str:
11881197
body: JSON.stringify(data)
11891198
}});
11901199
1200+
const result = await response.json();
11911201
if (!response.ok) {{
1192-
throw new Error(`HTTP ${{response.status}}: ${{response.statusText}}`);
1202+
const detail = result.detail;
1203+
const msg = typeof detail === 'string' ? detail : (detail ? JSON.stringify(detail) : `HTTP ${{response.status}}: ${{response.statusText}}`);
1204+
throw new Error(msg);
11931205
}}
11941206
11951207
status.textContent = 'Message sent successfully!';
11961208
status.className = 'status success';
11971209
status.style.display = 'block';
11981210
1211+
// Display results from response - more reliable than SSE for batch results
1212+
// (avoids race where SSE may not deliver all items before next interaction)
1213+
if (result.data && result.data.output && Array.isArray(result.data.output)) {{
1214+
const timestamp = result.timestamp || new Date().toISOString();
1215+
for (const item of result.data.output) {{
1216+
const content = typeof item === 'object' ? JSON.stringify(item, null, 2) : String(item);
1217+
addMessage(content, 'response', timestamp);
1218+
}}
1219+
}}
1220+
sseBuffer = []; // Discard buffered SSE - we displayed from response
1221+
11991222
setTimeout(() => {{
12001223
status.style.display = 'none';
12011224
lastUserMessage = null; // Clear after a delay
@@ -1208,7 +1231,9 @@ def _get_stream_interface(self) -> str:
12081231
12091232
addMessage(`Error: ${{error.message}}`, 'error', new Date().toISOString());
12101233
lastUserMessage = null; // Clear on error
1234+
sseBuffer = [];
12111235
}} finally {{
1236+
pendingRequest = false;
12121237
submitBtn.disabled = false;
12131238
submitBtn.textContent = 'Send Message';
12141239
}}

src/talkpipe/data/extraction.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,54 @@ def extract_jsonl(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
339339
yield ExtractionResult(**result_fields, **extra_fields)
340340

341341

342+
def extract_pdf(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
343+
"""
344+
Extract text from a PDF file.
345+
346+
Requires the pypdf package. Install with: pip install talkpipe[pypdf]
347+
348+
Args:
349+
file_path: Path to the PDF file.
350+
351+
Yields:
352+
ExtractionResult with the text content of the document.
353+
354+
Raises:
355+
FileNotFoundError: If the file does not exist.
356+
ImportError: If pypdf is not installed.
357+
"""
358+
try:
359+
from pypdf import PdfReader
360+
except ImportError:
361+
raise ImportError(
362+
"PDF extraction requires pypdf. Install it with: pip install talkpipe[pypdf]"
363+
) from None
364+
365+
p = Path(file_path)
366+
if not p.exists():
367+
logger.error(f"Path does not exist: {file_path}")
368+
raise FileNotFoundError(f"Path does not exist: {file_path}")
369+
if not p.is_file():
370+
logger.error(f"Unsupported path type: {file_path}")
371+
raise FileNotFoundError(f"Unsupported path type: {file_path}")
372+
373+
logger.info(f"Reading PDF file: {p}")
374+
source_str = str(p.resolve())
375+
reader = PdfReader(p)
376+
text_parts = []
377+
for page in reader.pages:
378+
page_text = page.extract_text()
379+
if page_text:
380+
text_parts.append(page_text)
381+
content = "\n\n".join(text_parts) if text_parts else ""
382+
yield ExtractionResult(
383+
content=content,
384+
source=source_str,
385+
id=source_str,
386+
title=p.name
387+
)
388+
389+
342390
def skip_file(file_path: Union[str, Path]) -> Iterator[ExtractionResult]:
343391
"""Default extractor that skips files by yielding nothing."""
344392
logger.debug(f"Skipping unsupported file: {file_path}")
@@ -358,6 +406,7 @@ def get_default_registry() -> ExtractorRegistry:
358406
registry.register("txt", extract_text)
359407
registry.register("md", extract_text)
360408
registry.register("docx", extract_docx)
409+
registry.register("pdf", extract_pdf)
361410
registry.register("csv", extract_csv)
362411
registry.register("jsonl", extract_jsonl)
363412
registry.register_default(skip_file)
@@ -401,6 +450,23 @@ def readdocx(file_path: Annotated[str, "Path to the .docx file to read"]):
401450
yield from extract_docx(file_path)
402451

403452

453+
@register_segment("readpdf")
454+
@field_segment(multi_emit=True)
455+
def readpdf(file_path: Annotated[str, "Path to the PDF file to read"]):
456+
"""Read and extract text from PDF files.
457+
458+
Requires the pypdf package. Install with: pip install talkpipe[pypdf]
459+
460+
Yields:
461+
ExtractionResult: Result containing content, source path, id, and title.
462+
463+
Raises:
464+
FileNotFoundError: If a path does not exist.
465+
ImportError: If pypdf is not installed.
466+
"""
467+
yield from extract_pdf(file_path)
468+
469+
404470
@register_segment("readcsv")
405471
@field_segment(multi_emit=True)
406472
def readcsv(file_path: Annotated[str, "Path to the CSV file to read"]):

tests/talkpipe/app/test_chatterlang_serve.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,17 @@ def test_stream_endpoint(self, client):
403403
html_content = response.text
404404
assert "ChatterLang Server - Stream" in html_content
405405
assert "chat-messages" in html_content
406+
407+
def test_stream_displays_results_from_process_response(self, client):
408+
"""Stream UI should display results from /process response for reliable display."""
409+
response = client.get("/stream")
410+
html_content = response.text
411+
# Must use response data for display (fixes race where SSE may not deliver all items)
412+
assert "result.data.output" in html_content
413+
assert "Array.isArray(result.data.output)" in html_content
414+
# Buffer SSE during request to avoid duplicate display
415+
assert "pendingRequest" in html_content
416+
assert "sseBuffer" in html_content
406417

407418
def test_favicon_endpoint(self, client):
408419
"""Test favicon endpoint."""

tests/talkpipe/data/test_extraction.py

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import pytest
2+
from pathlib import Path
3+
from unittest.mock import patch
24
from talkpipe.data.extraction import (
3-
ReadFile, readtxt, readdocx, readcsv, readjsonl, listFiles,
4-
ExtractorRegistry, extract_text, extract_docx, extract_csv, extract_jsonl, skip_file, get_default_registry,
5+
ReadFile, readtxt, readdocx, readpdf, readcsv, readjsonl, listFiles,
6+
ExtractorRegistry, extract_text, extract_docx, extract_pdf, extract_csv, extract_jsonl, skip_file, get_default_registry,
57
global_extractor_registry, ExtractionResult
68
)
79

@@ -600,6 +602,113 @@ def test_readjsonl(tmp_path):
600602
assert "products.jsonl:2" in results[1].title
601603

602604

605+
def _create_pdf_with_text(path, text: str = "Hello PDF") -> None:
606+
"""Create a minimal PDF file with the given text content."""
607+
content = f"""BT
608+
/F1 12 Tf
609+
100 700 Td
610+
({text}) Tj
611+
ET
612+
""".encode()
613+
obj1 = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
614+
obj2 = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
615+
obj3 = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources 5 0 R >>\nendobj\n"
616+
obj4 = (
617+
b"4 0 obj\n<< /Length " + str(len(content)).encode("ascii") + b" >>\nstream\n"
618+
+ content + b"\nendstream\nendobj\n"
619+
)
620+
obj5 = b"5 0 obj\n<< /Font << /F1 6 0 R >> >>\nendobj\n"
621+
obj6 = b"6 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
622+
body = obj1 + obj2 + obj3 + obj4 + obj5 + obj6
623+
startxref = 9 + len(body)
624+
offsets = [9]
625+
for obj in [obj1, obj2, obj3, obj4, obj5]:
626+
offsets.append(offsets[-1] + len(obj))
627+
xref = b"xref\n0 7\n0000000000 65535 f \n"
628+
for i in range(1, 7):
629+
xref += f"{offsets[i - 1]:010d} 00000 n \n".encode()
630+
trailer = f"trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{startxref}\n%%EOF\n".encode()
631+
Path(path).write_bytes(b"%PDF-1.4\n" + body + xref + trailer)
632+
633+
634+
def test_extract_pdf_requires_pypdf(tmp_path):
635+
"""Test that extract_pdf raises helpful ImportError when pypdf is not installed."""
636+
pdf_path = tmp_path / "test.pdf"
637+
pdf_path.write_bytes(b"%PDF-1.4 minimal\n")
638+
639+
import builtins
640+
real_import = builtins.__import__
641+
642+
def mock_import(name, *args, **kwargs):
643+
if name == "pypdf":
644+
raise ImportError("No module named 'pypdf'")
645+
return real_import(name, *args, **kwargs)
646+
647+
with patch.object(builtins, "__import__", side_effect=mock_import):
648+
with pytest.raises(ImportError) as exc_info:
649+
list(extract_pdf(pdf_path))
650+
assert "pypdf" in str(exc_info.value)
651+
assert "pip install talkpipe[pypdf]" in str(exc_info.value)
652+
653+
654+
def test_extract_pdf_file_not_found():
655+
"""Test extract_pdf raises FileNotFoundError for missing file."""
656+
pytest.importorskip("pypdf")
657+
with pytest.raises(FileNotFoundError, match="Path does not exist"):
658+
list(extract_pdf("/nonexistent/path.pdf"))
659+
660+
661+
def test_extract_pdf_with_pypdf(tmp_path):
662+
"""Test PDF extraction when pypdf is installed."""
663+
pytest.importorskip("pypdf")
664+
665+
pdf_path = tmp_path / "test.pdf"
666+
_create_pdf_with_text(pdf_path, "Hello PDF")
667+
668+
results = list(extract_pdf(pdf_path))
669+
assert len(results) == 1
670+
assert isinstance(results[0], ExtractionResult)
671+
assert "test.pdf" in results[0].source
672+
assert results[0].id == results[0].source
673+
assert results[0].title == "test.pdf"
674+
assert "Hello PDF" in results[0].content
675+
676+
677+
def test_readpdf_segment(tmp_path):
678+
"""Test readpdf segment when pypdf is installed."""
679+
pytest.importorskip("pypdf")
680+
681+
pdf_path = tmp_path / "segment_test.pdf"
682+
_create_pdf_with_text(pdf_path, "Segment test content")
683+
684+
results = list(readpdf()([str(pdf_path)]))
685+
assert len(results) == 1
686+
assert isinstance(results[0], ExtractionResult)
687+
assert "segment_test.pdf" in results[0].source
688+
assert "Segment test content" in results[0].content
689+
690+
691+
def test_pdf_in_default_registry(tmp_path):
692+
"""Test that PDF extractor is registered in default registry."""
693+
registry = get_default_registry()
694+
assert "pdf" in registry.registered_extensions
695+
696+
697+
def test_ReadFile_with_pdf(tmp_path):
698+
"""Test ReadFile extracts PDF when pypdf is installed."""
699+
pytest.importorskip("pypdf")
700+
701+
pdf_path = tmp_path / "readfile_test.pdf"
702+
_create_pdf_with_text(pdf_path, "ReadFile PDF content")
703+
704+
fe = ReadFile()
705+
results = list(fe([str(pdf_path)]))
706+
assert len(results) == 1
707+
assert isinstance(results[0], ExtractionResult)
708+
assert "readfile_test.pdf" in results[0].source
709+
assert "ReadFile PDF content" in results[0].content
710+
711+
603712
def test_jsonl_in_default_registry(tmp_path):
604713
"""Test that JSONL extractor is registered in default registry."""
605714
registry = get_default_registry()

0 commit comments

Comments
 (0)