Skip to content

Commit d9a8762

Browse files
committed
✨(back) add ODT parsing support and improve document routing
- Add odfdo dependency for ODT-to-markdown conversion - Refactor BaseParser to route by content type (PDF, ODT, other) - Extract OdtParserMixin and AdaptivePdfParserMixin for composability - Add OdtParsingError for corrupt/empty ODT files - Accept ODT in RAG upload formats, add pandoc to Docker image - Add tests for PDF/ODT routing, adaptive method selection, and ODT errors Signed-off-by: Laurent Paoletti <lp@providenz.fr>
1 parent 6dd41e8 commit d9a8762

File tree

11 files changed

+576
-44
lines changed

11 files changed

+576
-44
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and this project adheres to
1212

1313
- ✨(back) add projects with custom LLM instructions
1414
- ✨(front) projects management UI
15+
- ✨(back) add ODT parsing support
1516

1617
## [0.0.14] - 2026-03-11
1718

src/backend/chat/agent_rag/document_converter/markitdown.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,4 @@ def _convert(self, document: BytesIO, file_extension: str) -> str:
3939
conversion = self.converter.convert_stream(
4040
document, file_extension=file_extension or ".txt"
4141
)
42-
document_markdown = conversion.text_content
43-
return document_markdown
42+
return conversion.text_content
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import logging
2+
from io import BytesIO
3+
4+
from django.utils.translation import gettext_lazy as _
5+
6+
from odfdo import Document
7+
8+
logger = logging.getLogger(__name__)
9+
10+
11+
class OdtParsingError(Exception):
12+
"""Raised when an ODT file cannot be parsed."""
13+
14+
15+
class OtdToMd:
16+
"""Convert an ODT file to Markdown using odfdo."""
17+
18+
def extract(self, content: bytes, **kwargs) -> str:
19+
try:
20+
doc = Document(BytesIO(content))
21+
return doc.to_markdown()
22+
except (TypeError, FileNotFoundError) as e:
23+
logger.error("Failed to parse ODT document: %s", e)
24+
raise OdtParsingError(
25+
_("Failed to parse ODT document: %(error)s") % {"error": e}
26+
) from e

src/backend/chat/agent_rag/document_converter/parser.py

Lines changed: 40 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,31 +13,52 @@
1313

1414
from chat.agent_rag.document_converter.markitdown import DocumentConverter
1515

16+
from .odt import OtdToMd
17+
1618
logger = logging.getLogger(__name__)
1719

20+
CT_PDF = "application/pdf"
21+
CT_ODT = "application/vnd.oasis.opendocument.text"
22+
1823

1924
class BaseParser:
20-
"""Base class for document parsers."""
25+
"""Base class for document parsers.
26+
27+
Routes documents by content type:
28+
- PDF -> self.parse_pdf_document() (must be provided by subclass or mixin)
29+
- ODT -> self.parse_odt_document() (must be provided by subclass or mixin)
30+
- Other -> DocumentConverter (markitdown)
31+
"""
2132

2233
def parse_document(self, name: str, content_type: str, content: bytes) -> str:
23-
"""
24-
Parse the document and prepare it for the search operation.
25-
This method should handle the logic to convert the document
26-
into a format suitable for storage.
27-
28-
Args:
29-
name (str): The name of the document.
30-
content_type (str): The MIME type of the document (e.g., "application/pdf").
31-
content (bytes): The content of the document as a bytes stream.
32-
33-
Returns:
34-
str: The document content in Markdown format.
35-
"""
34+
"""Route to the appropriate parser based on content type."""
35+
36+
if content_type == CT_PDF:
37+
return self.parse_pdf_document(name=name, content_type=content_type, content=content)
38+
if content_type == CT_ODT:
39+
return self.parse_odt_document(name=name, content=content)
40+
return DocumentConverter().convert_raw(
41+
name=name, content_type=content_type, content=content
42+
)
43+
44+
def parse_pdf_document(self, name: str, content_type: str, content: bytes) -> str:
45+
"""Parse PDF document. Must be implemented by subclass or mixin."""
46+
raise NotImplementedError("Must be implemented in subclass.")
47+
48+
def parse_odt_document(self, name: str, content: bytes) -> str:
49+
"""Parse ODT document. Must be implemented by subclass or mixin."""
3650
raise NotImplementedError("Must be implemented in subclass.")
3751

3852

39-
class AlbertParser(BaseParser):
40-
"""Document parser using Albert API for PDFs and DocumentConverter for other formats."""
53+
class OdtParserMixin:
54+
"""Mixin that adds ODT parsing using odfdo."""
55+
56+
def parse_odt_document(self, name: str, content: bytes) -> str:
57+
return OtdToMd().extract(content)
58+
59+
60+
class AlbertParser(OdtParserMixin, BaseParser):
61+
"""Document parser using Albert API for PDFs."""
4162

4263
endpoint = urljoin(settings.ALBERT_API_URL, "/v1/parse-beta")
4364

@@ -60,23 +81,13 @@ def parse_pdf_document(self, name: str, content_type: str, content: bytes) -> st
6081
document_page["content"] for document_page in response.json().get("data", [])
6182
)
6283

63-
def parse_document(self, name: str, content_type: str, content: bytes) -> str:
64-
"""Parse document based on content type."""
65-
if content_type == "application/pdf":
66-
return self.parse_pdf_document(name=name, content_type=content_type, content=content)
67-
return DocumentConverter().convert_raw(
68-
name=name, content_type=content_type, content=content
69-
)
70-
7184

7285
METHOD_TEXT_EXTRACTION = "text_extraction"
7386
METHOD_OCR = "ocr"
7487

7588

7689
def analyze_pdf(pdf_data: bytes) -> dict:
77-
"""
78-
Analyze a PDF to determine if it needs OCR or can use direct text extraction.
79-
"""
90+
"""Analyze a PDF to determine if it needs OCR or can use direct text extraction."""
8091
reader = PdfReader(BytesIO(pdf_data))
8192
total_pages = len(reader.pages)
8293
if total_pages == 0:
@@ -95,20 +106,17 @@ def analyze_pdf(pdf_data: bytes) -> dict:
95106
text = (page.extract_text() or "").strip()
96107
char_count = len(text)
97108
total_chars += char_count
98-
99109
if char_count > 50:
100110
pages_with_text += 1
101111

102112
avg_chars = total_chars / total_pages
103113
text_coverage = pages_with_text / total_pages
104114

105-
# Decision logic
106115
if (
107116
avg_chars > settings.MIN_AVG_CHARS_FOR_TEXT_EXTRACTION
108117
and text_coverage > settings.MIN_TEXT_COVERAGE_FOR_TEXT_EXTRACTION
109118
):
110119
method = METHOD_TEXT_EXTRACTION
111-
112120
else:
113121
method = METHOD_OCR
114122

@@ -121,7 +129,7 @@ def analyze_pdf(pdf_data: bytes) -> dict:
121129
}
122130

123131

124-
class AdaptiveParserMixin:
132+
class AdaptivePdfParserMixin:
125133
"""
126134
Mixin that adds adaptive PDF parsing behavior.
127135
@@ -159,7 +167,7 @@ def parse_pdf_document_with_ocr(self, name: str, content: bytes) -> str:
159167
raise NotImplementedError("Subclass must implement parse_pdf_document_with_ocr")
160168

161169

162-
class AdaptivePdfParser(AdaptiveParserMixin, BaseParser):
170+
class AdaptivePdfParser(AdaptivePdfParserMixin, OdtParserMixin, BaseParser):
163171
"""
164172
PDF parser with adaptive text extraction / OCR routing.
165173
@@ -265,16 +273,6 @@ def parse_pdf_document_with_ocr(self, name: str, content: bytes) -> str:
265273
)
266274
except Exception as e: # pylint: disable=broad-except #noqa: BLE001
267275
logger.error("Failed to OCR pages %d-%d: %s", start_index + 1, end_index, str(e))
268-
# Preserve page count with empty placeholders to maintain correct ordering
269276
results.extend([""] * (end_index - start_index))
270277

271278
return "\n\n".join(results)
272-
273-
def parse_document(self, name: str, content_type: str, content: bytes) -> str:
274-
"""Route to PDF parser or DocumentConverter based on content type."""
275-
if content_type == "application/pdf":
276-
return self.parse_pdf_document(name=name, content_type=content_type, content=content)
277-
278-
return DocumentConverter().convert_raw(
279-
name=name, content_type=content_type, content=content
280-
)
Binary file not shown.

src/backend/chat/tests/agent_rag/document_converter/test_adaptive_pdf_parser.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import requests
99
from pypdf import PdfReader
1010

11+
from chat.agent_rag.document_converter.odt import OdtParsingError
1112
from chat.agent_rag.document_converter.parser import (
1213
METHOD_OCR,
1314
METHOD_TEXT_EXTRACTION,
@@ -36,6 +37,12 @@ def provide_mixed_pdf_10_pages():
3637
return (FIXTURES_DIR / "mixed_10_pages.pdf").read_bytes()
3738

3839

40+
@pytest.fixture(name="sample_odt")
41+
def provide_sample_odt():
42+
"""Load an ODT document."""
43+
return (FIXTURES_DIR / "sample.odt").read_bytes()
44+
45+
3946
MIN_AVG_CHARS_FOR_TEXT_EXTRACTION = 200
4047
OCR_RETRY_DELAY = 1
4148
OCR_MAX_RETRIES = 3
@@ -297,6 +304,63 @@ def test_parse_document_pdf_routed_correctly(text_pdf_1_page):
297304
)
298305

299306

307+
def test_text_pdf_routed_to_text_extraction(text_pdf_10_pages):
308+
"""Text-rich PDF should be routed to extract_text_from_pdf, not OCR."""
309+
parser = AdaptivePdfParser()
310+
311+
with (
312+
patch.object(parser, "extract_text_from_pdf", return_value="extracted") as mock_extract,
313+
patch.object(parser, "parse_pdf_document_with_ocr") as mock_ocr,
314+
):
315+
result = parser.parse_pdf_document(
316+
name="test.pdf", content_type="application/pdf", content=text_pdf_10_pages
317+
)
318+
319+
assert result == "extracted"
320+
mock_extract.assert_called_once_with(
321+
name="test.pdf", content_type="application/pdf", content=text_pdf_10_pages
322+
)
323+
mock_ocr.assert_not_called()
324+
325+
326+
def test_mixed_pdf_routed_to_ocr(mixed_pdf_10_pages):
327+
"""PDF with low text coverage should be routed to OCR, not text extraction."""
328+
parser = AdaptivePdfParser()
329+
330+
with (
331+
patch.object(parser, "extract_text_from_pdf") as mock_extract,
332+
patch.object(parser, "parse_pdf_document_with_ocr", return_value="ocr result") as mock_ocr,
333+
):
334+
result = parser.parse_pdf_document(
335+
name="test.pdf", content_type="application/pdf", content=mixed_pdf_10_pages
336+
)
337+
338+
assert result == "ocr result"
339+
mock_ocr.assert_called_once_with(name="test.pdf", content=mixed_pdf_10_pages)
340+
mock_extract.assert_not_called()
341+
342+
343+
def test_parse_document_pdf(text_pdf_1_page):
344+
"""Should route PDF content type to PDF parser."""
345+
parser = AdaptivePdfParser()
346+
347+
result = parser.parse_document("test.pdf", "application/pdf", text_pdf_1_page)
348+
349+
assert result == (
350+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
351+
"incididunt ut\nlabore et dolore magna aliqua. Ut enim ad minim veniam, "
352+
"quis nostrud exercitation ullamco\nlaboris nisi ut aliquip ex ea commodo consequat. "
353+
"Duis aute irure dolor in reprehenderit in\nvoluptate velit esse cillum dolore eu fugiat "
354+
"nulla pariatur. Excepteur sint occaecat cupidatat non\nproident, sunt in culpa qui "
355+
"officia deserunt mollit anim id est laborum.\n\nLorem ipsum dolor sit amet, consectetur "
356+
"adipiscing elit, sed do eiusmod tempor incididunt ut\nlabore et dolore magna aliqua. "
357+
"Ut enim ad minim veniam, quis nostrud exercitation ullamco\nlaboris nisi ut aliquip "
358+
"ex ea commodo consequat. Duis aute irure dolor in reprehenderit in\nvoluptate velit "
359+
"esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non"
360+
"\nproident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n\n"
361+
)
362+
363+
300364
def test_parse_document_non_pdf_uses_document_converter():
301365
"""Should route non-PDF content to DocumentConverter."""
302366
parser = AdaptivePdfParser()
@@ -308,3 +372,63 @@ def test_parse_document_non_pdf_uses_document_converter():
308372

309373
assert result == "docx content"
310374
mock_converter.return_value.convert_raw.assert_called_once()
375+
376+
377+
expected_md_from_odt = (
378+
"# Document Title\n\n## Introduction\n\nThis is a normal paragraph with "
379+
"**bold text**, \\\n_italic text_, and \\\n***bold italic text***."
380+
"\\\n\n\nThis has ~~strikethrough~~ and \\\n`inline code`.\\\n\n\n"
381+
"Visit [Example Site](https://example.com) for more info.\\\n\n\n"
382+
"## Features\n\n - Fast parsing\n - Clean output\n - "
383+
"Django integration\n - LLM\\-ready markdown\n\n"
384+
"### Nested List\n\n - Parent item\n - Child A"
385+
"\n - Child B\n - Another parent\n\n## Data Table\n\n"
386+
"| Name | Age | City |\n|-------|-----|--------|\n"
387+
"| Alice | 30 | Paris |\n| Bob | 25 | London |"
388+
"\n\n\n## Conclusion\n\nThis document tests "
389+
"the ODT to Markdown conversion pipeline.\n"
390+
)
391+
392+
393+
def test_parse_odt(sample_odt):
394+
"""Should extract a single page correctly."""
395+
parser = AdaptivePdfParser()
396+
397+
result = parser.parse_document(
398+
"sample.odt", "application/vnd.oasis.opendocument.text", sample_odt
399+
)
400+
401+
assert result == expected_md_from_odt
402+
403+
404+
def test_parse_document_odt_routed_correctly(sample_odt):
405+
"""Should route ODT content type to ODT parser."""
406+
parser = AdaptivePdfParser()
407+
408+
with patch.object(parser, "parse_odt_document", return_value="odt content") as mock_parse:
409+
result = parser.parse_document(
410+
"sample.odt", "application/vnd.oasis.opendocument.text", sample_odt
411+
)
412+
413+
assert result == "odt content"
414+
mock_parse.assert_called_once_with(name="sample.odt", content=sample_odt)
415+
416+
417+
def test_parse_odt_corrupt_input():
418+
"""Should raise OdtParsingError on corrupt input."""
419+
parser = AdaptivePdfParser()
420+
421+
with pytest.raises(OdtParsingError, match="Failed to parse ODT document"):
422+
parser.parse_document(
423+
"corrupt.odt", "application/vnd.oasis.opendocument.text", b"garbage"
424+
)
425+
426+
427+
def test_parse_odt_empty_input():
428+
"""Should raise OdtParsingError on empty input."""
429+
parser = AdaptivePdfParser()
430+
431+
with pytest.raises(OdtParsingError, match="Failed to parse ODT document"):
432+
parser.parse_document(
433+
"empty.odt", "application/vnd.oasis.opendocument.text", b""
434+
)

0 commit comments

Comments
 (0)