suitenumerique
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/backend/chat/agent_rag/document_converter/markitdown.py‎
Lines changed: 1 addition & 2 deletions b/‎src/backend/chat/agent_rag/document_converter/markitdown.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/backend/chat/agent_rag/document_converter/odt.py‎
Lines changed: 26 additions & 0 deletions b/‎src/backend/chat/agent_rag/document_converter/odt.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/backend/chat/agent_rag/document_converter/parser.py‎
Lines changed: 40 additions & 42 deletions b/‎src/backend/chat/agent_rag/document_converter/parser.py‎
Lines changed: 40 additions & 42 deletions
diff --git a/‎src/backend/chat/tests/agent_rag/document_converter/fixtures/sample.odt‎
9.68 KB b/‎src/backend/chat/tests/agent_rag/document_converter/fixtures/sample.odt‎
9.68 KB
diff --git a/‎src/backend/chat/tests/agent_rag/document_converter/test_adaptive_pdf_parser.py‎
Lines changed: 124 additions & 0 deletions b/‎src/backend/chat/tests/agent_rag/document_converter/test_adaptive_pdf_parser.py‎
Lines changed: 124 additions & 0 deletions
@@ -12,6 +12,7 @@ and this project adheres to
 
 - ✨(back) add projects with custom LLM instructions
 - ✨(front) projects management UI
+- ✨(back) add ODT parsing support
 
 ## [0.0.14] - 2026-03-11
 
 
@@ -39,5 +39,4 @@ def _convert(self, document: BytesIO, file_extension: str) -> str:
         conversion = self.converter.convert_stream(
             document, file_extension=file_extension or ".txt"
         )
-        document_markdown = conversion.text_content
-        return document_markdown
+        return conversion.text_content
@@ -0,0 +1,26 @@
+import logging
+from io import BytesIO
+
+from django.utils.translation import gettext_lazy as _
+
+from odfdo import Document
+
+logger = logging.getLogger(__name__)
+
+
+class OdtParsingError(Exception):
+    """Raised when an ODT file cannot be parsed."""
+
+
+class OtdToMd:
+    """Convert an ODT file to Markdown using odfdo."""
+
+    def extract(self, content: bytes, **kwargs) -> str:
+        try:
+            doc = Document(BytesIO(content))
+            return doc.to_markdown()
+        except (TypeError, FileNotFoundError) as e:
+            logger.error("Failed to parse ODT document: %s", e)
+            raise OdtParsingError(
+                _("Failed to parse ODT document: %(error)s") % {"error": e}
+            ) from e
@@ -13,31 +13,52 @@
 
 from chat.agent_rag.document_converter.markitdown import DocumentConverter
 
+from .odt import OtdToMd
+
 logger = logging.getLogger(__name__)
 
+CT_PDF = "application/pdf"
+CT_ODT = "application/vnd.oasis.opendocument.text"
+
 
 class BaseParser:
-    """Base class for document parsers."""
+    """Base class for document parsers.
+
+    Routes documents by content type:
+    - PDF -> self.parse_pdf_document() (must be provided by subclass or mixin)
+    - ODT -> self.parse_odt_document() (must be provided by subclass or mixin)
+    - Other -> DocumentConverter (markitdown)
+    """
 
     def parse_document(self, name: str, content_type: str, content: bytes) -> str:
-        """
-        Parse the document and prepare it for the search operation.
-        This method should handle the logic to convert the document
-        into a format suitable for storage.
-
-        Args:
-            name (str): The name of the document.
-            content_type (str): The MIME type of the document (e.g., "application/pdf").
-            content (bytes): The content of the document as a bytes stream.
-
-        Returns:
-            str: The document content in Markdown format.
-        """
+        """Route to the appropriate parser based on content type."""
+
+        if content_type == CT_PDF:
+            return self.parse_pdf_document(name=name, content_type=content_type, content=content)
+        if content_type == CT_ODT:
+            return self.parse_odt_document(name=name, content=content)
+        return DocumentConverter().convert_raw(
+            name=name, content_type=content_type, content=content
+        )
+
+    def parse_pdf_document(self, name: str, content_type: str, content: bytes) -> str:
+        """Parse PDF document. Must be implemented by subclass or mixin."""
+        raise NotImplementedError("Must be implemented in subclass.")
+
+    def parse_odt_document(self, name: str, content: bytes) -> str:
+        """Parse ODT document. Must be implemented by subclass or mixin."""
         raise NotImplementedError("Must be implemented in subclass.")
 
 
-class AlbertParser(BaseParser):
-    """Document parser using Albert API for PDFs and DocumentConverter for other formats."""
+class OdtParserMixin:
+    """Mixin that adds ODT parsing using odfdo."""
+
+    def parse_odt_document(self, name: str, content: bytes) -> str:
+        return OtdToMd().extract(content)
+
+
+class AlbertParser(OdtParserMixin, BaseParser):
+    """Document parser using Albert API for PDFs."""
 
     endpoint = urljoin(settings.ALBERT_API_URL, "/v1/parse-beta")
 
@@ -60,23 +81,13 @@ def parse_pdf_document(self, name: str, content_type: str, content: bytes) -> st
             document_page["content"] for document_page in response.json().get("data", [])
         )
 
-    def parse_document(self, name: str, content_type: str, content: bytes) -> str:
-        """Parse document based on content type."""
-        if content_type == "application/pdf":
-            return self.parse_pdf_document(name=name, content_type=content_type, content=content)
-        return DocumentConverter().convert_raw(
-            name=name, content_type=content_type, content=content
-        )
-
 
 METHOD_TEXT_EXTRACTION = "text_extraction"
 METHOD_OCR = "ocr"
 
 
 def analyze_pdf(pdf_data: bytes) -> dict:
-    """
-    Analyze a PDF to determine if it needs OCR or can use direct text extraction.
-    """
+    """Analyze a PDF to determine if it needs OCR or can use direct text extraction."""
     reader = PdfReader(BytesIO(pdf_data))
     total_pages = len(reader.pages)
     if total_pages == 0:
@@ -95,20 +106,17 @@ def analyze_pdf(pdf_data: bytes) -> dict:
         text = (page.extract_text() or "").strip()
         char_count = len(text)
         total_chars += char_count
-
         if char_count > 50:
             pages_with_text += 1
 
     avg_chars = total_chars / total_pages
     text_coverage = pages_with_text / total_pages
 
-    # Decision logic
     if (
         avg_chars > settings.MIN_AVG_CHARS_FOR_TEXT_EXTRACTION
         and text_coverage > settings.MIN_TEXT_COVERAGE_FOR_TEXT_EXTRACTION
     ):
         method = METHOD_TEXT_EXTRACTION
-
     else:
         method = METHOD_OCR
 
@@ -121,7 +129,7 @@ def analyze_pdf(pdf_data: bytes) -> dict:
     }
 
 
-class AdaptiveParserMixin:
+class AdaptivePdfParserMixin:
     """
     Mixin that adds adaptive PDF parsing behavior.
 
@@ -159,7 +167,7 @@ def parse_pdf_document_with_ocr(self, name: str, content: bytes) -> str:
         raise NotImplementedError("Subclass must implement parse_pdf_document_with_ocr")
 
 
-class AdaptivePdfParser(AdaptiveParserMixin, BaseParser):
+class AdaptivePdfParser(AdaptivePdfParserMixin, OdtParserMixin, BaseParser):
     """
     PDF parser with adaptive text extraction / OCR routing.
 
@@ -265,16 +273,6 @@ def parse_pdf_document_with_ocr(self, name: str, content: bytes) -> str:
                 )
             except Exception as e:  # pylint: disable=broad-except #noqa: BLE001
                 logger.error("Failed to OCR pages %d-%d: %s", start_index + 1, end_index, str(e))
-                # Preserve page count with empty placeholders to maintain correct ordering
                 results.extend([""] * (end_index - start_index))
 
         return "\n\n".join(results)
-
-    def parse_document(self, name: str, content_type: str, content: bytes) -> str:
-        """Route to PDF parser or DocumentConverter based on content type."""
-        if content_type == "application/pdf":
-            return self.parse_pdf_document(name=name, content_type=content_type, content=content)
-
-        return DocumentConverter().convert_raw(
-            name=name, content_type=content_type, content=content
-        )
@@ -8,6 +8,7 @@
 import requests
 from pypdf import PdfReader
 
+from chat.agent_rag.document_converter.odt import OdtParsingError
 from chat.agent_rag.document_converter.parser import (
     METHOD_OCR,
     METHOD_TEXT_EXTRACTION,
@@ -36,6 +37,12 @@ def provide_mixed_pdf_10_pages():
     return (FIXTURES_DIR / "mixed_10_pages.pdf").read_bytes()
 
 
+@pytest.fixture(name="sample_odt")
+def provide_sample_odt():
+    """Load an ODT document."""
+    return (FIXTURES_DIR / "sample.odt").read_bytes()
+
+
 MIN_AVG_CHARS_FOR_TEXT_EXTRACTION = 200
 OCR_RETRY_DELAY = 1
 OCR_MAX_RETRIES = 3
@@ -297,6 +304,63 @@ def test_parse_document_pdf_routed_correctly(text_pdf_1_page):
         )
 
 
+def test_text_pdf_routed_to_text_extraction(text_pdf_10_pages):
+    """Text-rich PDF should be routed to extract_text_from_pdf, not OCR."""
+    parser = AdaptivePdfParser()
+
+    with (
+        patch.object(parser, "extract_text_from_pdf", return_value="extracted") as mock_extract,
+        patch.object(parser, "parse_pdf_document_with_ocr") as mock_ocr,
+    ):
+        result = parser.parse_pdf_document(
+            name="test.pdf", content_type="application/pdf", content=text_pdf_10_pages
+        )
+
+        assert result == "extracted"
+        mock_extract.assert_called_once_with(
+            name="test.pdf", content_type="application/pdf", content=text_pdf_10_pages
+        )
+        mock_ocr.assert_not_called()
+
+
+def test_mixed_pdf_routed_to_ocr(mixed_pdf_10_pages):
+    """PDF with low text coverage should be routed to OCR, not text extraction."""
+    parser = AdaptivePdfParser()
+
+    with (
+        patch.object(parser, "extract_text_from_pdf") as mock_extract,
+        patch.object(parser, "parse_pdf_document_with_ocr", return_value="ocr result") as mock_ocr,
+    ):
+        result = parser.parse_pdf_document(
+            name="test.pdf", content_type="application/pdf", content=mixed_pdf_10_pages
+        )
+
+        assert result == "ocr result"
+        mock_ocr.assert_called_once_with(name="test.pdf", content=mixed_pdf_10_pages)
+        mock_extract.assert_not_called()
+
+
+def test_parse_document_pdf(text_pdf_1_page):
+    """Should route PDF content type to PDF parser."""
+    parser = AdaptivePdfParser()
+
+    result = parser.parse_document("test.pdf", "application/pdf", text_pdf_1_page)
+
+    assert result == (
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
+        "incididunt ut\nlabore et dolore magna aliqua. Ut enim ad minim veniam, "
+        "quis nostrud exercitation ullamco\nlaboris nisi ut aliquip ex ea commodo consequat. "
+        "Duis aute irure dolor in reprehenderit in\nvoluptate velit esse cillum dolore eu fugiat "
+        "nulla pariatur. Excepteur sint occaecat cupidatat non\nproident, sunt in culpa qui "
+        "oﬃcia deserunt mollit anim id est laborum.\n\nLorem ipsum dolor sit amet, consectetur "
+        "adipiscing elit, sed do eiusmod tempor incididunt ut\nlabore et dolore magna aliqua. "
+        "Ut enim ad minim veniam, quis nostrud exercitation ullamco\nlaboris nisi ut aliquip "
+        "ex ea commodo consequat. Duis aute irure dolor in reprehenderit in\nvoluptate velit "
+        "esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non"
+        "\nproident, sunt in culpa qui oﬃcia deserunt mollit anim id est laborum.\n\n"
+    )
+
+
 def test_parse_document_non_pdf_uses_document_converter():
     """Should route non-PDF content to DocumentConverter."""
     parser = AdaptivePdfParser()
@@ -308,3 +372,63 @@ def test_parse_document_non_pdf_uses_document_converter():
 
         assert result == "docx content"
         mock_converter.return_value.convert_raw.assert_called_once()
+
+
+expected_md_from_odt = (
+    "# Document Title\n\n## Introduction\n\nThis is a normal paragraph with "
+    "**bold text**, \\\n_italic text_, and \\\n***bold italic text***."
+    "\\\n\n\nThis has ~~strikethrough~~ and \\\n`inline code`.\\\n\n\n"
+    "Visit [Example Site](https://example.com) for more info.\\\n\n\n"
+    "## Features\n\n -  Fast parsing\n -  Clean output\n -  "
+    "Django integration\n -  LLM\\-ready markdown\n\n"
+    "### Nested List\n\n -  Parent item\n    -  Child A"
+    "\n    -  Child B\n -  Another parent\n\n## Data Table\n\n"
+    "| Name  | Age | City   |\n|-------|-----|--------|\n"
+    "| Alice | 30  | Paris  |\n| Bob   | 25  | London |"
+    "\n\n\n## Conclusion\n\nThis document tests "
+    "the ODT to Markdown conversion pipeline.\n"
+)
+
+
+def test_parse_odt(sample_odt):
+    """Should extract a single page correctly."""
+    parser = AdaptivePdfParser()
+
+    result = parser.parse_document(
+        "sample.odt", "application/vnd.oasis.opendocument.text", sample_odt
+    )
+
+    assert result == expected_md_from_odt
+
+
+def test_parse_document_odt_routed_correctly(sample_odt):
+    """Should route ODT content type to ODT parser."""
+    parser = AdaptivePdfParser()
+
+    with patch.object(parser, "parse_odt_document", return_value="odt content") as mock_parse:
+        result = parser.parse_document(
+            "sample.odt", "application/vnd.oasis.opendocument.text", sample_odt
+        )
+
+        assert result == "odt content"
+        mock_parse.assert_called_once_with(name="sample.odt", content=sample_odt)
+
+
+def test_parse_odt_corrupt_input():
+    """Should raise OdtParsingError on corrupt input."""
+    parser = AdaptivePdfParser()
+
+    with pytest.raises(OdtParsingError, match="Failed to parse ODT document"):
+        parser.parse_document(
+            "corrupt.odt", "application/vnd.oasis.opendocument.text", b"garbage"
+        )
+
+
+def test_parse_odt_empty_input():
+    """Should raise OdtParsingError on empty input."""
+    parser = AdaptivePdfParser()
+
+    with pytest.raises(OdtParsingError, match="Failed to parse ODT document"):
+        parser.parse_document(
+            "empty.odt", "application/vnd.oasis.opendocument.text", b""
+        )
Original file line number	Diff line number	Diff line change
`@@ -39,5 +39,4 @@ def _convert(self, document: BytesIO, file_extension: str) -> str:`
`39`	`39`	`conversion = self.converter.convert_stream(`
`40`	`40`	`document, file_extension=file_extension or ".txt"`
`41`	`41`	`)`
`42`		`- document_markdown = conversion.text_content`
`43`		`- return document_markdown`
	`42`	`+ return conversion.text_content`