diff --git a/Dockerfile b/Dockerfile index 5674782..ef2e7a4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,12 +13,14 @@ ENV BGE_EMBEDDINGS_MODEL_PATH=/embeddings_model/bge-small-en RUN apt-get update && \ apt-get install --no-install-recommends -y \ ca-certificates \ - # Libreoffice is required for MS office documents - libreoffice=4:24.2.7-0ubuntu0.24.04.4 \ libmagic1 \ # Dependency for opencv library libgl1 \ && \ + apt-get install --no-install-recommends -y -t noble-backports \ + # Libreoffice is required for MS office documents + libreoffice=4:25.2.7-0ubuntu0.25.04.1~bpo24.04.1 \ + && \ # Cleanup apt cache in the same command to reduce size apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/Makefile b/Makefile index 90b04e1..0c0f7dd 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ IMAGE_NAME ?= ai-dial-rag PLATFORM ?= linux/amd64 POETRY ?= poetry DOCKER ?= docker -LIBREOFFICE_UBUNTU_VERSION ?= 4:24.2.7-0ubuntu0.24.04.4 +LIBREOFFICE_UBUNTU_VERSION ?= 4:25.2.7-0ubuntu0.25.04.1~bpo24.04.1 ARGS ?= # Check for CI environment @@ -43,7 +43,7 @@ format: install_nox install_libreoffice: @echo "Installing LibreOffice..." sudo apt-get update - sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION) + sudo apt-get install --no-install-recommends -y -t noble-backports libreoffice=$(LIBREOFFICE_UBUNTU_VERSION) test: install_nox $(if $(CI), install_libreoffice) $(POETRY) run nox -- -s test -- $(ARGS) diff --git a/tests/data/test_long_table.docx b/tests/data/test_long_table.docx new file mode 100644 index 0000000..9abe3a7 Binary files /dev/null and b/tests/data/test_long_table.docx differ diff --git a/tests/test_load_documents.py b/tests/test_load_documents.py index a1bbffd..64f9101 100644 --- a/tests/test_load_documents.py +++ b/tests/test_load_documents.py @@ -3,6 +3,7 @@ import pytest from aidial_rag.attachment_link import AttachmentLink +from aidial_rag.converter import convert_document_if_needed from aidial_rag.document_loaders import load_attachment, parse_document from tests.utils.local_http_server import start_local_server @@ -19,8 +20,16 @@ async def load_document(name): display_name=name, ) - file_metadata, buffer = await load_attachment(attachment_link, {}) - mime_type = file_metadata.mime_type + file_metadata, original_file_bytes = await load_attachment( + attachment_link, {} + ) + + mime_type, buffer = await convert_document_if_needed( + mime_type=file_metadata.mime_type, + doc_bytes=original_file_bytes, + io_stream=sys.stderr, + ) + chunks = await parse_document( sys.stderr, buffer, mime_type, attachment_link, mime_type ) @@ -69,3 +78,18 @@ async def test_load_single_line_text(local_server): assert chunks[0].metadata["filetype"] == "text/plain" assert "page_number" not in chunks[0].metadata assert "orig_elements" not in chunks[0].metadata + + +@pytest.mark.asyncio +async def test_load_docx_with_long_table(local_server): + """Test that docx with long table can be loaded successfully. + + Older versions of libreoffice may hang when converting this docx to pdf. + See https://ask.libreoffice.org/t/file-can-be-converted-to-pdf-using-gui-but-not-with-cmd/125926/3 + """ + + chunks = await load_document("test_long_table.docx") + assert len(chunks) == 22 + assert chunks[0].page_content.startswith("X") + assert chunks[0].metadata["filetype"] == "application/pdf" + assert chunks[0].metadata["page_number"] == 1