Skip to content

Commit 6cc6e01

Browse files
committed
fix: update libreoffice to 25.2.7 (#122)
1 parent 3aa13b6 commit 6cc6e01

File tree

4 files changed

+32
-6
lines changed

4 files changed

+32
-6
lines changed

Dockerfile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ ENV BGE_EMBEDDINGS_MODEL_PATH=/embeddings_model/bge-small-en
1313
RUN apt-get update && \
1414
apt-get install --no-install-recommends -y \
1515
ca-certificates \
16-
# Libreoffice is required for MS office documents
17-
libreoffice=4:24.2.7-0ubuntu0.24.04.4 \
1816
libmagic1 \
1917
# Dependency for opencv library
2018
libgl1 \
2119
&& \
20+
apt-get install --no-install-recommends -y -t noble-backports \
21+
# Libreoffice is required for MS office documents
22+
libreoffice=4:25.2.7-0ubuntu0.25.04.1~bpo24.04.1 \
23+
&& \
2224
# Cleanup apt cache in the same command to reduce size
2325
apt-get clean && rm -rf /var/lib/apt/lists/*
2426

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ IMAGE_NAME ?= ai-dial-rag
33
PLATFORM ?= linux/amd64
44
POETRY ?= poetry
55
DOCKER ?= docker
6-
LIBREOFFICE_UBUNTU_VERSION ?= 4:24.2.7-0ubuntu0.24.04.4
6+
LIBREOFFICE_UBUNTU_VERSION ?= 4:25.2.7-0ubuntu0.25.04.1~bpo24.04.1
77
ARGS ?=
88

99
# Check for CI environment
@@ -43,7 +43,7 @@ format: install_nox
4343
install_libreoffice:
4444
@echo "Installing LibreOffice..."
4545
sudo apt-get update
46-
sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)
46+
sudo apt-get install --no-install-recommends -y -t noble-backports libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)
4747

4848
test: install_nox $(if $(CI), install_libreoffice)
4949
$(POETRY) run nox -- -s test -- $(ARGS)

tests/data/test_long_table.docx

23 KB
Binary file not shown.

tests/test_load_documents.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44

55
from aidial_rag.attachment_link import AttachmentLink
6+
from aidial_rag.converter import convert_document_if_needed
67
from aidial_rag.document_loaders import load_attachment, parse_document
78
from tests.utils.local_http_server import start_local_server
89

@@ -19,8 +20,16 @@ async def load_document(name):
1920
display_name=name,
2021
)
2122

22-
file_metadata, buffer = await load_attachment(attachment_link, {})
23-
mime_type = file_metadata.mime_type
23+
file_metadata, original_file_bytes = await load_attachment(
24+
attachment_link, {}
25+
)
26+
27+
mime_type, buffer = await convert_document_if_needed(
28+
mime_type=file_metadata.mime_type,
29+
doc_bytes=original_file_bytes,
30+
io_stream=sys.stderr,
31+
)
32+
2433
chunks = await parse_document(
2534
sys.stderr, buffer, mime_type, attachment_link, mime_type
2635
)
@@ -69,3 +78,18 @@ async def test_load_single_line_text(local_server):
6978
assert chunks[0].metadata["filetype"] == "text/plain"
7079
assert "page_number" not in chunks[0].metadata
7180
assert "orig_elements" not in chunks[0].metadata
81+
82+
83+
@pytest.mark.asyncio
84+
async def test_load_docx_with_long_table(local_server):
85+
"""Test that docx with long table can be loaded successfully.
86+
87+
Older versions of libreoffice may hang when converting this docx to pdf.
88+
See https://ask.libreoffice.org/t/file-can-be-converted-to-pdf-using-gui-but-not-with-cmd/125926/3
89+
"""
90+
91+
chunks = await load_document("test_long_table.docx")
92+
assert len(chunks) == 22
93+
assert chunks[0].page_content.startswith("X")
94+
assert chunks[0].metadata["filetype"] == "application/pdf"
95+
assert chunks[0].metadata["page_number"] == 1

0 commit comments

Comments
 (0)