Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
RUN apt-get update && \
apt-get install --no-install-recommends -y \
ca-certificates \
# Libreoffice is required for MS office documents
libreoffice=4:24.2.7-0ubuntu0.24.04.4 \
libmagic1 \
# Dependency for opencv library
libgl1 \
&& \
apt-get install --no-install-recommends -y -t noble-backports \
# Libreoffice is required for MS office documents
libreoffice=4:25.2.7-0ubuntu0.25.04.1~bpo24.04.1 \
&& \
# Cleanup apt cache in the same command to reduce size
apt-get clean && rm -rf /var/lib/apt/lists/*

Expand Down Expand Up @@ -73,7 +75,7 @@
apt-get install --no-install-recommends -y git

# Copy the whole repository
COPY . /opt/aidial_rag_repo

Check warning on line 78 in Dockerfile

View workflow job for this annotation

GitHub Actions / run_tests / docker_build

Attempting to Copy file that is excluded by .dockerignore

CopyIgnoredFile: Attempting to Copy file "." that is excluded by .dockerignore More info: https://docs.docker.com/go/dockerfile/rule/copy-ignored-file/
WORKDIR /opt/aidial_rag_repo

RUN python collect_repository_digest.py /opt/repository-digest.json
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ IMAGE_NAME ?= ai-dial-rag
PLATFORM ?= linux/amd64
POETRY ?= poetry
DOCKER ?= docker
LIBREOFFICE_UBUNTU_VERSION ?= 4:24.2.7-0ubuntu0.24.04.4
LIBREOFFICE_UBUNTU_VERSION ?= 4:25.2.7-0ubuntu0.25.04.1~bpo24.04.1
ARGS ?=

# Check for CI environment
Expand Down Expand Up @@ -43,7 +43,7 @@ format: install_nox
install_libreoffice:
@echo "Installing LibreOffice..."
sudo apt-get update
sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)
sudo apt-get install --no-install-recommends -y -t noble-backports libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)

test: install_nox $(if $(CI), install_libreoffice)
$(POETRY) run nox -- -s test -- $(ARGS)
Expand Down
Binary file added tests/data/test_long_table.docx
Binary file not shown.
28 changes: 26 additions & 2 deletions tests/test_load_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest

from aidial_rag.attachment_link import AttachmentLink
from aidial_rag.converter import convert_document_if_needed
from aidial_rag.document_loaders import load_attachment, parse_document
from tests.utils.local_http_server import start_local_server

Expand All @@ -19,8 +20,16 @@ async def load_document(name):
display_name=name,
)

file_metadata, buffer = await load_attachment(attachment_link, {})
mime_type = file_metadata.mime_type
file_metadata, original_file_bytes = await load_attachment(
attachment_link, {}
)

mime_type, buffer = await convert_document_if_needed(
mime_type=file_metadata.mime_type,
doc_bytes=original_file_bytes,
io_stream=sys.stderr,
)

chunks = await parse_document(
sys.stderr, buffer, mime_type, attachment_link, mime_type
)
Expand Down Expand Up @@ -69,3 +78,18 @@ async def test_load_single_line_text(local_server):
assert chunks[0].metadata["filetype"] == "text/plain"
assert "page_number" not in chunks[0].metadata
assert "orig_elements" not in chunks[0].metadata


@pytest.mark.asyncio
async def test_load_docx_with_long_table(local_server):
"""Test that docx with long table can be loaded successfully.

Older versions of libreoffice may hang when converting this docx to pdf.
See https://ask.libreoffice.org/t/file-can-be-converted-to-pdf-using-gui-but-not-with-cmd/125926/3
"""

chunks = await load_document("test_long_table.docx")
assert len(chunks) == 22
assert chunks[0].page_content.startswith("X")
assert chunks[0].metadata["filetype"] == "application/pdf"
assert chunks[0].metadata["page_number"] == 1
Loading