33import pytest
44
55from aidial_rag .attachment_link import AttachmentLink
6+ from aidial_rag .converter import convert_document_if_needed
67from aidial_rag .document_loaders import load_attachment , parse_document
78from tests .utils .local_http_server import start_local_server
89
@@ -19,8 +20,14 @@ async def load_document(name):
1920 display_name = name ,
2021 )
2122
22- file_metadata , buffer = await load_attachment (attachment_link , {})
23- mime_type = file_metadata .mime_type
23+ file_metadata , original_file_bytes = await load_attachment (attachment_link , {})
24+
25+ mime_type , buffer = await convert_document_if_needed (
26+ mime_type = file_metadata .mime_type ,
27+ doc_bytes = original_file_bytes ,
28+ io_stream = sys .stderr ,
29+ )
30+
2431 chunks = await parse_document (
2532 sys .stderr , buffer , mime_type , attachment_link , mime_type
2633 )
@@ -69,3 +76,18 @@ async def test_load_single_line_text(local_server):
6976 assert chunks [0 ].metadata ["filetype" ] == "text/plain"
7077 assert "page_number" not in chunks [0 ].metadata
7178 assert "orig_elements" not in chunks [0 ].metadata
79+
80+
81+ @pytest .mark .asyncio
82+ async def test_load_docx_with_long_table (local_server ):
83+ """Test that docx with long table can be loaded successfully.
84+
85+ Older versions of libreoffice may hang when converting this docx to pdf.
86+ See https://ask.libreoffice.org/t/file-can-be-converted-to-pdf-using-gui-but-not-with-cmd/125926/3
87+ """
88+
89+ chunks = await load_document ("test_long_table.docx" )
90+ assert len (chunks ) == 22
91+ assert chunks [0 ].page_content .startswith ("X" )
92+ assert chunks [0 ].metadata ["filetype" ] == "application/pdf"
93+ assert chunks [0 ].metadata ["page_number" ] == 1
0 commit comments