33import pytest
44
55from aidial_rag .attachment_link import AttachmentLink
6+ from aidial_rag .converter import convert_document_if_needed
67from aidial_rag .document_loaders import load_attachment , parse_document
78from tests .utils .local_http_server import start_local_server
89
@@ -19,8 +20,16 @@ async def load_document(name):
1920 display_name = name ,
2021 )
2122
22- file_metadata , buffer = await load_attachment (attachment_link , {})
23- mime_type = file_metadata .mime_type
23+ file_metadata , original_file_bytes = await load_attachment (
24+ attachment_link , {}
25+ )
26+
27+ mime_type , buffer = await convert_document_if_needed (
28+ mime_type = file_metadata .mime_type ,
29+ doc_bytes = original_file_bytes ,
30+ io_stream = sys .stderr ,
31+ )
32+
2433 chunks = await parse_document (
2534 sys .stderr , buffer , mime_type , attachment_link , mime_type
2635 )
@@ -69,3 +78,18 @@ async def test_load_single_line_text(local_server):
6978 assert chunks [0 ].metadata ["filetype" ] == "text/plain"
7079 assert "page_number" not in chunks [0 ].metadata
7180 assert "orig_elements" not in chunks [0 ].metadata
81+
82+
83+ @pytest .mark .asyncio
84+ async def test_load_docx_with_long_table (local_server ):
85+ """Test that docx with long table can be loaded successfully.
86+
87+ Older versions of libreoffice may hang when converting this docx to pdf.
88+ See https://ask.libreoffice.org/t/file-can-be-converted-to-pdf-using-gui-but-not-with-cmd/125926/3
89+ """
90+
91+ chunks = await load_document ("test_long_table.docx" )
92+ assert len (chunks ) == 22
93+ assert chunks [0 ].page_content .startswith ("X" )
94+ assert chunks [0 ].metadata ["filetype" ] == "application/pdf"
95+ assert chunks [0 ].metadata ["page_number" ] == 1
0 commit comments