@@ -137,11 +137,11 @@ def split_chunks(
137137 for d in text_splitter .split_documents ([Document (page_content = chunk .text )])
138138 ]
139139 logger .info (
140- f"{ _id } - { len (request .chunks ):,d} chunks split into { len (chunks ):,d} chunks." ,
140+ f"{ len (request .chunks ):,d} chunks split into { len (chunks ):,d} chunks. ( { _id } ) " ,
141141 )
142142 return chunks
143143 except Exception as e :
144- logger .exception (f"{ _id } - Failed to split chunks." )
144+ logger .exception (f"Failed to split chunks. ( { _id } ) " )
145145 raise BadInputError ("Failed to split chunks." ) from e
146146
147147
@@ -216,7 +216,7 @@ async def load_document(
216216 ext = splitext (file_name )[1 ].lower ()
217217 if ext in [".pdf" , ".docx" , ".pptx" , ".xlsx" , ".html" ]:
218218 doc_loader = DoclingLoader (self .request_id )
219- md = await doc_loader .convert_document_to_markdown (
219+ md = await doc_loader .document_to_markdown (
220220 file_name = file_name , content = content
221221 )
222222 elif ext in [".md" , ".txt" ]:
@@ -320,7 +320,7 @@ async def load_document_chunks(
320320 )
321321 else :
322322 doc_loader = DoclingLoader (self .request_id , page_break_placeholder = None )
323- chunks = await doc_loader .convert_document_to_chunks (
323+ chunks = await doc_loader .document_to_chunks (
324324 file_name = file_name ,
325325 content = content ,
326326 chunk_size = chunk_size ,
@@ -438,13 +438,13 @@ def __init__(
438438 )
439439 self .page_break_placeholder = page_break_placeholder
440440
441- async def retrieve_document_content (
441+ async def _parse_document (
442442 self ,
443443 file_name : str ,
444444 content : bytes ,
445- ) -> dict : # Expecting JSON response from docling-serve
445+ ) -> dict :
446446 """
447- Retrieves the content of a document file using Docling-Serve API (async pattern).
447+ Parse the document using Docling-Serve API (async pattern).
448448
449449 Args:
450450 file_path (str): Path to the document file to be parsed (local temp path).
@@ -458,7 +458,10 @@ async def retrieve_document_content(
458458 Raises:
459459 HTTPException: If the document conversion fails via docling-serve.
460460 """
461- logger .info (f'{ self .request_id } - Calling Docling-Serve for file "{ file_name } ".' )
461+ size_mb = get_bytes_size_mb (content )
462+ logger .info (
463+ f'Calling Docling-Serve for file "{ file_name } " with size { size_mb :.3f} MiB. ({ self .request_id } )'
464+ )
462465
463466 files = {"files" : (file_name , content , "application/octet-stream" )}
464467 data = {
@@ -507,7 +510,10 @@ async def retrieve_document_content(
507510 elif task_status in ("failure" , "revoked" ):
508511 error_info = status_data .get ("task_result" , {}).get ("error" , "Unknown error" )
509512 logger .error (
510- f'Docling-Serve task "{ task_id } " for document "{ file_name } " failed: { error_info } '
513+ (
514+ f'Docling-Serve task "{ task_id } " for document "{ file_name } " '
515+ f"with size { size_mb :.3f} MiB failed: { error_info } . ({ self .request_id } )"
516+ )
511517 )
512518 raise BadInputError (f'Your document "{ file_name } " cannot be parsed.' )
513519 # If not success, failure, or revoked, it's still processing or in another state
@@ -516,7 +522,7 @@ async def retrieve_document_content(
516522 else : # Executed if the while loop completes without a 'break'
517523 logger .error (
518524 (
519- f'Docling-Serve task "{ task_id } " for document "{ file_name } " '
525+ f'Docling-Serve task "{ task_id } " for document "{ file_name } " with size { size_mb :.3f } MiB '
520526 f"timed out after polling for { time_slept } seconds. ({ self .request_id } )"
521527 )
522528 )
@@ -537,24 +543,20 @@ async def retrieve_document_content(
537543 except Exception as e :
538544 raise UnexpectedError (f"Docling-Serve API error: { e } " ) from e
539545
540- async def convert_document_to_markdown (self , file_name : str , content : bytes ) -> str :
546+ async def document_to_markdown (self , file_name : str , content : bytes ) -> str :
541547 """
542548 Converts a document to Markdown format using Docling-Serve.
543549 """
544- docling_response = await self .retrieve_document_content (file_name , content )
545- logger .info (
546- f"Converted `{ file_name } ` to Markdown in { docling_response .get ('processing_time' , '0' ):.3f} seconds, "
547- f"{ get_bytes_size_mb (content ):.3f} MB."
548- )
550+ docling_response = await self ._parse_document (file_name , content )
549551 return docling_response .get ("document" , {}).get ("md_content" , "" )
550552
551- async def convert_document_to_chunks (
553+ async def document_to_chunks (
552554 self , file_name : str , content : bytes , chunk_size : int , chunk_overlap : int
553555 ) -> list [Chunk ]:
554556 """
555557 Converts a document to chunks, respecting page and table boundaries, using Docling-Serve.
556558 """
557- docling_response = await self .retrieve_document_content (file_name , content )
559+ docling_response = await self ._parse_document (file_name , content )
558560 md_content = docling_response .get ("document" , {}).get ("md_content" , "" )
559561
560562 documents = [Document (page_content = md_content , metadata = {"page" : 1 })]
0 commit comments