1616from xml .sax .saxutils import unescape
1717
1818import latex2mathml .converter
19+ from PIL .Image import Image
1920from pydantic import AnyUrl , BaseModel
2021from typing_extensions import override
2122
4041 _get_css_for_single_column ,
4142 _get_css_for_split_page ,
4243)
44+ from docling_core .transforms .visualizer .base import BaseVisualizer
4345from docling_core .types .doc .base import ImageRefMode
4446from docling_core .types .doc .document import (
4547 CodeItem ,
@@ -821,9 +823,22 @@ def serialize_hyperlink(
821823 def serialize_doc (
822824 self ,
823825 parts : list [SerializationResult ],
826+ visualizer : Optional [BaseVisualizer ] = None ,
824827 ** kwargs : Any ,
825828 ) -> SerializationResult :
826829 """Serialize a document out of its pages."""
830+
831+ def _serialize_page_img (page_img : Image ):
832+ buffered = BytesIO ()
833+ page_img .save (buffered , format = "PNG" ) # Save the image to the byte stream
834+ img_bytes = buffered .getvalue () # Get the byte data
835+
836+ # Encode to Base64 and decode to string
837+ img_base64 = base64 .b64encode (img_bytes ).decode ("utf-8" )
838+ img_text = f'<img src="data:image/png;base64,{ img_base64 } ">'
839+
840+ return f"<figure>{ img_text } </figure>"
841+
827842 # Create HTML structure
828843 html_parts = [
829844 "<!DOCTYPE html>" ,
@@ -853,19 +868,26 @@ def serialize_doc(
853868 html_parts .append ("<table>" )
854869 html_parts .append ("<tbody>" )
855870
871+ vized_pages_dict : dict [Optional [int ], Image ] = {}
872+ if visualizer :
873+ vized_pages_dict = visualizer .get_visualization (doc = self .doc )
874+
856875 for page_no , page in pages .items ():
857876
858877 if isinstance (page_no , int ):
859878 if applicable_pages is not None and page_no not in applicable_pages :
860879 continue
861880 page_img = self .doc .pages [page_no ].image
881+ vized_page = vized_pages_dict .get (page_no )
862882
863883 html_parts .append ("<tr>" )
864884
865885 html_parts .append ("<td>" )
866886
887+ if vized_page :
888+ html_parts .append (_serialize_page_img (page_img = vized_page ))
867889 # short-cut: we already have the image in base64
868- if (
890+ elif (
869891 (page_img is not None )
870892 and isinstance (page_img , ImageRef )
871893 and isinstance (page_img .uri , AnyUrl )
@@ -875,18 +897,7 @@ def serialize_doc(
875897 html_parts .append (f"<figure>{ img_text } </figure>" )
876898
877899 elif (page_img is not None ) and (page_img ._pil is not None ):
878-
879- buffered = BytesIO ()
880- page_img ._pil .save (
881- buffered , format = "PNG"
882- ) # Save the image to the byte stream
883- img_bytes = buffered .getvalue () # Get the byte data
884-
885- # Encode to Base64 and decode to string
886- img_base64 = base64 .b64encode (img_bytes ).decode ("utf-8" )
887- img_text = f'<img src="data:image/png;base64,{ img_base64 } ">'
888-
889- html_parts .append (f"<figure>{ img_text } </figure>" )
900+ html_parts .append (_serialize_page_img (page_img = page_img ._pil ))
890901 else :
891902 html_parts .append ("<figure>no page-image found</figure>" )
892903
0 commit comments