Skip to content

Commit 6a7eb53

Browse files
authored
feat: provide visualizer option in HTML split view (#294)
* feat: provide visualizer option in HTML split view Signed-off-by: Panos Vagenas <[email protected]> * loosen test Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent 56f70de commit 6a7eb53

File tree

3 files changed

+56
-18
lines changed

3 files changed

+56
-18
lines changed

docling_core/transforms/serializer/common.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ class CommonParams(BaseModel):
169169

170170
def merge_with_patch(self, patch: dict[str, Any]) -> Self:
171171
"""Create an instance by merging the provided patch dict on top of self."""
172-
res = self.model_validate({**self.model_dump(), **patch})
172+
res = self.model_copy(update=patch)
173173
return res
174174

175175

@@ -260,10 +260,10 @@ def serialize_doc(
260260
"""Serialize a document out of its pages."""
261261
...
262262

263-
def _serialize_body(self) -> SerializationResult:
263+
def _serialize_body(self, **kwargs) -> SerializationResult:
264264
"""Serialize the document body."""
265265
subparts = self.get_parts()
266-
res = self.serialize_doc(parts=subparts)
266+
res = self.serialize_doc(parts=subparts, **kwargs)
267267
return res
268268

269269
@override
@@ -278,12 +278,12 @@ def serialize(
278278
) -> SerializationResult:
279279
"""Serialize a given node."""
280280
my_visited: set[str] = visited if visited is not None else set()
281-
my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
281+
my_kwargs = {**self.params.model_dump(), **kwargs}
282282
empty_res = create_ser_result()
283283
if item is None or item == self.doc.body:
284284
if self.doc.body.self_ref not in my_visited:
285285
my_visited.add(self.doc.body.self_ref)
286-
return self._serialize_body()
286+
return self._serialize_body(**my_kwargs)
287287
else:
288288
return empty_res
289289

docling_core/transforms/serializer/html.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from xml.sax.saxutils import unescape
1717

1818
import latex2mathml.converter
19+
from PIL.Image import Image
1920
from pydantic import AnyUrl, BaseModel
2021
from typing_extensions import override
2122

@@ -40,6 +41,7 @@
4041
_get_css_for_single_column,
4142
_get_css_for_split_page,
4243
)
44+
from docling_core.transforms.visualizer.base import BaseVisualizer
4345
from docling_core.types.doc.base import ImageRefMode
4446
from docling_core.types.doc.document import (
4547
CodeItem,
@@ -821,9 +823,22 @@ def serialize_hyperlink(
821823
def serialize_doc(
822824
self,
823825
parts: list[SerializationResult],
826+
visualizer: Optional[BaseVisualizer] = None,
824827
**kwargs: Any,
825828
) -> SerializationResult:
826829
"""Serialize a document out of its pages."""
830+
831+
def _serialize_page_img(page_img: Image):
832+
buffered = BytesIO()
833+
page_img.save(buffered, format="PNG") # Save the image to the byte stream
834+
img_bytes = buffered.getvalue() # Get the byte data
835+
836+
# Encode to Base64 and decode to string
837+
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
838+
img_text = f'<img src="data:image/png;base64,{img_base64}">'
839+
840+
return f"<figure>{img_text}</figure>"
841+
827842
# Create HTML structure
828843
html_parts = [
829844
"<!DOCTYPE html>",
@@ -853,19 +868,26 @@ def serialize_doc(
853868
html_parts.append("<table>")
854869
html_parts.append("<tbody>")
855870

871+
vized_pages_dict: dict[Optional[int], Image] = {}
872+
if visualizer:
873+
vized_pages_dict = visualizer.get_visualization(doc=self.doc)
874+
856875
for page_no, page in pages.items():
857876

858877
if isinstance(page_no, int):
859878
if applicable_pages is not None and page_no not in applicable_pages:
860879
continue
861880
page_img = self.doc.pages[page_no].image
881+
vized_page = vized_pages_dict.get(page_no)
862882

863883
html_parts.append("<tr>")
864884

865885
html_parts.append("<td>")
866886

887+
if vized_page:
888+
html_parts.append(_serialize_page_img(page_img=vized_page))
867889
# short-cut: we already have the image in base64
868-
if (
890+
elif (
869891
(page_img is not None)
870892
and isinstance(page_img, ImageRef)
871893
and isinstance(page_img.uri, AnyUrl)
@@ -875,18 +897,7 @@ def serialize_doc(
875897
html_parts.append(f"<figure>{img_text}</figure>")
876898

877899
elif (page_img is not None) and (page_img._pil is not None):
878-
879-
buffered = BytesIO()
880-
page_img._pil.save(
881-
buffered, format="PNG"
882-
) # Save the image to the byte stream
883-
img_bytes = buffered.getvalue() # Get the byte data
884-
885-
# Encode to Base64 and decode to string
886-
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
887-
img_text = f'<img src="data:image/png;base64,{img_base64}">'
888-
889-
html_parts.append(f"<figure>{img_text}</figure>")
900+
html_parts.append(_serialize_page_img(page_img=page_img._pil))
890901
else:
891902
html_parts.append("<figure>no page-image found</figure>")
892903

test/test_serialization.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
MarkdownDocSerializer,
1313
MarkdownParams,
1414
)
15+
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
1516
from docling_core.types.doc.base import ImageRefMode
1617
from docling_core.types.doc.document import DoclingDocument
1718
from docling_core.types.doc.labels import DocItemLabel
@@ -183,6 +184,32 @@ def test_html_split_page_p2():
183184
verify(exp_file=src.parent / f"{src.stem}_split_p2.gt.html", actual=actual)
184185

185186

187+
def test_html_split_page_p2_with_visualizer():
188+
src = Path("./test/data/doc/2408.09869v3_enriched.json")
189+
doc = DoclingDocument.load_from_json(src)
190+
191+
ser = HTMLDocSerializer(
192+
doc=doc,
193+
params=HTMLParams(
194+
image_mode=ImageRefMode.EMBEDDED,
195+
output_style=HTMLOutputStyle.SPLIT_PAGE,
196+
pages={2},
197+
),
198+
)
199+
ser_res = ser.serialize(
200+
visualizer=LayoutVisualizer(),
201+
)
202+
actual = ser_res.text
203+
204+
# pinning the result with visualizer appeared flaky, so at least ensure it contains
205+
# a figure (for the page) and that it is different than without visualizer:
206+
assert '<figure><img src="data:image/png;base64' in actual
207+
file_without_viz = src.parent / f"{src.stem}_split_p2.gt.html"
208+
with open(file_without_viz) as f:
209+
data_without_viz = f.read()
210+
assert actual.strip() != data_without_viz.strip()
211+
212+
186213
def test_html_split_page_no_page_breaks():
187214
src = Path("./test/data/doc/2408.09869_p1.json")
188215
doc = DoclingDocument.load_from_json(src)

0 commit comments

Comments
 (0)