Skip to content

Commit a30e6a7

Browse files
Leg0shiiceberam
andauthored
feat(backend): add generic options support and HTML image handling modes (#2011)
* feat: add backend options support to document backends Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * feat: enhance document backends with generic backend options and improve HTML image handling Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * Refactor tests for declarativebackend Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * fix(HTML): improve image caption handling and ensure backend options are set correctly Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * fix: enhance HTML backend image handling and add support for local file paths Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * chore: Add ground truth data for test data Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * fix(HTML): skip loading SVG files in image data handling Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> * refactor(html): simplify backend options and address gaps Backend options for DeclarativeDocumentBackend classes and only when necessary. Refactor caption parsing in 'img' elements and remove dummy text. Replace deprecated annotations from Typing library with native types. Replace typing annotations according to pydantic guidelines. Some documentation with pydantic annotations. Fix diff issue with test files. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * tests(html): add tests and fix bugs Signed-off-by: Cesar Berrospi Ramis <[email protected]> * refactor(html): refactor backend options Move backend option classes to its own module within datamodel package. Rename 'source_location' with 'source_uri' in HTMLBackendOptions. Rename 'image_fetch' with 'fetch_images' in HTMLBackendOptions. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * refactor(markdown): create a class for the markdown backend options Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Leg0shii <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> Co-authored-by: Cesar Berrospi Ramis <[email protected]>
1 parent b66624b commit a30e6a7

31 files changed

+7050
-6550
lines changed

docling/backend/abstract_backend.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from abc import ABC, abstractmethod
22
from io import BytesIO
33
from pathlib import Path
4-
from typing import TYPE_CHECKING, Set, Union
4+
from typing import TYPE_CHECKING, Union
55

66
from docling_core.types.doc import DoclingDocument
77

8+
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
9+
810
if TYPE_CHECKING:
911
from docling.datamodel.base_models import InputFormat
1012
from docling.datamodel.document import InputDocument
@@ -35,7 +37,7 @@ def unload(self):
3537

3638
@classmethod
3739
@abstractmethod
38-
def supported_formats(cls) -> Set["InputFormat"]:
40+
def supported_formats(cls) -> set["InputFormat"]:
3941
pass
4042

4143

@@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
5860
straight without a recognition pipeline.
5961
"""
6062

63+
@abstractmethod
64+
def __init__(
65+
self,
66+
in_doc: "InputDocument",
67+
path_or_stream: Union[BytesIO, Path],
68+
options: BackendOptions = DeclarativeBackendOptions(),
69+
) -> None:
70+
super().__init__(in_doc, path_or_stream)
71+
self.options: BackendOptions = options
72+
6173
@abstractmethod
6274
def convert(self) -> DoclingDocument:
6375
pass
76+
77+
@classmethod
78+
def get_default_options(cls) -> BackendOptions:
79+
return DeclarativeBackendOptions()

docling/backend/asciidoc_backend.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import re
33
from io import BytesIO
44
from pathlib import Path
5-
from typing import Final, Set, Union
5+
from typing import Final, Union
66

77
from docling_core.types.doc import (
88
DocItemLabel,
@@ -27,7 +27,7 @@
2727

2828

2929
class AsciiDocBackend(DeclarativeDocumentBackend):
30-
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
30+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
3131
super().__init__(in_doc, path_or_stream)
3232

3333
self.path_or_stream = path_or_stream
@@ -58,7 +58,7 @@ def unload(self):
5858
return
5959

6060
@classmethod
61-
def supported_formats(cls) -> Set[InputFormat]:
61+
def supported_formats(cls) -> set[InputFormat]:
6262
return {InputFormat.ASCIIDOC}
6363

6464
def convert(self) -> DoclingDocument:

docling/backend/html_backend.py

Lines changed: 135 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
1+
import base64
12
import logging
3+
import os
24
import re
3-
import traceback
5+
import warnings
46
from contextlib import contextmanager
57
from copy import deepcopy
68
from io import BytesIO
79
from pathlib import Path
810
from typing import Final, Optional, Union, cast
9-
from urllib.parse import urljoin
11+
from urllib.parse import urljoin, urlparse
1012

13+
import requests
1114
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
1215
from bs4.element import PreformattedString
1316
from docling_core.types.doc import (
@@ -17,20 +20,26 @@
1720
DocumentOrigin,
1821
GroupItem,
1922
GroupLabel,
23+
PictureItem,
2024
RefItem,
2125
RichTableCell,
2226
TableCell,
2327
TableData,
2428
TableItem,
2529
TextItem,
2630
)
27-
from docling_core.types.doc.document import ContentLayer, Formatting, Script
31+
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
32+
from PIL import Image, UnidentifiedImageError
2833
from pydantic import AnyUrl, BaseModel, ValidationError
2934
from typing_extensions import override
3035

31-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
36+
from docling.backend.abstract_backend import (
37+
DeclarativeDocumentBackend,
38+
)
39+
from docling.datamodel.backend_options import HTMLBackendOptions
3240
from docling.datamodel.base_models import InputFormat
3341
from docling.datamodel.document import InputDocument
42+
from docling.exceptions import OperationNotAllowed
3443

3544
_log = logging.getLogger(__name__)
3645

@@ -43,6 +52,7 @@
4352
"details",
4453
"figure",
4554
"footer",
55+
"img",
4656
"h1",
4757
"h2",
4858
"h3",
@@ -186,11 +196,12 @@ def __init__(
186196
self,
187197
in_doc: InputDocument,
188198
path_or_stream: Union[BytesIO, Path],
189-
original_url: Optional[AnyUrl] = None,
199+
options: HTMLBackendOptions = HTMLBackendOptions(),
190200
):
191-
super().__init__(in_doc, path_or_stream)
201+
super().__init__(in_doc, path_or_stream, options)
192202
self.soup: Optional[Tag] = None
193-
self.path_or_stream = path_or_stream
203+
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
204+
self.base_path: Optional[str] = str(options.source_uri)
194205

195206
# Initialize the parents for the hierarchy
196207
self.max_levels = 10
@@ -200,7 +211,6 @@ def __init__(
200211
for i in range(self.max_levels):
201212
self.parents[i] = None
202213
self.hyperlink: Union[AnyUrl, Path, None] = None
203-
self.original_url = original_url
204214
self.format_tags: list[str] = []
205215

206216
try:
@@ -236,6 +246,11 @@ def unload(self):
236246
def supported_formats(cls) -> set[InputFormat]:
237247
return {InputFormat.HTML}
238248

249+
@classmethod
250+
@override
251+
def get_default_options(cls) -> HTMLBackendOptions:
252+
return HTMLBackendOptions()
253+
239254
@override
240255
def convert(self) -> DoclingDocument:
241256
_log.debug("Starting HTML conversion...")
@@ -261,7 +276,7 @@ def convert(self) -> DoclingDocument:
261276
content_layer=ContentLayer.FURNITURE,
262277
)
263278
# remove script and style tags
264-
for tag in self.soup(["script", "style"]):
279+
for tag in self.soup(["script", "noscript", "style"]):
265280
tag.decompose()
266281
# remove any hidden tag
267282
for tag in self.soup(hidden=True):
@@ -291,6 +306,28 @@ def convert(self) -> DoclingDocument:
291306
self._walk(content, doc)
292307
return doc
293308

309+
@staticmethod
310+
def _is_remote_url(value: str) -> bool:
311+
parsed = urlparse(value)
312+
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
313+
314+
def _resolve_relative_path(self, loc: str) -> str:
315+
abs_loc = loc
316+
317+
if self.base_path:
318+
if loc.startswith("//"):
319+
# Protocol-relative URL - default to https
320+
abs_loc = "https:" + loc
321+
elif not loc.startswith(("http://", "https://", "data:", "file://")):
322+
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
323+
abs_loc = urljoin(self.base_path, loc)
324+
elif self.base_path: # local fetch
325+
# For local files, resolve relative to the HTML file location
326+
abs_loc = str(Path(self.base_path).parent / loc)
327+
328+
_log.debug(f"Resolved location {loc} to {abs_loc}")
329+
return abs_loc
330+
294331
@staticmethod
295332
def group_cell_elements(
296333
group_name: str,
@@ -520,7 +557,8 @@ def flush_buffer():
520557
if name == "img":
521558
flush_buffer()
522559
im_ref3 = self._emit_image(node, doc)
523-
added_refs.append(im_ref3)
560+
if im_ref3:
561+
added_refs.append(im_ref3)
524562
elif name in _FORMAT_TAG_MAP:
525563
with self._use_format([name]):
526564
wk = self._walk(node, doc)
@@ -669,8 +707,7 @@ def _use_hyperlink(self, tag: Tag):
669707
else:
670708
if isinstance(this_href, str) and this_href:
671709
old_hyperlink = self.hyperlink
672-
if self.original_url is not None:
673-
this_href = urljoin(str(self.original_url), str(this_href))
710+
this_href = self._resolve_relative_path(this_href)
674711
# ugly fix for relative links since pydantic does not support them.
675712
try:
676713
new_hyperlink = AnyUrl(this_href)
@@ -837,7 +874,8 @@ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
837874
for img_tag in tag("img"):
838875
if isinstance(img_tag, Tag):
839876
im_ref = self._emit_image(img_tag, doc)
840-
added_ref.append(im_ref)
877+
if im_ref:
878+
added_ref.append(im_ref)
841879
return added_ref
842880

843881
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
@@ -1003,7 +1041,8 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
10031041
img_tag = tag.find("img")
10041042
if isinstance(img_tag, Tag):
10051043
im_ref = self._emit_image(img_tag, doc)
1006-
added_refs.append(im_ref)
1044+
if im_ref is not None:
1045+
added_refs.append(im_ref)
10071046

10081047
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
10091048
heading_refs = self._handle_heading(tag, doc)
@@ -1061,7 +1100,8 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
10611100
for img_tag in tag("img"):
10621101
if isinstance(img_tag, Tag):
10631102
im_ref2 = self._emit_image(tag, doc)
1064-
added_refs.append(im_ref2)
1103+
if im_ref2 is not None:
1104+
added_refs.append(im_ref2)
10651105

10661106
elif tag_name in {"pre"}:
10671107
# handle monospace code snippets (pre).
@@ -1092,10 +1132,12 @@ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
10921132
self._walk(tag, doc)
10931133
return added_refs
10941134

1095-
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
1135+
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
10961136
figure = img_tag.find_parent("figure")
10971137
caption: AnnotatedTextList = AnnotatedTextList()
10981138

1139+
parent = self.parents[self.level]
1140+
10991141
# check if the figure has a link - this is HACK:
11001142
def get_img_hyperlink(img_tag):
11011143
this_parent = img_tag.parent
@@ -1106,9 +1148,8 @@ def get_img_hyperlink(img_tag):
11061148
return None
11071149

11081150
if img_hyperlink := get_img_hyperlink(img_tag):
1109-
caption.append(
1110-
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
1111-
)
1151+
img_text = img_tag.get("alt") or ""
1152+
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
11121153

11131154
if isinstance(figure, Tag):
11141155
caption_tag = figure.find("figcaption", recursive=False)
@@ -1135,13 +1176,78 @@ def get_img_hyperlink(img_tag):
11351176
hyperlink=caption_anno_text.hyperlink,
11361177
)
11371178

1179+
src_loc: str = self._get_attr_as_string(img_tag, "src")
1180+
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
1181+
# Do not fetch the image, just add a placeholder
1182+
placeholder: PictureItem = doc.add_picture(
1183+
caption=caption_item,
1184+
parent=parent,
1185+
content_layer=self.content_layer,
1186+
)
1187+
return placeholder.get_ref()
1188+
1189+
src_loc = self._resolve_relative_path(src_loc)
1190+
img_ref = self._create_image_ref(src_loc)
1191+
11381192
docling_pic = doc.add_picture(
1193+
image=img_ref,
11391194
caption=caption_item,
1140-
parent=self.parents[self.level],
1195+
parent=parent,
11411196
content_layer=self.content_layer,
11421197
)
11431198
return docling_pic.get_ref()
11441199

1200+
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
1201+
try:
1202+
img_data = self._load_image_data(src_url)
1203+
if img_data:
1204+
img = Image.open(BytesIO(img_data))
1205+
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
1206+
except (
1207+
requests.HTTPError,
1208+
ValidationError,
1209+
UnidentifiedImageError,
1210+
OperationNotAllowed,
1211+
TypeError,
1212+
ValueError,
1213+
) as e:
1214+
warnings.warn(f"Could not process an image from {src_url}: {e}")
1215+
1216+
return None
1217+
1218+
def _load_image_data(self, src_loc: str) -> Optional[bytes]:
1219+
if src_loc.lower().endswith(".svg"):
1220+
_log.debug(f"Skipping SVG file: {src_loc}")
1221+
return None
1222+
1223+
if HTMLDocumentBackend._is_remote_url(src_loc):
1224+
if not self.options.enable_remote_fetch:
1225+
raise OperationNotAllowed(
1226+
"Fetching remote resources is only allowed when set explicitly. "
1227+
"Set options.enable_remote_fetch=True."
1228+
)
1229+
response = requests.get(src_loc, stream=True)
1230+
response.raise_for_status()
1231+
return response.content
1232+
elif src_loc.startswith("data:"):
1233+
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
1234+
return base64.b64decode(data)
1235+
1236+
if src_loc.startswith("file://"):
1237+
src_loc = src_loc[7:]
1238+
1239+
if not self.options.enable_local_fetch:
1240+
raise OperationNotAllowed(
1241+
"Fetching local resources is only allowed when set explicitly. "
1242+
"Set options.enable_local_fetch=True."
1243+
)
1244+
# add check that file exists and can read
1245+
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
1246+
with open(src_loc, "rb") as f:
1247+
return f.read()
1248+
else:
1249+
raise ValueError("File does not exist or it is not readable.")
1250+
11451251
@staticmethod
11461252
def get_text(item: PageElement) -> str:
11471253
"""Concatenate all child strings of a PageElement.
@@ -1238,3 +1344,12 @@ def _extract_num(s: str) -> int:
12381344
)
12391345

12401346
return int_spans
1347+
1348+
@staticmethod
1349+
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
1350+
"""Get attribute value as string, handling list values."""
1351+
value = tag.get(attr)
1352+
if not value:
1353+
return default
1354+
1355+
return value[0] if isinstance(value, list) else value

0 commit comments

Comments
 (0)