Skip to content

Commit 4295e5f

Browse files
authored
Merge pull request #6 from krrome/bug-support-DocumentStream-as-input
implement possibility to pass source to ResultPostprocessor for processing with pymupdf + add error handling so that ResultPostprocessor falls back to style based inference in case pymupdf can't read the file.
2 parents ca33194 + 97264a9 commit 4295e5f

File tree

5 files changed

+240
-65
lines changed

5 files changed

+240
-65
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,31 @@ result.document.export_to_markdown()
122122
# or use a chunker on it...
123123
```
124124

125+
## FAQ
126+
127+
### Working with DocumentStream sources / PDFFileNotFoundException:
128+
129+
If you run into the `PDFFileNotFoundException` then your `source` attribute to `DocumentConverter().convert(source=source)` has either been of type `str` or of type `DocumentStream` so there is the Docling conversion result unfortunately does *not* hold a valid reference to the source file anymore. Hence the Postprocessor needs your help - if `source` was a string then you can add the `source=source` when instantiating `ResultPostprocessor` - full example:
130+
131+
```python
132+
from docling.document_converter import DocumentConverter
133+
from hierarchical.postprocessor import ResultPostprocessor
134+
135+
source = "my_file.pdf" # document per local path or URL
136+
converter = DocumentConverter()
137+
result = converter.convert(source)
138+
# the postprocessor modifies the result.document in place.
139+
ResultPostprocessor(result, source=source).process()
140+
# ...
141+
```
142+
143+
If you have used a `DocumentStream` object as source you are unfortunately in the situation that you will have to pass a valid Path to the PDF as a `source` argument to `ResultPostprocessor` or a new, open BytesIO stream or `DocumentStream` object as a `source` argument to `ResultPostprocessor`. The reason is that docling *closes* the source stream when it is finished - so no more reading from that stream is possible.
144+
145+
### Exception handling for ToC extraction from metadata:
146+
147+
You want to handle exceptions regarding File-IO / Streams yourself - great, just set `raise_on_error` to `True` when instantiating `ResultPostprocessor`.
148+
149+
125150
## Citation
126151

127152
If you use this software for your project please cite Docling as well as the following:

hierarchical/hierarchy_builder_metadata.py

Lines changed: 123 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
import re
2+
from collections.abc import Generator
3+
from contextlib import contextmanager
24
from functools import cached_property
5+
from io import BytesIO
36
from logging import Logger
7+
from pathlib import Path, PurePath
8+
from typing import Optional, Union
49

10+
from docling.datamodel.base_models import DocumentStream
511
from docling.datamodel.document import ConversionResult
612
from docling_core.types.doc import BoundingBox, ListItem, TextItem
713
from pymupdf import Document as FitzDocument
@@ -21,79 +27,135 @@ def __init__(self) -> None:
2127
super().__init__("Hierarchy demands equal level heading, but no common parent was found!")
2228

2329

30+
class PDFFileNotFoundException(Exception):
31+
def __init__(self, path: PurePath) -> None:
32+
super().__init__(f"PDF file {path} does not exist!")
33+
34+
35+
class PDFFileStreamClosed(Exception):
36+
def __init__(self) -> None:
37+
super().__init__("The (byte)stream of the PDF was closed. Can't process this input for ToC extraction.")
38+
39+
40+
class InvalidSourceTypeException(Exception):
41+
pass
42+
43+
2444
class HierarchyBuilderMetadata:
25-
def __init__(self, conv_res: ConversionResult, raise_on_error: bool = False):
45+
def __init__(
46+
self,
47+
conv_res: ConversionResult,
48+
source: Optional[Union[PurePath, str, DocumentStream, BytesIO]] = None,
49+
raise_on_error: bool = False,
50+
):
2651
self.conv_res: ConversionResult = conv_res
52+
self.source: Optional[Union[PurePath, str, DocumentStream, BytesIO]] = source
2753
self.raise_on_error: bool = raise_on_error
2854

2955
@cached_property
3056
def toc(self) -> list[tuple]:
3157
return self._extract_toc()
3258

59+
@contextmanager
60+
def _get_source_kwargs(self) -> Generator[dict]:
61+
source = self.source
62+
if source is None:
63+
source = self.conv_res.input.file
64+
if isinstance(source, str):
65+
source = Path(source)
66+
if isinstance(source, PurePath):
67+
if not Path(source).exists():
68+
raise PDFFileNotFoundException(source)
69+
else:
70+
yield {"filename": str(source)}
71+
elif isinstance(source, DocumentStream):
72+
stream = source.stream
73+
if stream.closed:
74+
raise PDFFileStreamClosed()
75+
else:
76+
stream.seek(0)
77+
yield {"filetype": str(self.conv_res.input.file), "stream": stream}
78+
elif isinstance(source, BytesIO):
79+
stream = source
80+
if stream.closed:
81+
raise PDFFileStreamClosed()
82+
else:
83+
stream.seek(0)
84+
yield {"filetype": str(self.conv_res.input.file), "stream": stream}
85+
else:
86+
raise InvalidSourceTypeException()
87+
3388
def _extract_toc(self) -> list[tuple]: # noqa: C901
34-
with FitzDocument(self.conv_res.input.file) as doc:
35-
toc = doc.get_toc(
36-
simple=False
37-
) # gives a list of lists [<hierarchy level>, <Header name>, <pdf-page number>, <dict of additional information including position of the bookmark>]
38-
# pages_dicts = {}
39-
toc_output = []
40-
for level, title, page, add_info in toc:
41-
# alternative
42-
rects = doc[page - 1].search_for(title)
43-
# doc[page - 1].get_pixmap(clip=rects[0]).save("rect_x.png")
44-
this_bbox = None
45-
for b in rects:
46-
if this_bbox is None:
47-
this_bbox = BoundingBox(l=b.x0, t=b.y0, r=b.x1, b=b.y1)
48-
else:
49-
this_bbox = BoundingBox(
50-
l=min(b.x0, this_bbox.l),
51-
t=min(b.y0, this_bbox.t),
52-
r=max(b.x1, this_bbox.r),
53-
b=max(b.y1, this_bbox.b),
54-
)
55-
if this_bbox:
56-
add_info["coords"] = this_bbox
57-
# sometimes the bookmark still points to the previous page, but the header is at the top of the current page
58-
# future todo - instead of this try to use the offset of the bookmark pointer!
59-
for page_here in [page, page + 1]:
60-
if "coords" not in add_info:
61-
title_ref = re.sub(r"[^A-Za-z0-9]", "", title)
62-
actual_title = ""
63-
accum_blocks: list[tuple] = []
64-
for block in doc[page_here - 1].get_textpage().extractBLOCKS():
65-
potential_title = re.sub(r"[^A-Za-z0-9]", "", block[4])
66-
if potential_title == title_ref and not accum_blocks:
67-
actual_title += potential_title
68-
add_info["coords"] = BoundingBox(l=block[0], t=block[1], r=block[2], b=block[3])
69-
add_info["actual_title"] = actual_title
70-
page = page_here
71-
break
72-
elif potential_title and title_ref.startswith(potential_title):
73-
accum_blocks.append(block)
74-
actual_title += potential_title
75-
title_ref = title_ref[len(potential_title) :]
76-
if len(title_ref) == 0:
77-
this_bbox = None
78-
for b in accum_blocks:
79-
if this_bbox is None:
80-
this_bbox = BoundingBox(l=b[0], t=b[1], r=b[2], b=b[3])
81-
else:
82-
this_bbox = BoundingBox(
83-
l=min(b[0], this_bbox.l),
84-
t=min(b[1], this_bbox.t),
85-
r=max(b[2], this_bbox.r),
86-
b=max(b[3], this_bbox.b),
87-
)
88-
add_info["coords"] = this_bbox
89+
toc_output = []
90+
try:
91+
with self._get_source_kwargs() as kwargs:
92+
doc = FitzDocument(**kwargs)
93+
toc = doc.get_toc( # type: ignore[attr-defined]
94+
simple=False
95+
) # gives a list of lists [<hierarchy level>, <Header name>, <pdf-page number>, <dict of additional information including position of the bookmark>]
96+
# pages_dicts = {}
97+
for level, title, page, add_info in toc:
98+
# alternative
99+
rects = doc[page - 1].search_for(title)
100+
# doc[page - 1].get_pixmap(clip=rects[0]).save("rect_x.png")
101+
this_bbox = None
102+
for b in rects:
103+
if this_bbox is None:
104+
this_bbox = BoundingBox(l=b.x0, t=b.y0, r=b.x1, b=b.y1)
105+
else:
106+
this_bbox = BoundingBox(
107+
l=min(b.x0, this_bbox.l),
108+
t=min(b.y0, this_bbox.t),
109+
r=max(b.x1, this_bbox.r),
110+
b=max(b.y1, this_bbox.b),
111+
)
112+
if this_bbox:
113+
add_info["coords"] = this_bbox
114+
# sometimes the bookmark still points to the previous page, but the header is at the top of the current page
115+
# future todo - instead of this try to use the offset of the bookmark pointer!
116+
for page_here in [page, page + 1]:
117+
if "coords" not in add_info:
118+
title_ref = re.sub(r"[^A-Za-z0-9]", "", title)
119+
actual_title = ""
120+
accum_blocks: list[tuple] = []
121+
for block in doc[page_here - 1].get_textpage().extractBLOCKS():
122+
potential_title = re.sub(r"[^A-Za-z0-9]", "", block[4])
123+
if potential_title == title_ref and not accum_blocks:
124+
actual_title += potential_title
125+
add_info["coords"] = BoundingBox(l=block[0], t=block[1], r=block[2], b=block[3])
89126
add_info["actual_title"] = actual_title
90127
page = page_here
91128
break
92-
if "coords" in add_info:
93-
break
94-
if "coords" not in add_info:
95-
logger.warning(f"WARNING: Could not find title '{title}', which was mentioned in TOC. ")
96-
toc_output.append((level, title, page, add_info))
129+
elif potential_title and title_ref.startswith(potential_title):
130+
accum_blocks.append(block)
131+
actual_title += potential_title
132+
title_ref = title_ref[len(potential_title) :]
133+
if len(title_ref) == 0:
134+
this_bbox = None
135+
for b in accum_blocks:
136+
if this_bbox is None:
137+
this_bbox = BoundingBox(l=b[0], t=b[1], r=b[2], b=b[3])
138+
else:
139+
this_bbox = BoundingBox(
140+
l=min(b[0], this_bbox.l),
141+
t=min(b[1], this_bbox.t),
142+
r=max(b[2], this_bbox.r),
143+
b=max(b[3], this_bbox.b),
144+
)
145+
add_info["coords"] = this_bbox
146+
add_info["actual_title"] = actual_title
147+
page = page_here
148+
break
149+
if "coords" in add_info:
150+
break
151+
if "coords" not in add_info:
152+
logger.warning(f"WARNING: Could not find title '{title}', which was mentioned in TOC. ")
153+
toc_output.append((level, title, page, add_info))
154+
except (InvalidSourceTypeException, PDFFileStreamClosed, PDFFileNotFoundException) as e:
155+
if self.raise_on_error:
156+
raise
157+
else:
158+
logger.warning(e)
97159
return toc_output
98160

99161
def infer(self) -> HierarchicalHeader:

hierarchical/postprocessor.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from functools import cached_property
2+
from io import BytesIO
3+
from pathlib import PurePath
4+
from typing import Optional, Union
25

6+
from docling.datamodel.base_models import DocumentStream
37
from docling.datamodel.document import ConversionResult
48
from docling_core.types.doc.document import (
59
DocItem,
@@ -48,8 +52,15 @@ def set_item_in_doc(doc: DoclingDocument, item: DocItem) -> None:
4852

4953

5054
class ResultPostprocessor:
51-
def __init__(self, result: ConversionResult):
55+
def __init__(
56+
self,
57+
result: ConversionResult,
58+
source: Optional[Union[PurePath, str, DocumentStream, BytesIO]] = None,
59+
raise_on_error: bool = False,
60+
):
5261
self.result = result
62+
self.source = source
63+
self.raise_on_error = raise_on_error
5364

5465
@cached_property
5566
def has_hierarchy_levels(self) -> bool:
@@ -117,7 +128,7 @@ def get_headers(self) -> list[dict]:
117128
return items
118129

119130
def process(self) -> None: # noqa: C901
120-
hbm = HierarchyBuilderMetadata(self.result)
131+
hbm = HierarchyBuilderMetadata(self.result, self.source, self.raise_on_error)
121132
header_correction = False
122133
if len(hbm.toc) > 0:
123134
root = hbm.infer()

tests/test_metadata_toc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_convert():
2525
source = sample_path / "sample_document.pdf" # document per local path or URL
2626
converter = DocumentConverter()
2727
result = converter.convert(source)
28-
hbm = HierarchyBuilderMetadata(result, [])
28+
hbm = HierarchyBuilderMetadata(result)
2929
root = hbm.infer()
3030
assert str(root) == ref_output
3131

@@ -126,6 +126,6 @@ def test_convert_r10():
126126
source = sample_path / "R-10-00.pdf" # document per local path or URL
127127
converter = DocumentConverter()
128128
result = converter.convert(source)
129-
hbm = HierarchyBuilderMetadata(result, [])
129+
hbm = HierarchyBuilderMetadata(result)
130130
root = hbm.infer()
131131
assert str(root) == ref_output

tests/test_postprocessing.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
from io import BytesIO
12
from pathlib import Path
23

34
import pytest
5+
from docling.datamodel.base_models import DocumentStream
46
from docling.document_converter import DocumentConverter
57

8+
from hierarchical.hierarchy_builder_metadata import PDFFileNotFoundException, PDFFileStreamClosed
69
from hierarchical.postprocessor import ResultPostprocessor
710

811
results_path = Path(__file__).parent / "results"
@@ -72,6 +75,80 @@ def test_result_postprocessor_textpdf():
7275
assert item.text in allowed_headers
7376

7477

78+
def test_result_postprocessor_textpdf_stream():
79+
source_path = sample_path / "sample_document.pdf" # document per local path or URL
80+
with source_path.open("rb") as fh:
81+
source = DocumentStream(name=source_path.name, stream=BytesIO(fh.read()))
82+
converter = DocumentConverter()
83+
result = converter.convert(source)
84+
try:
85+
ResultPostprocessor(result, raise_on_error=True).process()
86+
raise Exception("FAIL NO STREAM!") # noqa: TRY002 TRY003
87+
except PDFFileNotFoundException:
88+
pass
89+
try:
90+
ResultPostprocessor(result, source=source, raise_on_error=True).process()
91+
raise Exception("FAIL STREAM CLOSED!") # noqa: TRY002 TRY003
92+
except PDFFileStreamClosed:
93+
pass
94+
95+
with source_path.open("rb") as fh:
96+
source = DocumentStream(name=source_path.name, stream=BytesIO(fh.read()))
97+
ResultPostprocessor(result, source=source, raise_on_error=True).process()
98+
99+
compare(result.document.export_to_markdown(), "sample_document.md")
100+
101+
allowed_headers_res = [item_ref.resolve(result.document).text for item_ref in result.document.body.children]
102+
print(allowed_headers_res)
103+
104+
allowed_headers = [
105+
"Some kind of text document",
106+
"1. Introduction",
107+
"1.1 Background",
108+
"1.2 Purpose",
109+
"2. Main Content",
110+
"2.1 Section One",
111+
"2.1.1 Subsection",
112+
"2.1.2 Another Subsection",
113+
"2.2 Section Two",
114+
"3. Conclusion",
115+
]
116+
117+
for item_ref in result.document.body.children:
118+
item = item_ref.resolve(result.document)
119+
assert item.text in allowed_headers
120+
121+
122+
def test_result_postprocessor_textpdf_string():
123+
source_path = sample_path / "sample_document.pdf" # document per local path or URL
124+
source = str(source_path)
125+
converter = DocumentConverter()
126+
result = converter.convert(source)
127+
ResultPostprocessor(result, source=source).process()
128+
129+
compare(result.document.export_to_markdown(), "sample_document.md")
130+
131+
allowed_headers_res = [item_ref.resolve(result.document).text for item_ref in result.document.body.children]
132+
print(allowed_headers_res)
133+
134+
allowed_headers = [
135+
"Some kind of text document",
136+
"1. Introduction",
137+
"1.1 Background",
138+
"1.2 Purpose",
139+
"2. Main Content",
140+
"2.1 Section One",
141+
"2.1.1 Subsection",
142+
"2.1.2 Another Subsection",
143+
"2.2 Section Two",
144+
"3. Conclusion",
145+
]
146+
147+
for item_ref in result.document.body.children:
148+
item = item_ref.resolve(result.document)
149+
assert item.text in allowed_headers
150+
151+
75152
@pytest.mark.skip(
76153
reason="just another example like test_result_postprocessor_textpdf. Not necessary for automated tests."
77154
)

0 commit comments

Comments
 (0)