Merge pull request #6 from krrome/bug-support-DocumentStream-as-input

krrome · web-flow · commit 4295e5f8e8eb · 2025-10-13T21:02:26.000+02:00
implement possibility to pass source to ResultPostprocessor for processing with pymupdf + add error handling so that ResultPostprocessor falls back to style based inference in case pymupdf can't read the file.
diff --git a/README.md b/README.md
@@ -122,6 +122,31 @@ result.document.export_to_markdown()
 # or use a chunker on it...
 ```
 
+## FAQ
+
+### Working with DocumentStream sources / PDFFileNotFoundException:
+
+If you run into the `PDFFileNotFoundException` then your `source` attribute to `DocumentConverter().convert(source=source)` has either been of type `str` or of type `DocumentStream` so there is the Docling conversion result unfortunately does *not* hold a valid reference to the source file anymore. Hence the Postprocessor needs your help - if `source` was a string then you can add the `source=source` when instantiating `ResultPostprocessor` - full example:
+
+```python
+from docling.document_converter import DocumentConverter
+from hierarchical.postprocessor import ResultPostprocessor
+
+source = "my_file.pdf"  # document per local path or URL
+converter = DocumentConverter()
+result = converter.convert(source)
+# the postprocessor modifies the result.document in place.
+ResultPostprocessor(result, source=source).process()
+# ...
+```
+
+If you have used a `DocumentStream` object as source you are unfortunately in the situation that you will have to pass a valid Path to the PDF as a `source` argument to `ResultPostprocessor` or a new, open BytesIO stream or `DocumentStream` object as a `source` argument to `ResultPostprocessor`. The reason is that docling *closes* the source stream when it is finished - so no more reading from that stream is possible.
+
+### Exception handling for ToC extraction from metadata:
+
+You want to handle exceptions regarding File-IO / Streams yourself - great, just set `raise_on_error` to `True` when instantiating `ResultPostprocessor`.
+
+
 ## Citation
 
 If you use this software for your project please cite Docling as well as the following:
diff --git a/hierarchical/hierarchy_builder_metadata.py b/hierarchical/hierarchy_builder_metadata.py
@@ -1,7 +1,13 @@
 import re
+from collections.abc import Generator
+from contextlib import contextmanager
 from functools import cached_property
+from io import BytesIO
 from logging import Logger
+from pathlib import Path, PurePath
+from typing import Optional, Union
 
+from docling.datamodel.base_models import DocumentStream
 from docling.datamodel.document import ConversionResult
 from docling_core.types.doc import BoundingBox, ListItem, TextItem
 from pymupdf import Document as FitzDocument
@@ -21,79 +27,135 @@ def __init__(self) -> None:
         super().__init__("Hierarchy demands equal level heading, but no common parent was found!")
 
 
+class PDFFileNotFoundException(Exception):
+    def __init__(self, path: PurePath) -> None:
+        super().__init__(f"PDF file {path} does not exist!")
+
+
+class PDFFileStreamClosed(Exception):
+    def __init__(self) -> None:
+        super().__init__("The (byte)stream of the PDF was closed. Can't process this input for ToC extraction.")
+
+
+class InvalidSourceTypeException(Exception):
+    pass
+
+
 class HierarchyBuilderMetadata:
-    def __init__(self, conv_res: ConversionResult, raise_on_error: bool = False):
+    def __init__(
+        self,
+        conv_res: ConversionResult,
+        source: Optional[Union[PurePath, str, DocumentStream, BytesIO]] = None,
+        raise_on_error: bool = False,
+    ):
         self.conv_res: ConversionResult = conv_res
+        self.source: Optional[Union[PurePath, str, DocumentStream, BytesIO]] = source
         self.raise_on_error: bool = raise_on_error
 
     @cached_property
     def toc(self) -> list[tuple]:
         return self._extract_toc()
 
+    @contextmanager
+    def _get_source_kwargs(self) -> Generator[dict]:
+        source = self.source
+        if source is None:
+            source = self.conv_res.input.file
+        if isinstance(source, str):
+            source = Path(source)
+        if isinstance(source, PurePath):
+            if not Path(source).exists():
+                raise PDFFileNotFoundException(source)
+            else:
+                yield {"filename": str(source)}
+        elif isinstance(source, DocumentStream):
+            stream = source.stream
+            if stream.closed:
+                raise PDFFileStreamClosed()
+            else:
+                stream.seek(0)
+                yield {"filetype": str(self.conv_res.input.file), "stream": stream}
+        elif isinstance(source, BytesIO):
+            stream = source
+            if stream.closed:
+                raise PDFFileStreamClosed()
+            else:
+                stream.seek(0)
+                yield {"filetype": str(self.conv_res.input.file), "stream": stream}
+        else:
+            raise InvalidSourceTypeException()
+
     def _extract_toc(self) -> list[tuple]:  # noqa: C901
-        with FitzDocument(self.conv_res.input.file) as doc:
-            toc = doc.get_toc(
-                simple=False
-            )  # gives a list of lists [<hierarchy level>, <Header name>, <pdf-page number>, <dict of additional information including position of the bookmark>]
-            # pages_dicts = {}
-            toc_output = []
-            for level, title, page, add_info in toc:
-                # alternative
-                rects = doc[page - 1].search_for(title)
-                # doc[page - 1].get_pixmap(clip=rects[0]).save("rect_x.png")
-                this_bbox = None
-                for b in rects:
-                    if this_bbox is None:
-                        this_bbox = BoundingBox(l=b.x0, t=b.y0, r=b.x1, b=b.y1)
-                    else:
-                        this_bbox = BoundingBox(
-                            l=min(b.x0, this_bbox.l),
-                            t=min(b.y0, this_bbox.t),
-                            r=max(b.x1, this_bbox.r),
-                            b=max(b.y1, this_bbox.b),
-                        )
-                if this_bbox:
-                    add_info["coords"] = this_bbox
-                # sometimes the bookmark still points to the previous page, but the header is at the top of the current page
-                # future todo - instead of this try to use the offset of the bookmark pointer!
-                for page_here in [page, page + 1]:
-                    if "coords" not in add_info:
-                        title_ref = re.sub(r"[^A-Za-z0-9]", "", title)
-                        actual_title = ""
-                        accum_blocks: list[tuple] = []
-                        for block in doc[page_here - 1].get_textpage().extractBLOCKS():
-                            potential_title = re.sub(r"[^A-Za-z0-9]", "", block[4])
-                            if potential_title == title_ref and not accum_blocks:
-                                actual_title += potential_title
-                                add_info["coords"] = BoundingBox(l=block[0], t=block[1], r=block[2], b=block[3])
-                                add_info["actual_title"] = actual_title
-                                page = page_here
-                                break
-                            elif potential_title and title_ref.startswith(potential_title):
-                                accum_blocks.append(block)
-                                actual_title += potential_title
-                                title_ref = title_ref[len(potential_title) :]
-                                if len(title_ref) == 0:
-                                    this_bbox = None
-                                    for b in accum_blocks:
-                                        if this_bbox is None:
-                                            this_bbox = BoundingBox(l=b[0], t=b[1], r=b[2], b=b[3])
-                                        else:
-                                            this_bbox = BoundingBox(
-                                                l=min(b[0], this_bbox.l),
-                                                t=min(b[1], this_bbox.t),
-                                                r=max(b[2], this_bbox.r),
-                                                b=max(b[3], this_bbox.b),
-                                            )
-                                    add_info["coords"] = this_bbox
+        toc_output = []
+        try:
+            with self._get_source_kwargs() as kwargs:
+                doc = FitzDocument(**kwargs)
+                toc = doc.get_toc(  # type: ignore[attr-defined]
+                    simple=False
+                )  # gives a list of lists [<hierarchy level>, <Header name>, <pdf-page number>, <dict of additional information including position of the bookmark>]
+                # pages_dicts = {}
+                for level, title, page, add_info in toc:
+                    # alternative
+                    rects = doc[page - 1].search_for(title)
+                    # doc[page - 1].get_pixmap(clip=rects[0]).save("rect_x.png")
+                    this_bbox = None
+                    for b in rects:
+                        if this_bbox is None:
+                            this_bbox = BoundingBox(l=b.x0, t=b.y0, r=b.x1, b=b.y1)
+                        else:
+                            this_bbox = BoundingBox(
+                                l=min(b.x0, this_bbox.l),
+                                t=min(b.y0, this_bbox.t),
+                                r=max(b.x1, this_bbox.r),
+                                b=max(b.y1, this_bbox.b),
+                            )
+                    if this_bbox:
+                        add_info["coords"] = this_bbox
+                    # sometimes the bookmark still points to the previous page, but the header is at the top of the current page
+                    # future todo - instead of this try to use the offset of the bookmark pointer!
+                    for page_here in [page, page + 1]:
+                        if "coords" not in add_info:
+                            title_ref = re.sub(r"[^A-Za-z0-9]", "", title)
+                            actual_title = ""
+                            accum_blocks: list[tuple] = []
+                            for block in doc[page_here - 1].get_textpage().extractBLOCKS():
+                                potential_title = re.sub(r"[^A-Za-z0-9]", "", block[4])
+                                if potential_title == title_ref and not accum_blocks:
+                                    actual_title += potential_title
+                                    add_info["coords"] = BoundingBox(l=block[0], t=block[1], r=block[2], b=block[3])
                                     add_info["actual_title"] = actual_title
                                     page = page_here
                                     break
-                    if "coords" in add_info:
-                        break
-                if "coords" not in add_info:
-                    logger.warning(f"WARNING: Could not find title '{title}', which was mentioned in TOC. ")
-                toc_output.append((level, title, page, add_info))
+                                elif potential_title and title_ref.startswith(potential_title):
+                                    accum_blocks.append(block)
+                                    actual_title += potential_title
+                                    title_ref = title_ref[len(potential_title) :]
+                                    if len(title_ref) == 0:
+                                        this_bbox = None
+                                        for b in accum_blocks:
+                                            if this_bbox is None:
+                                                this_bbox = BoundingBox(l=b[0], t=b[1], r=b[2], b=b[3])
+                                            else:
+                                                this_bbox = BoundingBox(
+                                                    l=min(b[0], this_bbox.l),
+                                                    t=min(b[1], this_bbox.t),
+                                                    r=max(b[2], this_bbox.r),
+                                                    b=max(b[3], this_bbox.b),
+                                                )
+                                        add_info["coords"] = this_bbox
+                                        add_info["actual_title"] = actual_title
+                                        page = page_here
+                                        break
+                        if "coords" in add_info:
+                            break
+                    if "coords" not in add_info:
+                        logger.warning(f"WARNING: Could not find title '{title}', which was mentioned in TOC. ")
+                    toc_output.append((level, title, page, add_info))
+        except (InvalidSourceTypeException, PDFFileStreamClosed, PDFFileNotFoundException) as e:
+            if self.raise_on_error:
+                raise
+            else:
+                logger.warning(e)
         return toc_output
 
     def infer(self) -> HierarchicalHeader:
diff --git a/hierarchical/postprocessor.py b/hierarchical/postprocessor.py
@@ -1,5 +1,9 @@
 from functools import cached_property
+from io import BytesIO
+from pathlib import PurePath
+from typing import Optional, Union
 
+from docling.datamodel.base_models import DocumentStream
 from docling.datamodel.document import ConversionResult
 from docling_core.types.doc.document import (
     DocItem,
@@ -48,8 +52,15 @@ def set_item_in_doc(doc: DoclingDocument, item: DocItem) -> None:
 
 
 class ResultPostprocessor:
-    def __init__(self, result: ConversionResult):
+    def __init__(
+        self,
+        result: ConversionResult,
+        source: Optional[Union[PurePath, str, DocumentStream, BytesIO]] = None,
+        raise_on_error: bool = False,
+    ):
         self.result = result
+        self.source = source
+        self.raise_on_error = raise_on_error
 
     @cached_property
     def has_hierarchy_levels(self) -> bool:
@@ -117,7 +128,7 @@ def get_headers(self) -> list[dict]:
         return items
 
     def process(self) -> None:  # noqa: C901
-        hbm = HierarchyBuilderMetadata(self.result)
+        hbm = HierarchyBuilderMetadata(self.result, self.source, self.raise_on_error)
         header_correction = False
         if len(hbm.toc) > 0:
             root = hbm.infer()
diff --git a/tests/test_metadata_toc.py b/tests/test_metadata_toc.py
@@ -25,7 +25,7 @@ def test_convert():
     source = sample_path / "sample_document.pdf"  # document per local path or URL
     converter = DocumentConverter()
     result = converter.convert(source)
-    hbm = HierarchyBuilderMetadata(result, [])
+    hbm = HierarchyBuilderMetadata(result)
     root = hbm.infer()
     assert str(root) == ref_output
 
@@ -126,6 +126,6 @@ def test_convert_r10():
     source = sample_path / "R-10-00.pdf"  # document per local path or URL
     converter = DocumentConverter()
     result = converter.convert(source)
-    hbm = HierarchyBuilderMetadata(result, [])
+    hbm = HierarchyBuilderMetadata(result)
     root = hbm.infer()
     assert str(root) == ref_output
diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py
@@ -1,8 +1,11 @@
+from io import BytesIO
 from pathlib import Path
 
 import pytest
+from docling.datamodel.base_models import DocumentStream
 from docling.document_converter import DocumentConverter
 
+from hierarchical.hierarchy_builder_metadata import PDFFileNotFoundException, PDFFileStreamClosed
 from hierarchical.postprocessor import ResultPostprocessor
 
 results_path = Path(__file__).parent / "results"
@@ -72,6 +75,80 @@ def test_result_postprocessor_textpdf():
         assert item.text in allowed_headers
 
 
+def test_result_postprocessor_textpdf_stream():
+    source_path = sample_path / "sample_document.pdf"  # document per local path or URL
+    with source_path.open("rb") as fh:
+        source = DocumentStream(name=source_path.name, stream=BytesIO(fh.read()))
+    converter = DocumentConverter()
+    result = converter.convert(source)
+    try:
+        ResultPostprocessor(result, raise_on_error=True).process()
+        raise Exception("FAIL NO STREAM!")  # noqa: TRY002 TRY003
+    except PDFFileNotFoundException:
+        pass
+    try:
+        ResultPostprocessor(result, source=source, raise_on_error=True).process()
+        raise Exception("FAIL STREAM CLOSED!")  # noqa: TRY002 TRY003
+    except PDFFileStreamClosed:
+        pass
+
+    with source_path.open("rb") as fh:
+        source = DocumentStream(name=source_path.name, stream=BytesIO(fh.read()))
+    ResultPostprocessor(result, source=source, raise_on_error=True).process()
+
+    compare(result.document.export_to_markdown(), "sample_document.md")
+
+    allowed_headers_res = [item_ref.resolve(result.document).text for item_ref in result.document.body.children]
+    print(allowed_headers_res)
+
+    allowed_headers = [
+        "Some kind of text document",
+        "1. Introduction",
+        "1.1 Background",
+        "1.2 Purpose",
+        "2. Main Content",
+        "2.1 Section One",
+        "2.1.1 Subsection",
+        "2.1.2 Another Subsection",
+        "2.2 Section Two",
+        "3. Conclusion",
+    ]
+
+    for item_ref in result.document.body.children:
+        item = item_ref.resolve(result.document)
+        assert item.text in allowed_headers
+
+
+def test_result_postprocessor_textpdf_string():
+    source_path = sample_path / "sample_document.pdf"  # document per local path or URL
+    source = str(source_path)
+    converter = DocumentConverter()
+    result = converter.convert(source)
+    ResultPostprocessor(result, source=source).process()
+
+    compare(result.document.export_to_markdown(), "sample_document.md")
+
+    allowed_headers_res = [item_ref.resolve(result.document).text for item_ref in result.document.body.children]
+    print(allowed_headers_res)
+
+    allowed_headers = [
+        "Some kind of text document",
+        "1. Introduction",
+        "1.1 Background",
+        "1.2 Purpose",
+        "2. Main Content",
+        "2.1 Section One",
+        "2.1.1 Subsection",
+        "2.1.2 Another Subsection",
+        "2.2 Section Two",
+        "3. Conclusion",
+    ]
+
+    for item_ref in result.document.body.children:
+        item = item_ref.resolve(result.document)
+        assert item.text in allowed_headers
+
+
 @pytest.mark.skip(
     reason="just another example like test_result_postprocessor_textpdf. Not necessary for automated tests."
 )