From 3287487351dd09ac455dafcb907448d7e2f834d4 Mon Sep 17 00:00:00 2001 From: basavarm Date: Mon, 2 Jun 2025 12:27:15 +0000 Subject: [PATCH] fix : add content layer options to layout. --- spacy_layout/layout.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index a699230..15dc85c 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -14,7 +14,7 @@ import srsly from docling.datamodel.base_models import DocumentStream from docling.document_converter import DocumentConverter -from docling_core.types.doc.document import DoclingDocument +from docling_core.types.doc.document import DoclingDocument, ContentLayer from docling_core.types.doc.labels import DocItemLabel from spacy.tokens import Doc, Span, SpanGroup @@ -53,6 +53,7 @@ def __init__( ], display_table: Callable[["DataFrame"], str] | str = TABLE_PLACEHOLDER, docling_options: dict["InputFormat", "FormatOption"] | None = None, + content_layer_options: set[ContentLayer] = [ContentLayer.BODY], ) -> None: """Initialize the layout parser and Docling converter.""" self.nlp = nlp @@ -70,6 +71,7 @@ def __init__( self.headings = headings self.display_table = display_table self.converter = DocumentConverter(format_options=docling_options) + self.content_layer_options = content_layer_options # Set spaCy extension attributes for custom data Doc.set_extension(self.attrs.doc_layout, default=None, force=True) Doc.set_extension(self.attrs.doc_pages, getter=self.get_pages, force=True) @@ -142,7 +144,9 @@ def _result_to_doc(self, document: DoclingDocument) -> Doc: text_items = {item.self_ref: item for item in document.texts} table_items = {item.self_ref: item for item in document.tables} # We want to iterate over the tree to get different elements in order - for node, _ in document.iterate_items(): + for node, _ in document.iterate_items( + included_content_layers=self.content_layer_options + ): if node.self_ref in text_items: item = text_items[node.self_ref] if item.text == "":