Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import srsly
from docling.datamodel.base_models import DocumentStream
from docling.document_converter import DocumentConverter
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc.document import DoclingDocument, ContentLayer
from docling_core.types.doc.labels import DocItemLabel
from spacy.tokens import Doc, Span, SpanGroup

Expand Down Expand Up @@ -53,6 +53,7 @@ def __init__(
],
display_table: Callable[["DataFrame"], str] | str = TABLE_PLACEHOLDER,
docling_options: dict["InputFormat", "FormatOption"] | None = None,
content_layer_options: set[ContentLayer] = [ContentLayer.BODY],
) -> None:
"""Initialize the layout parser and Docling converter."""
self.nlp = nlp
Expand All @@ -70,6 +71,7 @@ def __init__(
self.headings = headings
self.display_table = display_table
self.converter = DocumentConverter(format_options=docling_options)
self.content_layer_options = content_layer_options
# Set spaCy extension attributes for custom data
Doc.set_extension(self.attrs.doc_layout, default=None, force=True)
Doc.set_extension(self.attrs.doc_pages, getter=self.get_pages, force=True)
Expand Down Expand Up @@ -142,7 +144,9 @@ def _result_to_doc(self, document: DoclingDocument) -> Doc:
text_items = {item.self_ref: item for item in document.texts}
table_items = {item.self_ref: item for item in document.tables}
# We want to iterate over the tree to get different elements in order
for node, _ in document.iterate_items():
for node, _ in document.iterate_items(
included_content_layers=self.content_layer_options
):
if node.self_ref in text_items:
item = text_items[node.self_ref]
if item.text == "":
Expand Down