Skip to content

Commit ad88ecf

Browse files
authored
fix: Add unit flags to SegmentedPage (#286)
Add unit flags to SegmentedPage Signed-off-by: Christoph Auer <[email protected]>
1 parent 7f83f1c commit ad88ecf

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[flake8]
22
per-file-ignores = __init__.py:F401
3-
max-line-length = 88
3+
max-line-length = 120
44
exclude = test/*
55
max-complexity = 25
66
docstring-convention = google

docling_core/types/doc/page.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,27 @@ class SegmentedPage(BaseModel):
472472
word_cells: List[TextCell] = []
473473
textline_cells: List[TextCell] = []
474474

475+
# These flags are set to differentiate if above lists of this SegmentedPage
476+
# are empty (page had no content) or if they have not been computed (i.e. textline_cells may be present
477+
# but word_cells are not)
478+
has_chars: bool = False
479+
has_words: bool = False
480+
has_lines: bool = False
481+
475482
image: Optional[ImageRef] = None
476483

484+
@model_validator(mode="after")
485+
def validate_page(self) -> "SegmentedPage":
486+
"""Validate page."""
487+
if len(self.textline_cells) > 0:
488+
self.has_lines = True
489+
if len(self.word_cells) > 0:
490+
self.has_words = True
491+
if len(self.char_cells) > 0:
492+
self.has_chars = True
493+
494+
return self
495+
477496
def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
478497
"""Iterate through text cells of the specified unit type.
479498

0 commit comments

Comments
 (0)