Skip to content

Commit 0197102

Browse files
authored
fix: Set flags for SegmentedPage correctly (#127)
Signed-off-by: Christoph Auer <[email protected]>
1 parent f93d9c8 commit 0197102

File tree

3 files changed

+181
-32
lines changed

3 files changed

+181
-32
lines changed

docling_parse/pdf_parser.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,11 +318,13 @@ def _to_segmented_page(
318318
self, page: dict, create_words: bool, create_textlines: bool
319319
) -> SegmentedPdfPage:
320320

321+
char_cells = self._to_cells(page["cells"])
321322
segmented_page = SegmentedPdfPage(
322323
dimension=self._to_page_geometry(page["dimension"]),
323-
char_cells=self._to_cells(page["cells"]),
324+
char_cells=char_cells,
324325
word_cells=[],
325326
textline_cells=[],
327+
has_chars=len(char_cells) > 0,
326328
bitmap_resources=self._to_bitmap_resources(page["images"]),
327329
lines=self._to_lines(page["lines"]),
328330
)
@@ -360,6 +362,8 @@ def _create_word_cells(
360362
cell = PdfTextCell.model_validate(item)
361363
segmented_page.word_cells.append(cell)
362364

365+
segmented_page.has_words = len(segmented_page.word_cells) > 0
366+
363367
def _create_textline_cells(
364368
self, segmented_page: SegmentedPdfPage, _loglevel: str = "fatal"
365369
):
@@ -390,6 +394,8 @@ def _create_textline_cells(
390394
cell = PdfTextCell.model_validate(item)
391395
segmented_page.textline_cells.append(cell)
392396

397+
segmented_page.has_lines = len(segmented_page.textline_cells) > 0
398+
393399
def _to_parsed_document(
394400
self,
395401
doc_dict: dict,

0 commit comments

Comments
 (0)