Skip to content

Commit 59191a2

Browse files
committed
fix: standardization of page_no to 1-based indexing (#2654)
Signed-off-by: ryyhan <dayel.rehan@gmail.com>
1 parent 2fe9def commit 59191a2

File tree

5 files changed

+17
-17
lines changed

5 files changed

+17
-17
lines changed

docling/models/base_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def prepare_element(
213213
coord_origin=bbox.coord_origin,
214214
)
215215

216-
page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
216+
page_ix = element_prov.page_no - 1
217217
cropped_image = conv_res.pages[page_ix].get_image(
218218
scale=self.images_scale, cropbox=expanded_bbox
219219
)

docling/models/stages/reading_order/readingorder_model.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def _add_child_elements(
8181
for child in element.cluster.children:
8282
c_label = child.label
8383
c_bbox = child.bbox.to_bottom_left_origin(
84-
doc.pages[element.page_no + 1].size.height
84+
doc.pages[element.page_no].size.height
8585
)
8686
c_text = " ".join(
8787
[
@@ -92,7 +92,7 @@ def _add_child_elements(
9292
)
9393

9494
c_prov = ProvenanceItem(
95-
page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
95+
page_no=element.page_no, charspan=(0, len(c_text)), bbox=c_bbox
9696
)
9797
if c_label == DocItemLabel.LIST_ITEM:
9898
# TODO: Infer if this is a numbered or a bullet list item
@@ -142,7 +142,7 @@ def _readingorder_elements_to_docling_doc(
142142
out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
143143

144144
for page in conv_res.pages:
145-
page_no = page.page_no + 1
145+
page_no = page.page_no
146146
size = page.size
147147

148148
assert size is not None, "Page size is not initialized."
@@ -174,7 +174,7 @@ def _readingorder_elements_to_docling_doc(
174174
if element.label == DocItemLabel.CODE:
175175
cap_text = element.text
176176
prov = ProvenanceItem(
177-
page_no=element.page_no + 1,
177+
page_no=element.page_no,
178178
charspan=(0, len(cap_text)),
179179
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
180180
)
@@ -230,7 +230,7 @@ def _readingorder_elements_to_docling_doc(
230230
)
231231

232232
prov = ProvenanceItem(
233-
page_no=element.page_no + 1,
233+
page_no=element.page_no,
234234
charspan=(0, 0),
235235
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
236236
)
@@ -286,7 +286,7 @@ def _readingorder_elements_to_docling_doc(
286286
elif isinstance(element, FigureElement):
287287
cap_text = ""
288288
prov = ProvenanceItem(
289-
page_no=element.page_no + 1,
289+
page_no=element.page_no,
290290
charspan=(0, len(cap_text)),
291291
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
292292
)
@@ -330,7 +330,7 @@ def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
330330
assert isinstance(elem, TextElement)
331331
text = elem.text
332332
prov = ProvenanceItem(
333-
page_no=elem.page_no + 1,
333+
page_no=elem.page_no,
334334
charspan=(0, len(text)),
335335
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
336336
)
@@ -343,7 +343,7 @@ def _handle_text_element(self, element, out_doc, current_list, page_height):
343343
cap_text = element.text
344344

345345
prov = ProvenanceItem(
346-
page_no=element.page_no + 1,
346+
page_no=element.page_no,
347347
charspan=(0, len(cap_text)),
348348
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
349349
)
@@ -391,7 +391,7 @@ def _merge_elements(self, element, merged_elem, new_item, page_height):
391391
"Labels of merged elements must match."
392392
)
393393
prov = ProvenanceItem(
394-
page_no=merged_elem.page_no + 1,
394+
page_no=merged_elem.page_no,
395395
charspan=(
396396
len(new_item.text) + 1,
397397
len(new_item.text) + 1 + len(merged_elem.text),

docling/pipeline/base_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
216216
for i in range(conv_res.input.page_count):
217217
start_page, end_page = conv_res.input.limits.page_range
218218
if (start_page - 1) <= i <= (end_page - 1):
219-
conv_res.pages.append(Page(page_no=i))
219+
conv_res.pages.append(Page(page_no=i + 1))
220220

221221
try:
222222
total_pages_processed = 0

docling/pipeline/legacy_standard_pdf_pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
145145

146146
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
147147
with TimeRecorder(conv_res, "page_init"):
148-
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
148+
page._backend = conv_res.input._backend.load_page(page.page_no - 1) # type: ignore
149149
if page._backend is not None and page._backend.is_valid():
150150
page.size = page._backend.get_size()
151151

@@ -176,7 +176,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
176176
if self.pipeline_options.generate_page_images:
177177
for page in conv_res.pages:
178178
assert page.image is not None
179-
page_no = page.page_no + 1
179+
page_no = page.page_no
180180
conv_res.document.pages[page_no].image = ImageRef.from_pil(
181181
page.image, dpi=int(72 * self.pipeline_options.images_scale)
182182
)

docling/pipeline/standard_pdf_pipeline.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
372372
assert isinstance(backend, PdfDocumentBackend), (
373373
"Threaded pipeline only supports PdfDocumentBackend."
374374
)
375-
page_backend = backend.load_page(page.page_no)
375+
page_backend = backend.load_page(page.page_no - 1)
376376
page._backend = page_backend
377377
if page_backend.is_valid():
378378
page.size = page_backend.get_size()
@@ -603,7 +603,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
603603
pages: list[Page] = []
604604
for i in range(conv_res.input.page_count):
605605
if start_page - 1 <= i <= end_page - 1:
606-
page = Page(page_no=i)
606+
page = Page(page_no=i + 1)
607607
conv_res.pages.append(page)
608608
pages.append(page)
609609

@@ -717,7 +717,7 @@ def _integrate_results(
717717
]
718718
# Add error details from failed pages
719719
for page_no, error in proc.failed_pages:
720-
page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page"
720+
page_label = f"Page {page_no}" if page_no > 0 else "Unknown page"
721721
error_msg = str(error) if error else ""
722722
error_item = ErrorItem(
723723
component_type=DoclingComponentType.PIPELINE,
@@ -762,7 +762,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
762762
if self.pipeline_options.generate_page_images:
763763
for page in conv_res.pages:
764764
assert page.image is not None
765-
page_no = page.page_no + 1
765+
page_no = page.page_no
766766
conv_res.document.pages[page_no].image = ImageRef.from_pil(
767767
page.image, dpi=int(72 * self.pipeline_options.images_scale)
768768
)

0 commit comments

Comments
 (0)