|
4 | 4 | # |
5 | 5 | import copy |
6 | 6 | import logging |
7 | | -import os |
8 | 7 | import re |
9 | | -from collections.abc import Iterable |
10 | 8 | from typing import Dict, List, Set, Tuple |
11 | 9 |
|
12 | 10 | from docling_core.types.doc.base import BoundingBox, Size |
13 | 11 | from docling_core.types.doc.document import RefItem |
14 | 12 | from docling_core.types.doc.labels import DocItemLabel |
15 | 13 | from pydantic import BaseModel |
| 14 | +from rtree import index as rtree_index |
16 | 15 |
|
17 | 16 |
|
18 | 17 | class PageElement(BoundingBox): |
@@ -306,59 +305,97 @@ def _init_l2r_map(self, page_elems: List[PageElement]): |
306 | 305 | self.l2r_map[i] = j |
307 | 306 | self.r2l_map[j] = i |
308 | 307 |
|
309 | | - def _init_ud_maps(self, page_elems: List[PageElement]): |
| 308 | + def _init_ud_maps(self, page_elems: List[PageElement]) -> None: |
| 309 | + """ |
| 310 | + Initialize up/down maps for reading order prediction using R-tree spatial indexing. |
| 311 | +
|
| 312 | + Uses R-tree for spatial queries. |
| 313 | + Determines linear reading sequence by finding preceding/following elements. |
| 314 | + """ |
310 | 315 | self.up_map = {} |
311 | 316 | self.dn_map = {} |
312 | 317 |
|
313 | 318 | for i, pelem_i in enumerate(page_elems): |
314 | 319 | self.up_map[i] = [] |
315 | 320 | self.dn_map[i] = [] |
316 | 321 |
|
317 | | - for j, pelem_j in enumerate(page_elems): |
| 322 | + # Build R-tree spatial index |
| 323 | + spatial_idx = rtree_index.Index() |
| 324 | + for i, pelem in enumerate(page_elems): |
| 325 | + spatial_idx.insert(i, (pelem.l, pelem.b, pelem.r, pelem.t)) |
318 | 326 |
|
| 327 | + for j, pelem_j in enumerate(page_elems): |
319 | 328 | if j in self.r2l_map: |
320 | 329 | i = self.r2l_map[j] |
321 | | - |
322 | 330 | self.dn_map[i] = [j] |
323 | 331 | self.up_map[j] = [i] |
324 | | - |
325 | 332 | continue |
326 | 333 |
|
327 | | - for i, pelem_i in enumerate(page_elems): |
| 334 | + # Find elements above current that might precede it in reading order |
| 335 | + query_bbox = (pelem_j.l - 0.1, pelem_j.t, pelem_j.r + 0.1, float("inf")) |
| 336 | + candidates = list(spatial_idx.intersection(query_bbox)) |
328 | 337 |
|
| 338 | + for i in candidates: |
329 | 339 | if i == j: |
330 | 340 | continue |
331 | 341 |
|
332 | | - is_horizontally_connected: bool = False |
333 | | - is_i_just_above_j: bool = pelem_i.overlaps_horizontally( |
334 | | - pelem_j |
335 | | - ) and pelem_i.is_strictly_above(pelem_j) |
336 | | - |
337 | | - for w, pelem_w in enumerate(page_elems): |
338 | | - |
339 | | - if not is_horizontally_connected: |
340 | | - is_horizontally_connected = pelem_w.is_horizontally_connected( |
341 | | - pelem_i, pelem_j |
342 | | - ) |
| 342 | + pelem_i = page_elems[i] |
343 | 343 |
|
344 | | - # ensure there is no other element that is between i and j vertically |
345 | | - if is_i_just_above_j and ( |
346 | | - pelem_i.overlaps_horizontally(pelem_w) |
347 | | - or pelem_j.overlaps_horizontally(pelem_w) |
348 | | - ): |
349 | | - i_above_w: bool = pelem_i.is_strictly_above(pelem_w) |
350 | | - w_above_j: bool = pelem_w.is_strictly_above(pelem_j) |
351 | | - |
352 | | - is_i_just_above_j = not (i_above_w and w_above_j) |
353 | | - |
354 | | - if is_i_just_above_j: |
| 344 | + # Check spatial relationship |
| 345 | + if not ( |
| 346 | + pelem_i.is_strictly_above(pelem_j) |
| 347 | + and pelem_i.overlaps_horizontally(pelem_j) |
| 348 | + ): |
| 349 | + continue |
355 | 350 |
|
| 351 | + # Check for interrupting elements |
| 352 | + if not self._has_sequence_interruption( |
| 353 | + spatial_idx, page_elems, i, j, pelem_i, pelem_j |
| 354 | + ): |
| 355 | + # Follow left-to-right mapping |
356 | 356 | while i in self.l2r_map: |
357 | 357 | i = self.l2r_map[i] |
358 | 358 |
|
359 | 359 | self.dn_map[i].append(j) |
360 | 360 | self.up_map[j].append(i) |
361 | 361 |
|
| 362 | + def _has_sequence_interruption( |
| 363 | + self, |
| 364 | + spatial_idx: rtree_index.Index, |
| 365 | + page_elems: List[PageElement], |
| 366 | + i: int, |
| 367 | + j: int, |
| 368 | + pelem_i: PageElement, |
| 369 | + pelem_j: PageElement, |
| 370 | + ) -> bool: |
| 371 | + """Check if elements interrupt the reading sequence between i and j.""" |
| 372 | + # Query R-tree for elements between i and j |
| 373 | + x_min = min(pelem_i.l, pelem_j.l) - 1.0 |
| 374 | + x_max = max(pelem_i.r, pelem_j.r) + 1.0 |
| 375 | + y_min = pelem_j.t |
| 376 | + y_max = pelem_i.b |
| 377 | + |
| 378 | + candidates = list(spatial_idx.intersection((x_min, y_min, x_max, y_max))) |
| 379 | + |
| 380 | + for w in candidates: |
| 381 | + if w in (i, j): |
| 382 | + continue |
| 383 | + |
| 384 | + pelem_w = page_elems[w] |
| 385 | + |
| 386 | + # Check if w interrupts the i->j sequence |
| 387 | + if ( |
| 388 | + ( |
| 389 | + pelem_i.overlaps_horizontally(pelem_w) |
| 390 | + or pelem_j.overlaps_horizontally(pelem_w) |
| 391 | + ) |
| 392 | + and pelem_i.is_strictly_above(pelem_w) |
| 393 | + and pelem_w.is_strictly_above(pelem_j) |
| 394 | + ): |
| 395 | + return True |
| 396 | + |
| 397 | + return False |
| 398 | + |
362 | 399 | def _do_horizontal_dilation(self, page_elems, dilated_page_elems): |
363 | 400 |
|
364 | 401 | for i, pelem_i in enumerate(dilated_page_elems): |
|
0 commit comments