11import logging
22from io import BytesIO
33from pathlib import Path
4- from typing import Any , Optional , Union , cast
4+ from typing import Annotated , Any , Optional , Union , cast
55
66from docling_core .types .doc import (
77 BoundingBox ,
2323from openpyxl .drawing .spreadsheet_drawing import TwoCellAnchor
2424from openpyxl .worksheet .worksheet import Worksheet
2525from PIL import Image as PILImage
26- from pydantic import BaseModel , NonNegativeInt , PositiveInt
26+ from pydantic import BaseModel , Field , NonNegativeInt , PositiveInt
27+ from pydantic .dataclasses import dataclass
2728from typing_extensions import override
2829
2930from docling .backend .abstract_backend import (
3637_log = logging .getLogger (__name__ )
3738
3839
40+ @dataclass
41+ class DataRegion :
42+ """Represents the bounding rectangle of non-empty cells in a worksheet."""
43+
44+ min_row : Annotated [
45+ PositiveInt , Field (description = "Smallest row index (1-based index)." )
46+ ]
47+ max_row : Annotated [
48+ PositiveInt , Field (description = "Largest row index (1-based index)." )
49+ ]
50+ min_col : Annotated [
51+ PositiveInt , Field (description = "Smallest column index (1-based index)." )
52+ ]
53+ max_col : Annotated [
54+ PositiveInt , Field (description = "Largest column index (1-based index)." )
55+ ]
56+
57+ def width (self ) -> PositiveInt :
58+ """Number of columns in the data region."""
59+ return self .max_col - self .min_col + 1
60+
61+ def height (self ) -> PositiveInt :
62+ """Number of rows in the data region."""
63+ return self .max_row - self .min_row + 1
64+
65+
3966class ExcelCell (BaseModel ):
4067 """Represents an Excel cell.
4168
@@ -294,6 +321,48 @@ def _find_tables_in_sheet(
294321
295322 return doc
296323
324+ def _find_true_data_bounds (self , sheet : Worksheet ) -> DataRegion :
325+ """Find the true data boundaries (min/max rows and columns) in a worksheet.
326+
327+ This function scans all cells to find the smallest rectangular region that contains
328+ all non-empty cells or merged cell ranges. It returns the minimal and maximal
329+ row/column indices that bound the actual data region.
330+
331+ Args:
332+ sheet: The worksheet to analyze.
333+
334+ Returns:
335+ A data region representing the smallest rectangle that covers all data and merged cells.
336+ If the sheet is empty, returns (1, 1, 1, 1) by default.
337+ """
338+ min_row , min_col = None , None
339+ max_row , max_col = 0 , 0
340+
341+ for cell in sheet ._cells .values ():
342+ if cell .value is not None :
343+ r , c = cell .row , cell .column
344+ min_row = r if min_row is None else min (min_row , r )
345+ min_col = c if min_col is None else min (min_col , c )
346+ max_row = max (max_row , r )
347+ max_col = max (max_col , c )
348+
349+ # Expand bounds to include merged cells
350+ for merged in sheet .merged_cells .ranges :
351+ min_row = (
352+ merged .min_row if min_row is None else min (min_row , merged .min_row )
353+ )
354+ min_col = (
355+ merged .min_col if min_col is None else min (min_col , merged .min_col )
356+ )
357+ max_row = max (max_row , merged .max_row )
358+ max_col = max (max_col , merged .max_col )
359+
360+ # If no data found, default to (1, 1, 1, 1)
361+ if min_row is None or min_col is None :
362+ min_row = min_col = max_row = max_col = 1
363+
364+ return DataRegion (min_row , max_row , min_col , max_col )
365+
297366 def _find_data_tables (self , sheet : Worksheet ) -> list [ExcelTable ]:
298367 """Find all compact rectangular data tables in an Excel worksheet.
299368
@@ -303,18 +372,31 @@ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
303372 Returns:
304373 A list of ExcelTable objects representing the data tables.
305374 """
375+ bounds : DataRegion = self ._find_true_data_bounds (
376+ sheet
377+ ) # The true data boundaries
306378 tables : list [ExcelTable ] = [] # List to store found tables
307379 visited : set [tuple [int , int ]] = set () # Track already visited cells
308380
309- # Iterate over all cells in the sheet
310- for ri , row in enumerate (sheet .iter_rows (values_only = False )):
311- for rj , cell in enumerate (row ):
312- # Skip empty or already visited cells
381+ # Limit scan to actual data bounds
382+ for ri , row in enumerate (
383+ sheet .iter_rows (
384+ min_row = bounds .min_row ,
385+ max_row = bounds .max_row ,
386+ min_col = bounds .min_col ,
387+ max_col = bounds .max_col ,
388+ values_only = False ,
389+ ),
390+ start = bounds .min_row - 1 ,
391+ ):
392+ for rj , cell in enumerate (row , start = bounds .min_col - 1 ):
313393 if cell .value is None or (ri , rj ) in visited :
314394 continue
315395
316396 # If the cell starts a new table, find its bounds
317- table_bounds , visited_cells = self ._find_table_bounds (sheet , ri , rj )
397+ table_bounds , visited_cells = self ._find_table_bounds (
398+ sheet , ri , rj , bounds .max_row , bounds .max_col
399+ )
318400
319401 visited .update (visited_cells ) # Mark these cells as visited
320402 tables .append (table_bounds )
@@ -326,31 +408,35 @@ def _find_table_bounds(
326408 sheet : Worksheet ,
327409 start_row : int ,
328410 start_col : int ,
411+ max_row : int ,
412+ max_col : int ,
329413 ) -> tuple [ExcelTable , set [tuple [int , int ]]]:
330414 """Determine the bounds of a compact rectangular table.
331415
332416 Args:
333417 sheet: The Excel worksheet to be parsed.
334418 start_row: The row number of the starting cell.
335419 start_col: The column number of the starting cell.
420+ max_row: Maximum row boundary from true data bounds.
421+ max_col: Maximum column boundary from true data bounds.
336422
337423 Returns:
338424 A tuple with an Excel table and a set of cell coordinates.
339425 """
340426 _log .debug ("find_table_bounds" )
341427
342- max_row = self ._find_table_bottom (sheet , start_row , start_col )
343- max_col = self ._find_table_right (sheet , start_row , start_col )
428+ table_max_row = self ._find_table_bottom (sheet , start_row , start_col , max_row )
429+ table_max_col = self ._find_table_right (sheet , start_row , start_col , max_col )
344430
345431 # Collect the data within the bounds
346432 data = []
347433 visited_cells : set [tuple [int , int ]] = set ()
348434 for ri , row in enumerate (
349435 sheet .iter_rows (
350436 min_row = start_row + 1 , # start_row is 0-based but iter_rows is 1-based
351- max_row = max_row + 1 ,
437+ max_row = table_max_row + 1 ,
352438 min_col = start_col + 1 ,
353- max_col = max_col + 1 ,
439+ max_col = table_max_col + 1 ,
354440 values_only = False ,
355441 ),
356442 start_row ,
@@ -390,32 +476,33 @@ def _find_table_bounds(
390476 return (
391477 ExcelTable (
392478 anchor = (start_col , start_row ),
393- num_rows = max_row + 1 - start_row ,
394- num_cols = max_col + 1 - start_col ,
479+ num_rows = table_max_row + 1 - start_row ,
480+ num_cols = table_max_col + 1 - start_col ,
395481 data = data ,
396482 ),
397483 visited_cells ,
398484 )
399485
400486 def _find_table_bottom (
401- self , sheet : Worksheet , start_row : int , start_col : int
487+ self , sheet : Worksheet , start_row : int , start_col : int , max_row : int
402488 ) -> int :
403489 """Find the bottom boundary of a table.
404490
405491 Args:
406492 sheet: The Excel worksheet to be parsed.
407493 start_row: The starting row of the table.
408494 start_col: The starting column of the table.
495+ max_row: Maximum row boundary from true data bounds.
409496
410497 Returns:
411498 The row index representing the bottom boundary of the table.
412499 """
413- max_row : int = start_row
500+ table_max_row : int = start_row
414501
415502 for ri , (cell ,) in enumerate (
416503 sheet .iter_rows (
417504 min_row = start_row + 2 ,
418- max_row = sheet . max_row ,
505+ max_row = max_row ,
419506 min_col = start_col + 1 ,
420507 max_col = start_col + 1 ,
421508 values_only = False ,
@@ -431,35 +518,36 @@ def _find_table_bottom(
431518 if cell .value is None and not merged_range :
432519 break # Stop if the cell is empty and not merged
433520
434- # Expand max_row to include the merged range if applicable
521+ # Expand table_max_row to include the merged range if applicable
435522 if merged_range :
436- max_row = max (max_row , merged_range .max_row - 1 )
523+ table_max_row = max (table_max_row , merged_range .max_row - 1 )
437524 else :
438- max_row = ri
525+ table_max_row = ri
439526
440- return max_row
527+ return table_max_row
441528
442529 def _find_table_right (
443- self , sheet : Worksheet , start_row : int , start_col : int
530+ self , sheet : Worksheet , start_row : int , start_col : int , max_col : int
444531 ) -> int :
445532 """Find the right boundary of a table.
446533
447534 Args:
448535 sheet: The Excel worksheet to be parsed.
449536 start_row: The starting row of the table.
450537 start_col: The starting column of the table.
538+ max_col: The actual max column of the table.
451539
452540 Returns:
453541 The column index representing the right boundary of the table."
454542 """
455- max_col : int = start_col
543+ table_max_col : int = start_col
456544
457545 for rj , (cell ,) in enumerate (
458546 sheet .iter_cols (
459547 min_row = start_row + 1 ,
460548 max_row = start_row + 1 ,
461549 min_col = start_col + 2 ,
462- max_col = sheet . max_column ,
550+ max_col = max_col ,
463551 values_only = False ,
464552 ),
465553 start_col + 1 ,
@@ -473,13 +561,13 @@ def _find_table_right(
473561 if cell .value is None and not merged_range :
474562 break # Stop if the cell is empty and not merged
475563
476- # Expand max_col to include the merged range if applicable
564+ # Expand table_max_col to include the merged range if applicable
477565 if merged_range :
478- max_col = max (max_col , merged_range .max_col - 1 )
566+ table_max_col = max (table_max_col , merged_range .max_col - 1 )
479567 else :
480- max_col = rj
568+ table_max_col = rj
481569
482- return max_col
570+ return table_max_col
483571
484572 def _find_images_in_sheet (
485573 self , doc : DoclingDocument , sheet : Worksheet
0 commit comments