@@ -275,47 +275,6 @@ def _to_page_geometry(self, dimension: dict) -> PdfPageGeometry:
275275 bleed_bbox = bleed_bbox ,
276276 )
277277
278- """
279- def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
280-
281- assert "data" in cells, '"data" in cells'
282- assert "header" in cells, '"header" in cells'
283-
284- data = cells["data"]
285- header = cells["header"]
286-
287- result: List[Union[PdfTextCell, TextCell]] = []
288- for ind, row in enumerate(data):
289- rect = BoundingRectangle(
290- r_x0=row[header.index(f"r_x0")],
291- r_y0=row[header.index(f"r_y0")],
292- r_x1=row[header.index(f"r_x1")],
293- r_y1=row[header.index(f"r_y1")],
294- r_x2=row[header.index(f"r_x2")],
295- r_y2=row[header.index(f"r_y2")],
296- r_x3=row[header.index(f"r_x3")],
297- r_y3=row[header.index(f"r_y3")],
298- )
299- cell = PdfTextCell(
300- rect=rect,
301- text=row[header.index(f"text")],
302- orig=row[header.index(f"text")],
303- font_key=row[header.index(f"font-key")],
304- font_name=row[header.index(f"font-name")],
305- widget=row[header.index(f"widget")],
306- text_direction=(
307- TextDirection.LEFT_TO_RIGHT
308- if row[header.index(f"left_to_right")]
309- else TextDirection.RIGHT_TO_LEFT
310- ),
311- index=ind,
312- rendering_mode=row[header.index(f"rendering-mode")],
313- )
314- result.append(cell)
315-
316- return result
317- """
318-
319278 def _to_cells (self , cells : dict ) -> List [Union [PdfTextCell , TextCell ]]:
320279 assert "data" in cells , '"data" in cells'
321280 assert "header" in cells , '"header" in cells'
@@ -481,14 +440,22 @@ def _to_segmented_page(
481440
482441 if create_words and ("word_cells" in page ):
483442 segmented_page .word_cells = self ._to_cells (page ["word_cells" ])
484- elif create_words :
443+ segmented_page .has_words = len (segmented_page .word_cells ) > 0
444+ elif keep_chars :
445+ logging .warning (
446+ "`words` will be created for segmented_page in an inefficient way!"
447+ )
485448 self ._create_word_cells (segmented_page , enforce_same_font = enforce_same_font )
486449 else :
487450 logging .warning ("No `words` will be created for segmented_page" )
488451
489- if create_textlines and ("word_cells " in page ):
452+ if create_textlines and ("line_cells " in page ):
490453 segmented_page .textline_cells = self ._to_cells (page ["line_cells" ])
491- elif create_textlines :
454+ segmented_page .has_lines = len (segmented_page .textline_cells ) > 0
455+ elif keep_chars :
456+ logging .warning (
457+ "`text_lines` will be created for segmented_page in an inefficient way!"
458+ )
492459 self ._create_textline_cells (
493460 segmented_page , enforce_same_font = enforce_same_font
494461 )
@@ -501,11 +468,11 @@ def _create_word_cells(
501468 self ,
502469 segmented_page : SegmentedPdfPage ,
503470 * ,
471+ horizontal_cell_tolerance : float = 1.0 ,
504472 space_width_factor_for_merge : float = 0.33 ,
505473 enforce_same_font : bool = True ,
506474 _loglevel : str = "fatal" ,
507475 ):
508-
509476 if len (segmented_page .word_cells ) > 0 :
510477 return
511478
@@ -523,6 +490,7 @@ def _create_word_cells(
523490
524491 # data = sanitizer.create_word_cells(space_width_factor_for_merge=0.33)
525492 data = sanitizer .create_word_cells (
493+ horizontal_cell_tolerance = horizontal_cell_tolerance ,
526494 space_width_factor_for_merge = space_width_factor_for_merge ,
527495 enforce_same_font = enforce_same_font ,
528496 )
@@ -538,6 +506,7 @@ def _create_textline_cells(
538506 self ,
539507 segmented_page : SegmentedPdfPage ,
540508 * ,
509+ horizontal_cell_tolerance : float = 1.0 ,
541510 space_width_factor_for_merge : float = 1.0 ,
542511 space_width_factor_for_merge_with_space : float = 0.33 ,
543512 enforce_same_font : bool = True ,
@@ -564,6 +533,7 @@ def _create_textline_cells(
564533
565534 # data = sanitizer.create_line_cells()
566535 data = sanitizer .create_line_cells (
536+ horizontal_cell_tolerance = horizontal_cell_tolerance ,
567537 space_width_factor_for_merge = space_width_factor_for_merge ,
568538 space_width_factor_for_merge_with_space = space_width_factor_for_merge_with_space ,
569539 enforce_same_font = enforce_same_font ,
0 commit comments