@@ -115,7 +115,12 @@ def _to_table_of_contents(self, toc: dict) -> List[PdfTableOfContents]:
115115 return result
116116
117117 def get_page (
118- self , page_no : int , create_words : bool = True , create_textlines : bool = True
118+ self ,
119+ page_no : int ,
120+ * ,
121+ create_words : bool = True ,
122+ create_textlines : bool = True ,
123+ enforce_same_font : bool = True ,
119124 ) -> SegmentedPdfPage :
120125 if page_no in self ._pages .keys ():
121126 return self ._pages [page_no ]
@@ -134,6 +139,7 @@ def get_page(
134139 page = page ["original" ],
135140 create_words = create_words ,
136141 create_textlines = create_textlines ,
142+ enforce_same_font = enforce_same_font ,
137143 ) # put on cache
138144 return self ._pages [page_no ]
139145
@@ -315,7 +321,12 @@ def _to_lines(self, data: dict) -> List[PdfLine]:
315321 return result
316322
317323 def _to_segmented_page (
318- self , page : dict , create_words : bool , create_textlines : bool
324+ self ,
325+ page : dict ,
326+ * ,
327+ create_words : bool ,
328+ create_textlines : bool ,
329+ enforce_same_font : bool = True ,
319330 ) -> SegmentedPdfPage :
320331
321332 char_cells = self ._to_cells (page ["cells" ])
@@ -330,14 +341,22 @@ def _to_segmented_page(
330341 )
331342
332343 if create_words :
333- self ._create_word_cells (segmented_page )
344+ self ._create_word_cells (segmented_page , enforce_same_font = enforce_same_font )
334345
335346 if create_textlines :
336- self ._create_textline_cells (segmented_page )
347+ self ._create_textline_cells (
348+ segmented_page , enforce_same_font = enforce_same_font
349+ )
350+
337351 return segmented_page
338352
339353 def _create_word_cells (
340- self , segmented_page : SegmentedPdfPage , _loglevel : str = "fatal"
354+ self ,
355+ segmented_page : SegmentedPdfPage ,
356+ * ,
357+ space_width_factor_for_merge : float = 0.33 ,
358+ enforce_same_font : bool = True ,
359+ _loglevel : str = "fatal" ,
341360 ):
342361
343362 if len (segmented_page .word_cells ) > 0 :
@@ -355,7 +374,11 @@ def _create_word_cells(
355374
356375 sanitizer .set_char_cells (data = char_data )
357376
358- data = sanitizer .create_word_cells (space_width_factor_for_merge = 0.33 )
377+ # data = sanitizer.create_word_cells(space_width_factor_for_merge=0.33)
378+ data = sanitizer .create_word_cells (
379+ space_width_factor_for_merge = space_width_factor_for_merge ,
380+ enforce_same_font = enforce_same_font ,
381+ )
359382
360383 segmented_page .word_cells = []
361384 for item in data :
@@ -365,9 +388,14 @@ def _create_word_cells(
365388 segmented_page .has_words = len (segmented_page .word_cells ) > 0
366389
367390 def _create_textline_cells (
368- self , segmented_page : SegmentedPdfPage , _loglevel : str = "fatal"
391+ self ,
392+ segmented_page : SegmentedPdfPage ,
393+ * ,
394+ space_width_factor_for_merge : float = 1.0 ,
395+ space_width_factor_for_merge_with_space : float = 0.33 ,
396+ enforce_same_font : bool = True ,
397+ _loglevel : str = "fatal" ,
369398 ):
370-
371399 if len (segmented_page .textline_cells ) > 0 :
372400 return
373401
@@ -387,7 +415,12 @@ def _create_textline_cells(
387415
388416 sanitizer .set_char_cells (data = char_data )
389417
390- data = sanitizer .create_line_cells ()
418+ # data = sanitizer.create_line_cells()
419+ data = sanitizer .create_line_cells (
420+ space_width_factor_for_merge = space_width_factor_for_merge ,
421+ space_width_factor_for_merge_with_space = space_width_factor_for_merge_with_space ,
422+ enforce_same_font = enforce_same_font ,
423+ )
391424
392425 segmented_page .textline_cells = []
393426 for item in data :
0 commit comments