@@ -505,12 +505,33 @@ def text_strip(text, strip=""):
505505 return stripped
506506
507507
508+ def text_replace (text , replace = {}):
509+ """Replaces the keys for the values that are present in `text`.
510+ Parameters
511+ ----------
512+ text : str
513+ Text to process and modify.
514+ replace : dict, optional (default: {})
515+ key value pairs, where keys are swapped for the values in `text`.
516+ Returns
517+ -------
518+ text : str
519+ """
520+ if replace is {}:
521+ return text
522+
523+ for key , value in replace .items ():
524+ text = text .replace (key , value )
525+
526+ return text
527+
528+
508529# TODO: combine the following functions into a TextProcessor class which
509530# applies corresponding transformations sequentially
510531# (inspired from sklearn.pipeline.Pipeline)
511532
512533
513- def flag_font_size (textline , direction , strip_text = "" ):
534+ def flag_font_size (textline , direction , strip_text = "" , replace_text = {} ):
514535 """Flags super/subscripts in text by enclosing them with <s></s>.
515536 May give false positives.
516537
@@ -523,6 +544,9 @@ def flag_font_size(textline, direction, strip_text=""):
523544 strip_text : str, optional (default: '')
524545 Characters that should be stripped from a string before
525546 assigning it to a cell.
547+ replace_text : dict, optional (default: {})
548+ Characters that should be replaced from a string before
549+ assigning it to a cell.
526550
527551 Returns
528552 -------
@@ -559,10 +583,13 @@ def flag_font_size(textline, direction, strip_text=""):
559583 fstring = "" .join (flist )
560584 else :
561585 fstring = "" .join ([t .get_text () for t in textline ])
586+ fstring = text_replace (fstring , replace_text )
562587 return text_strip (fstring , strip_text )
563588
564589
565- def split_textline (table , textline , direction , flag_size = False , strip_text = "" ):
590+ def split_textline (
591+ table , textline , direction , flag_size = False , strip_text = "" , replace_text = {}
592+ ):
566593 """Splits PDFMiner LTTextLine into substrings if it spans across
567594 multiple rows/columns.
568595
@@ -580,6 +607,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
580607 strip_text : str, optional (default: '')
581608 Characters that should be stripped from a string before
582609 assigning it to a cell.
610+ replace_text : dict, optional (default: {})
611+ Characters that should be replaced from a string before
612+ assigning it to a cell.
583613
584614 Returns
585615 -------
@@ -668,20 +698,28 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
668698 key [0 ],
669699 key [1 ],
670700 flag_font_size (
671- [t [2 ] for t in chars ], direction , strip_text = strip_text
701+ [t [2 ] for t in chars ],
702+ direction ,
703+ strip_text = strip_text ,
704+ replace_text = replace_text ,
672705 ),
673706 )
674707 )
675708 else :
676- gchars = [t [2 ].get_text () for t in chars ]
677- grouped_chars .append (
678- (key [0 ], key [1 ], text_strip ("" .join (gchars ), strip_text ))
679- )
709+ gchars = "" .join ([t [2 ].get_text () for t in chars ])
710+ gchars = text_replace (gchars , replace_text )
711+ grouped_chars .append ((key [0 ], key [1 ], text_strip (gchars , strip_text )))
680712 return grouped_chars
681713
682714
683715def get_table_index (
684- table , t , direction , split_text = False , flag_size = False , strip_text = ""
716+ table ,
717+ t ,
718+ direction ,
719+ split_text = False ,
720+ flag_size = False ,
721+ strip_text = "" ,
722+ replace_text = {},
685723):
686724 """Gets indices of the table cell where given text object lies by
687725 comparing their y and x-coordinates.
@@ -703,6 +741,9 @@ def get_table_index(
703741 strip_text : str, optional (default: '')
704742 Characters that should be stripped from a string before
705743 assigning it to a cell.
744+ replace_text : dict, optional (default: {})
745+ Characters that should be replaced from a string before
746+ assigning it to a cell.
706747
707748 Returns
708749 -------
@@ -761,7 +802,12 @@ def get_table_index(
761802 if split_text :
762803 return (
763804 split_textline (
764- table , t , direction , flag_size = flag_size , strip_text = strip_text
805+ table ,
806+ t ,
807+ direction ,
808+ flag_size = flag_size ,
809+ strip_text = strip_text ,
810+ replace_text = replace_text ,
765811 ),
766812 error ,
767813 )
@@ -772,13 +818,20 @@ def get_table_index(
772818 (
773819 r_idx ,
774820 c_idx ,
775- flag_font_size (t ._objs , direction , strip_text = strip_text ),
821+ flag_font_size (
822+ t ._objs ,
823+ direction ,
824+ strip_text = strip_text ,
825+ replace_text = replace_text ,
826+ ),
776827 )
777828 ],
778829 error ,
779830 )
780831 else :
781- return [(r_idx , c_idx , text_strip (t .get_text (), strip_text ))], error
832+ text = t .get_text ()
833+ text = text_replace (text , replace_text )
834+ return [(r_idx , c_idx , text_strip (text , strip_text ))], error
782835
783836
784837def compute_accuracy (error_weights ):
0 commit comments