@@ -114,15 +114,20 @@ def process(
114114 metrics : bool = True ,
115115 differences : bool = False ,
116116 textequiv_level : str = "region" ,
117+ plain_encoding : str = "autodetect" ,
117118) -> None :
118119 """Check OCR result against GT.
119120
120121 The @click decorators change the signature of the decorated functions, so we keep
121122 this undecorated version and use Click on a wrapper.
122123 """
123124
124- gt_text = extract (gt , textequiv_level = textequiv_level )
125- ocr_text = extract (ocr , textequiv_level = textequiv_level )
125+ gt_text = extract (
126+ gt , textequiv_level = textequiv_level , plain_encoding = plain_encoding
127+ )
128+ ocr_text = extract (
129+ ocr , textequiv_level = textequiv_level , plain_encoding = plain_encoding
130+ )
126131 gt_words : List [str ] = list (words_normalized (gt_text ))
127132 ocr_words : List [str ] = list (words_normalized (ocr_text ))
128133
@@ -195,6 +200,7 @@ def process_dir(
195200 metrics : bool = True ,
196201 differences : bool = False ,
197202 textequiv_level : str = "region" ,
203+ plain_encoding : str = "autodetect" ,
198204) -> None :
199205 for gt_file in os .listdir (gt ):
200206 gt_file_path = os .path .join (gt , gt_file )
@@ -209,6 +215,7 @@ def process_dir(
209215 metrics = metrics ,
210216 differences = differences ,
211217 textequiv_level = textequiv_level ,
218+ plain_encoding = plain_encoding ,
212219 )
213220 else :
214221 print ("Skipping {0} and {1}" .format (gt_file_path , ocr_file_path ))
@@ -233,6 +240,11 @@ def process_dir(
233240 help = "PAGE TextEquiv level to extract text from" ,
234241 metavar = "LEVEL" ,
235242)
243+ @click .option (
244+ "--plain-encoding" ,
245+ default = "autodetect" ,
246+ help = 'Encoding (e.g. "utf-8") of plain text files' ,
247+ )
236248@click .option ("--progress" , default = False , is_flag = True , help = "Show progress bar" )
237249@click .version_option ()
238250def main (
@@ -243,6 +255,7 @@ def main(
243255 metrics ,
244256 differences ,
245257 textequiv_level ,
258+ plain_encoding ,
246259 progress ,
247260):
248261 """
@@ -280,6 +293,7 @@ def main(
280293 metrics = metrics ,
281294 differences = differences ,
282295 textequiv_level = textequiv_level ,
296+ plain_encoding = plain_encoding ,
283297 )
284298 else :
285299 process (
@@ -290,6 +304,7 @@ def main(
290304 metrics = metrics ,
291305 differences = differences ,
292306 textequiv_level = textequiv_level ,
307+ plain_encoding = plain_encoding ,
293308 )
294309
295310
0 commit comments