Skip to content

Commit 5578ce8

Browse files
committed
🚧 Add option for text encoding to line dir cli
1 parent cf59b95 commit 5578ce8

File tree

1 file changed

+17
-2
lines changed

1 file changed

+17
-2
lines changed

‎src/dinglehopper/cli.py‎

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,20 @@ def process(
114114
metrics: bool = True,
115115
differences: bool = False,
116116
textequiv_level: str = "region",
117+
plain_encoding: str = "autodetect",
117118
) -> None:
118119
"""Check OCR result against GT.
119120
120121
The @click decorators change the signature of the decorated functions, so we keep
121122
this undecorated version and use Click on a wrapper.
122123
"""
123124

124-
gt_text = extract(gt, textequiv_level=textequiv_level)
125-
ocr_text = extract(ocr, textequiv_level=textequiv_level)
125+
gt_text = extract(
126+
gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
127+
)
128+
ocr_text = extract(
129+
ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
130+
)
126131
gt_words: List[str] = list(words_normalized(gt_text))
127132
ocr_words: List[str] = list(words_normalized(ocr_text))
128133

@@ -195,6 +200,7 @@ def process_dir(
195200
metrics: bool = True,
196201
differences: bool = False,
197202
textequiv_level: str = "region",
203+
plain_encoding: str = "autodetect",
198204
) -> None:
199205
for gt_file in os.listdir(gt):
200206
gt_file_path = os.path.join(gt, gt_file)
@@ -209,6 +215,7 @@ def process_dir(
209215
metrics=metrics,
210216
differences=differences,
211217
textequiv_level=textequiv_level,
218+
plain_encoding=plain_encoding,
212219
)
213220
else:
214221
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@@ -233,6 +240,11 @@ def process_dir(
233240
help="PAGE TextEquiv level to extract text from",
234241
metavar="LEVEL",
235242
)
243+
@click.option(
244+
"--plain-encoding",
245+
default="autodetect",
246+
help='Encoding (e.g. "utf-8") of plain text files',
247+
)
236248
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
237249
@click.version_option()
238250
def main(
@@ -243,6 +255,7 @@ def main(
243255
metrics,
244256
differences,
245257
textequiv_level,
258+
plain_encoding,
246259
progress,
247260
):
248261
"""
@@ -280,6 +293,7 @@ def main(
280293
metrics=metrics,
281294
differences=differences,
282295
textequiv_level=textequiv_level,
296+
plain_encoding=plain_encoding,
283297
)
284298
else:
285299
process(
@@ -290,6 +304,7 @@ def main(
290304
metrics=metrics,
291305
differences=differences,
292306
textequiv_level=textequiv_level,
307+
plain_encoding=plain_encoding,
293308
)
294309

295310

0 commit comments

Comments
 (0)