Skip to content

Commit a8a19ce

Browse files
authored
chore: Add --ocr-languages parameter to unstructured ingest (#793)
1 parent 752e78e commit a8a19ce

File tree

6 files changed

+111
-3
lines changed

6 files changed

+111
-3
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
### Enhancements
44

5+
* Adds --partition-ocr-languages to unstructured-ingest
6+
7+
58
### Features
69

710
* Adds `partition_org` for processed Org Mode documents.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
[
2+
{
3+
"type": "Title",
4+
"element_id": "a726edcce6a7b116c43e1b5109c2f4c8",
5+
"metadata": {
6+
"data_source": {},
7+
"filetype": "image/png"
8+
},
9+
"text": "RULES AND INSTRUCTIONS"
10+
},
11+
{
12+
"type": "NarrativeText",
13+
"element_id": "ba41648485acf4a8e7dd7d183b764811",
14+
"metadata": {
15+
"data_source": {},
16+
"filetype": "image/png"
17+
},
18+
"text": "1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email accounts."
19+
},
20+
{
21+
"type": "NarrativeText",
22+
"element_id": "30089b5886f3074973195439e5343f64",
23+
"metadata": {
24+
"data_source": {},
25+
"filetype": "image/png"
26+
},
27+
"text": "Note: Remember to write your own \"OPENING MESSAGE\" before you copy and paste the template. please always include [TREASURE HARUTO] for example:"
28+
},
29+
{
30+
"type": "NarrativeText",
31+
"element_id": "577b0ea5cd4f62a19b9d14dd0bd7272e",
32+
"metadata": {
33+
"data_source": {},
34+
"filetype": "image/png"
35+
},
36+
"text": "안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 메 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 고 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다."
37+
},
38+
{
39+
"type": "NarrativeText",
40+
"element_id": "95c5b9d0d081bc45be0e50a109924191",
41+
"metadata": {
42+
"data_source": {},
43+
"filetype": "image/png"
44+
},
45+
"text": "3. CC [email protected] so we can keep track of how many emails were successfully sent"
46+
},
47+
{
48+
"type": "NarrativeText",
49+
"element_id": "da91cb5310386aafd5aaf6fd988b9616",
50+
"metadata": {
51+
"data_source": {},
52+
"filetype": "image/png"
53+
},
54+
"text": "4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]"
55+
}
56+
]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
6+
cd "$SCRIPT_DIR"/.. || exit 1
7+
8+
PYTHONPATH=. ./unstructured/ingest/main.py \
9+
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
10+
--local-input-path example-docs/english-and-korean.png \
11+
--structured-output-dir parameterized-ingest-output \
12+
--partition-ocr-languages eng+kor \
13+
--verbose \
14+
--reprocess
15+
16+
set +e
17+
18+
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
19+
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
20+
21+
cp -r parameterized-ingest-output/ test_unstructured_ingest/expected-structured-output/
22+
23+
elif ! diff -ru test_unstructured_ingest/expected-structured-output/parameterized-ingest-output parameterized-ingest-output ; then
24+
echo
25+
echo "There are differences from the previously checked-in structured outputs."
26+
echo
27+
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
28+
echo
29+
echo " export OVERWRITE_FIXTURES=true"
30+
echo
31+
echo "and then rerun this script."
32+
echo
33+
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
34+
echo "to update fixtures for CI,"
35+
echo
36+
exit 1
37+
38+
fi

test_unstructured_ingest/test-ingest.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,6 @@ export OMP_THREAD_LIMIT=1
2121
./test_unstructured_ingest/test-ingest-slack.sh
2222
./test_unstructured_ingest/test-ingest-against-api.sh
2323
./test_unstructured_ingest/test-ingest-gcs.sh
24+
./test_unstructured_ingest/test-ingest-local-single-file.sh
2425
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
2526
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh

unstructured/ingest/doc_processor/generalized.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def initialize():
1515

1616

1717
def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]:
18-
"""Process any IngestDoc-like class of document with choosen Unstructured's partition logic.
18+
"""Process any IngestDoc-like class of document with chosen Unstructured's partition logic.
1919
2020
Parameters
2121
----------

unstructured/ingest/main.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,14 @@ def run(self):
151151
help="The method that will be used to process the documents. "
152152
"Default: auto. Other strategies include `fast` and `hi_res`.",
153153
)
154+
@click.option(
155+
"--partition-ocr-languages",
156+
default="eng",
157+
help="A list of language packs to specify which languages to use for OCR, separated by '+' "
158+
"e.g. 'eng+deu' to use the English and German language packs. The appropriate Tesseract "
159+
"language pack needs to be installed."
160+
"Default: eng",
161+
)
154162
@click.option(
155163
"--api-key",
156164
default="",
@@ -470,6 +478,7 @@ def main(
470478
partition_by_api,
471479
partition_endpoint,
472480
partition_strategy,
481+
partition_ocr_languages,
473482
api_key,
474483
local_input_path,
475484
local_file_glob,
@@ -781,14 +790,15 @@ def main(
781790
logger.error("No connector-specific option was specified!")
782791
sys.exit(1)
783792

784-
process_document_with_partition_strategy = partial(
793+
process_document_with_partition_args = partial(
785794
process_document,
786795
strategy=partition_strategy,
796+
ocr_languages=partition_ocr_languages,
787797
)
788798

789799
MainProcess(
790800
doc_connector=doc_connector,
791-
doc_processor_fn=process_document_with_partition_strategy,
801+
doc_processor_fn=process_document_with_partition_args,
792802
num_processes=num_processes,
793803
reprocess=reprocess,
794804
verbose=verbose,

0 commit comments

Comments
 (0)