Skip to content

Commit 9bd7e52

Browse files
Coniferishawalker4
andauthored
feat: add languages kwarg (#246)
Add `languages` as a kwarg for parsing and its associated tests Closes #233 ### Testing ``` curl -X 'POST' 'http://127.0.0.1:8000/general/v0/general' \ -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ -F 'files=@sample-docs/english-and-korean.png' \ -F 'languages=eng' \ -F 'languages=kor' \ | jq -C . | less -R ``` --------- Co-authored-by: Austin Walker <[email protected]>
1 parent a20e01c commit 9bd7e52

File tree

5 files changed

+72
-8
lines changed

5 files changed

+72
-8
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
## 0.0.48-dev0
2+
3+
* **Adds `languages` kwarg** `ocr_languages` will eventually be depricated and replaced by `lanugages` to specify what languages to use for OCR
4+
15
## 0.0.47
26

37
* **Adds `chunking_strategy` kwarg and associated params** These params allow users to "chunk" elements into larger or smaller `CompositeElement`s
48
* **Remove `parent_id` from the element metadata**. New metadata fields are causing errors with existing installs. We'll readd this once a fix is widely available.
59
* **Fix some pdfs incorrectly returning a file is encrypted error**. The `pypdf.is_encrypted` check caused us to return this error even if the file is readable.
6-
>>>>>>> main
710

811
## 0.0.46
912

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ We also support models to be used locally, for example, `yolox`. Please refer to
8686

8787
#### OCR languages
8888

89+
Note: This kwarg will eventually be deprecated. Please use `languages`.
8990
You can also specify what languages to use for OCR with the `ocr_languages` kwarg. See the [Tesseract documentation](https://github.com/tesseract-ocr/tessdata) for a full list of languages and install instructions. OCR is only applied if the text is not already available in the PDF document.
9091

9192
```
@@ -100,6 +101,22 @@ curl -X 'POST' \
100101
| jq -C . | less -R
101102
```
102103

104+
#### Languages
105+
106+
You can also specify what languages to use for OCR with the `languages` kwarg. See the [Tesseract documentation](https://github.com/tesseract-ocr/tessdata) for a full list of languages and install instructions. OCR is only applied if the text is not already available in the PDF document.
107+
108+
```
109+
curl -X 'POST' \
110+
'https://api.unstructured.io/general/v0/general' \
111+
-H 'accept: application/json' \
112+
-H 'Content-Type: multipart/form-data' \
113+
-F 'files=@sample-docs/english-and-korean.png' \
114+
-F 'strategy=ocr_only' \
115+
-F 'languages=eng' \
116+
-F 'languages=kor' \
117+
| jq -C . | less -R
118+
```
119+
103120
#### Coordinates
104121

105122
When elements are extracted from PDFs or images, it may be useful to get their bounding boxes as well. Set the `coordinates` parameter to `true` to add this field to the elements in the response.

prepline_general/api/general.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def pipeline_api(
219219
m_skip_infer_table_types=[],
220220
m_strategy=[],
221221
m_xml_keep_tags=[],
222+
languages=["eng"],
222223
m_chunking_strategy=[],
223224
m_multipage_sections=[],
224225
m_combine_under_n_chars=[],
@@ -249,6 +250,7 @@ def pipeline_api(
249250
"m_skip_infer_table_types": m_skip_infer_table_types,
250251
"m_strategy": m_strategy,
251252
"m_xml_keep_tags": m_xml_keep_tags,
253+
"languages": languages,
252254
"m_chunking_strategy": m_chunking_strategy,
253255
"m_multipage_sections": m_multipage_sections,
254256
"m_combine_under_n_chars": m_combine_under_n_chars,
@@ -306,7 +308,7 @@ def pipeline_api(
306308
enable_parallel_mode = os.environ.get("UNSTRUCTURED_PARALLEL_MODE_ENABLED", "false")
307309
pdf_parallel_mode_enabled = enable_parallel_mode == "true"
308310

309-
ocr_languages = ("+".join(m_ocr_languages) if len(m_ocr_languages) else "eng").lower()
311+
ocr_languages = "+".join(m_ocr_languages) if m_ocr_languages and len(m_ocr_languages) else None
310312

311313
include_page_breaks_str = (
312314
m_include_page_breaks[0] if len(m_include_page_breaks) else "false"
@@ -370,6 +372,7 @@ def pipeline_api(
370372
"model_name": hi_res_model_name,
371373
"xml_keep_tags": xml_keep_tags,
372374
"skip_infer_table_types": skip_infer_table_types,
375+
"languages": languages,
373376
"chunking_strategy": chunking_strategy,
374377
"multipage_sections": multipage_sections,
375378
"combine_under_n_chars": combine_under_n_chars,
@@ -400,6 +403,7 @@ def pipeline_api(
400403
skip_infer_table_types=skip_infer_table_types,
401404
strategy=strategy,
402405
xml_keep_tags=xml_keep_tags,
406+
languages=languages,
403407
chunking_strategy=chunking_strategy,
404408
multipage_sections=multipage_sections,
405409
combine_under_n_chars=combine_under_n_chars,
@@ -419,6 +423,7 @@ def pipeline_api(
419423
skip_infer_table_types=skip_infer_table_types,
420424
strategy=strategy,
421425
xml_keep_tags=xml_keep_tags,
426+
languages=languages,
422427
chunking_strategy=chunking_strategy,
423428
multipage_sections=multipage_sections,
424429
combine_under_n_chars=combine_under_n_chars,
@@ -564,7 +569,7 @@ def return_content_type(filename):
564569

565570

566571
@router.post("/general/v0/general")
567-
@router.post("/general/v0.0.47/general")
572+
@router.post("/general/v0.0.48/general")
568573
def pipeline_1(
569574
request: Request,
570575
gz_uncompressed_content_type: Optional[str] = Form(default=None),
@@ -579,6 +584,7 @@ def pipeline_1(
579584
skip_infer_table_types: List[str] = Form(default=[]),
580585
strategy: List[str] = Form(default=[]),
581586
xml_keep_tags: List[str] = Form(default=[]),
587+
languages: List[str] = ["eng"],
582588
chunking_strategy: List[str] = Form(default=[]),
583589
multipage_sections: List[str] = Form(default=[]),
584590
combine_under_n_chars: List[str] = Form(default=[]),
@@ -634,6 +640,7 @@ def response_generator(is_multipart):
634640
response_type=media_type,
635641
filename=file.filename,
636642
file_content_type=file_content_type,
643+
languages=languages,
637644
m_chunking_strategy=chunking_strategy,
638645
m_multipage_sections=multipage_sections,
639646
m_combine_under_n_chars=combine_under_n_chars,

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.47
2+
version: 0.0.48

test_general/api/test_app.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def test_metadata_fields_removed():
137137
assert "detection_class_prob" not in response_without_coords[i]["metadata"]
138138

139139

140-
def test_ocr_languages_param():
140+
def test_ocr_languages_param(): # will eventually be depricated
141141
"""
142142
Verify that we get the corresponding languages from the response with ocr_languages
143143
"""
@@ -154,6 +154,41 @@ def test_ocr_languages_param():
154154
assert elements[3]["text"].startswith("안녕하세요, 저 희 는 YGEAS 그룹")
155155

156156

157+
def test_languages_param():
158+
"""
159+
Verify that we get the corresponding languages from the response with `languages`
160+
"""
161+
client = TestClient(app)
162+
test_file = Path("sample-docs") / "english-and-korean.png"
163+
response = client.post(
164+
MAIN_API_ROUTE,
165+
files=[("files", (str(test_file), open(test_file, "rb")))],
166+
data={"strategy": "ocr_only", "languages": ["eng", "kor"]},
167+
)
168+
169+
assert response.status_code == 200
170+
elements = response.json()
171+
assert elements[3]["text"].startswith("안녕하세요, 저 희 는 YGEAS 그룹")
172+
173+
174+
def test_languages_and_ocr_languages_raises_error():
175+
"""
176+
Verify that we get the corresponding languages from the response with `languages`
177+
"""
178+
with pytest.raises(ValueError):
179+
client = TestClient(app)
180+
test_file = Path("sample-docs") / "english-and-korean.png"
181+
client.post(
182+
MAIN_API_ROUTE,
183+
files=[("files", (str(test_file), open(test_file, "rb")))],
184+
data={
185+
"strategy": "ocr_only",
186+
"languages": ["eng", "kor"],
187+
"ocr_languages": ["eng", "kor"],
188+
},
189+
)
190+
191+
157192
def test_skip_infer_table_types_param():
158193
"""
159194
Verify that we skip table instruction unless specified
@@ -375,7 +410,7 @@ def test_general_api_returns_400_bad_pdf():
375410
tmp.close()
376411

377412

378-
def test_general_api_returns_503(monkeypatch, mocker):
413+
def test_general_api_returns_503(monkeypatch):
379414
"""
380415
When available memory is below the minimum. return a 503, unless our origin ip is 10.{4,5}.x.x
381416
"""
@@ -431,7 +466,8 @@ def test_parallel_mode_passes_params(monkeypatch):
431466
"encoding": "foo",
432467
"hi_res_model_name": "yolox",
433468
"include_page_breaks": True,
434-
"ocr_languages": "foo",
469+
# "ocr_languages": "foo",
470+
"languages": "foo",
435471
"pdf_infer_table_structure": True,
436472
"strategy": "hi_res",
437473
"xml_keep_tags": True,
@@ -452,7 +488,8 @@ def test_parallel_mode_passes_params(monkeypatch):
452488
model_name="yolox",
453489
encoding="foo",
454490
include_page_breaks=True,
455-
ocr_languages="foo",
491+
ocr_languages=None,
492+
languages=["foo"],
456493
pdf_infer_table_structure=True,
457494
strategy="hi_res",
458495
xml_keep_tags=True,

0 commit comments

Comments
 (0)