feat: add languages kwarg (#246)

Coniferish · awalker4 · web-flow · commit 9bd7e52957a2 · 2023-09-26T15:25:01.000-05:00
Add `languages` as a kwarg for parsing and its associated tests Closes #233 ### Testing ``` curl -X 'POST' 'http://127.0.0.1:8000/general/v0/general' \ -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ -F 'files=@sample-docs/english-and-korean.png' \ -F 'languages=eng' \ -F 'languages=kor' \ | jq -C . | less -R ``` --------- Co-authored-by: Austin Walker <austin@unstructured.io>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,12 @@
+## 0.0.48-dev0
+
+* **Adds `languages` kwarg** `ocr_languages` will eventually be depricated and replaced by `lanugages` to specify what languages to use for OCR 
+
 ## 0.0.47
 
 * **Adds `chunking_strategy` kwarg and associated params** These params allow users to "chunk" elements into larger or smaller `CompositeElement`s
 * **Remove `parent_id` from the element metadata**. New metadata fields are causing errors with existing installs. We'll readd this once a fix is widely available.
 * **Fix some pdfs incorrectly returning a file is encrypted error**. The `pypdf.is_encrypted` check caused us to return this error even if the file is readable.
->>>>>>> main
 
 ## 0.0.46
 
diff --git a/README.md b/README.md
@@ -86,6 +86,7 @@ We also support models to be used locally, for example, `yolox`. Please refer to
 
 #### OCR languages
 
+Note: This kwarg will eventually be deprecated. Please use `languages`.
 You can also specify what languages to use for OCR with the `ocr_languages` kwarg. See the [Tesseract documentation](https://github.com/tesseract-ocr/tessdata) for a full list of languages and install instructions. OCR is only applied if the text is not already available in the PDF document.
 
 ```
@@ -100,6 +101,22 @@ curl -X 'POST' \
   | jq -C . | less -R
 ```
 
+#### Languages
+
+You can also specify what languages to use for OCR with the `languages` kwarg. See the [Tesseract documentation](https://github.com/tesseract-ocr/tessdata) for a full list of languages and install instructions. OCR is only applied if the text is not already available in the PDF document.
+
+```
+curl -X 'POST' \
+  'https://api.unstructured.io/general/v0/general' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: multipart/form-data' \
+  -F 'files=@sample-docs/english-and-korean.png' \
+  -F 'strategy=ocr_only' \
+  -F 'languages=eng'  \
+  -F 'languages=kor'  \
+  | jq -C . | less -R
+```
+
 #### Coordinates
 
 When elements are extracted from PDFs or images, it may be useful to get their bounding boxes as well. Set the `coordinates` parameter to `true` to add this field to the elements in the response.
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -219,6 +219,7 @@ def pipeline_api(
     m_skip_infer_table_types=[],
     m_strategy=[],
     m_xml_keep_tags=[],
+    languages=["eng"],
     m_chunking_strategy=[],
     m_multipage_sections=[],
     m_combine_under_n_chars=[],
@@ -249,6 +250,7 @@ def pipeline_api(
                         "m_skip_infer_table_types": m_skip_infer_table_types,
                         "m_strategy": m_strategy,
                         "m_xml_keep_tags": m_xml_keep_tags,
+                        "languages": languages,
                         "m_chunking_strategy": m_chunking_strategy,
                         "m_multipage_sections": m_multipage_sections,
                         "m_combine_under_n_chars": m_combine_under_n_chars,
@@ -306,7 +308,7 @@ def pipeline_api(
     enable_parallel_mode = os.environ.get("UNSTRUCTURED_PARALLEL_MODE_ENABLED", "false")
     pdf_parallel_mode_enabled = enable_parallel_mode == "true"
 
-    ocr_languages = ("+".join(m_ocr_languages) if len(m_ocr_languages) else "eng").lower()
+    ocr_languages = "+".join(m_ocr_languages) if m_ocr_languages and len(m_ocr_languages) else None
 
     include_page_breaks_str = (
         m_include_page_breaks[0] if len(m_include_page_breaks) else "false"
@@ -370,6 +372,7 @@ def pipeline_api(
                         "model_name": hi_res_model_name,
                         "xml_keep_tags": xml_keep_tags,
                         "skip_infer_table_types": skip_infer_table_types,
+                        "languages": languages,
                         "chunking_strategy": chunking_strategy,
                         "multipage_sections": multipage_sections,
                         "combine_under_n_chars": combine_under_n_chars,
@@ -400,6 +403,7 @@ def pipeline_api(
                 skip_infer_table_types=skip_infer_table_types,
                 strategy=strategy,
                 xml_keep_tags=xml_keep_tags,
+                languages=languages,
                 chunking_strategy=chunking_strategy,
                 multipage_sections=multipage_sections,
                 combine_under_n_chars=combine_under_n_chars,
@@ -419,6 +423,7 @@ def pipeline_api(
                 skip_infer_table_types=skip_infer_table_types,
                 strategy=strategy,
                 xml_keep_tags=xml_keep_tags,
+                languages=languages,
                 chunking_strategy=chunking_strategy,
                 multipage_sections=multipage_sections,
                 combine_under_n_chars=combine_under_n_chars,
@@ -564,7 +569,7 @@ def return_content_type(filename):
 
 
 @router.post("/general/v0/general")
-@router.post("/general/v0.0.47/general")
+@router.post("/general/v0.0.48/general")
 def pipeline_1(
     request: Request,
     gz_uncompressed_content_type: Optional[str] = Form(default=None),
@@ -579,6 +584,7 @@ def pipeline_1(
     skip_infer_table_types: List[str] = Form(default=[]),
     strategy: List[str] = Form(default=[]),
     xml_keep_tags: List[str] = Form(default=[]),
+    languages: List[str] = ["eng"],
     chunking_strategy: List[str] = Form(default=[]),
     multipage_sections: List[str] = Form(default=[]),
     combine_under_n_chars: List[str] = Form(default=[]),
@@ -634,6 +640,7 @@ def response_generator(is_multipart):
                     response_type=media_type,
                     filename=file.filename,
                     file_content_type=file_content_type,
+                    languages=languages,
                     m_chunking_strategy=chunking_strategy,
                     m_multipage_sections=multipage_sections,
                     m_combine_under_n_chars=combine_under_n_chars,
diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.47
+version: 0.0.48
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
@@ -137,7 +137,7 @@ def test_metadata_fields_removed():
         assert "detection_class_prob" not in response_without_coords[i]["metadata"]
 
 
-def test_ocr_languages_param():
+def test_ocr_languages_param():  # will eventually be depricated
     """
     Verify that we get the corresponding languages from the response with ocr_languages
     """
@@ -154,6 +154,41 @@ def test_ocr_languages_param():
     assert elements[3]["text"].startswith("안녕하세요, 저 희 는 YGEAS 그룹")
 
 
+def test_languages_param():
+    """
+    Verify that we get the corresponding languages from the response with `languages`
+    """
+    client = TestClient(app)
+    test_file = Path("sample-docs") / "english-and-korean.png"
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={"strategy": "ocr_only", "languages": ["eng", "kor"]},
+    )
+
+    assert response.status_code == 200
+    elements = response.json()
+    assert elements[3]["text"].startswith("안녕하세요, 저 희 는 YGEAS 그룹")
+
+
+def test_languages_and_ocr_languages_raises_error():
+    """
+    Verify that we get the corresponding languages from the response with `languages`
+    """
+    with pytest.raises(ValueError):
+        client = TestClient(app)
+        test_file = Path("sample-docs") / "english-and-korean.png"
+        client.post(
+            MAIN_API_ROUTE,
+            files=[("files", (str(test_file), open(test_file, "rb")))],
+            data={
+                "strategy": "ocr_only",
+                "languages": ["eng", "kor"],
+                "ocr_languages": ["eng", "kor"],
+            },
+        )
+
+
 def test_skip_infer_table_types_param():
     """
     Verify that we skip table instruction unless specified
@@ -375,7 +410,7 @@ def test_general_api_returns_400_bad_pdf():
     tmp.close()
 
 
-def test_general_api_returns_503(monkeypatch, mocker):
+def test_general_api_returns_503(monkeypatch):
     """
     When available memory is below the minimum. return a 503, unless our origin ip is 10.{4,5}.x.x
     """
@@ -431,7 +466,8 @@ def test_parallel_mode_passes_params(monkeypatch):
             "encoding": "foo",
             "hi_res_model_name": "yolox",
             "include_page_breaks": True,
-            "ocr_languages": "foo",
+            # "ocr_languages": "foo",
+            "languages": "foo",
             "pdf_infer_table_structure": True,
             "strategy": "hi_res",
             "xml_keep_tags": True,
@@ -452,7 +488,8 @@ def test_parallel_mode_passes_params(monkeypatch):
         model_name="yolox",
         encoding="foo",
         include_page_breaks=True,
-        ocr_languages="foo",
+        ocr_languages=None,
+        languages=["foo"],
         pdf_infer_table_structure=True,
         strategy="hi_res",
         xml_keep_tags=True,

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name: general`
`2`		`-version: 0.0.47`
	`2`	`+version: 0.0.48`