Feat: add pdf_infer_table_structure parameter to api (#139)

yuming-long · web-flow · commit 15d4e66aeec9 · 2023-07-04T14:43:56.000Z
* change and tidy notebook * changelog version bump * gernate api * friendly input * adds test * stick with valid input * move param location * Revert "adds test" This reverts commit 5d8296a. * Revert "change and tidy notebook" This reverts commit 56e2bf1. * move to unittest * add readme * Revert "move to unittest" This reverts commit 4319718. * Revert "Revert "adds test"" This reverts commit 54b55ab. * stage * bump ust 0.7.10->11 * ah readme * tidy * remove content type param * Revert "remove content type param" This reverts commit 74d9040. * note to content type
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
 ## 0.0.30
 
+* Add table extraction support for hi_res strategy
 * Add support for `encoding` parameter
 * Add support for `xml_keep_tags` parameter
 * Add env variables for additional parallel mode tweaking
diff --git a/README.md b/README.md
@@ -93,6 +93,21 @@ When elements are extracted from PDFs or images, it may be useful to get their b
   | jq -C . | less -R
 ```
 
+#### PDF Table Extraction
+
+To extract the table structure from PDF files using the `hi_res` strategy, ensure that the `pdf_infer_table_structure` parameter is set to `true`. This setting includes the table's text content in the response. By default, this parameter is set to `false` to avoid the expensive reading process.
+
+```
+ curl -X 'POST' \
+  'https://api.unstructured.io/general/v0/general' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: multipart/form-data' \
+  -F 'files=@sample-docs/layout-parser-paper.pdf' \
+  -F 'strategy=hi_res' \
+  -F 'pdf_infer_table_structure=true' \
+  | jq -C . | less -R
+```
+
 #### Encoding
 
 You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.
diff --git a/pipeline-notebooks/pipeline-general.ipynb b/pipeline-notebooks/pipeline-general.ipynb
@@ -726,6 +726,7 @@
     "    m_ocr_languages=[],\n",
     "    m_encoding=[],\n",
     "    m_xml_keep_tags=[],\n",
+    "    m_pdf_infer_table_structure = [],\n",
     "    file_content_type=None,\n",
     "    response_type=\"application/json\"\n",
     "):\n",
@@ -756,6 +757,14 @@
     "    xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else \"false\").lower()\n",
     "    xml_keep_tags = xml_keep_tags_str == \"true\"\n",
     "    \n",
+    "    pdf_infer_table_structure = (\n",
+    "        m_pdf_infer_table_structure[0] if len(m_pdf_infer_table_structure) else \"false\"\n",
+    "    ).lower()\n",
+    "    if strategy == \"hi_res\" and pdf_infer_table_structure == \"true\":\n",
+    "        pdf_infer_table_structure = True\n",
+    "    else:\n",
+    "        pdf_infer_table_structure = False\n",
+    "    \n",
     "    try:\n",
     "        if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n",
     "            elements = partition_pdf_splits(\n",
@@ -766,6 +775,7 @@
     "                strategy=strategy,\n",
     "                ocr_languages=ocr_languages,\n",
     "                coordinates=show_coordinates,\n",
+    "                pdf_infer_table_structure=pdf_infer_table_structure,\n",
     "                encoding=encoding,\n",
     "            )\n",
     "        else:\n",
@@ -775,6 +785,7 @@
     "                content_type=file_content_type,\n",
     "                strategy=strategy,\n",
     "                ocr_languages=ocr_languages,\n",
+    "                pdf_infer_table_structure=pdf_infer_table_structure,\n",
     "                encoding=encoding,\n",
     "                xml_keep_tags=xml_keep_tags,\n",
     "            )\n",
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -184,6 +184,7 @@ def pipeline_api(
     m_ocr_languages=[],
     m_encoding=[],
     m_xml_keep_tags=[],
+    m_pdf_infer_table_structure=[],
     file_content_type=None,
     response_type="application/json",
 ):
@@ -213,6 +214,14 @@ def pipeline_api(
     xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else "false").lower()
     xml_keep_tags = xml_keep_tags_str == "true"
 
+    pdf_infer_table_structure = (
+        m_pdf_infer_table_structure[0] if len(m_pdf_infer_table_structure) else "false"
+    ).lower()
+    if strategy == "hi_res" and pdf_infer_table_structure == "true":
+        pdf_infer_table_structure = True
+    else:
+        pdf_infer_table_structure = False
+
     try:
         if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
             elements = partition_pdf_splits(
@@ -223,6 +232,7 @@ def pipeline_api(
                 strategy=strategy,
                 ocr_languages=ocr_languages,
                 coordinates=show_coordinates,
+                pdf_infer_table_structure=pdf_infer_table_structure,
                 encoding=encoding,
             )
         else:
@@ -232,6 +242,7 @@ def pipeline_api(
                 content_type=file_content_type,
                 strategy=strategy,
                 ocr_languages=ocr_languages,
+                pdf_infer_table_structure=pdf_infer_table_structure,
                 encoding=encoding,
                 xml_keep_tags=xml_keep_tags,
             )
@@ -378,6 +389,7 @@ def pipeline_1(
     ocr_languages: List[str] = Form(default=[]),
     encoding: List[str] = Form(default=[]),
     xml_keep_tags: List[str] = Form(default=[]),
+    pdf_infer_table_structure: List[str] = Form(default=[]),
 ):
     if files:
         for file_index in range(len(files)):
@@ -417,6 +429,7 @@ def response_generator(is_multipart):
                     m_ocr_languages=ocr_languages,
                     m_encoding=encoding,
                     m_xml_keep_tags=xml_keep_tags,
+                    m_pdf_infer_table_structure=pdf_infer_table_structure,
                     response_type=media_type,
                     filename=file.filename,
                     file_content_type=file_content_type,
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -60,7 +60,7 @@ et-xmlfile==1.1.0
     # via openpyxl
 exceptiongroup==1.1.1
     # via anyio
-fastapi==0.98.0
+fastapi==0.99.0
     # via unstructured-api-tools
 fastjsonschema==2.17.1
     # via nbformat
@@ -189,7 +189,7 @@ omegaconf==2.3.0
     # via effdet
 onnxruntime==1.15.1
     # via unstructured-inference
-opencv-python==4.7.0.72
+opencv-python==4.8.0.74
     # via
     #   layoutparser
     #   unstructured-inference
@@ -243,7 +243,7 @@ pycocotools==2.0.6
     # via effdet
 pycparser==2.21
     # via cffi
-pydantic==1.10.9
+pydantic==1.10.10
     # via
     #   argilla
     #   fastapi
@@ -379,6 +379,7 @@ types-urllib3==1.26.25.13
     # via types-requests
 typing-extensions==4.7.0
     # via
+    #   fastapi
     #   huggingface-hub
     #   iopath
     #   mypy
@@ -387,11 +388,11 @@ typing-extensions==4.7.0
     #   rich
     #   starlette
     #   torch
-unstructured[local-inference]==0.7.10
+unstructured[local-inference]==0.7.11
     # via -r requirements/base.in
 unstructured-api-tools==0.10.7
     # via -r requirements/base.in
-unstructured-inference==0.5.1
+unstructured-inference==0.5.4
     # via unstructured
 urllib3==2.0.3
     # via requests
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -140,7 +140,7 @@ execnb==0.1.5
     # via nbdev
 executing==1.2.0
     # via stack-data
-fastapi==0.98.0
+fastapi==0.99.0
     # via
     #   -r requirements/base.txt
     #   unstructured-api-tools
@@ -444,7 +444,7 @@ onnxruntime==1.15.1
     # via
     #   -r requirements/base.txt
     #   unstructured-inference
-opencv-python==4.7.0.72
+opencv-python==4.8.0.74
     # via
     #   -r requirements/base.txt
     #   layoutparser
@@ -562,7 +562,7 @@ pycparser==2.21
     # via
     #   -r requirements/base.txt
     #   cffi
-pydantic==1.10.9
+pydantic==1.10.10
     # via
     #   -r requirements/base.txt
     #   argilla
@@ -832,6 +832,7 @@ typing-extensions==4.7.0
     # via
     #   -r requirements/base.txt
     #   black
+    #   fastapi
     #   huggingface-hub
     #   iopath
     #   ipython
@@ -841,11 +842,11 @@ typing-extensions==4.7.0
     #   rich
     #   starlette
     #   torch
-unstructured[local-inference]==0.7.10
+unstructured[local-inference]==0.7.11
     # via -r requirements/base.txt
 unstructured-api-tools==0.10.7
     # via -r requirements/base.txt
-unstructured-inference==0.5.1
+unstructured-inference==0.5.4
     # via
     #   -r requirements/base.txt
     #   unstructured
diff --git a/scripts/smoketest.py b/scripts/smoketest.py
@@ -13,10 +13,23 @@
 skip_inference_tests = os.getenv("SKIP_INFERENCE_TESTS", "").lower() in {"true", "yes", "y", "1"}
 
 
-def send_document(filename, content_type, strategy="fast", output_format="application/json"):
+def send_document(
+    filename,
+    content_type,
+    strategy="fast",
+    output_format="application/json",
+    pdf_infer_table_structure="false",
+):
+    # Note: `content_type` is not passed into request since fast API will overwrite it.
     files = {"files": (str(filename), open(filename, "rb"))}
     return requests.post(
-        API_URL, files=files, data={"strategy": strategy, "output_format": output_format}
+        API_URL,
+        files=files,
+        data={
+            "strategy": strategy,
+            "output_format": output_format,
+            "pdf_infer_table_structure": pdf_infer_table_structure,
+        },
     )
 
 
@@ -107,3 +120,37 @@ def test_strategy_performance():
     assert response.status_code == 200
 
     assert hi_res_time > performance_ratio * fast_time
+
+
+@pytest.mark.skipif(skip_inference_tests, reason="emulated architecture")
+@pytest.mark.parametrize(
+    "strategy, pdf_infer_table_structure, expected_table_num",
+    [
+        ("fast", "True", 0),
+        ("fast", "False", 0),
+        ("hi_res", "True", 2),
+        ("hi_res", "False", 0),
+    ],
+)
+def test_table_support(strategy, pdf_infer_table_structure, expected_table_num):
+    """
+    Test that table extraction works on hi_res strategy
+    """
+    test_file = Path("sample-docs") / "layout-parser-paper.pdf"
+    response = send_document(
+        test_file,
+        "application/pdf",
+        strategy=strategy,
+        pdf_infer_table_structure=pdf_infer_table_structure,
+    )
+
+    assert response.status_code == 200
+    extracted_tables = [
+        el["metadata"]["text_as_html"]
+        for el in response.json()
+        if "text_as_html" in el["metadata"].keys()
+    ]
+    assert len(extracted_tables) == expected_table_num
+    if expected_table_num > 0:
+        # Test a text form a table is extracted
+        assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0]