Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 16 additions & 0 deletions b/‎README.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pipeline-notebooks/pipeline-general.ipynb‎
Lines changed: 73 additions & 71 deletions b/‎pipeline-notebooks/pipeline-general.ipynb‎
Lines changed: 73 additions & 71 deletions
diff --git a/‎prepline_general/api/general.py‎
Lines changed: 59 additions & 53 deletions b/‎prepline_general/api/general.py‎
Lines changed: 59 additions & 53 deletions
diff --git a/‎preprocessing-pipeline-family.yaml‎
Lines changed: 1 addition & 1 deletion b/‎preprocessing-pipeline-family.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/base.txt‎
Lines changed: 14 additions & 12 deletions b/‎requirements/base.txt‎
Lines changed: 14 additions & 12 deletions
@@ -1,3 +1,6 @@
+## 0.0.34-dev0
+
+* Add table support for image with parameter `skip_infer_table_types`
 ## 0.0.33
 
 * Image tweak, move application entrypoint to scripts/app-start.sh
 
@@ -111,6 +111,22 @@ To extract the table structure from PDF files using the `hi_res` strategy, ensur
   | jq -C . | less -R
 ```
 
+#### Skip Table Extraction
+
+Currently, we provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter `skip_infer_table_types` to specify the document types that you want to skip table extraction with. By default, we skip table extraction
+for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that table extraction only works with `hi_res` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to `skip_infer_table_types`with:
+
+```
+ curl -X 'POST' \
+  'https://api.unstructured.io/general/v0/general' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: multipart/form-data' \
+  -F 'files=@sample-docs/layout-parser-paper-with-table.jpg' \
+  -F 'strategy=hi_res' \
+  -F 'skip_infer_table_types=' \
+  | jq -C . | less -R
+```
+
 #### Encoding
 
 You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.
 
@@ -198,33 +198,35 @@ def pipeline_api(
     file,
     request=None,
     filename="",
-    m_strategy=[],
+    file_content_type=None,
+    response_type="application/json",
     m_coordinates=[],
-    m_ocr_languages=[],
-    m_include_page_breaks=[],
     m_encoding=[],
-    m_xml_keep_tags=[],
-    m_pdf_infer_table_structure=[],
     m_hi_res_model_name=[],
-    file_content_type=None,
-    response_type="application/json",
+    m_include_page_breaks=[],
+    m_ocr_languages=[],
+    m_pdf_infer_table_structure=[],
+    m_skip_infer_table_types=[],
+    m_strategy=[],
+    m_xml_keep_tags=[],
 ):
     logger.debug(
         "pipeline_api input params: {}".format(
             json.dumps(
                 {
                     "request": request,
                     "filename": filename,
-                    "m_strategy": m_strategy,
+                    "file_content_type": file_content_type,
+                    "response_type": response_type,
                     "m_coordinates": m_coordinates,
-                    "m_ocr_languages": m_ocr_languages,
-                    "m_include_page_breaks": m_include_page_breaks,
                     "m_encoding": m_encoding,
-                    "m_xml_keep_tags": m_xml_keep_tags,
-                    "m_pdf_infer_table_structure": m_pdf_infer_table_structure,
                     "m_hi_res_model_name": m_hi_res_model_name,
-                    "file_content_type": file_content_type,
-                    "response_type": response_type,
+                    "m_include_page_breaks": m_include_page_breaks,
+                    "m_ocr_languages": m_ocr_languages,
+                    "m_pdf_infer_table_structure": m_pdf_infer_table_structure,
+                    "m_skip_infer_table_types": m_skip_infer_table_types,
+                    "m_strategy": m_strategy,
+                    "m_xml_keep_tags": m_xml_keep_tags,
                 },
                 default=str,
             )
@@ -290,6 +292,10 @@ def pipeline_api(
     else:
         pdf_infer_table_structure = False
 
+    skip_infer_table_types = (
+        m_skip_infer_table_types if len(m_skip_infer_table_types) else ["pdf", "jpg", "png"]
+    )
+
     try:
         logger.debug(
             "partition input data: {}".format(
@@ -304,6 +310,7 @@ def pipeline_api(
                         "encoding": encoding,
                         "model_name": hi_res_model_name,
                         "xml_keep_tags": xml_keep_tags,
+                        "skip_infer_table_types": skip_infer_table_types,
                     },
                     default=str,
                 )
@@ -317,26 +324,31 @@ def pipeline_api(
                 file=file,
                 file_filename=filename,
                 content_type=file_content_type,
-                strategy=strategy,
-                ocr_languages=ocr_languages,
                 coordinates=show_coordinates,
-                pdf_infer_table_structure=pdf_infer_table_structure,
-                include_page_breaks=include_page_breaks,
+                # partition_kwargs
                 encoding=encoding,
+                include_page_breaks=include_page_breaks,
                 model_name=hi_res_model_name,
+                ocr_languages=ocr_languages,
+                pdf_infer_table_structure=pdf_infer_table_structure,
+                skip_infer_table_types=skip_infer_table_types,
+                strategy=strategy,
+                xml_keep_tags=xml_keep_tags,
             )
         else:
             elements = partition(
                 file=file,
                 file_filename=filename,
                 content_type=file_content_type,
-                strategy=strategy,
+                # partition_kwargs
+                encoding=encoding,
+                include_page_breaks=include_page_breaks,
+                model_name=hi_res_model_name,
                 ocr_languages=ocr_languages,
                 pdf_infer_table_structure=pdf_infer_table_structure,
-                include_page_breaks=include_page_breaks,
-                encoding=encoding,
+                skip_infer_table_types=skip_infer_table_types,
+                strategy=strategy,
                 xml_keep_tags=xml_keep_tags,
-                model_name=hi_res_model_name,
             )
     except ValueError as e:
         if "Invalid file" in e.args[0]:
@@ -345,31 +357,23 @@ def pipeline_api(
             )
         raise e
 
+    # Clean up returned elements
+    for i, element in enumerate(elements):
+        elements[i].metadata.filename = os.path.basename(filename)
+
+        if not show_coordinates and element.metadata.coordinates:
+            elements[i].metadata.coordinates = None
+
+        # Note(yuming): currently removing date from metadata
+        # since it should be fixed in the core library
+        if element.metadata.date:
+            elements[i].metadata.date = None
+
     if response_type == "text/csv":
         df = convert_to_dataframe(elements)
-        df["filename"] = os.path.basename(filename)
-        if not show_coordinates:
-            columns_to_drop = [
-                col
-                for col in [
-                    "coordinates_points",
-                    "coordinates_system",
-                    "coordinates_layout_width",
-                    "coordinates_layout_height",
-                ]
-                if col in df.columns
-            ]
-            if columns_to_drop:
-                df.drop(columns=columns_to_drop, inplace=True)
-
         return df.to_csv(index=False)
 
     result = convert_to_isd(elements)
-    for element in result:
-        element["metadata"]["filename"] = os.path.basename(filename)
-
-        if not show_coordinates and "coordinates" in element["metadata"]:
-            del element["metadata"]["coordinates"]
 
     return result
 
@@ -479,20 +483,21 @@ def return_content_type(filename):
 
 
 @router.post("/general/v0/general")
-@router.post("/general/v0.0.33/general")
+@router.post("/general/v0.0.34/general")
 def pipeline_1(
     request: Request,
     gz_uncompressed_content_type: Optional[str] = Form(default=None),
     files: Union[List[UploadFile], None] = File(default=None),
     output_format: Union[str, None] = Form(default=None),
-    strategy: List[str] = Form(default=[]),
     coordinates: List[str] = Form(default=[]),
-    ocr_languages: List[str] = Form(default=[]),
-    include_page_breaks: List[str] = Form(default=[]),
     encoding: List[str] = Form(default=[]),
-    xml_keep_tags: List[str] = Form(default=[]),
-    pdf_infer_table_structure: List[str] = Form(default=[]),
     hi_res_model_name: List[str] = Form(default=[]),
+    include_page_breaks: List[str] = Form(default=[]),
+    ocr_languages: List[str] = Form(default=[]),
+    pdf_infer_table_structure: List[str] = Form(default=[]),
+    skip_infer_table_types: List[str] = Form(default=[]),
+    strategy: List[str] = Form(default=[]),
+    xml_keep_tags: List[str] = Form(default=[]),
 ):
     if files:
         for file_index in range(len(files)):
@@ -532,14 +537,15 @@ def response_generator(is_multipart):
                 response = pipeline_api(
                     _file,
                     request=request,
-                    m_strategy=strategy,
                     m_coordinates=coordinates,
-                    m_ocr_languages=ocr_languages,
-                    m_include_page_breaks=include_page_breaks,
                     m_encoding=encoding,
-                    m_xml_keep_tags=xml_keep_tags,
-                    m_pdf_infer_table_structure=pdf_infer_table_structure,
                     m_hi_res_model_name=hi_res_model_name,
+                    m_include_page_breaks=include_page_breaks,
+                    m_ocr_languages=ocr_languages,
+                    m_pdf_infer_table_structure=pdf_infer_table_structure,
+                    m_skip_infer_table_types=skip_infer_table_types,
+                    m_strategy=strategy,
+                    m_xml_keep_tags=xml_keep_tags,
                     response_type=media_type,
                     filename=file.filename,
                     file_content_type=file_content_type,
 
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.33
+version: 0.0.34
@@ -52,7 +52,7 @@ et-xmlfile==1.1.0
     # via openpyxl
 exceptiongroup==1.1.2
     # via anyio
-fastapi==0.100.0
+fastapi==0.100.1
     # via unstructured-api-tools
 fastjsonschema==2.18.0
     # via nbformat
@@ -126,7 +126,7 @@ lxml==4.9.3
     #   python-docx
     #   python-pptx
     #   unstructured
-markdown==3.4.3
+markdown==3.4.4
     # via unstructured
 markupsafe==2.1.3
     # via
@@ -146,9 +146,9 @@ mypy-extensions==1.0.0
     # via mypy
 nbclient==0.8.0
     # via nbconvert
-nbconvert==7.7.2
+nbconvert==7.7.3
     # via unstructured-api-tools
-nbformat==5.9.1
+nbformat==5.9.2
     # via
     #   nbclient
     #   nbconvert
@@ -203,7 +203,7 @@ pdfminer-six==20221105
     # via
     #   pdfplumber
     #   unstructured
-pdfplumber==0.10.1
+pdfplumber==0.10.2
     # via layoutparser
 pillow==10.0.0
     # via
@@ -217,7 +217,7 @@ pillow==10.0.0
     #   unstructured
 pkgutil-resolve-name==1.3.10
     # via jsonschema
-platformdirs==3.9.1
+platformdirs==3.10.0
     # via jupyter-core
 portalocker==2.7.0
     # via iopath
@@ -229,19 +229,19 @@ pycparser==2.21
     # via cffi
 pycryptodome==3.18.0
     # via -r requirements/base.in
-pydantic==1.10.11
+pydantic==1.10.12
     # via
     #   -r requirements/base.in
     #   fastapi
-pyflakes==3.0.1
+pyflakes==3.1.0
     # via autoflake
 pygments==2.15.1
     # via nbconvert
 pypandoc==1.11
     # via unstructured
 pyparsing==3.0.9
     # via matplotlib
-pypdf==3.13.0
+pypdf==3.14.0
     # via -r requirements/base.in
 pypdfium2==4.18.0
     # via pdfplumber
@@ -365,26 +365,28 @@ types-urllib3==1.26.25.14
     # via types-requests
 typing-extensions==4.7.1
     # via
+    #   annotated-types
     #   fastapi
     #   huggingface-hub
     #   iopath
     #   mypy
     #   pydantic
+    #   pydantic-core
     #   pypdf
     #   starlette
     #   torch
     #   uvicorn
 tzdata==2023.3
     # via pandas
-unstructured[local-inference]==0.8.1
+unstructured[local-inference]==0.8.7
     # via -r requirements/base.in
 unstructured-api-tools==0.10.10
     # via -r requirements/base.in
-unstructured-inference==0.5.5
+unstructured-inference==0.5.7
     # via unstructured
 urllib3==2.0.4
     # via requests
-uvicorn[standard]==0.23.1
+uvicorn[standard]==0.23.2
     # via unstructured-api-tools
 uvloop==0.17.0
     # via uvicorn
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name: general`
`2`		`-version: 0.0.33`
	`2`	`+version: 0.0.34`