Chore: add support forxml_keep_tags param (#136)

shreyanid · web-flow · commit 5a851a0a3ee9 · 2023-06-29T15:32:33.000-07:00
* merge with encoding param

* wrote xml keep tags param test

* update changelog and readme

* bump requirements

* remove spaces in readme curl sample
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 0.0.30-dev1
 
 * Add support for `encoding` parameter
+* Add support for `xml_keep_tags` parameter
 * Add env variables for additional parallel mode tweaking
 
 ## 0.0.29
diff --git a/README.md b/README.md
@@ -107,6 +107,19 @@ curl -X 'POST'
  | jq -C . | less -R
 ```
 
+#### XML Tags
+
+When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags.
+
+```
+curl -X 'POST' 
+ 'https://api.unstructured.io/general/v0/general' \
+ -H 'accept: application/json'  \
+ -H 'Content-Type: multipart/form-data' \
+ -F 'files=@sample-docs/fake-xml.xml' \
+ -F 'xml_keep_tags=true' \
+ | jq -C . | less -R
+```
 
 ## Developer Quick Start
 
diff --git a/pipeline-notebooks/pipeline-general.ipynb b/pipeline-notebooks/pipeline-general.ipynb
@@ -725,6 +725,7 @@
     "    m_coordinates=[],\n",
     "    m_ocr_languages=[],\n",
     "    m_encoding=[],\n",
+    "    m_xml_keep_tags=[],\n",
     "    file_content_type=None,\n",
     "    response_type=\"application/json\"\n",
     "):\n",
@@ -752,6 +753,9 @@
     "\n",
     "    encoding = m_encoding[0] if len(m_encoding) else None\n",
     "    \n",
+    "    xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else \"false\").lower()\n",
+    "    xml_keep_tags = xml_keep_tags_str == \"true\"\n",
+    "    \n",
     "    try:\n",
     "        if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n",
     "            elements = partition_pdf_splits(\n",
@@ -772,6 +776,7 @@
     "                strategy=strategy,\n",
     "                ocr_languages=ocr_languages,\n",
     "                encoding=encoding,\n",
+    "                xml_keep_tags=xml_keep_tags,\n",
     "            )\n",
     "    except ValueError as e:\n",
     "        if 'Invalid file' in e.args[0]:\n",
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -183,6 +183,7 @@ def pipeline_api(
     m_coordinates=[],
     m_ocr_languages=[],
     m_encoding=[],
+    m_xml_keep_tags=[],
     file_content_type=None,
     response_type="application/json",
 ):
@@ -209,6 +210,9 @@ def pipeline_api(
 
     encoding = m_encoding[0] if len(m_encoding) else None
 
+    xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else "false").lower()
+    xml_keep_tags = xml_keep_tags_str == "true"
+
     try:
         if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
             elements = partition_pdf_splits(
@@ -229,6 +233,7 @@ def pipeline_api(
                 strategy=strategy,
                 ocr_languages=ocr_languages,
                 encoding=encoding,
+                xml_keep_tags=xml_keep_tags,
             )
     except ValueError as e:
         if "Invalid file" in e.args[0]:
@@ -372,6 +377,7 @@ def pipeline_1(
     coordinates: List[str] = Form(default=[]),
     ocr_languages: List[str] = Form(default=[]),
     encoding: List[str] = Form(default=[]),
+    xml_keep_tags: List[str] = Form(default=[]),
 ):
     if files:
         for file_index in range(len(files)):
@@ -410,6 +416,7 @@ def response_generator(is_multipart):
                     m_coordinates=coordinates,
                     m_ocr_languages=ocr_languages,
                     m_encoding=encoding,
+                    m_xml_keep_tags=xml_keep_tags,
                     response_type=media_type,
                     filename=file.filename,
                     file_content_type=file_content_type,
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -11,7 +11,7 @@ anyio==3.7.0
     #   httpcore
     #   starlette
     #   watchfiles
-argilla==1.11.0
+argilla==1.12.0
     # via unstructured
 attrs==23.1.0
     # via jsonschema
@@ -115,7 +115,7 @@ jinja2==3.1.2
     #   nbconvert
     #   torch
     #   unstructured-api-tools
-joblib==1.3.0
+joblib==1.3.1
     # via nltk
 jsonschema==4.17.3
     # via nbformat
@@ -369,7 +369,7 @@ traitlets==5.9.0
     #   nbformat
 transformers==4.30.2
     # via unstructured-inference
-typer==0.9.0
+typer==0.7.0
     # via argilla
 types-requests==2.31.0.1
     # via unstructured-api-tools
@@ -387,7 +387,6 @@ typing-extensions==4.7.0
     #   rich
     #   starlette
     #   torch
-    #   typer
 unstructured[local-inference]==0.7.10
     # via -r requirements/base.in
 unstructured-api-tools==0.10.7
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -19,7 +19,7 @@ appnope==0.1.3
     # via
     #   ipykernel
     #   ipython
-argilla==1.11.0
+argilla==1.12.0
     # via
     #   -r requirements/base.txt
     #   unstructured
@@ -266,7 +266,7 @@ jinja2==3.1.2
     #   notebook
     #   torch
     #   unstructured-api-tools
-joblib==1.3.0
+joblib==1.3.1
     # via
     #   -r requirements/base.txt
     #   nltk
@@ -812,7 +812,7 @@ transformers==4.30.2
     # via
     #   -r requirements/base.txt
     #   unstructured-inference
-typer==0.9.0
+typer==0.7.0
     # via
     #   -r requirements/base.txt
     #   argilla
@@ -841,7 +841,6 @@ typing-extensions==4.7.0
     #   rich
     #   starlette
     #   torch
-    #   typer
 unstructured[local-inference]==0.7.10
     # via -r requirements/base.txt
 unstructured-api-tools==0.10.7
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
@@ -4,6 +4,7 @@
 import json
 import io
 import pytest
+import re
 import requests
 import ast
 import pandas as pd
@@ -236,6 +237,53 @@ def test_api_with_different_encodings():
     assert "invalid start byte" in str(excinfo.value)
 
 
+def test_xml_keep_tags_param():
+    """
+    Verify that responses do not include xml tags unless requested
+    """
+    client = TestClient(app)
+    test_file = Path("sample-docs") / "fake-xml.xml"
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={"strategy": "hi_res"},
+    )
+    assert response.status_code == 200
+    response_without_xml_tags = response.json()
+
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={"xml_keep_tags": "true", "strategy": "hi_res"},
+    )
+    assert response.status_code == 200
+    response_with_xml_tags = response.json()[3:]  # skip the initial encoding tag(s)
+
+    # The responses should have the same content except for the xml tags
+    response_with_xml_tags_index, response_without_xml_tags_index = 0, 0
+    while response_without_xml_tags_index < len(response_without_xml_tags):
+        xml_tagged_line = response_with_xml_tags[response_with_xml_tags_index]["text"]
+        assert xml_tagged_line.startswith("<")
+        assert xml_tagged_line.endswith(">")
+
+        # if there is content on this line, ensure it matches the content on the non tagged line
+        xml_tagged_line_content = xml_tagged_line.split(">", 1)[1]  # remove opening tag
+        if not xml_tagged_line_content:
+            response_with_xml_tags_index += 1
+
+        else:
+            xml_tagged_line_content = xml_tagged_line_content.split("<", 1)[0]  # remove closing tag
+
+            xml_untagged_line = response_without_xml_tags[response_without_xml_tags_index]["text"]
+            xml_tagged_line_content_parsed = re.sub(
+                "&amp;", "&", xml_tagged_line_content
+            )  # xml_keep_tags does not currently parse the inner content
+            assert xml_tagged_line_content_parsed == xml_untagged_line
+
+            response_with_xml_tags_index += 1
+            response_without_xml_tags_index += 1
+
+
 @pytest.mark.parametrize(
     "example_filename",
     [