Unstructured-IO
diff --git a/‎.github/actions/base-cache/action.yml‎
Lines changed: 4 additions & 1 deletion b/‎.github/actions/base-cache/action.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/actions/base-ingest-cache/action.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/actions/base-ingest-cache/action.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 6 additions & 46 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 6 additions & 46 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 13 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 2 additions & 7 deletions b/‎Makefile‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎example-docs/empty.xlsx‎
8 KB b/‎example-docs/empty.xlsx‎
8 KB
diff --git a/‎requirements/base.in‎
Lines changed: 1 addition & 1 deletion b/‎requirements/base.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/base.txt‎
Lines changed: 11 additions & 8 deletions b/‎requirements/base.txt‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎requirements/deps/constraints.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/deps/constraints.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/dev.txt‎
Lines changed: 4 additions & 4 deletions b/‎requirements/dev.txt‎
Lines changed: 4 additions & 4 deletions
@@ -30,14 +30,17 @@ runs:
       shell: bash
       run: |
         python${{ inputs.python-version }} -m pip install --upgrade virtualenv
-        python${{ inputs.python-version }} -m venv .venv
+        if [ ! -d ".venv" ]; then
+          python${{ inputs.python-version }} -m venv .venv
+        fi
         source .venv/bin/activate
         [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
         if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
           python -m ensurepip --upgrade
           python -m pip install --upgrade setuptools
         fi
         make install-ci
+        make install-nltk-models
     - name: Save Cache
       if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
       id: virtualenv-cache-save
 
@@ -18,7 +18,7 @@ runs:
         path: |
           .venv
           nltk_data
-        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
+        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
         lookup-only: ${{ inputs.check-only }}
     - name: Set up Python ${{ inputs.python-version }}
       if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@@ -39,6 +39,8 @@ runs:
           python -m pip install --upgrade setuptools
         fi
         make install-ci
+        make install-nltk-models
+        make install-all-docs
         make install-ingest
     - name: Save Ingest Cache
       if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@@ -48,5 +50,5 @@ runs:
         path: |
           .venv
           nltk_data
-        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
+        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
 
@@ -12,14 +12,15 @@ permissions:
   id-token: write
   contents: read
 
+env:
+  NLTK_DATA: ${{ github.workspace }}/nltk_data
+
 jobs:
   setup:
     strategy:
       matrix:
         python-version: ["3.9","3.10","3.11", "3.12"]
     runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
     steps:
     - uses: actions/checkout@v4
     - uses: ./.github/actions/base-cache
@@ -78,8 +79,6 @@ jobs:
     strategy:
       matrix:
         python-version: ["3.9","3.10","3.11"]
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
     runs-on: ubuntu-latest
     needs: [setup, changelog]
     steps:
@@ -153,41 +152,6 @@ jobs:
         make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
         make check-coverage
 
-  test_chipper:
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-    runs-on: ubuntu-latest
-    env:
-      UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
-    needs: [setup, lint]
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Setup virtual environment
-      uses: ./.github/actions/base-cache
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Test
-      env:
-        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
-        PYTHON: python${{ matrix.python-version }}
-        NLTK_DATA: ${{ github.workspace }}/nltk_data
-      run: |
-        source .venv/bin/activate
-        sudo apt-get update
-        sudo apt-get install -y poppler-utils
-        make install-pandoc install-test
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get update
-        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
-        tesseract --version
-        make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
-
   test_unit_no_extras:
     strategy:
       matrix:
@@ -220,8 +184,6 @@ jobs:
         python-version: ["3.10"]
         extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
     runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
     needs: [setup, lint, test_unit_no_extras]
     steps:
     - uses: actions/checkout@v4
@@ -255,15 +217,14 @@ jobs:
         sudo apt-get update
         sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
         tesseract --version
+        make install-${{ matrix.extra }}
         make test-extra-${{ matrix.extra }} CI=true
 
   setup_ingest:
     strategy:
       matrix:
         python-version: [ "3.9","3.10" ]
     runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
     needs: [setup]
     steps:
       - uses: actions/checkout@v4
@@ -342,7 +303,6 @@ jobs:
         MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
         OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
         CI: "true"
-        NLTK_DATA: ${{ github.workspace }}/nltk_data
         PYTHON: python${{ matrix.python-version }}
       run: |
         source .venv/bin/activate
@@ -355,6 +315,8 @@ jobs:
         sudo apt-get install -y tesseract-ocr-kor
         sudo apt-get install diffstat
         tesseract --version
+        make install-all-docs
+        make install-ingest
         ./test_unstructured_ingest/test-ingest-src.sh
 
 
@@ -364,8 +326,6 @@ jobs:
         # NOTE(yuming): Unstructured API only use Python 3.10
         python-version: ["3.10"]
     runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
     needs: [setup, lint]
     steps:
     - uses: actions/checkout@v4
 
@@ -1,12 +1,24 @@
-## 0.16.1-dev0
+## 0.16.1
 
 ### Enhancements
 
+* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies
+* **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine.
+* **Request retry parameters in `partition_via_api` function.** Expose retry-mechanism related parameters in the `partition_via_api` function to allow users to configure the retry behavior of the API requests.
+
 ### Features
 
+* **Parsing HTML to Unstructured Elements and back**
+
 ### Fixes
 
+* **Remove unsupported chipper model**
 * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
+* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
+* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
+* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
+* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
+* **Minify text_as_html from PPTX.** Previously `.metadata.text_as_html` for PPTX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text and structure.
 
 ## 0.16.0
 
 
@@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
 .PHONY: test
 test:
 	PYTHONPATH=. CI=$(CI) \
-	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
-
-.PHONY: test-chipper
-test-chipper:
-	PYTHONPATH=. CI=$(CI) \
-	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
+	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
 
 .PHONY: test-unstructured-api-unit
 test-unstructured-api-unit:
@@ -309,7 +304,7 @@ docker-test:
 	$(DOCKER_IMAGE) \
 	bash -c "CI=$(CI) \
 	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
-	pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
+	pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
 
 .PHONY: docker-smoke-test
 docker-smoke-test:
 
@@ -4,7 +4,6 @@ filetype
 python-magic
 lxml
 nltk
-tabulate
 requests
 beautifulsoup4
 emoji
@@ -22,3 +21,4 @@ wrapt
 tqdm
 psutil
 python-oxmsg
+html5lib
@@ -4,7 +4,7 @@
 #
 #    pip-compile ./base.in
 #
-anyio==4.6.0
+anyio==4.6.2.post1
     # via httpx
 backoff==2.2.1
     # via -r ./base.in
@@ -20,15 +20,15 @@ cffi==1.17.1
     # via cryptography
 chardet==5.2.0
     # via -r ./base.in
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
     # via
     #   requests
     #   unstructured-client
 click==8.1.7
     # via
     #   nltk
     #   python-oxmsg
-cryptography==43.0.1
+cryptography==43.0.3
     # via unstructured-client
 dataclasses-json==0.6.7
     # via
@@ -44,6 +44,8 @@ filetype==1.2.0
     # via -r ./base.in
 h11==0.14.0
     # via httpcore
+html5lib==1.1
+    # via -r ./base.in
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
@@ -62,7 +64,7 @@ langdetect==1.0.9
     # via -r ./base.in
 lxml==5.3.0
     # via -r ./base.in
-marshmallow==3.22.0
+marshmallow==3.23.0
     # via
     #   dataclasses-json
     #   unstructured-client
@@ -84,15 +86,15 @@ packaging==24.1
     # via
     #   marshmallow
     #   unstructured-client
-psutil==6.0.0
+psutil==6.1.0
     # via -r ./base.in
 pycparser==2.22
     # via cffi
 pypdf==5.0.1
     # via unstructured-client
 python-dateutil==2.9.0.post0
     # via unstructured-client
-python-iso639==2024.4.27
+python-iso639==2024.10.22
     # via -r ./base.in
 python-magic==0.4.27
     # via -r ./base.in
@@ -111,6 +113,7 @@ requests-toolbelt==1.0.0
     # via unstructured-client
 six==1.16.0
     # via
+    #   html5lib
     #   langdetect
     #   python-dateutil
     #   unstructured-client
@@ -120,8 +123,6 @@ sniffio==1.3.1
     #   httpx
 soupsieve==2.6
     # via beautifulsoup4
-tabulate==0.9.0
-    # via -r ./base.in
 tqdm==4.66.5
     # via
     #   -r ./base.in
@@ -147,5 +148,7 @@ urllib3==1.26.20
     #   -c ././deps/constraints.txt
     #   requests
     #   unstructured-client
+webencodings==0.5.1
+    # via html5lib
 wrapt==1.16.0
     # via -r ./base.in
@@ -17,4 +17,4 @@ botocore<1.34.132
 # TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile
 importlib-metadata>=8.5.0
 # (austin): Versions below this have a different interface for passing parameters
-unstructured-client>=0.23.0
+unstructured-client>=0.23.0,<0.26.0
@@ -4,7 +4,7 @@
 #
 #    pip-compile ./dev.in
 #
-build==1.2.2
+build==1.2.2.post1
     # via pip-tools
 cfgv==3.4.0
     # via pre-commit
@@ -13,7 +13,7 @@ click==8.1.7
     #   -c ./base.txt
     #   -c ./test.txt
     #   pip-tools
-distlib==0.3.8
+distlib==0.3.9
     # via virtualenv
 filelock==3.16.1
     # via virtualenv
@@ -36,7 +36,7 @@ platformdirs==4.3.6
     # via
     #   -c ./test.txt
     #   virtualenv
-pre-commit==3.8.0
+pre-commit==4.0.1
     # via -r ./dev.in
 pyproject-hooks==1.2.0
     # via
@@ -51,7 +51,7 @@ tomli==2.0.2
     #   -c ./test.txt
     #   build
     #   pip-tools
-virtualenv==20.26.6
+virtualenv==20.27.0
     # via pre-commit
 wheel==0.44.0
     # via pip-tools