Skip to content

Commit 9d00e32

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents b6f2bde + 0b4c72a commit 9d00e32

File tree

90 files changed

+4234
-1537
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+4234
-1537
lines changed

.github/actions/base-cache/action.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,17 @@ runs:
3030
shell: bash
3131
run: |
3232
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
33-
python${{ inputs.python-version }} -m venv .venv
33+
if [ ! -d ".venv" ]; then
34+
python${{ inputs.python-version }} -m venv .venv
35+
fi
3436
source .venv/bin/activate
3537
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
3638
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
3739
python -m ensurepip --upgrade
3840
python -m pip install --upgrade setuptools
3941
fi
4042
make install-ci
43+
make install-nltk-models
4144
- name: Save Cache
4245
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
4346
id: virtualenv-cache-save

.github/actions/base-ingest-cache/action.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ runs:
1818
path: |
1919
.venv
2020
nltk_data
21-
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
21+
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
2222
lookup-only: ${{ inputs.check-only }}
2323
- name: Set up Python ${{ inputs.python-version }}
2424
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@@ -39,6 +39,8 @@ runs:
3939
python -m pip install --upgrade setuptools
4040
fi
4141
make install-ci
42+
make install-nltk-models
43+
make install-all-docs
4244
make install-ingest
4345
- name: Save Ingest Cache
4446
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@@ -48,5 +50,5 @@ runs:
4850
path: |
4951
.venv
5052
nltk_data
51-
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
53+
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
5254

.github/workflows/ci.yml

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,15 @@ permissions:
1212
id-token: write
1313
contents: read
1414

15+
env:
16+
NLTK_DATA: ${{ github.workspace }}/nltk_data
17+
1518
jobs:
1619
setup:
1720
strategy:
1821
matrix:
1922
python-version: ["3.9","3.10","3.11", "3.12"]
2023
runs-on: ubuntu-latest
21-
env:
22-
NLTK_DATA: ${{ github.workspace }}/nltk_data
2324
steps:
2425
- uses: actions/checkout@v4
2526
- uses: ./.github/actions/base-cache
@@ -78,8 +79,6 @@ jobs:
7879
strategy:
7980
matrix:
8081
python-version: ["3.9","3.10","3.11"]
81-
env:
82-
NLTK_DATA: ${{ github.workspace }}/nltk_data
8382
runs-on: ubuntu-latest
8483
needs: [setup, changelog]
8584
steps:
@@ -153,41 +152,6 @@ jobs:
153152
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
154153
make check-coverage
155154
156-
test_chipper:
157-
strategy:
158-
matrix:
159-
python-version: ["3.10"]
160-
runs-on: ubuntu-latest
161-
env:
162-
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
163-
NLTK_DATA: ${{ github.workspace }}/nltk_data
164-
needs: [setup, lint]
165-
steps:
166-
- uses: actions/checkout@v4
167-
- name: Set up Python ${{ matrix.python-version }}
168-
uses: actions/setup-python@v5
169-
with:
170-
python-version: ${{ matrix.python-version }}
171-
- name: Setup virtual environment
172-
uses: ./.github/actions/base-cache
173-
with:
174-
python-version: ${{ matrix.python-version }}
175-
- name: Test
176-
env:
177-
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
178-
PYTHON: python${{ matrix.python-version }}
179-
NLTK_DATA: ${{ github.workspace }}/nltk_data
180-
run: |
181-
source .venv/bin/activate
182-
sudo apt-get update
183-
sudo apt-get install -y poppler-utils
184-
make install-pandoc install-test
185-
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
186-
sudo apt-get update
187-
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
188-
tesseract --version
189-
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
190-
191155
test_unit_no_extras:
192156
strategy:
193157
matrix:
@@ -220,8 +184,6 @@ jobs:
220184
python-version: ["3.10"]
221185
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
222186
runs-on: ubuntu-latest
223-
env:
224-
NLTK_DATA: ${{ github.workspace }}/nltk_data
225187
needs: [setup, lint, test_unit_no_extras]
226188
steps:
227189
- uses: actions/checkout@v4
@@ -255,15 +217,14 @@ jobs:
255217
sudo apt-get update
256218
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
257219
tesseract --version
220+
make install-${{ matrix.extra }}
258221
make test-extra-${{ matrix.extra }} CI=true
259222
260223
setup_ingest:
261224
strategy:
262225
matrix:
263226
python-version: [ "3.9","3.10" ]
264227
runs-on: ubuntu-latest
265-
env:
266-
NLTK_DATA: ${{ github.workspace }}/nltk_data
267228
needs: [setup]
268229
steps:
269230
- uses: actions/checkout@v4
@@ -342,7 +303,6 @@ jobs:
342303
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
343304
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
344305
CI: "true"
345-
NLTK_DATA: ${{ github.workspace }}/nltk_data
346306
PYTHON: python${{ matrix.python-version }}
347307
run: |
348308
source .venv/bin/activate
@@ -355,6 +315,8 @@ jobs:
355315
sudo apt-get install -y tesseract-ocr-kor
356316
sudo apt-get install diffstat
357317
tesseract --version
318+
make install-all-docs
319+
make install-ingest
358320
./test_unstructured_ingest/test-ingest-src.sh
359321
360322
@@ -364,8 +326,6 @@ jobs:
364326
# NOTE(yuming): Unstructured API only use Python 3.10
365327
python-version: ["3.10"]
366328
runs-on: ubuntu-latest
367-
env:
368-
NLTK_DATA: ${{ github.workspace }}/nltk_data
369329
needs: [setup, lint]
370330
steps:
371331
- uses: actions/checkout@v4

CHANGELOG.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,24 @@
1-
## 0.16.1-dev0
1+
## 0.16.1
22

33
### Enhancements
44

5+
* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies
6+
* **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine.
7+
* **Request retry parameters in `partition_via_api` function.** Expose retry-mechanism related parameters in the `partition_via_api` function to allow users to configure the retry behavior of the API requests.
8+
59
### Features
610

11+
* **Parsing HTML to Unstructured Elements and back**
12+
713
### Fixes
814

15+
* **Remove unsupported chipper model**
916
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
17+
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
18+
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
19+
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
20+
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
21+
* **Minify text_as_html from PPTX.** Previously `.metadata.text_as_html` for PPTX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text and structure.
1022

1123
## 0.16.0
1224

Makefile

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
138138
.PHONY: test
139139
test:
140140
PYTHONPATH=. CI=$(CI) \
141-
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
142-
143-
.PHONY: test-chipper
144-
test-chipper:
145-
PYTHONPATH=. CI=$(CI) \
146-
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
141+
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
147142

148143
.PHONY: test-unstructured-api-unit
149144
test-unstructured-api-unit:
@@ -309,7 +304,7 @@ docker-test:
309304
$(DOCKER_IMAGE) \
310305
bash -c "CI=$(CI) \
311306
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
312-
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
307+
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
313308

314309
.PHONY: docker-smoke-test
315310
docker-smoke-test:

example-docs/empty.xlsx

8 KB
Binary file not shown.

requirements/base.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ filetype
44
python-magic
55
lxml
66
nltk
7-
tabulate
87
requests
98
beautifulsoup4
109
emoji
@@ -22,3 +21,4 @@ wrapt
2221
tqdm
2322
psutil
2423
python-oxmsg
24+
html5lib

requirements/base.txt

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile ./base.in
66
#
7-
anyio==4.6.0
7+
anyio==4.6.2.post1
88
# via httpx
99
backoff==2.2.1
1010
# via -r ./base.in
@@ -20,15 +20,15 @@ cffi==1.17.1
2020
# via cryptography
2121
chardet==5.2.0
2222
# via -r ./base.in
23-
charset-normalizer==3.3.2
23+
charset-normalizer==3.4.0
2424
# via
2525
# requests
2626
# unstructured-client
2727
click==8.1.7
2828
# via
2929
# nltk
3030
# python-oxmsg
31-
cryptography==43.0.1
31+
cryptography==43.0.3
3232
# via unstructured-client
3333
dataclasses-json==0.6.7
3434
# via
@@ -44,6 +44,8 @@ filetype==1.2.0
4444
# via -r ./base.in
4545
h11==0.14.0
4646
# via httpcore
47+
html5lib==1.1
48+
# via -r ./base.in
4749
httpcore==1.0.6
4850
# via httpx
4951
httpx==0.27.2
@@ -62,7 +64,7 @@ langdetect==1.0.9
6264
# via -r ./base.in
6365
lxml==5.3.0
6466
# via -r ./base.in
65-
marshmallow==3.22.0
67+
marshmallow==3.23.0
6668
# via
6769
# dataclasses-json
6870
# unstructured-client
@@ -84,15 +86,15 @@ packaging==24.1
8486
# via
8587
# marshmallow
8688
# unstructured-client
87-
psutil==6.0.0
89+
psutil==6.1.0
8890
# via -r ./base.in
8991
pycparser==2.22
9092
# via cffi
9193
pypdf==5.0.1
9294
# via unstructured-client
9395
python-dateutil==2.9.0.post0
9496
# via unstructured-client
95-
python-iso639==2024.4.27
97+
python-iso639==2024.10.22
9698
# via -r ./base.in
9799
python-magic==0.4.27
98100
# via -r ./base.in
@@ -111,6 +113,7 @@ requests-toolbelt==1.0.0
111113
# via unstructured-client
112114
six==1.16.0
113115
# via
116+
# html5lib
114117
# langdetect
115118
# python-dateutil
116119
# unstructured-client
@@ -120,8 +123,6 @@ sniffio==1.3.1
120123
# httpx
121124
soupsieve==2.6
122125
# via beautifulsoup4
123-
tabulate==0.9.0
124-
# via -r ./base.in
125126
tqdm==4.66.5
126127
# via
127128
# -r ./base.in
@@ -147,5 +148,7 @@ urllib3==1.26.20
147148
# -c ././deps/constraints.txt
148149
# requests
149150
# unstructured-client
151+
webencodings==0.5.1
152+
# via html5lib
150153
wrapt==1.16.0
151154
# via -r ./base.in

requirements/deps/constraints.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ botocore<1.34.132
1717
# TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile
1818
importlib-metadata>=8.5.0
1919
# (austin): Versions below this have a different interface for passing parameters
20-
unstructured-client>=0.23.0
20+
unstructured-client>=0.23.0,<0.26.0

requirements/dev.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile ./dev.in
66
#
7-
build==1.2.2
7+
build==1.2.2.post1
88
# via pip-tools
99
cfgv==3.4.0
1010
# via pre-commit
@@ -13,7 +13,7 @@ click==8.1.7
1313
# -c ./base.txt
1414
# -c ./test.txt
1515
# pip-tools
16-
distlib==0.3.8
16+
distlib==0.3.9
1717
# via virtualenv
1818
filelock==3.16.1
1919
# via virtualenv
@@ -36,7 +36,7 @@ platformdirs==4.3.6
3636
# via
3737
# -c ./test.txt
3838
# virtualenv
39-
pre-commit==3.8.0
39+
pre-commit==4.0.1
4040
# via -r ./dev.in
4141
pyproject-hooks==1.2.0
4242
# via
@@ -51,7 +51,7 @@ tomli==2.0.2
5151
# -c ./test.txt
5252
# build
5353
# pip-tools
54-
virtualenv==20.26.6
54+
virtualenv==20.27.0
5555
# via pre-commit
5656
wheel==0.44.0
5757
# via pip-tools

0 commit comments

Comments
 (0)