Skip to content

Commit 72da165

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents eb43642 + 451ad97 commit 72da165

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+2236
-1469
lines changed

.github/workflows/ingest-test-fixtures-update-pr.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ jobs:
109109
sudo apt-get install -y tesseract-ocr-kor
110110
sudo apt-get install diffstat
111111
tesseract --version
112+
python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
112113
./test_unstructured_ingest/test-ingest-src.sh
113114
114115
- name: Save branch name to environment file

CHANGELOG.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,54 @@
1+
## 0.16.19-dev2
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
9+
- **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff.
10+
- **Remove upper bound constraint on python version** in setup.py. Python3.13 is not yet officially supported, but allow users to try.
11+
12+
## 0.16.17
13+
14+
### Enhancements
15+
- **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.
16+
17+
### Features
18+
19+
### Fixes
20+
- **Fix a bug where `build_layout_elements_from_cor_regions` incorrectly joins texts in wrong order**.
21+
22+
## 0.16.16
23+
24+
### Enhancements
25+
26+
### Features
27+
- **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.
28+
29+
### Fixes
30+
- **Add auto-download for NLTK for Python Enviroment** When user import tokenize, It will automatic download nltk data from `tokenize.py` file. Added `AUTO_DOWNLOAD_NLTK` flag in `tokenize.py` to download `NLTK_DATA`.
31+
- **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`. Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.
32+
- **Drop usage of ndjson dependency**
33+
34+
## 0.16.15
35+
### Enhancements
36+
37+
### Features
38+
39+
### Fixes
40+
- **Update `unstructured-inference`** to 0.8.6 in requirements which removed `layoutparser` dependency libs
41+
- **Update `pdfminer-six` to 20240706**
42+
43+
## 0.16.14
44+
45+
### Enhancements
46+
47+
### Features
48+
49+
### Fixes
50+
- **Fix an issue with multiple values for `infer_table_structure`** when paritioning email with image attachements the kwarg calls into `partition` to partition the image already contains `infer_table_structure`. Now `partition` function checks if the `kwarg` has `infer_table_structure` already
51+
152
## 0.16.13
253

354
### Enhancements

Dockerfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
22

33
ARG PYTHON=python3.11
4-
ARG PIP=pip3.11
4+
ARG PIP="${PYTHON} -m pip"
55

66
USER root
77

@@ -19,6 +19,9 @@ RUN chown -R notebook-user:notebook-user /app && \
1919

2020
USER notebook-user
2121

22+
# append PATH before pip install to avoid warning logs; it also avoids issues with packages that needs compilation during installation
23+
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
24+
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
2225
ENV NLTK_DATA=/home/notebook-user/nltk_data
2326

2427
# Install Python dependencies and download required NLTK packages
@@ -28,7 +31,4 @@ RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir -
2831
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
2932
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
3033

31-
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
32-
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
33-
3434
CMD ["/bin/bash"]

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ tidy-shell:
254254

255255
.PHONY: tidy-python
256256
tidy-python:
257-
ruff . --fix-only || true
257+
ruff check . --fix-only || true
258258
autoflake --in-place .
259259
black --line-length=100 .
260260

@@ -308,7 +308,7 @@ docker-test:
308308
$(DOCKER_IMAGE) \
309309
bash -c "CI=$(CI) \
310310
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
311-
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
311+
python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
312312

313313
.PHONY: docker-smoke-test
314314
docker-smoke-test:

example-docs/pdf/single_table.pdf

77.3 KB
Binary file not shown.

pyproject.toml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,24 @@
11
[tool.black]
22
line-length = 100
3+
exclude = '''
4+
/(
5+
\.venv # Ignore virtual environment directory
6+
| \.git
7+
| \.hg
8+
| \.mypy_cache
9+
| \.tox
10+
| \.nox
11+
| \.env
12+
| \.pytest_cache
13+
| \.venv
14+
| _build
15+
| buck-out
16+
| build
17+
| dist
18+
| unstructured/nlp/patterns\.py
19+
)/|
20+
unstructured/nlp/patterns\.py
21+
'''
322

423
[tool.pyright]
524
pythonPlatform = "Linux"

requirements/base.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,3 @@ tqdm
2222
psutil
2323
python-oxmsg
2424
html5lib
25-
ndjson

requirements/base.txt

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile ./base.in
66
#
7-
anyio==4.7.0
7+
anyio==4.8.0
88
# via httpx
99
backoff==2.2.1
1010
# via -r ./base.in
@@ -36,7 +36,7 @@ dataclasses-json==0.6.7
3636
# unstructured-client
3737
deepdiff==8.1.1
3838
# via unstructured-client
39-
emoji==2.14.0
39+
emoji==2.14.1
4040
# via -r ./base.in
4141
exceptiongroup==1.2.2
4242
# via anyio
@@ -64,16 +64,14 @@ langdetect==1.0.9
6464
# via -r ./base.in
6565
lxml==5.3.0
6666
# via -r ./base.in
67-
marshmallow==3.23.2
67+
marshmallow==3.26.0
6868
# via
6969
# dataclasses-json
7070
# unstructured-client
7171
mypy-extensions==1.0.0
7272
# via
7373
# typing-inspect
7474
# unstructured-client
75-
ndjson==0.3.1
76-
# via -r ./base.in
7775
nest-asyncio==1.6.0
7876
# via unstructured-client
7977
nltk==3.9.1
@@ -92,11 +90,11 @@ psutil==6.1.1
9290
# via -r ./base.in
9391
pycparser==2.22
9492
# via cffi
95-
pypdf==5.1.0
93+
pypdf==5.2.0
9694
# via unstructured-client
9795
python-dateutil==2.9.0.post0
9896
# via unstructured-client
99-
python-iso639==2024.10.22
97+
python-iso639==2025.1.28
10098
# via -r ./base.in
10199
python-magic==0.4.27
102100
# via -r ./base.in
@@ -150,5 +148,5 @@ urllib3==1.26.20
150148
# unstructured-client
151149
webencodings==0.5.1
152150
# via html5lib
153-
wrapt==1.17.0
151+
wrapt==1.17.2
154152
# via -r ./base.in

requirements/deps/constraints.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
# extras. Putting a dependency here will only affect dependency sets that contain them -- in other
44
# words, if something does not require a constraint, it will not be installed.
55
####################################################################################################
6-
# (jennings): Versions greater than 5.0 create dependency conflicts with other packages
7-
protobuf<5.0
6+
# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3
7+
weaviate-client>=3.26.7,<4.0.0
88
# TODO: Constriant due to multiple versions being installed during pip-compile
99
grpcio>=1.65.5
1010
# TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py)

requirements/dev.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ click==8.1.8
1515
# pip-tools
1616
distlib==0.3.9
1717
# via virtualenv
18-
filelock==3.16.1
18+
filelock==3.17.0
1919
# via virtualenv
20-
identify==2.6.4
20+
identify==2.6.6
2121
# via pre-commit
22-
importlib-metadata==8.5.0
22+
importlib-metadata==8.6.1
2323
# via
2424
# -c ././deps/constraints.txt
2525
# build
@@ -36,7 +36,7 @@ platformdirs==4.3.6
3636
# via
3737
# -c ./test.txt
3838
# virtualenv
39-
pre-commit==4.0.1
39+
pre-commit==4.1.0
4040
# via -r ./dev.in
4141
pyproject-hooks==1.2.0
4242
# via
@@ -51,7 +51,7 @@ tomli==2.2.1
5151
# -c ./test.txt
5252
# build
5353
# pip-tools
54-
virtualenv==20.28.1
54+
virtualenv==20.29.1
5555
# via pre-commit
5656
wheel==0.45.1
5757
# via pip-tools

0 commit comments

Comments
 (0)