Skip to content

Commit c0b945e

Browse files
authored
chore: bump unstructured to 0.10.23 (#285)
Note I added a workaround for [this bug](Unstructured-IO/unstructured#1754).
1 parent 92908f2 commit c0b945e

File tree

6 files changed

+41
-34
lines changed

6 files changed

+41
-34
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
with:
2121
path: |
2222
.venv
23-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
23+
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
2424
- name: Set up Python ${{ env.PYTHON_VERSION }}
2525
uses: actions/setup-python@v4
2626
with:
@@ -42,7 +42,7 @@ jobs:
4242
with:
4343
path: |
4444
.venv
45-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
45+
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
4646
- name: Lint
4747
run: |
4848
source .venv/bin/activate
@@ -65,7 +65,7 @@ jobs:
6565
with:
6666
path: |
6767
.venv
68-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
68+
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
6969
- name: Run core tests
7070
run: |
7171
source .venv/bin/activate
@@ -106,7 +106,7 @@ jobs:
106106
with:
107107
path: |
108108
.venv
109-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
109+
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
110110
- name: Test Dockerfile
111111
run: |
112112
source .venv/bin/activate

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.0.53-dev0
1+
## 0.0.53
22

3+
* Bump unstructured to 0.10.23
34
* Simplify the error message for BadZipFile errors
45

56
## 0.0.52

Dockerfile

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,8 @@ USER ${NB_USER}
3535

3636
FROM python-deps as model-deps
3737

38-
# Note(Austin) - Unstructured 0.10.20 has some broken imports in ingest
39-
# Not relevant here - remove the imports for now
4038
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
4139
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
42-
sed -i '/Chunker/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
43-
sed -i '/Embedder/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
4440
python3.10 -c "from unstructured.ingest.pipeline.initialize import initialize; initialize()"
4541

4642
FROM model-deps as code

prepline_general/api/general.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def partition_pdf_splits(
197197
# If it's small enough, just process locally
198198
# (Some kwargs need to be renamed for local partition)
199199
if len(pdf_pages) <= pages_per_pdf:
200-
if "hi_res_model_name" in partition_kwargs:
200+
if partition_kwargs.get("hi_res_model_name"):
201201
partition_kwargs["model_name"] = partition_kwargs.pop("hi_res_model_name")
202202

203203
return partition(
@@ -406,6 +406,13 @@ def pipeline_api(
406406
)
407407
)
408408

409+
# TODO(austin) - Latest unstructured won't accept model_name=None
410+
# Just pass if it's set until the fix is released
411+
# https://github.com/Unstructured-IO/unstructured/issues/1754
412+
kwargs = {}
413+
if hi_res_model_name:
414+
kwargs["model_name"] = hi_res_model_name
415+
409416
# Be careful of naming differences in api params vs partition params!
410417
# These kwargs are going back into the api, not into partition
411418
# If there's a difference, remap the param in partition_pdf_splits
@@ -440,7 +447,6 @@ def pipeline_api(
440447
# partition_kwargs
441448
encoding=encoding,
442449
include_page_breaks=include_page_breaks,
443-
model_name=hi_res_model_name,
444450
ocr_languages=ocr_languages,
445451
pdf_infer_table_structure=pdf_infer_table_structure,
446452
skip_infer_table_types=skip_infer_table_types,
@@ -451,6 +457,7 @@ def pipeline_api(
451457
multipage_sections=multipage_sections,
452458
combine_under_n_chars=combine_under_n_chars,
453459
new_after_n_chars=new_after_n_chars,
460+
**kwargs,
454461
)
455462
except ValueError as e:
456463
if "Invalid file" in e.args[0]:

requirements/base.txt

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,12 @@ msg-parser==1.2.0
115115
mypy-extensions==1.0.0
116116
# via typing-inspect
117117
networkx==3.1
118-
# via torch
118+
# via
119+
# torch
120+
# unstructured
119121
nltk==3.8.1
120122
# via unstructured
121-
numpy==1.26.0
123+
numpy==1.26.1
122124
# via
123125
# contourpy
124126
# layoutparser
@@ -137,7 +139,7 @@ olefile==0.46
137139
omegaconf==2.3.0
138140
# via effdet
139141
onnx==1.14.1
140-
# via unstructured-inference
142+
# via unstructured
141143
onnxruntime==1.15.1
142144
# via unstructured-inference
143145
opencv-python==4.8.1.78
@@ -169,7 +171,7 @@ pdfminer-six==20221105
169171
# unstructured
170172
pdfplumber==0.10.2
171173
# via layoutparser
172-
pillow==10.0.1
174+
pillow==10.1.0
173175
# via
174176
# layoutparser
175177
# matplotlib
@@ -185,7 +187,7 @@ protobuf==4.24.4
185187
# via
186188
# onnx
187189
# onnxruntime
188-
psutil==5.9.5
190+
psutil==5.9.6
189191
# via -r requirements/base.in
190192
pycocotools==2.0.7
191193
# via effdet
@@ -211,7 +213,7 @@ python-dateutil==2.8.2
211213
# via
212214
# matplotlib
213215
# pandas
214-
python-docx==1.0.0
216+
python-docx==1.0.1
215217
# via unstructured
216218
python-iso639==2023.6.15
217219
# via unstructured
@@ -310,9 +312,9 @@ typing-inspect==0.9.0
310312
# via dataclasses-json
311313
tzdata==2023.3
312314
# via pandas
313-
unstructured[local-inference]==0.10.21
315+
unstructured[local-inference]==0.10.23
314316
# via -r requirements/base.in
315-
unstructured-inference==0.7.2
317+
unstructured-inference==0.7.5
316318
# via unstructured
317319
unstructured-pytesseract==0.3.12
318320
# via unstructured
@@ -322,5 +324,5 @@ uvicorn==0.23.2
322324
# via -r requirements/base.in
323325
xlrd==2.0.1
324326
# via unstructured
325-
xlsxwriter==3.1.7
327+
xlsxwriter==3.1.8
326328
# via python-pptx

requirements/test.txt

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -275,11 +275,11 @@ jupyter-core==5.4.0
275275
# nbconvert
276276
# nbformat
277277
# qtconsole
278-
jupyter-events==0.7.0
278+
jupyter-events==0.8.0
279279
# via jupyter-server
280280
jupyter-lsp==2.2.0
281281
# via jupyterlab
282-
jupyter-server==2.7.3
282+
jupyter-server==2.8.0
283283
# via
284284
# jupyter-lsp
285285
# jupyterlab
@@ -288,7 +288,7 @@ jupyter-server==2.7.3
288288
# notebook-shim
289289
jupyter-server-terminals==0.4.4
290290
# via jupyter-server
291-
jupyterlab==4.0.6
291+
jupyterlab==4.0.7
292292
# via notebook
293293
jupyterlab-pygments==0.2.2
294294
# via nbconvert
@@ -377,17 +377,18 @@ networkx==3.1
377377
# via
378378
# -r requirements/base.txt
379379
# torch
380+
# unstructured
380381
nltk==3.8.1
381382
# via
382383
# -r requirements/base.txt
383384
# unstructured
384-
notebook==7.0.4
385+
notebook==7.0.5
385386
# via jupyter
386387
notebook-shim==0.2.3
387388
# via
388389
# jupyterlab
389390
# notebook
390-
numpy==1.26.0
391+
numpy==1.26.1
391392
# via
392393
# -r requirements/base.txt
393394
# contourpy
@@ -413,7 +414,7 @@ omegaconf==2.3.0
413414
onnx==1.14.1
414415
# via
415416
# -r requirements/base.txt
416-
# unstructured-inference
417+
# unstructured
417418
onnxruntime==1.15.1
418419
# via
419420
# -r requirements/base.txt
@@ -479,7 +480,7 @@ pexpect==4.8.0
479480
# via ipython
480481
pickleshare==0.7.5
481482
# via ipython
482-
pillow==10.0.1
483+
pillow==10.1.0
483484
# via
484485
# -r requirements/base.txt
485486
# layoutparser
@@ -511,7 +512,7 @@ protobuf==4.24.4
511512
# -r requirements/base.txt
512513
# onnx
513514
# onnxruntime
514-
psutil==5.9.5
515+
psutil==5.9.6
515516
# via
516517
# -r requirements/base.txt
517518
# ipykernel
@@ -525,7 +526,7 @@ pycocotools==2.0.7
525526
# via
526527
# -r requirements/base.txt
527528
# effdet
528-
pycodestyle==2.11.0
529+
pycodestyle==2.11.1
529530
# via flake8
530531
pycparser==2.21
531532
# via
@@ -578,7 +579,7 @@ python-dateutil==2.8.2
578579
# jupyter-client
579580
# matplotlib
580581
# pandas
581-
python-docx==1.0.0
582+
python-docx==1.0.1
582583
# via
583584
# -r requirements/base.txt
584585
# unstructured
@@ -658,7 +659,7 @@ rfc3986-validator==0.1.1
658659
# via
659660
# jsonschema
660661
# jupyter-events
661-
rpds-py==0.10.5
662+
rpds-py==0.10.6
662663
# via
663664
# jsonschema
664665
# referencing
@@ -804,9 +805,9 @@ tzdata==2023.3
804805
# via
805806
# -r requirements/base.txt
806807
# pandas
807-
unstructured[local-inference]==0.10.21
808+
unstructured[local-inference]==0.10.23
808809
# via -r requirements/base.txt
809-
unstructured-inference==0.7.2
810+
unstructured-inference==0.7.5
810811
# via
811812
# -r requirements/base.txt
812813
# unstructured
@@ -842,7 +843,7 @@ xlrd==2.0.1
842843
# via
843844
# -r requirements/base.txt
844845
# unstructured
845-
xlsxwriter==3.1.7
846+
xlsxwriter==3.1.8
846847
# via
847848
# -r requirements/base.txt
848849
# python-pptx

0 commit comments

Comments
 (0)