Skip to content

Commit 13c1760

Browse files
authored
chore(deps): bump unstructured to 0.10.25 (#291)
Revert the workaround added in #285
1 parent 414ea67 commit 13c1760

File tree

5 files changed

+39
-40
lines changed

5 files changed

+39
-40
lines changed

CHANGELOG.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
## 0.0.54-dev2
1+
## 0.0.54
22

3-
* Bump unstructured to 0.10.24
3+
* Bump unstructured to 0.10.25
44
* Use a generator when splitting pdfs in parallel mode
55
* Add a default memory minimum for 503 check
6-
* Fix a UnboundLocalError when an invalid docx file is caught
6+
* Fix an UnboundLocalError when an invalid docx file is caught
77

88
## 0.0.53
99

prepline_general/api/general.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -403,13 +403,6 @@ def pipeline_api(
403403
)
404404
)
405405

406-
# TODO(austin) - Latest unstructured won't accept model_name=None
407-
# Just pass if it's set until the fix is released
408-
# https://github.com/Unstructured-IO/unstructured/issues/1754
409-
kwargs = {}
410-
if hi_res_model_name:
411-
kwargs["model_name"] = hi_res_model_name
412-
413406
# Be careful of naming differences in api params vs partition params!
414407
# These kwargs are going back into the api, not into partition
415408
# If there's a difference, remap the param in partition_pdf_splits
@@ -444,6 +437,7 @@ def pipeline_api(
444437
# partition_kwargs
445438
encoding=encoding,
446439
include_page_breaks=include_page_breaks,
440+
model_name=hi_res_model_name,
447441
ocr_languages=ocr_languages,
448442
pdf_infer_table_structure=pdf_infer_table_structure,
449443
skip_infer_table_types=skip_infer_table_types,
@@ -454,7 +448,6 @@ def pipeline_api(
454448
multipage_sections=multipage_sections,
455449
combine_under_n_chars=combine_under_n_chars,
456450
new_after_n_chars=new_after_n_chars,
457-
**kwargs,
458451
)
459452
except ValueError as e:
460453
if "Invalid file" in e.args[0]:

requirements/base.txt

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ et-xmlfile==1.1.0
5151
# via openpyxl
5252
exceptiongroup==1.1.3
5353
# via anyio
54-
fastapi==0.103.2
54+
fastapi==0.104.0
5555
# via -r requirements/base.in
5656
filelock==3.12.4
5757
# via
@@ -114,7 +114,7 @@ msg-parser==1.2.0
114114
# via unstructured
115115
mypy-extensions==1.0.0
116116
# via typing-inspect
117-
networkx==3.1
117+
networkx==3.2
118118
# via
119119
# torch
120120
# unstructured
@@ -139,7 +139,9 @@ olefile==0.46
139139
omegaconf==2.3.0
140140
# via effdet
141141
onnx==1.14.1
142-
# via unstructured
142+
# via
143+
# unstructured
144+
# unstructured-inference
143145
onnxruntime==1.15.1
144146
# via unstructured-inference
145147
opencv-python==4.8.1.78
@@ -205,7 +207,7 @@ pyparsing==3.1.1
205207
# via matplotlib
206208
pypdf==3.16.4
207209
# via -r requirements/base.in
208-
pypdfium2==4.21.0
210+
pypdfium2==4.22.0
209211
# via pdfplumber
210212
pytesseract==0.3.10
211213
# via layoutparser
@@ -273,7 +275,7 @@ sympy==1.12
273275
# torch
274276
tabulate==0.9.0
275277
# via unstructured
276-
timm==0.9.7
278+
timm==0.9.8
277279
# via effdet
278280
tokenizers==0.14.1
279281
# via transformers
@@ -294,7 +296,7 @@ tqdm==4.66.1
294296
# iopath
295297
# nltk
296298
# transformers
297-
transformers==4.34.0
299+
transformers==4.34.1
298300
# via unstructured-inference
299301
typing-extensions==4.8.0
300302
# via
@@ -312,17 +314,17 @@ typing-inspect==0.9.0
312314
# via dataclasses-json
313315
tzdata==2023.3
314316
# via pandas
315-
unstructured[local-inference]==0.10.24
317+
unstructured[local-inference]==0.10.25
316318
# via -r requirements/base.in
317-
unstructured-inference==0.7.7
319+
unstructured-inference==0.7.9
318320
# via unstructured
319321
unstructured-pytesseract==0.3.12
320322
# via unstructured
321-
urllib3==2.0.6
323+
urllib3==2.0.7
322324
# via requests
323325
uvicorn==0.23.2
324326
# via -r requirements/base.in
325327
xlrd==2.0.1
326328
# via unstructured
327-
xlsxwriter==3.1.8
329+
xlsxwriter==3.1.9
328330
# via python-pptx

requirements/test.txt

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ beautifulsoup4==4.12.2
5050
# -r requirements/base.txt
5151
# nbconvert
5252
# unstructured
53-
black==23.9.1
53+
black==23.10.0
5454
# via -r requirements/test.in
5555
bleach==6.1.0
5656
# via nbconvert
@@ -86,9 +86,7 @@ coloredlogs==15.0.1
8686
# -r requirements/base.txt
8787
# onnxruntime
8888
comm==0.1.4
89-
# via
90-
# ipykernel
91-
# ipywidgets
89+
# via ipykernel
9290
contourpy==1.1.1
9391
# via
9492
# -r requirements/base.txt
@@ -139,7 +137,7 @@ execnb==0.1.5
139137
# via nbdev
140138
executing==2.0.0
141139
# via stack-data
142-
fastapi==0.103.2
140+
fastapi==0.104.0
143141
# via -r requirements/base.txt
144142
fastcore==1.5.29
145143
# via
@@ -212,6 +210,7 @@ iopath==0.1.10
212210
# layoutparser
213211
ipykernel==6.25.2
214212
# via
213+
# ipywidgets
215214
# jupyter
216215
# jupyter-console
217216
# jupyterlab
@@ -224,8 +223,10 @@ ipython==8.16.1
224223
# jupyter-console
225224
ipython-genutils==0.2.0
226225
# via qtconsole
227-
ipywidgets==8.1.1
228-
# via jupyter
226+
ipywidgets==8.0.4
227+
# via
228+
# jupyter
229+
# nbdev
229230
isoduration==20.11.0
230231
# via jsonschema
231232
jedi==0.19.1
@@ -350,7 +351,7 @@ msg-parser==1.2.0
350351
# via
351352
# -r requirements/base.txt
352353
# unstructured
353-
mypy==1.6.0
354+
mypy==1.6.1
354355
# via -r requirements/test.in
355356
mypy-extensions==1.0.0
356357
# via
@@ -364,7 +365,7 @@ nbconvert==7.9.2
364365
# via
365366
# jupyter
366367
# jupyter-server
367-
nbdev==2.3.12
368+
nbdev==2.3.13
368369
# via -r requirements/test.in
369370
nbformat==5.9.2
370371
# via
@@ -373,7 +374,7 @@ nbformat==5.9.2
373374
# nbconvert
374375
nest-asyncio==1.5.8
375376
# via ipykernel
376-
networkx==3.1
377+
networkx==3.2
377378
# via
378379
# -r requirements/base.txt
379380
# torch
@@ -415,6 +416,7 @@ onnx==1.14.1
415416
# via
416417
# -r requirements/base.txt
417418
# unstructured
419+
# unstructured-inference
418420
onnxruntime==1.15.1
419421
# via
420422
# -r requirements/base.txt
@@ -556,7 +558,7 @@ pyparsing==3.1.1
556558
# matplotlib
557559
pypdf==3.16.4
558560
# via -r requirements/base.txt
559-
pypdfium2==4.21.0
561+
pypdfium2==4.22.0
560562
# via
561563
# -r requirements/base.txt
562564
# pdfplumber
@@ -570,7 +572,7 @@ pytest==7.4.2
570572
# pytest-mock
571573
pytest-cov==4.1.0
572574
# via -r requirements/test.in
573-
pytest-mock==3.11.1
575+
pytest-mock==3.12.0
574576
# via -r requirements/test.in
575577
python-dateutil==2.8.2
576578
# via
@@ -714,7 +716,7 @@ terminado==0.17.1
714716
# via
715717
# jupyter-server
716718
# jupyter-server-terminals
717-
timm==0.9.7
719+
timm==0.9.8
718720
# via
719721
# -r requirements/base.txt
720722
# effdet
@@ -776,7 +778,7 @@ traitlets==5.11.2
776778
# nbconvert
777779
# nbformat
778780
# qtconsole
779-
transformers==4.34.0
781+
transformers==4.34.1
780782
# via
781783
# -r requirements/base.txt
782784
# unstructured-inference
@@ -805,9 +807,9 @@ tzdata==2023.3
805807
# via
806808
# -r requirements/base.txt
807809
# pandas
808-
unstructured[local-inference]==0.10.24
810+
unstructured[local-inference]==0.10.25
809811
# via -r requirements/base.txt
810-
unstructured-inference==0.7.7
812+
unstructured-inference==0.7.9
811813
# via
812814
# -r requirements/base.txt
813815
# unstructured
@@ -817,7 +819,7 @@ unstructured-pytesseract==0.3.12
817819
# unstructured
818820
uri-template==1.3.0
819821
# via jsonschema
820-
urllib3==2.0.6
822+
urllib3==2.0.7
821823
# via
822824
# -r requirements/base.txt
823825
# requests
@@ -843,7 +845,7 @@ xlrd==2.0.1
843845
# via
844846
# -r requirements/base.txt
845847
# unstructured
846-
xlsxwriter==3.1.8
848+
xlsxwriter==3.1.9
847849
# via
848850
# -r requirements/base.txt
849851
# python-pptx

scripts/smoketest.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,6 @@ def test_table_support(strategy, pdf_infer_table_structure, expected_table_num):
163163
assert len(extracted_tables) == expected_table_num
164164
if expected_table_num > 0:
165165
# Test a text form a table is extracted
166-
assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0]
166+
# Note(austin) - table output has changed - this line isn't returned
167+
# assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0]
168+
assert "Layouts of history" in extracted_tables[0]

0 commit comments

Comments
 (0)