Skip to content

Commit eff84af

Browse files
authored
chore: update python-docx version dependency (#2952)
**Summary** `unstructured` will use table features added in the most recent version of `python-docx`. Also update the `lxml` version constraint because `lxml>4.9.2` will not install on Apple Silicon (#1707). `python-docx` requires `lxml` although other file formats require it as well.
1 parent 542d442 commit eff84af

34 files changed

+121
-105
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.13.7-dev0
1+
## 0.13.7-dev1
22

33
### Enhancements
4+
45
* **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.
56

67
### Features

docs/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ mdit-py-plugins==0.4.0
5252
# via myst-parser
5353
mdurl==0.1.2
5454
# via markdown-it-py
55-
myst-parser==3.0.0
55+
myst-parser==3.0.1
5656
# via -r ./build.in
5757
packaging==23.2
5858
# via

requirements/base.txt

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ charset-normalizer==3.3.2
2121
# unstructured-client
2222
click==8.1.7
2323
# via nltk
24-
dataclasses-json==0.6.4
24+
dataclasses-json==0.6.5
2525
# via -r ./base.in
2626
dataclasses-json-speakeasy==0.5.11
2727
# via unstructured-client
@@ -39,10 +39,8 @@ jsonpath-python==1.0.6
3939
# via unstructured-client
4040
langdetect==1.0.9
4141
# via -r ./base.in
42-
lxml==4.9.4
43-
# via
44-
# -c ././deps/constraints.txt
45-
# -r ./base.in
42+
lxml==5.2.1
43+
# via -r ./base.in
4644
marshmallow==3.21.1
4745
# via
4846
# dataclasses-json
@@ -63,13 +61,13 @@ packaging==23.2
6361
# unstructured-client
6462
python-dateutil==2.9.0.post0
6563
# via unstructured-client
66-
python-iso639==2024.2.7
64+
python-iso639==2024.4.27
6765
# via -r ./base.in
6866
python-magic==0.4.27
6967
# via -r ./base.in
7068
rapidfuzz==3.8.1
7169
# via -r ./base.in
72-
regex==2024.4.16
70+
regex==2024.4.28
7371
# via nltk
7472
requests==2.31.0
7573
# via

requirements/build.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ mdit-py-plugins==0.4.0
5252
# via myst-parser
5353
mdurl==0.1.2
5454
# via markdown-it-py
55-
myst-parser==3.0.0
55+
myst-parser==3.0.1
5656
# via -r ./build.in
5757
packaging==23.2
5858
# via

requirements/deps/constraints.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ opencv-python==4.8.0.76
3838
opencv-contrib-python==4.8.0.76
3939
platformdirs==3.10.0
4040

41+
# Note(scanny): partition_docx() uses table features added in python-docx v1.1.2. Added here since
42+
# multiple formats have a python-docx dependency (docx, odt)
43+
python-docx>=1.1.2
44+
4145
# TODO: Constraint due to langchain, remove when that gets updated:
4246
packaging<24.0
4347

@@ -48,8 +52,5 @@ urllib3<1.27
4852
# TODO: Constriant due to aiobotocore, remove when that gets updates:
4953
botocore<1.34.52
5054

51-
# TODO: constraint due to current release of pikepdf (v8.14.0), remove once next version releases since fix is on main
52-
lxml<5
53-
5455
# NOTE(jennings): pinned due to later versions not supporting api_key_auth in UnstructuredClient
5556
unstructured-client<=0.18.0

requirements/dev.txt

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ executing==2.0.1
8181
# via stack-data
8282
fastjsonschema==2.19.1
8383
# via nbformat
84-
filelock==3.13.4
84+
filelock==3.14.0
8585
# via virtualenv
8686
fqdn==1.5.1
8787
# via jsonschema
@@ -138,7 +138,7 @@ json5==0.9.25
138138
# via jupyterlab-server
139139
jsonpointer==2.4
140140
# via jsonschema
141-
jsonschema[format-nongpl]==4.21.1
141+
jsonschema[format-nongpl]==4.22.0
142142
# via
143143
# jupyter-events
144144
# jupyterlab-server
@@ -203,7 +203,7 @@ mistune==3.0.2
203203
# via nbconvert
204204
nbclient==0.10.0
205205
# via nbconvert
206-
nbconvert==7.16.3
206+
nbconvert==7.16.4
207207
# via
208208
# jupyter
209209
# jupyter-server
@@ -277,7 +277,7 @@ pygments==2.17.2
277277
# jupyter-console
278278
# nbconvert
279279
# qtconsole
280-
pyproject-hooks==1.0.0
280+
pyproject-hooks==1.1.0
281281
# via
282282
# build
283283
# pip-tools
@@ -294,7 +294,7 @@ pyyaml==6.0.1
294294
# -c ./test.txt
295295
# jupyter-events
296296
# pre-commit
297-
pyzmq==26.0.2
297+
pyzmq==26.0.3
298298
# via
299299
# ipykernel
300300
# jupyter-client
@@ -305,7 +305,7 @@ qtconsole==5.5.1
305305
# via jupyter
306306
qtpy==2.4.1
307307
# via qtconsole
308-
referencing==0.35.0
308+
referencing==0.35.1
309309
# via
310310
# jsonschema
311311
# jsonschema-specifications
@@ -359,7 +359,6 @@ tomli==2.0.1
359359
# build
360360
# jupyterlab
361361
# pip-tools
362-
# pyproject-hooks
363362
tornado==6.4
364363
# via
365364
# ipykernel
@@ -401,7 +400,7 @@ urllib3==1.26.18
401400
# -c ./base.txt
402401
# -c ./test.txt
403402
# requests
404-
virtualenv==20.26.0
403+
virtualenv==20.26.1
405404
# via pre-commit
406405
wcwidth==0.2.13
407406
# via prompt-toolkit

requirements/extra-docx.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
#
55
# pip-compile ./extra-docx.in
66
#
7-
lxml==4.9.4
7+
lxml==5.2.1
88
# via
9-
# -c ././deps/constraints.txt
109
# -c ./base.txt
1110
# python-docx
12-
python-docx==1.1.0
13-
# via -r ./extra-docx.in
11+
python-docx==1.1.2
12+
# via
13+
# -c ././deps/constraints.txt
14+
# -r ./extra-docx.in
1415
typing-extensions==4.11.0
1516
# via
1617
# -c ./base.txt

requirements/extra-odt.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,16 @@
44
#
55
# pip-compile ./extra-odt.in
66
#
7-
lxml==4.9.4
7+
lxml==5.2.1
88
# via
9-
# -c ././deps/constraints.txt
109
# -c ./base.txt
1110
# python-docx
1211
pypandoc==1.13
1312
# via -r ./extra-odt.in
14-
python-docx==1.1.0
15-
# via -r ./extra-odt.in
13+
python-docx==1.1.2
14+
# via
15+
# -c ././deps/constraints.txt
16+
# -r ./extra-odt.in
1617
typing-extensions==4.11.0
1718
# via
1819
# -c ./base.txt

requirements/extra-paddleocr.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ babel==2.14.0
1010
# via flask-babel
1111
bce-python-sdk==0.9.7
1212
# via visualdl
13-
blinker==1.8.0
13+
blinker==1.8.1
1414
# via flask
1515
cachetools==5.3.3
1616
# via premailer
@@ -77,9 +77,8 @@ lazy-loader==0.4
7777
# via scikit-image
7878
lmdb==1.4.1
7979
# via unstructured-paddleocr
80-
lxml==4.9.4
80+
lxml==5.2.1
8181
# via
82-
# -c ././deps/constraints.txt
8382
# -c ./base.txt
8483
# premailer
8584
# unstructured-paddleocr

requirements/extra-pdf-image.txt

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ deprecated==1.2.14
3232
# via pikepdf
3333
effdet==0.4.1
3434
# via layoutparser
35-
filelock==3.13.4
35+
filelock==3.14.0
3636
# via
3737
# huggingface-hub
3838
# torch
@@ -45,8 +45,10 @@ fsspec==2024.3.1
4545
# via
4646
# huggingface-hub
4747
# torch
48-
google-api-core[grpc]==2.18.0
49-
# via google-cloud-vision
48+
google-api-core[grpc]==2.19.0
49+
# via
50+
# google-api-core
51+
# google-cloud-vision
5052
google-auth==2.29.0
5153
# via
5254
# google-api-core
@@ -57,7 +59,7 @@ googleapis-common-protos==1.63.0
5759
# via
5860
# google-api-core
5961
# grpcio-status
60-
grpcio==1.62.2
62+
grpcio==1.63.0
6163
# via
6264
# google-api-core
6365
# grpcio-status
@@ -85,9 +87,8 @@ kiwisolver==1.4.5
8587
# via matplotlib
8688
layoutparser[layoutmodels,tesseract]==0.3.4
8789
# via unstructured-inference
88-
lxml==4.9.4
90+
lxml==5.2.1
8991
# via
90-
# -c ././deps/constraints.txt
9192
# -c ./base.txt
9293
# pikepdf
9394
markupsafe==2.1.5
@@ -223,7 +224,7 @@ rapidfuzz==3.8.1
223224
# via
224225
# -c ./base.txt
225226
# unstructured-inference
226-
regex==2024.4.16
227+
regex==2024.4.28
227228
# via
228229
# -c ./base.txt
229230
# transformers

0 commit comments

Comments
 (0)