From 3e945110dbcc59f7e9e372c8e836f5ad6158a92c Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 10:55:05 -0500 Subject: [PATCH 01/18] update unstructured-inference --- CHANGELOG.md | 9 +++++++ requirements/base.txt | 8 +++--- requirements/dev.txt | 4 +-- requirements/extra-epub.txt | 2 +- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 10 ++++---- requirements/extra-pandoc.txt | 2 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 43 +++++++++++--------------------- requirements/huggingface.txt | 4 +-- requirements/test.txt | 15 +++++------ unstructured/__version__.py | 2 +- 12 files changed, 49 insertions(+), 54 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56e7f8ef43..10687f5b48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.16.14-dev0 + +### Enhancements + +### Features + +### Fixes +- **Update `unstructured-inference`** to 0.8.3 in requirements which removed `layoutparser` dependency libs + ## 0.16.13 ### Enhancements diff --git a/requirements/base.txt b/requirements/base.txt index 6fecb30c04..57ca655747 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ # # pip-compile ./base.in # -anyio==4.7.0 +anyio==4.8.0 # via httpx backoff==2.2.1 # via -r ./base.in @@ -36,7 +36,7 @@ dataclasses-json==0.6.7 # unstructured-client deepdiff==8.1.1 # via unstructured-client -emoji==2.14.0 +emoji==2.14.1 # via -r ./base.in exceptiongroup==1.2.2 # via anyio @@ -64,7 +64,7 @@ langdetect==1.0.9 # via -r ./base.in lxml==5.3.0 # via -r ./base.in -marshmallow==3.23.2 +marshmallow==3.25.1 # via # dataclasses-json # unstructured-client @@ -150,5 +150,5 @@ urllib3==1.26.20 # unstructured-client webencodings==0.5.1 # via html5lib -wrapt==1.17.0 +wrapt==1.17.2 # via -r ./base.in diff --git a/requirements/dev.txt b/requirements/dev.txt index 30e42eb0ac..fbdeb1952c 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -17,7 +17,7 @@ distlib==0.3.9 # via virtualenv filelock==3.16.1 # via virtualenv -identify==2.6.4 +identify==2.6.5 # via pre-commit importlib-metadata==8.5.0 # via @@ -51,7 +51,7 @@ tomli==2.2.1 # -c ./test.txt # build # pip-tools -virtualenv==20.28.1 +virtualenv==20.29.1 # via pre-commit wheel==0.45.1 # via pip-tools diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index a9533059da..460408c418 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -4,5 +4,5 @@ # # pip-compile ./extra-epub.in # -pypandoc==1.14 +pypandoc==1.15 # via -r ./extra-epub.in diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 28ebf301a6..362c53ed74 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -8,7 +8,7 @@ lxml==5.3.0 # via # -c ./base.txt # python-docx -pypandoc==1.14 +pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index d7c0fe7226..97e2f10f53 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-paddleocr.in # -anyio==4.7.0 +anyio==4.8.0 # via # -c ./base.txt # httpx @@ -52,13 +52,13 @@ idna==3.10 # anyio # httpx # requests -imageio==2.36.1 +imageio==2.37.0 # via # imgaug # scikit-image imgaug==0.4.0 # via unstructured-paddleocr -importlib-resources==6.5.1 +importlib-resources==6.5.2 # via matplotlib kiwisolver==1.4.7 # via matplotlib @@ -86,9 +86,9 @@ numpy==1.26.4 # shapely # tifffile # unstructured-paddleocr -opencv-contrib-python==4.10.0.84 +opencv-contrib-python==4.11.0.86 # via unstructured-paddleocr -opencv-python==4.10.0.84 +opencv-python==4.11.0.86 # via # imgaug # unstructured-paddleocr diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index 4125059733..dd397c3845 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -4,5 +4,5 @@ # # pip-compile ./extra-pandoc.in # -pypandoc==1.14 +pypandoc==1.15 # via -r ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index ae3ccdf381..934876925a 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.8.1 +unstructured-inference==0.8.3 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 81b61276ef..81dd2cb259 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -60,14 +60,14 @@ googleapis-common-protos==1.66.0 # via # google-api-core # grpcio-status -grpcio==1.68.1 +grpcio==1.69.0 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.27.0 +huggingface-hub==0.27.1 # via # timm # tokenizers @@ -79,16 +79,12 @@ idna==3.10 # via # -c ./base.txt # requests -importlib-resources==6.5.1 +importlib-resources==6.5.2 # via matplotlib -iopath==0.1.10 - # via layoutparser jinja2==3.1.5 # via torch kiwisolver==1.4.7 # via matplotlib -layoutparser==0.3.4 - # via unstructured-inference lxml==5.3.0 # via # -c ./base.txt @@ -107,7 +103,6 @@ numpy==1.26.4 # via # -c ./base.txt # contourpy - # layoutparser # matplotlib # onnx # onnxruntime @@ -126,10 +121,8 @@ onnx==1.17.0 # unstructured-inference onnxruntime==1.19.2 # via unstructured-inference -opencv-python==4.10.0.84 - # via - # layoutparser - # unstructured-inference +opencv-python==4.11.0.86 + # via unstructured-inference packaging==24.2 # via # -c ./base.txt @@ -140,24 +133,21 @@ packaging==24.2 # transformers # unstructured-pytesseract pandas==2.2.3 - # via layoutparser + # via unstructured-inference pdf2image==1.17.0 - # via - # -r ./extra-pdf-image.in - # layoutparser + # via -r ./extra-pdf-image.in pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber pdfplumber==0.11.5 - # via layoutparser + # via unstructured-inference pi-heif==0.21.0 # via -r ./extra-pdf-image.in -pikepdf==9.5.0 +pikepdf==9.5.1 # via -r ./extra-pdf-image.in pillow==11.1.0 # via - # layoutparser # matplotlib # pdf2image # pdfplumber @@ -165,8 +155,6 @@ pillow==11.1.0 # pikepdf # torchvision # unstructured-pytesseract -portalocker==3.1.1 - # via iopath proto-plus==1.25.0 # via # google-api-core @@ -213,7 +201,6 @@ pytz==2024.2 pyyaml==6.0.2 # via # huggingface-hub - # layoutparser # omegaconf # timm # transformers @@ -233,12 +220,12 @@ requests==2.32.3 # transformers rsa==4.9 # via google-auth -safetensors==0.5.0 +safetensors==0.5.2 # via # timm # transformers scipy==1.13.1 - # via layoutparser + # via unstructured-inference six==1.17.0 # via # -c ./base.txt @@ -247,7 +234,7 @@ sympy==1.13.1 # via # onnxruntime # torch -timm==1.0.12 +timm==1.0.14 # via # effdet # unstructured-inference @@ -269,7 +256,6 @@ tqdm==4.67.1 # via # -c ./base.txt # huggingface-hub - # iopath # transformers transformers==4.44.2 # via unstructured-inference @@ -277,12 +263,11 @@ typing-extensions==4.12.2 # via # -c ./base.txt # huggingface-hub - # iopath # pypdf # torch tzdata==2024.2 # via pandas -unstructured-inference==0.8.1 +unstructured-inference==0.8.3 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in @@ -291,7 +276,7 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -wrapt==1.17.0 +wrapt==1.17.2 # via # -c ./base.txt # deprecated diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 7051a2233b..33223c77e9 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -25,7 +25,7 @@ fsspec==2024.12.0 # via # huggingface-hub # torch -huggingface-hub==0.27.0 +huggingface-hub==0.27.1 # via # tokenizers # transformers @@ -74,7 +74,7 @@ requests==2.32.3 # transformers sacremoses==0.1.1 # via -r ./huggingface.in -safetensors==0.5.0 +safetensors==0.5.2 # via transformers sentencepiece==0.2.0 # via -r ./huggingface.in diff --git a/requirements/test.txt b/requirements/test.txt index a7e1d2cfa2..48d96d6c7a 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,7 +6,7 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.7.0 +anyio==4.8.0 # via # -c ./base.txt # httpx @@ -54,7 +54,7 @@ exceptiongroup==1.2.2 # -c ./base.txt # anyio # pytest -faker==33.1.0 +faker==33.3.1 # via jsf flake8==7.1.1 # via @@ -66,7 +66,7 @@ freezegun==1.5.1 # via -r ./test.in genson==1.3.0 # via datamodel-code-generator -grpcio==1.68.1 +grpcio==1.69.0 # via # -c ././deps/constraints.txt # -r ./test.in @@ -164,7 +164,7 @@ pycodestyle==2.12.1 # via # flake8 # flake8-print -pydantic[email]==2.10.4 +pydantic[email]==2.10.5 # via # -r ./test.in # datamodel-code-generator @@ -196,7 +196,7 @@ pyyaml==6.0.2 # via # datamodel-code-generator # vcrpy -referencing==0.35.1 +referencing==0.36.1 # via # jsonschema # jsonschema-specifications @@ -218,7 +218,7 @@ rpds-py==0.22.3 # referencing rstr==3.2.2 # via jsf -ruff==0.8.5 +ruff==0.9.2 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -269,6 +269,7 @@ typing-extensions==4.12.2 # mypy # pydantic # pydantic-core + # referencing tzdata==2024.2 # via pandas ujson==5.10.0 @@ -281,7 +282,7 @@ urllib3==1.26.20 # vcrpy vcrpy==7.0.0 # via -r ./test.in -wrapt==1.17.0 +wrapt==1.17.2 # via # -c ./base.txt # smart-open diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b77b6463a4..5e2ca16589 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.13" # pragma: no cover +__version__ = "0.16.14-dev0" # pragma: no cover From 263a37bc7516b9e4b94be1a471f2832829243370 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 16:27:13 -0500 Subject: [PATCH 02/18] manually update pdfminer to be 20240706 --- requirements/dev.txt | 4 ++-- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 16 ++++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index fbdeb1952c..3e2e172a14 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -17,7 +17,7 @@ distlib==0.3.9 # via virtualenv filelock==3.16.1 # via virtualenv -identify==2.6.5 +identify==2.6.6 # via pre-commit importlib-metadata==8.5.0 # via @@ -36,7 +36,7 @@ platformdirs==4.3.6 # via # -c ./test.txt # virtualenv -pre-commit==4.0.1 +pre-commit==4.1.0 # via -r ./dev.in pyproject-hooks==1.2.0 # via diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 934876925a..d1990696a8 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -3,7 +3,7 @@ onnx pdf2image -pdfminer.six +pdfminer.six==20240706 pikepdf pi_heif pypdf diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 81dd2cb259..842e493d42 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -16,6 +16,10 @@ cffi==1.17.1 # via # -c ./base.txt # cryptography +chardet==5.2.0 + # via + # -c ./base.txt + # pdfplumber charset-normalizer==3.4.1 # via # -c ./base.txt @@ -136,11 +140,11 @@ pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 # via -r ./extra-pdf-image.in -pdfminer-six==20231228 +pdfminer-six==20240706 # via # -r ./extra-pdf-image.in # pdfplumber -pdfplumber==0.11.5 +pdfplumber==0.5.3 # via unstructured-inference pi-heif==0.21.0 # via -r ./extra-pdf-image.in @@ -181,14 +185,14 @@ pycparser==2.22 # via # -c ./base.txt # cffi +pycrypto==2.6.1 + # via pdfplumber pyparsing==3.2.1 # via matplotlib pypdf==5.1.0 # via # -c ./base.txt # -r ./extra-pdf-image.in -pypdfium2==4.30.1 - # via pdfplumber python-dateutil==2.9.0.post0 # via # -c ./base.txt @@ -267,6 +271,8 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas +unicodecsv==0.14.1 + # via pdfplumber unstructured-inference==0.8.3 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 @@ -276,6 +282,8 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests +wand==0.6.13 + # via pdfplumber wrapt==1.17.2 # via # -c ./base.txt From 8e83c66d6ebea623ff851518abc061b2072fa33a Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 16:28:13 -0500 Subject: [PATCH 03/18] update changelog about pdfminer --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f48a55386..3a3f8f7a6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Fixes - **Update `unstructured-inference`** to 0.8.3 in requirements which removed `layoutparser` dependency libs +- **Update `pdfminer-six` to 20240706** ## 0.16.14 From 592c5f34f7f94bffafdfa886ad822b52b2fdf934 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 16:46:41 -0500 Subject: [PATCH 04/18] add pycryptodome==3.17 --- requirements/extra-pdf-image.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 842e493d42..5f1b8c2823 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -185,7 +185,8 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pycrypto==2.6.1 +pycryptodome==3.17 +# pycrypto==2.6.1 # via pdfplumber pyparsing==3.2.1 # via matplotlib From e050ed1d2b5601362bc0b0faf673645e3e14e08b Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 20:37:09 -0500 Subject: [PATCH 05/18] updated unstructured-inference to 0.8.4 --- requirements/dev.txt | 2 +- requirements/extra-markdown.txt | 2 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 5 ++--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 3e2e172a14..78dbc1625f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -19,7 +19,7 @@ filelock==3.16.1 # via virtualenv identify==2.6.6 # via pre-commit -importlib-metadata==8.5.0 +importlib-metadata==8.6.1 # via # -c ././deps/constraints.txt # build diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 243fd0b0da..9d0a14da55 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-markdown.in # -importlib-metadata==8.5.0 +importlib-metadata==8.6.1 # via # -c ././deps/constraints.txt # markdown diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index d1990696a8..04d3ffd1b6 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.8.3 +unstructured-inference==0.8.4 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 5f1b8c2823..56276e5b3d 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -185,8 +185,7 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pycryptodome==3.17 -# pycrypto==2.6.1 +pycrypto==2.6.1 # via pdfplumber pyparsing==3.2.1 # via matplotlib @@ -274,7 +273,7 @@ tzdata==2024.2 # via pandas unicodecsv==0.14.1 # via pdfplumber -unstructured-inference==0.8.3 +unstructured-inference==0.8.4 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in From dcc73b393b744e2300db7cc9f097a2c62bc9d7d2 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 20:43:43 -0500 Subject: [PATCH 06/18] revert back pdfminer version --- requirements/extra-pdf-image.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 04d3ffd1b6..f3600d5d83 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -3,7 +3,7 @@ onnx pdf2image -pdfminer.six==20240706 +pdfminer.six pikepdf pi_heif pypdf From 2ef9c2f815de66f158eb34c473e549e872743ab9 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Mon, 20 Jan 2025 20:54:50 -0500 Subject: [PATCH 07/18] revert back pdfminer version --- requirements/extra-pdf-image.txt | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 56276e5b3d..28363b804f 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -16,10 +16,6 @@ cffi==1.17.1 # via # -c ./base.txt # cryptography -chardet==5.2.0 - # via - # -c ./base.txt - # pdfplumber charset-normalizer==3.4.1 # via # -c ./base.txt @@ -140,11 +136,11 @@ pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 # via -r ./extra-pdf-image.in -pdfminer-six==20240706 +pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber -pdfplumber==0.5.3 +pdfplumber==0.11.5 # via unstructured-inference pi-heif==0.21.0 # via -r ./extra-pdf-image.in @@ -185,14 +181,14 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pycrypto==2.6.1 - # via pdfplumber pyparsing==3.2.1 # via matplotlib pypdf==5.1.0 # via # -c ./base.txt # -r ./extra-pdf-image.in +pypdfium2==4.30.1 + # via pdfplumber python-dateutil==2.9.0.post0 # via # -c ./base.txt @@ -271,8 +267,6 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -unicodecsv==0.14.1 - # via pdfplumber unstructured-inference==0.8.4 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 @@ -282,8 +276,6 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -wand==0.6.13 - # via pdfplumber wrapt==1.17.2 # via # -c ./base.txt From 7ed9cd503848ef5e301e9ea610f558f907bb9a48 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Tue, 21 Jan 2025 10:15:53 -0500 Subject: [PATCH 08/18] update pdfminer to newer but remove pycrypto --- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index f3600d5d83..04d3ffd1b6 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -3,7 +3,7 @@ onnx pdf2image -pdfminer.six +pdfminer.six==20240706 pikepdf pi_heif pypdf diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 28363b804f..e77a79b01c 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -16,6 +16,10 @@ cffi==1.17.1 # via # -c ./base.txt # cryptography +chardet==5.2.0 + # via + # -c ./base.txt + # pdfplumber charset-normalizer==3.4.1 # via # -c ./base.txt @@ -136,11 +140,11 @@ pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 # via -r ./extra-pdf-image.in -pdfminer-six==20231228 +pdfminer-six==20240706 # via # -r ./extra-pdf-image.in # pdfplumber -pdfplumber==0.11.5 +pdfplumber==0.5.3 # via unstructured-inference pi-heif==0.21.0 # via -r ./extra-pdf-image.in @@ -181,14 +185,14 @@ pycparser==2.22 # via # -c ./base.txt # cffi +# pycrypto==2.6.1 +# # via pdfplumber pyparsing==3.2.1 # via matplotlib pypdf==5.1.0 # via # -c ./base.txt # -r ./extra-pdf-image.in -pypdfium2==4.30.1 - # via pdfplumber python-dateutil==2.9.0.post0 # via # -c ./base.txt @@ -267,6 +271,8 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas +unicodecsv==0.14.1 + # via pdfplumber unstructured-inference==0.8.4 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 @@ -276,6 +282,8 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests +wand==0.6.13 + # via pdfplumber wrapt==1.17.2 # via # -c ./base.txt From 9ea13cc45e81224fc9e5b850310b367fb1244aa9 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Tue, 21 Jan 2025 10:39:17 -0500 Subject: [PATCH 09/18] add pycryptodome --- requirements/extra-pdf-image.in | 1 + requirements/extra-pdf-image.txt | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 04d3ffd1b6..fdc4ca3276 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -4,6 +4,7 @@ onnx pdf2image pdfminer.six==20240706 +pycryptodome pikepdf pi_heif pypdf diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index e77a79b01c..a87266ada3 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -185,8 +185,10 @@ pycparser==2.22 # via # -c ./base.txt # cffi -# pycrypto==2.6.1 -# # via pdfplumber +pycrypto==2.6.1 + # via pdfplumber +pycryptodome==3.21.0 + # via -r ./extra-pdf-image.in pyparsing==3.2.1 # via matplotlib pypdf==5.1.0 From 96ce8b3e8f5d60df2d9668d95f97d522c87776ed Mon Sep 17 00:00:00 2001 From: tbs17 Date: Tue, 21 Jan 2025 18:09:25 -0500 Subject: [PATCH 10/18] update unstructured-inference to 0.8.5 --- requirements/dev.txt | 2 +- requirements/extra-csv.txt | 2 +- requirements/extra-paddleocr.txt | 2 +- requirements/extra-pdf-image.in | 4 ++-- requirements/extra-pdf-image.txt | 27 ++++++++------------------- requirements/extra-xlsx.txt | 2 +- requirements/huggingface.txt | 2 +- requirements/test.txt | 2 +- 8 files changed, 16 insertions(+), 27 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 78dbc1625f..a5ebd99214 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -15,7 +15,7 @@ click==8.1.8 # pip-tools distlib==0.3.9 # via virtualenv -filelock==3.16.1 +filelock==3.17.0 # via virtualenv identify==2.6.6 # via pre-commit diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 496cd42fc1..d4d50645e8 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -20,5 +20,5 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2024.2 +tzdata==2025.1 # via pandas diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 97e2f10f53..22c2bf16b7 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -32,7 +32,7 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # anyio -fonttools==4.55.3 +fonttools==4.55.4 # via matplotlib h11==0.14.0 # via diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index fdc4ca3276..24ae0b18f2 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -3,7 +3,7 @@ onnx pdf2image -pdfminer.six==20240706 +pdfminer.six pycryptodome pikepdf pi_heif @@ -12,5 +12,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.8.4 +unstructured-inference==0.8.5 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index a87266ada3..8e52bd9ab1 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -6,7 +6,7 @@ # antlr4-python3-runtime==4.9.3 # via omegaconf -cachetools==5.5.0 +cachetools==5.5.1 # via google-auth certifi==2024.12.14 # via @@ -16,10 +16,6 @@ cffi==1.17.1 # via # -c ./base.txt # cryptography -chardet==5.2.0 - # via - # -c ./base.txt - # pdfplumber charset-normalizer==3.4.1 # via # -c ./base.txt @@ -39,14 +35,14 @@ deprecated==1.2.15 # via pikepdf effdet==0.4.1 # via -r ./extra-pdf-image.in -filelock==3.16.1 +filelock==3.17.0 # via # huggingface-hub # torch # transformers flatbuffers==24.12.23 # via onnxruntime -fonttools==4.55.3 +fonttools==4.55.4 # via matplotlib fsspec==2024.12.0 # via @@ -143,9 +139,7 @@ pdf2image==1.17.0 pdfminer-six==20240706 # via # -r ./extra-pdf-image.in - # pdfplumber -pdfplumber==0.5.3 - # via unstructured-inference + # unstructured-inference pi-heif==0.21.0 # via -r ./extra-pdf-image.in pikepdf==9.5.1 @@ -154,7 +148,6 @@ pillow==11.1.0 # via # matplotlib # pdf2image - # pdfplumber # pi-heif # pikepdf # torchvision @@ -185,8 +178,6 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pycrypto==2.6.1 - # via pdfplumber pycryptodome==3.21.0 # via -r ./extra-pdf-image.in pyparsing==3.2.1 @@ -195,6 +186,8 @@ pypdf==5.1.0 # via # -c ./base.txt # -r ./extra-pdf-image.in +pypdfium2==4.30.1 + # via unstructured-inference python-dateutil==2.9.0.post0 # via # -c ./base.txt @@ -271,11 +264,9 @@ typing-extensions==4.12.2 # huggingface-hub # pypdf # torch -tzdata==2024.2 +tzdata==2025.1 # via pandas -unicodecsv==0.14.1 - # via pdfplumber -unstructured-inference==0.8.4 +unstructured-inference==0.8.5 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in @@ -284,8 +275,6 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -wand==0.6.13 - # via pdfplumber wrapt==1.17.2 # via # -c ./base.txt diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 7f00c057a2..b0c6cadbf7 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -26,7 +26,7 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2024.2 +tzdata==2025.1 # via pandas xlrd==2.0.1 # via -r ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 33223c77e9..e614f90a3b 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -16,7 +16,7 @@ click==8.1.8 # via # -c ./base.txt # sacremoses -filelock==3.16.1 +filelock==3.17.0 # via # huggingface-hub # torch diff --git a/requirements/test.txt b/requirements/test.txt index 48d96d6c7a..9a1087290b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -270,7 +270,7 @@ typing-extensions==4.12.2 # pydantic # pydantic-core # referencing -tzdata==2024.2 +tzdata==2025.1 # via pandas ujson==5.10.0 # via label-studio-sdk From d77a32c1880bd8dc4136ce26a7e59c2ea8eec055 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 09:36:58 -0500 Subject: [PATCH 11/18] set unstructured-inference>0.8.5 --- requirements/extra-pdf-image.in | 3 +-- requirements/extra-pdf-image.txt | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 24ae0b18f2..8ebe237bb7 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -4,7 +4,6 @@ onnx pdf2image pdfminer.six -pycryptodome pikepdf pi_heif pypdf @@ -12,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.8.5 +unstructured-inference>=0.8.5 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 8e52bd9ab1..abef58ee5f 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -40,7 +40,7 @@ filelock==3.17.0 # huggingface-hub # torch # transformers -flatbuffers==24.12.23 +flatbuffers==25.1.21 # via onnxruntime fonttools==4.55.4 # via matplotlib @@ -178,8 +178,6 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pycryptodome==3.21.0 - # via -r ./extra-pdf-image.in pyparsing==3.2.1 # via matplotlib pypdf==5.1.0 From 34696fe2708c3b54d311c273805d35b1f569fbb1 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 10:37:31 -0500 Subject: [PATCH 12/18] test fix source->sources in test_ocr.py --- test_unstructured/partition/pdf_image/test_ocr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index e9982810a0..4459fdfc58 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -328,7 +328,7 @@ def mock_layout(mock_embedded_text_regions): def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): ocr_elements = [ - LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) + LayoutElement(text=r.text, sources=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) for r in mock_ocr_regions ] @@ -353,7 +353,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): def test_merge_out_layout_with_ocr_layout(mock_out_layout, mock_ocr_regions): ocr_elements = [ - LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) + LayoutElement(text=r.text, sources=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) for r in mock_ocr_regions ] @@ -473,7 +473,7 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): # the code should ignore this invalid text and use ocr region's text mock_out_layout[0].text = "(cid:10)(cid:5)?" ocr_elements = [ - LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) + LayoutElement(text=r.text, sources=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) for r in mock_ocr_regions ] From cab4083a34470ba96373760803117ab16c89f39d Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 11:11:58 -0500 Subject: [PATCH 13/18] change to sources to a diff place --- .../partition/pdf_image/test_ocr.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 4459fdfc58..afdeff0b63 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -91,9 +91,9 @@ def test_get_ocr_layout_from_image_tesseract(monkeypatch): ocr_layout = ocr_agent.get_layout_from_image(image) expected_layout = [ - TextRegion.from_coords(10, 5, 25, 15, "Hello", source=Source.OCR_TESSERACT), - TextRegion.from_coords(20, 15, 45, 35, "World", source=Source.OCR_TESSERACT), - TextRegion.from_coords(30, 25, 65, 55, "!", source=Source.OCR_TESSERACT), + TextRegion.from_coords(10, 5, 25, 15, "Hello", sources=Source.OCR_TESSERACT), + TextRegion.from_coords(20, 15, 45, 35, "World", sources=Source.OCR_TESSERACT), + TextRegion.from_coords(30, 25, 65, 55, "!", sources=Source.OCR_TESSERACT), ] assert ocr_layout == expected_layout @@ -148,9 +148,9 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch): ocr_layout = OCRAgentPaddle().get_layout_from_image(image) expected_layout = [ - TextRegion.from_coords(10, 5, 25, 15, "Hello", source=Source.OCR_PADDLE), - TextRegion.from_coords(20, 15, 45, 35, "World", source=Source.OCR_PADDLE), - TextRegion.from_coords(30, 25, 65, 55, "!", source=Source.OCR_PADDLE), + TextRegion.from_coords(10, 5, 25, 15, "Hello", sources=Source.OCR_PADDLE), + TextRegion.from_coords(20, 15, 45, 35, "World", sources=Source.OCR_PADDLE), + TextRegion.from_coords(30, 25, 65, 55, "!", sources=Source.OCR_PADDLE), ] assert ocr_layout == expected_layout @@ -273,9 +273,9 @@ def test_get_layout_elements_from_image_google_vision(google_vision_client): @pytest.fixture() def mock_ocr_regions(): return [ - EmbeddedTextRegion.from_coords(10, 10, 90, 90, text="0", source=None), - EmbeddedTextRegion.from_coords(200, 200, 300, 300, text="1", source=None), - EmbeddedTextRegion.from_coords(500, 320, 600, 350, text="3", source=None), + EmbeddedTextRegion.from_coords(10, 10, 90, 90, text="0", sources=None), + EmbeddedTextRegion.from_coords(200, 200, 300, 300, text="1", sources=None), + EmbeddedTextRegion.from_coords(500, 320, 600, 350, text="3", sources=None), ] @@ -328,7 +328,7 @@ def mock_layout(mock_embedded_text_regions): def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): ocr_elements = [ - LayoutElement(text=r.text, sources=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) + LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) for r in mock_ocr_regions ] @@ -353,7 +353,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): def test_merge_out_layout_with_ocr_layout(mock_out_layout, mock_ocr_regions): ocr_elements = [ - LayoutElement(text=r.text, sources=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) + LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) for r in mock_ocr_regions ] @@ -473,7 +473,7 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): # the code should ignore this invalid text and use ocr region's text mock_out_layout[0].text = "(cid:10)(cid:5)?" ocr_elements = [ - LayoutElement(text=r.text, sources=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) + LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox) for r in mock_ocr_regions ] From e6c71dc61fe9dde09796aa688ffc675c4415d5be Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 14:09:46 -0500 Subject: [PATCH 14/18] update unstructured-inference to 0.8.6 --- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 2 +- requirements/test.txt | 2 +- .../partition/pdf_image/test_ocr.py | 18 +++++++++--------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 8ebe237bb7..99df481053 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference>=0.8.5 +unstructured-inference>=0.8.6 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index abef58ee5f..4fde223e02 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -264,7 +264,7 @@ typing-extensions==4.12.2 # torch tzdata==2025.1 # via pandas -unstructured-inference==0.8.5 +unstructured-inference==0.8.6 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in diff --git a/requirements/test.txt b/requirements/test.txt index 9a1087290b..87b9d7cc52 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -54,7 +54,7 @@ exceptiongroup==1.2.2 # -c ./base.txt # anyio # pytest -faker==33.3.1 +faker==34.0.0 # via jsf flake8==7.1.1 # via diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index afdeff0b63..e9982810a0 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -91,9 +91,9 @@ def test_get_ocr_layout_from_image_tesseract(monkeypatch): ocr_layout = ocr_agent.get_layout_from_image(image) expected_layout = [ - TextRegion.from_coords(10, 5, 25, 15, "Hello", sources=Source.OCR_TESSERACT), - TextRegion.from_coords(20, 15, 45, 35, "World", sources=Source.OCR_TESSERACT), - TextRegion.from_coords(30, 25, 65, 55, "!", sources=Source.OCR_TESSERACT), + TextRegion.from_coords(10, 5, 25, 15, "Hello", source=Source.OCR_TESSERACT), + TextRegion.from_coords(20, 15, 45, 35, "World", source=Source.OCR_TESSERACT), + TextRegion.from_coords(30, 25, 65, 55, "!", source=Source.OCR_TESSERACT), ] assert ocr_layout == expected_layout @@ -148,9 +148,9 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch): ocr_layout = OCRAgentPaddle().get_layout_from_image(image) expected_layout = [ - TextRegion.from_coords(10, 5, 25, 15, "Hello", sources=Source.OCR_PADDLE), - TextRegion.from_coords(20, 15, 45, 35, "World", sources=Source.OCR_PADDLE), - TextRegion.from_coords(30, 25, 65, 55, "!", sources=Source.OCR_PADDLE), + TextRegion.from_coords(10, 5, 25, 15, "Hello", source=Source.OCR_PADDLE), + TextRegion.from_coords(20, 15, 45, 35, "World", source=Source.OCR_PADDLE), + TextRegion.from_coords(30, 25, 65, 55, "!", source=Source.OCR_PADDLE), ] assert ocr_layout == expected_layout @@ -273,9 +273,9 @@ def test_get_layout_elements_from_image_google_vision(google_vision_client): @pytest.fixture() def mock_ocr_regions(): return [ - EmbeddedTextRegion.from_coords(10, 10, 90, 90, text="0", sources=None), - EmbeddedTextRegion.from_coords(200, 200, 300, 300, text="1", sources=None), - EmbeddedTextRegion.from_coords(500, 320, 600, 350, text="3", sources=None), + EmbeddedTextRegion.from_coords(10, 10, 90, 90, text="0", source=None), + EmbeddedTextRegion.from_coords(200, 200, 300, 300, text="1", source=None), + EmbeddedTextRegion.from_coords(500, 320, 600, 350, text="3", source=None), ] From 057a7fd55a67331f0e95d6f1d930c58e8089fe76 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 14:41:42 -0500 Subject: [PATCH 15/18] upgrade protobuf version --- requirements/deps/constraints.txt | 4 ++-- requirements/extra-paddleocr.txt | 2 +- requirements/extra-pdf-image.txt | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 5700719383..3881757f4a 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -3,8 +3,8 @@ # extras. Putting a dependency here will only affect dependency sets that contain them -- in other # words, if something does not require a constraint, it will not be installed. #################################################################################################### -# (jennings): Versions greater than 5.0 create dependency conflicts with other packages -protobuf<5.0 +# (jennings): Versions greater than 5.0 is required from unstructured-inference +protobuf>5.0 # TODO: Constriant due to multiple versions being installed during pip-compile grpcio>=1.65.5 # TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py) diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 22c2bf16b7..feedb17627 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -113,7 +113,7 @@ pillow==11.1.0 # pdf2image # scikit-image # unstructured-paddleocr -protobuf==4.25.5 +protobuf==5.29.3 # via # -c ././deps/constraints.txt # paddlepaddle diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 4fde223e02..9a4ab63c16 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -65,7 +65,7 @@ grpcio==1.69.0 # -c ././deps/constraints.txt # google-api-core # grpcio-status -grpcio-status==1.62.3 +grpcio-status==1.69.0 # via google-api-core huggingface-hub==0.27.1 # via @@ -156,7 +156,7 @@ proto-plus==1.25.0 # via # google-api-core # google-cloud-vision -protobuf==4.25.5 +protobuf==5.29.3 # via # -c ././deps/constraints.txt # google-api-core From 1c6a69db96f73df9a090649ac9112f615893b874 Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 16:31:15 -0500 Subject: [PATCH 16/18] update weaviate client and remove protobuf in constraints.txt --- requirements/deps/constraints.txt | 3 +-- requirements/extra-paddleocr.txt | 4 +--- requirements/extra-pdf-image.txt | 1 - test_unstructured/staging/test_weaviate.py | 4 ++-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 3881757f4a..e63d5417de 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -3,8 +3,7 @@ # extras. Putting a dependency here will only affect dependency sets that contain them -- in other # words, if something does not require a constraint, it will not be installed. #################################################################################################### -# (jennings): Versions greater than 5.0 is required from unstructured-inference -protobuf>5.0 + # TODO: Constriant due to multiple versions being installed during pip-compile grpcio>=1.65.5 # TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py) diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index feedb17627..f4f22d18fd 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -114,9 +114,7 @@ pillow==11.1.0 # scikit-image # unstructured-paddleocr protobuf==5.29.3 - # via - # -c ././deps/constraints.txt - # paddlepaddle + # via paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr pyparsing==3.2.1 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 9a4ab63c16..59fcfb8326 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -158,7 +158,6 @@ proto-plus==1.25.0 # google-cloud-vision protobuf==5.29.3 # via - # -c ././deps/constraints.txt # google-api-core # google-cloud-vision # googleapis-common-protos diff --git a/test_unstructured/staging/test_weaviate.py b/test_unstructured/staging/test_weaviate.py index abebc0d360..260b142e14 100644 --- a/test_unstructured/staging/test_weaviate.py +++ b/test_unstructured/staging/test_weaviate.py @@ -7,7 +7,7 @@ # NOTE(robinson) - allows tests that do not require the weaviate client to # run for the docker container with contextlib.suppress(ModuleNotFoundError): - from weaviate import Client + from weaviate import WeaviateClient from weaviate.embedded import EmbeddedOptions from unstructured.partition.json import partition_json @@ -59,5 +59,5 @@ def test_stage_for_weaviate(): def test_weaviate_schema_is_valid(): unstructured_class = create_unstructured_weaviate_class() schema = {"classes": [unstructured_class]} - client = Client(embedded_options=EmbeddedOptions()) + client = WeaviateClient(embedded_options=EmbeddedOptions()) client.schema.create(schema) From 9d1514aee4d354a917a9dd3e76c4eebd4756facb Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 17:07:13 -0500 Subject: [PATCH 17/18] pin weaviate to use v3 --- requirements/deps/constraints.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index e63d5417de..296dd366b5 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -3,7 +3,8 @@ # extras. Putting a dependency here will only affect dependency sets that contain them -- in other # words, if something does not require a constraint, it will not be installed. #################################################################################################### - +# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3 +weaviate-client>=3.26.7,<4.0.0 # TODO: Constriant due to multiple versions being installed during pip-compile grpcio>=1.65.5 # TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py) From 2ce003b0152863dba96bed33f698c7c564196acf Mon Sep 17 00:00:00 2001 From: tbs17 Date: Wed, 22 Jan 2025 17:21:05 -0500 Subject: [PATCH 18/18] add back weaviate client --- test_unstructured/staging/test_weaviate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/staging/test_weaviate.py b/test_unstructured/staging/test_weaviate.py index 260b142e14..abebc0d360 100644 --- a/test_unstructured/staging/test_weaviate.py +++ b/test_unstructured/staging/test_weaviate.py @@ -7,7 +7,7 @@ # NOTE(robinson) - allows tests that do not require the weaviate client to # run for the docker container with contextlib.suppress(ModuleNotFoundError): - from weaviate import WeaviateClient + from weaviate import Client from weaviate.embedded import EmbeddedOptions from unstructured.partition.json import partition_json @@ -59,5 +59,5 @@ def test_stage_for_weaviate(): def test_weaviate_schema_is_valid(): unstructured_class = create_unstructured_weaviate_class() schema = {"classes": [unstructured_class]} - client = WeaviateClient(embedded_options=EmbeddedOptions()) + client = Client(embedded_options=EmbeddedOptions()) client.schema.create(schema)