Skip to content

Commit b7be3e6

Browse files
committed
Refactor ZeroxPDFLoader
1 parent 33354f9 commit b7be3e6

File tree

11 files changed

+1411
-161
lines changed

11 files changed

+1411
-161
lines changed

docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb

Lines changed: 915 additions & 118 deletions
Large diffs are not rendered by default.

libs/community/extended_testing_deps.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jq>=1.4.1,<2
4141
jsonschema>1
4242
keybert>=0.8.5
4343
langchain_openai>=0.2.1
44-
litellm>=1.30,<=1.39.5
44+
litellm>=1.30
4545
lxml>=4.9.3,<6.0
4646
markdownify>=0.11.6,<0.12
4747
motor>=3.3.1,<4
@@ -74,6 +74,7 @@ pymupdf>=1.22.3,<2
7474
pypdf>=3.4.0,<5
7575
pypdfium2>=4.10.0,<5
7676
pyspark>=3.4.0,<4
77+
py-zerox>=0.0.7
7778
rank-bm25>=0.2.2,<0.3
7879
rapidfuzz>=3.1.1,<4
7980
rapidocr-onnxruntime>=1.3.2,<2

libs/community/langchain_community/document_loaders/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@
360360
PyPDFium2Loader,
361361
PyPDFLoader,
362362
UnstructuredPDFLoader,
363+
ZeroxPDFLoader,
363364
)
364365
from langchain_community.document_loaders.pebblo import (
365366
PebbloSafeLoader,
@@ -732,6 +733,7 @@
732733
"YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders",
733734
"YoutubeLoader": "langchain_community.document_loaders.youtube",
734735
"YuqueLoader": "langchain_community.document_loaders.yuque",
736+
"ZeroxPDFLoader": "langchain_community.document_loaders.pdf",
735737
}
736738

737739

@@ -940,4 +942,5 @@ def __getattr__(name: str) -> Any:
940942
"YoutubeAudioLoader",
941943
"YoutubeLoader",
942944
"YuqueLoader",
945+
"ZeroxPDFLoader",
943946
]

libs/community/langchain_community/document_loaders/parsers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
PyMuPDFParser,
3333
PyPDFium2Parser,
3434
PyPDFParser,
35+
ZeroxPDFParser,
3536
)
3637
from langchain_community.document_loaders.parsers.vsdx import (
3738
VsdxParser,
@@ -55,6 +56,7 @@
5556
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
5657
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
5758
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
59+
"ZeroxPDFParser": "langchain_community.document_loaders.parsers.pdf",
5860
}
5961

6062

@@ -82,4 +84,5 @@ def __getattr__(name: str) -> Any:
8284
"RapidOCRBlobParser",
8385
"TesseractBlobParser",
8486
"VsdxParser",
87+
"ZeroxPDFParser",
8588
]

0 commit comments

Comments
 (0)