Skip to content

Commit c074729

Browse files
authored
Merge branch 'master' into pprados/02-pymupdf
2 parents feacf69 + 4bc6cb7 commit c074729

File tree

29 files changed

+69
-4514
lines changed

29 files changed

+69
-4514
lines changed

docs/scripts/tool_feat_table.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,6 @@
8686
"link": "/docs/integrations/tools/riza",
8787
"self_hosting": True,
8888
},
89-
"E2B Data Analysis": {
90-
"langauges": "Python. In beta: JavaScript, R, Java",
91-
"sandbox_lifetime": "24 Hours",
92-
"upload": True,
93-
"return_results": "Text, Images, Videos",
94-
"link": "/docs/integrations/tools/e2b_data_analysis",
95-
"self_hosting": True,
96-
},
9789
"Azure Container Apps dynamic sessions": {
9890
"langauges": "Python",
9991
"sandbox_lifetime": "1 Hour",

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,11 @@ def _extract_text_from_page(page: pypdf.PageObject) -> str:
254254
Document(
255255
page_content=_extract_text_from_page(page=page)
256256
+ self._extract_images_from_page(page),
257-
metadata={"source": blob.source, "page": page_number},
257+
metadata={
258+
"source": blob.source,
259+
"page": page_number,
260+
"page_label": pdf_reader.page_labels[page_number],
261+
},
258262
# type: ignore[attr-defined]
259263
)
260264
for page_number, page in enumerate(pdf_reader.pages)
Binary file not shown.

libs/community/tests/unit_tests/document_loaders/test_pdf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
Path(__file__).parent.parent
1313
/ "document_loaders/sample_documents/layout-parser-paper.pdf"
1414
)
15+
path_to_multi_label_page_numbers_pdf = (
16+
Path(__file__).parent.parent
17+
/ "document_loaders/sample_documents/geotopo-komprimiert.pdf"
18+
)
1519
path_to_layout_pdf_txt = (
1620
Path(__file__).parent.parent.parent
1721
/ "integration_tests/examples/layout-parser-paper-page-1.txt"
@@ -32,6 +36,7 @@ def test_pypdf_loader() -> None:
3236
assert len(docs) == 16
3337
for page, doc in enumerate(docs):
3438
assert doc.metadata["page"] == page
39+
assert doc.metadata["page_label"] == str(page + 1)
3540
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
3641
assert len(doc.page_content) > 10
3742

@@ -49,6 +54,7 @@ def test_pypdf_loader_with_layout() -> None:
4954
assert len(docs) == 16
5055
for page, doc in enumerate(docs):
5156
assert doc.metadata["page"] == page
57+
assert doc.metadata["page_label"] == str(page + 1)
5258
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
5359
assert len(doc.page_content) > 10
5460

@@ -60,3 +66,19 @@ def test_pypdf_loader_with_layout() -> None:
6066
cleaned_first_page = re.sub(r"\x00", "", first_page)
6167
cleaned_expected = re.sub(r"\x00", "", expected)
6268
assert cleaned_first_page == cleaned_expected
69+
70+
71+
@pytest.mark.requires("pypdf")
72+
def test_pypdf_loader_with_multi_labled_page_numbers() -> None:
73+
"""Test PyPDFLoader with a pdf that contains multi-labled page numbers."""
74+
loader = PyPDFLoader(str(path_to_multi_label_page_numbers_pdf))
75+
docs = loader.load()
76+
77+
assert len(docs) == 7
78+
79+
assert docs[0].metadata["page"] == 0
80+
assert docs[0].metadata["page_label"] == "i"
81+
82+
# Since the actual page numbers in this pdf starts from 4th page
83+
assert docs[3].metadata["page"] == 3
84+
assert docs[3].metadata["page_label"] == "1"

libs/packages.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,8 @@ packages:
259259
downloads: 35495
260260
downloads_updated_at: '2024-12-23T20:10:11.816059+00:00'
261261
- name: langchain-couchbase
262-
path: libs/partners/couchbase
263-
repo: langchain-ai/langchain
262+
path: .
263+
repo: Couchbase-Ecosystem/langchain-couchbase
264264
downloads: 347
265265
downloads_updated_at: '2024-12-23T20:10:11.816059+00:00'
266266
- name: langchain-ollama

libs/partners/couchbase/.gitignore

Lines changed: 0 additions & 3 deletions
This file was deleted.

libs/partners/couchbase/LICENSE

Lines changed: 0 additions & 21 deletions
This file was deleted.

libs/partners/couchbase/Makefile

Lines changed: 0 additions & 64 deletions
This file was deleted.

libs/partners/couchbase/README.md

Lines changed: 2 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,3 @@
1-
# langchain-couchbase
1+
This package has moved!
22

3-
This package contains the LangChain integration with Couchbase
4-
5-
## Installation
6-
7-
```bash
8-
pip install -U langchain-couchbase
9-
```
10-
11-
## Usage
12-
13-
The `CouchbaseVectorStore` class exposes the connection to the Couchbase vector store.
14-
15-
```python
16-
from langchain_couchbase.vectorstores import CouchbaseVectorStore
17-
18-
from couchbase.cluster import Cluster
19-
from couchbase.auth import PasswordAuthenticator
20-
from couchbase.options import ClusterOptions
21-
from datetime import timedelta
22-
23-
auth = PasswordAuthenticator(username, password)
24-
options = ClusterOptions(auth)
25-
connect_string = "couchbases://localhost"
26-
cluster = Cluster(connect_string, options)
27-
28-
# Wait until the cluster is ready for use.
29-
cluster.wait_until_ready(timedelta(seconds=5))
30-
31-
embeddings = OpenAIEmbeddings()
32-
33-
vectorstore = CouchbaseVectorStore(
34-
cluster=cluster,
35-
bucket_name="",
36-
scope_name="",
37-
collection_name="",
38-
embedding=embeddings,
39-
index_name="vector-search-index",
40-
)
41-
42-
```
3+
https://github.com/Couchbase-Ecosystem/langchain-couchbase/tree/main/langchain_couchbase

libs/partners/couchbase/langchain_couchbase/__init__.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

0 commit comments

Comments
 (0)