Skip to content

Commit 754eddb

Browse files
authored
feat: add support for Path & DocumentStream
Signed-off-by: GitHub <[email protected]>
1 parent aa50365 commit 754eddb

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

langchain_docling/loader.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77

88
from abc import ABC, abstractmethod
99
from enum import Enum
10+
from pathlib import Path
1011
from typing import Any, Dict, Iterable, Iterator, Optional, Union
1112

1213
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
14+
from docling.datamodel.base_models import DocumentStream
1315
from docling.datamodel.document import DoclingDocument
1416
from docling.document_converter import DocumentConverter
1517
from langchain_core.document_loaders import BaseLoader
@@ -27,13 +29,15 @@ class BaseMetaExtractor(ABC):
2729
"""BaseMetaExtractor."""
2830

2931
@abstractmethod
30-
def extract_chunk_meta(self, file_path: str, chunk: BaseChunk) -> dict[str, Any]:
32+
def extract_chunk_meta(
33+
self, source: Union[Path, str, DocumentStream], chunk: BaseChunk
34+
) -> dict[str, Any]:
3135
"""Extract chunk meta."""
3236
raise NotImplementedError()
3337

3438
@abstractmethod
3539
def extract_dl_doc_meta(
36-
self, file_path: str, dl_doc: DoclingDocument
40+
self, source: Union[Path, str, DocumentStream], dl_doc: DoclingDocument
3741
) -> dict[str, Any]:
3842
"""Extract Docling document meta."""
3943
raise NotImplementedError()
@@ -42,26 +46,36 @@ def extract_dl_doc_meta(
4246
class MetaExtractor(BaseMetaExtractor):
4347
"""MetaExtractor."""
4448

45-
def extract_chunk_meta(self, file_path: str, chunk: BaseChunk) -> dict[str, Any]:
49+
def extract_chunk_meta(
50+
self, source: Union[Path, str, DocumentStream], chunk: BaseChunk
51+
) -> dict[str, Any]:
4652
"""Extract chunk meta."""
4753
return {
48-
"source": file_path,
54+
"source": (
55+
str(source) if not isinstance(source, DocumentStream) else source.name
56+
),
4957
"dl_meta": chunk.meta.export_json_dict(),
5058
}
5159

5260
def extract_dl_doc_meta(
53-
self, file_path: str, dl_doc: DoclingDocument
61+
self, source: Union[Path, str, DocumentStream], dl_doc: DoclingDocument
5462
) -> dict[str, Any]:
5563
"""Extract Docling document meta."""
56-
return {"source": file_path}
64+
return {
65+
"source": (
66+
str(source) if not isinstance(source, DocumentStream) else source.name
67+
),
68+
}
5769

5870

5971
class DoclingLoader(BaseLoader):
6072
"""Docling Loader."""
6173

6274
def __init__(
6375
self,
64-
file_path: Union[str, Iterable[str]],
76+
source: Union[
77+
Path, str, DocumentStream, Iterable[Union[Path, str, DocumentStream]]
78+
],
6579
*,
6680
converter: Optional[DocumentConverter] = None,
6781
convert_kwargs: Optional[Dict[str, Any]] = None,
@@ -73,8 +87,8 @@ def __init__(
7387
"""Initialize with a file path.
7488
7589
Args:
76-
file_path: File source as single str (URL or local file) or Iterable
77-
thereof.
90+
source: File source as single object (URL, local file or `DocumentStream`)
91+
or `Iterable` thereof.
7892
converter: Any specific `DocumentConverter` to use. Defaults to `None` (i.e.
7993
converter defined internally).
8094
convert_kwargs: Any specific kwargs to pass to conversion invocation.
@@ -91,10 +105,11 @@ def __init__(
91105
meta_extractor: The extractor instance to use for populating the output
92106
document metadata; if not set, a system default is used.
93107
"""
94-
self._file_paths = (
95-
file_path
96-
if isinstance(file_path, Iterable) and not isinstance(file_path, str)
97-
else [file_path]
108+
self._sources = (
109+
source
110+
if isinstance(source, Iterable)
111+
and not isinstance(source, (str, DocumentStream))
112+
else [source]
98113
)
99114

100115
self._converter: DocumentConverter = converter or DocumentConverter()
@@ -113,17 +128,17 @@ def lazy_load(
113128
self,
114129
) -> Iterator[Document]:
115130
"""Lazy load documents."""
116-
for file_path in self._file_paths:
131+
for source in self._sources:
117132
conv_res = self._converter.convert(
118-
source=file_path,
133+
source=source,
119134
**self._convert_kwargs,
120135
)
121136
dl_doc = conv_res.document
122137
if self._export_type == ExportType.MARKDOWN:
123138
yield Document(
124139
page_content=dl_doc.export_to_markdown(**self._md_export_kwargs),
125140
metadata=self._meta_extractor.extract_dl_doc_meta(
126-
file_path=file_path,
141+
source=source,
127142
dl_doc=dl_doc,
128143
),
129144
)
@@ -133,7 +148,7 @@ def lazy_load(
133148
yield Document(
134149
page_content=self._chunker.serialize(chunk=chunk),
135150
metadata=self._meta_extractor.extract_chunk_meta(
136-
file_path=file_path,
151+
source=source,
137152
chunk=chunk,
138153
),
139154
)

test/test_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def test_load_as_markdown(monkeypatch: pytest.MonkeyPatch) -> None:
144144
)
145145

146146
loader = DoclingLoader(
147-
file_path="https://example.com/foo.pdf",
147+
source="https://example.com/foo.pdf",
148148
export_type=ExportType.MARKDOWN,
149149
)
150150
lc_doc_iter = loader.lazy_load()
@@ -172,7 +172,7 @@ def test_load_as_doc_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
172172
)
173173

174174
loader = DoclingLoader(
175-
file_path="https://example.com/foo.pdf",
175+
source="https://example.com/foo.pdf",
176176
export_type=ExportType.DOC_CHUNKS,
177177
chunker=HierarchicalChunker(),
178178
)

0 commit comments

Comments
 (0)