77
88from abc import ABC , abstractmethod
99from enum import Enum
10+ from pathlib import Path
1011from typing import Any , Dict , Iterable , Iterator , Optional , Union
1112
1213from docling .chunking import BaseChunk , BaseChunker , HybridChunker
14+ from docling .datamodel .base_models import DocumentStream
1315from docling .datamodel .document import DoclingDocument
1416from docling .document_converter import DocumentConverter
1517from langchain_core .document_loaders import BaseLoader
@@ -27,13 +29,15 @@ class BaseMetaExtractor(ABC):
2729 """BaseMetaExtractor."""
2830
2931 @abstractmethod
30- def extract_chunk_meta (self , file_path : str , chunk : BaseChunk ) -> dict [str , Any ]:
32+ def extract_chunk_meta (
33+ self , source : Union [Path , str , DocumentStream ], chunk : BaseChunk
34+ ) -> dict [str , Any ]:
3135 """Extract chunk meta."""
3236 raise NotImplementedError ()
3337
3438 @abstractmethod
3539 def extract_dl_doc_meta (
36- self , file_path : str , dl_doc : DoclingDocument
40+ self , source : Union [ Path , str , DocumentStream ] , dl_doc : DoclingDocument
3741 ) -> dict [str , Any ]:
3842 """Extract Docling document meta."""
3943 raise NotImplementedError ()
@@ -42,26 +46,36 @@ def extract_dl_doc_meta(
4246class MetaExtractor (BaseMetaExtractor ):
4347 """MetaExtractor."""
4448
45- def extract_chunk_meta (self , file_path : str , chunk : BaseChunk ) -> dict [str , Any ]:
49+ def extract_chunk_meta (
50+ self , source : Union [Path , str , DocumentStream ], chunk : BaseChunk
51+ ) -> dict [str , Any ]:
4652 """Extract chunk meta."""
4753 return {
48- "source" : file_path ,
54+ "source" : (
55+ str (source ) if not isinstance (source , DocumentStream ) else source .name
56+ ),
4957 "dl_meta" : chunk .meta .export_json_dict (),
5058 }
5159
5260 def extract_dl_doc_meta (
53- self , file_path : str , dl_doc : DoclingDocument
61+ self , source : Union [ Path , str , DocumentStream ] , dl_doc : DoclingDocument
5462 ) -> dict [str , Any ]:
5563 """Extract Docling document meta."""
56- return {"source" : file_path }
64+ return {
65+ "source" : (
66+ str (source ) if not isinstance (source , DocumentStream ) else source .name
67+ ),
68+ }
5769
5870
5971class DoclingLoader (BaseLoader ):
6072 """Docling Loader."""
6173
6274 def __init__ (
6375 self ,
64- file_path : Union [str , Iterable [str ]],
76+ source : Union [
77+ Path , str , DocumentStream , Iterable [Union [Path , str , DocumentStream ]]
78+ ],
6579 * ,
6680 converter : Optional [DocumentConverter ] = None ,
6781 convert_kwargs : Optional [Dict [str , Any ]] = None ,
@@ -73,8 +87,8 @@ def __init__(
7387 """Initialize with a file path.
7488
7589 Args:
76- file_path : File source as single str (URL or local file) or Iterable
77- thereof.
90+ source : File source as single object (URL, local file or `DocumentStream`)
91+ or `Iterable` thereof.
7892 converter: Any specific `DocumentConverter` to use. Defaults to `None` (i.e.
7993 converter defined internally).
8094 convert_kwargs: Any specific kwargs to pass to conversion invocation.
@@ -91,10 +105,11 @@ def __init__(
91105 meta_extractor: The extractor instance to use for populating the output
92106 document metadata; if not set, a system default is used.
93107 """
94- self ._file_paths = (
95- file_path
96- if isinstance (file_path , Iterable ) and not isinstance (file_path , str )
97- else [file_path ]
108+ self ._sources = (
109+ source
110+ if isinstance (source , Iterable )
111+ and not isinstance (source , (str , DocumentStream ))
112+ else [source ]
98113 )
99114
100115 self ._converter : DocumentConverter = converter or DocumentConverter ()
@@ -113,17 +128,17 @@ def lazy_load(
113128 self ,
114129 ) -> Iterator [Document ]:
115130 """Lazy load documents."""
116- for file_path in self ._file_paths :
131+ for source in self ._sources :
117132 conv_res = self ._converter .convert (
118- source = file_path ,
133+ source = source ,
119134 ** self ._convert_kwargs ,
120135 )
121136 dl_doc = conv_res .document
122137 if self ._export_type == ExportType .MARKDOWN :
123138 yield Document (
124139 page_content = dl_doc .export_to_markdown (** self ._md_export_kwargs ),
125140 metadata = self ._meta_extractor .extract_dl_doc_meta (
126- file_path = file_path ,
141+ source = source ,
127142 dl_doc = dl_doc ,
128143 ),
129144 )
@@ -133,7 +148,7 @@ def lazy_load(
133148 yield Document (
134149 page_content = self ._chunker .serialize (chunk = chunk ),
135150 metadata = self ._meta_extractor .extract_chunk_meta (
136- file_path = file_path ,
151+ source = source ,
137152 chunk = chunk ,
138153 ),
139154 )
0 commit comments