11import asyncio
22import logging
3- from typing import Dict , List , Optional , Sequence
3+ from typing import AsyncIterator , Dict , Iterator , List , Optional , Sequence
44
55from langchain_core .documents import Document
66
@@ -85,6 +85,47 @@ def load(self) -> List[Document]:
8585 """
8686 return asyncio .run (self .aload ())
8787
88+ def lazy_load (self ) -> Iterator [Document ]:
89+ """A lazy loader for MongoDB documents.
90+
91+ Attention:
92+
93+ This implementation starts an asyncio event loop which
94+ will only work if running in a sync env. In an async env, it should
95+ fail since there is already an event loop running.
96+
97+ This code should be updated to kick off the event loop from a separate
98+ thread if running within an async context.
99+
100+ Yields:
101+ Document: A document from the MongoDB collection.
102+ """
103+ try :
104+ event_loop = asyncio .get_running_loop ()
105+ except RuntimeError :
106+ event_loop = asyncio .new_event_loop ()
107+ asyncio .set_event_loop (event_loop )
108+
109+ async_generator = self .alazy_load ()
110+
111+ while True :
112+ try :
113+ document = event_loop .run_until_complete (async_generator .__anext__ ())
114+ yield document
115+ except StopAsyncIteration :
116+ break
117+
118+ async def alazy_load (self ) -> AsyncIterator [Document ]:
119+ """Asynchronously yields Document objects one at a time.
120+
121+ Yields:
122+ Document: A document from the MongoDB collection.
123+ """
124+ projection = self ._construct_projection ()
125+
126+ async for doc in self .collection .find (self .filter_criteria , projection ):
127+ yield self ._process_document (doc )
128+
88129 async def aload (self ) -> List [Document ]:
89130 """Asynchronously loads data into Document objects."""
90131 result = []
@@ -93,26 +134,7 @@ async def aload(self) -> List[Document]:
93134 projection = self ._construct_projection ()
94135
95136 async for doc in self .collection .find (self .filter_criteria , projection ):
96- metadata = self ._extract_fields (doc , self .metadata_names , default = "" )
97-
98- # Optionally add database and collection names to metadata
99- if self .include_db_collection_in_metadata :
100- metadata .update (
101- {
102- "database" : self .db_name ,
103- "collection" : self .collection_name ,
104- }
105- )
106-
107- # Extract text content from filtered fields or use the entire document
108- if self .field_names is not None :
109- fields = self ._extract_fields (doc , self .field_names , default = "" )
110- texts = [str (value ) for value in fields .values ()]
111- text = " " .join (texts )
112- else :
113- text = str (doc )
114-
115- result .append (Document (page_content = text , metadata = metadata ))
137+ result .append (self ._process_document (doc ))
116138
117139 if len (result ) != total_docs :
118140 logger .warning (
@@ -122,6 +144,33 @@ async def aload(self) -> List[Document]:
122144
123145 return result
124146
147+ def _process_document (self , doc : Dict ) -> Document :
148+ """Process a single MongoDB document into a Document object.
149+
150+ Args:
151+ doc: The MongoDB document dictionary to process into a Document object.
152+ """
153+ metadata = self ._extract_fields (doc , self .metadata_names , default = "" )
154+
155+ # Optionally add database and collection names to metadata
156+ if self .include_db_collection_in_metadata :
157+ metadata .update (
158+ {
159+ "database" : self .db_name ,
160+ "collection" : self .collection_name ,
161+ }
162+ )
163+
164+ # Extract text content from filtered fields or use the entire document
165+ if self .field_names is not None :
166+ fields = self ._extract_fields (doc , self .field_names , default = "" )
167+ texts = [str (value ) for value in fields .values ()]
168+ text = " " .join (texts )
169+ else :
170+ text = str (doc )
171+
172+ return Document (page_content = text , metadata = metadata )
173+
125174 def _construct_projection (self ) -> Optional [Dict ]:
126175 """Constructs the projection dictionary for MongoDB query based
127176 on the specified field names and metadata names."""
0 commit comments