feat(mongodb): add lazy loading support for MongoDB document loader (#334)

dbgsprw · web-flow · commit 214d639b591b · 2025-09-28T20:02:47.000-04:00
- Add lazy_load() method for memory-efficient document streaming
- Add alazy_load() async method for async document streaming
- Refactor document processing into _process_document() helper method
- Add comprehensive tests for lazy loading functionality
diff --git a/libs/community/langchain_community/document_loaders/mongodb.py b/libs/community/langchain_community/document_loaders/mongodb.py
@@ -1,6 +1,6 @@
 import asyncio
 import logging
-from typing import Dict, List, Optional, Sequence
+from typing import AsyncIterator, Dict, Iterator, List, Optional, Sequence
 
 from langchain_core.documents import Document
 
@@ -85,6 +85,47 @@ def load(self) -> List[Document]:
         """
         return asyncio.run(self.aload())
 
+    def lazy_load(self) -> Iterator[Document]:
+        """A lazy loader for MongoDB documents.
+
+        Attention:
+
+        This implementation starts an asyncio event loop which
+        will only work if running in a sync env. In an async env, it should
+        fail since there is already an event loop running.
+
+        This code should be updated to kick off the event loop from a separate
+        thread if running within an async context.
+
+        Yields:
+            Document: A document from the MongoDB collection.
+        """
+        try:
+            event_loop = asyncio.get_running_loop()
+        except RuntimeError:
+            event_loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(event_loop)
+
+        async_generator = self.alazy_load()
+
+        while True:
+            try:
+                document = event_loop.run_until_complete(async_generator.__anext__())
+                yield document
+            except StopAsyncIteration:
+                break
+
+    async def alazy_load(self) -> AsyncIterator[Document]:
+        """Asynchronously yields Document objects one at a time.
+
+        Yields:
+            Document: A document from the MongoDB collection.
+        """
+        projection = self._construct_projection()
+
+        async for doc in self.collection.find(self.filter_criteria, projection):
+            yield self._process_document(doc)
+
     async def aload(self) -> List[Document]:
         """Asynchronously loads data into Document objects."""
         result = []
@@ -93,26 +134,7 @@ async def aload(self) -> List[Document]:
         projection = self._construct_projection()
 
         async for doc in self.collection.find(self.filter_criteria, projection):
-            metadata = self._extract_fields(doc, self.metadata_names, default="")
-
-            # Optionally add database and collection names to metadata
-            if self.include_db_collection_in_metadata:
-                metadata.update(
-                    {
-                        "database": self.db_name,
-                        "collection": self.collection_name,
-                    }
-                )
-
-            # Extract text content from filtered fields or use the entire document
-            if self.field_names is not None:
-                fields = self._extract_fields(doc, self.field_names, default="")
-                texts = [str(value) for value in fields.values()]
-                text = " ".join(texts)
-            else:
-                text = str(doc)
-
-            result.append(Document(page_content=text, metadata=metadata))
+            result.append(self._process_document(doc))
 
         if len(result) != total_docs:
             logger.warning(
@@ -122,6 +144,33 @@ async def aload(self) -> List[Document]:
 
         return result
 
+    def _process_document(self, doc: Dict) -> Document:
+        """Process a single MongoDB document into a Document object.
+
+        Args:
+            doc: The MongoDB document dictionary to process into a Document object.
+        """
+        metadata = self._extract_fields(doc, self.metadata_names, default="")
+
+        # Optionally add database and collection names to metadata
+        if self.include_db_collection_in_metadata:
+            metadata.update(
+                {
+                    "database": self.db_name,
+                    "collection": self.collection_name,
+                }
+            )
+
+        # Extract text content from filtered fields or use the entire document
+        if self.field_names is not None:
+            fields = self._extract_fields(doc, self.field_names, default="")
+            texts = [str(value) for value in fields.values()]
+            text = " ".join(texts)
+        else:
+            text = str(doc)
+
+        return Document(page_content=text, metadata=metadata)
+
     def _construct_projection(self) -> Optional[Dict]:
         """Constructs the projection dictionary for MongoDB query based
         on the specified field names and metadata names."""
diff --git a/libs/community/tests/unit_tests/document_loaders/test_mongodb.py b/libs/community/tests/unit_tests/document_loaders/test_mongodb.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import AsyncIterator, Dict, List
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
@@ -70,3 +70,42 @@ async def test_load_mocked_with_filters(expected_documents: List[Document]) -> N
         documents = await loader.aload()
 
     assert documents == expected_documents
+
+
+@pytest.mark.requires("motor")
+def test_lazy_load_mocked_with_filters(expected_documents: List[Document]) -> None:
+    filter_criteria = {"address.room": {"$eq": "2"}}
+    field_names = ["address.building", "address.room"]
+    metadata_names = ["_id"]
+    include_db_collection_in_metadata = True
+
+    async def mock_async_generator() -> AsyncIterator[Document]:
+        for doc in expected_documents:
+            yield doc
+
+    mock_find = AsyncMock()
+    mock_find.return_value = mock_async_generator()
+
+    mock_collection = MagicMock()
+    mock_collection.find = mock_find
+
+    with (
+        patch("motor.motor_asyncio.AsyncIOMotorClient", return_value=MagicMock()),
+        patch(
+            "langchain_community.document_loaders.mongodb.MongodbLoader.alazy_load",
+            return_value=mock_async_generator(),
+        ),
+    ):
+        loader = MongodbLoader(
+            "mongodb://localhost:27017",
+            "test_db",
+            "test_collection",
+            filter_criteria=filter_criteria,
+            field_names=field_names,
+            metadata_names=metadata_names,
+            include_db_collection_in_metadata=include_db_collection_in_metadata,
+        )
+        loader.collection = mock_collection
+        documents = list(loader.lazy_load())
+
+    assert documents == expected_documents