community[patch]: Prevent XXE in evernote loader (#139)

eyurtsev · mdrxy · web-flow · commit e84245210808 · 2025-07-02T14:11:04.000-04:00
Prevent XXE in evernote loader

---------

Co-authored-by: Mason Daugherty &lt;github@mdrxy.com&gt;
diff --git a/libs/community/langchain_community/document_loaders/evernote.py b/libs/community/langchain_community/document_loaders/evernote.py
@@ -1,6 +1,7 @@
-"""Load documents from Evernote.
+"""Document loader for EverNote ENEX export files.
 
-https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
+This module provides functionality to securely load and parse EverNote notebook
+export files (``.enex`` format) into LangChain Document objects.
 """
 
 import hashlib
@@ -18,31 +19,69 @@
 
 
 class EverNoteLoader(BaseLoader):
-    """Load from `EverNote`.
-
-    Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
-    Instructions on producing this file can be found at
-    https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
-
-    Currently only the plain text in the note is extracted and stored as the contents
-    of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
-    but not 'content-raw' or 'resource') tags on the note will be extracted and stored
-    as metadata on the Document.
-
-    Args:
-        file_path (str): The path to the notebook export with a .enex extension
-        load_single_document (bool): Whether or not to concatenate the content of all
-            notes into a single long Document.
-        If this is set to True (default) then the only metadata on the document will be
-            the 'source' which contains the file name of the export.
+    """Document loader for EverNote ENEX export files.
+
+    Loads EverNote notebook export files (``.enex`` format) into LangChain Documents.
+    Extracts plain text content from HTML and preserves note metadata including
+    titles, timestamps, and attachments. Uses secure XML parsing to prevent
+    vulnerabilities.
+
+    The loader supports two modes:
+    - Single document: Concatenates all notes into one Document (default)
+    - Multiple documents: Creates separate Documents for each note
+
+    `Instructions for creating ENEX files <https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML>`__
+
+    Example:
+
+    .. code-block:: python
+
+        from langchain_community.document_loaders import EverNoteLoader
+
+        # Load all notes as a single document
+        loader = EverNoteLoader("my_notebook.enex")
+        documents = loader.load()
+
+        # Load each note as a separate document:
+        # documents = [ document1, document2, ... ]
+        loader = EverNoteLoader("my_notebook.enex", load_single_document=False)
+        documents = loader.load()
+
+        # Lazy loading for large files
+        for doc in loader.lazy_load():
+            print(f"Title: {doc.metadata.get('title', 'Untitled')}")
+            print(f"Content: {doc.page_content[:100]}...")
+
+    Note:
+        Requires the ``lxml`` and ``html2text`` packages to be installed.
+        Install with: ``pip install lxml html2text``
     """
 
     def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
-        """Initialize with file path."""
+        """Initialize the EverNote loader.
+
+        Args:
+            file_path: Path to the EverNote export file (``.enex`` extension).
+            load_single_document: Whether to concatenate all notes into a single
+                Document. If ``True``, only the ``source`` metadata is preserved.
+                If ``False``, each note becomes a separate Document with its own
+                metadata.
+        """
         self.file_path = str(file_path)
         self.load_single_document = load_single_document
 
     def _lazy_load(self) -> Iterator[Document]:
+        """Lazily load documents from the EverNote export file.
+
+        Lazy loading allows processing large EverNote files without
+        loading everything into memory at once. This method yields Documents
+        one by one by parsning the XML. Each document represents a note in the EverNote
+        export, containing the note's content as ``page_content`` and metadata including
+        ``title``, ``created/updated`` ``timestamps``, and other note attributes.
+
+        Yields:
+            Document: A Document object for each note in the export file.
+        """
         for note in self._parse_note_xml(self.file_path):
             if note.get("content") is not None:
                 yield Document(
@@ -58,7 +97,14 @@ def _lazy_load(self) -> Iterator[Document]:
                 )
 
     def lazy_load(self) -> Iterator[Document]:
-        """Load documents from EverNote export file."""
+        """Load documents from EverNote export file.
+
+        Depending on the ``load_single_document`` setting, either yields individual
+        Documents for each note or a single Document containing all notes.
+
+        Yields:
+            Document: Either individual note Documents or a single combined Document.
+        """
         if not self.load_single_document:
             yield from self._lazy_load()
         else:
@@ -71,19 +117,44 @@ def lazy_load(self) -> Iterator[Document]:
 
     @staticmethod
     def _parse_content(content: str) -> str:
+        """Parse HTML content from EverNote into plain text.
+
+        Converts HTML content to plain text using the ``html2text`` library.
+        Strips whitespace from the result.
+
+        Args:
+            content: HTML content string from EverNote.
+
+        Returns:
+            Plain text version of the content.
+
+        Raises:
+            ImportError: If ``html2text`` is not installed.
+        """
         try:
             import html2text
 
             return html2text.html2text(content).strip()
         except ImportError as e:
             raise ImportError(
                 "Could not import `html2text`. Although it is not a required package "
-                "to use Langchain, using the EverNote loader requires `html2text`. "
+                "to use LangChain, using the EverNote loader requires `html2text`. "
                 "Please install `html2text` via `pip install html2text` and try again."
             ) from e
 
     @staticmethod
     def _parse_resource(resource: list) -> dict:
+        """Parse resource elements from EverNote XML.
+
+        Extracts resource information like attachments, images, etc.
+        Base64 decodes data elements and generates MD5 hashes.
+
+        Args:
+            resource: List of XML elements representing a resource.
+
+        Returns:
+            Dictionary containing resource metadata and decoded data.
+        """
         rsc_dict: Dict[str, Any] = {}
         for elem in resource:
             if elem.tag == "data":
@@ -97,6 +168,18 @@ def _parse_resource(resource: list) -> dict:
 
     @staticmethod
     def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
+        """Parse a note element from EverNote XML.
+
+        Extracts note content, metadata, resources, and attributes.
+        Handles nested note-attributes recursively with prefixes.
+
+        Args:
+            note: List of XML elements representing a note.
+            prefix: Optional prefix for nested attribute names.
+
+        Returns:
+            Dictionary containing note content and metadata.
+        """
         note_dict: Dict[str, Any] = {}
         resources = []
 
@@ -129,22 +212,37 @@ def add_prefix(element_tag: str) -> str:
 
     @staticmethod
     def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
-        """Parse Evernote xml."""
-        # Without huge_tree set to True, parser may complain about huge text node
-        # Try to recover, because there may be "&nbsp;", which will cause
-        # "XMLSyntaxError: Entity 'nbsp' not defined"
+        """Parse EverNote XML file securely.
+
+        Uses ``lxml`` with secure parsing configuration to prevent XML vulnerabilities
+        including XXE attacks, XML bombs, and malformed XML exploitation.
+
+        Args:
+            xml_file: Path to the EverNote export XML file.
+
+        Yields:
+            Dictionary containing parsed note data for each note in the file.
+
+        Raises:
+            ImportError: If ``lxml`` is not installed.
+        """
         try:
             from lxml import etree
         except ImportError as e:
             logger.error(
                 "Could not import `lxml`. Although it is not a required package to use "
-                "Langchain, using the EverNote loader requires `lxml`. Please install "
+                "LangChain, using the EverNote loader requires `lxml`. Please install "
                 "`lxml` via `pip install lxml` and try again."
             )
             raise e
 
         context = etree.iterparse(
-            xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
+            xml_file,
+            encoding="utf-8",
+            resolve_entities=False,  # Prevents XXE attacks
+            no_network=True,  # Blocks network-based external entities
+            recover=False,  # Avoid parsing invalid/malformed XML
+            huge_tree=False,  # Protect against XML Bomb DoS attacks
         )
 
         for action, elem in context: