Skip to content

Commit e842452

Browse files
eyurtsevmdrxy
andauthored
community[patch]: Prevent XXE in evernote loader (#139)
Prevent XXE in evernote loader --------- Co-authored-by: Mason Daugherty <[email protected]>
1 parent 6057ba5 commit e842452

File tree

1 file changed

+126
-28
lines changed
  • libs/community/langchain_community/document_loaders

1 file changed

+126
-28
lines changed

libs/community/langchain_community/document_loaders/evernote.py

Lines changed: 126 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
"""Load documents from Evernote.
1+
"""Document loader for EverNote ENEX export files.
22
3-
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
3+
This module provides functionality to securely load and parse EverNote notebook
4+
export files (``.enex`` format) into LangChain Document objects.
45
"""
56

67
import hashlib
@@ -18,31 +19,69 @@
1819

1920

2021
class EverNoteLoader(BaseLoader):
21-
"""Load from `EverNote`.
22-
23-
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
24-
Instructions on producing this file can be found at
25-
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
26-
27-
Currently only the plain text in the note is extracted and stored as the contents
28-
of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
29-
but not 'content-raw' or 'resource') tags on the note will be extracted and stored
30-
as metadata on the Document.
31-
32-
Args:
33-
file_path (str): The path to the notebook export with a .enex extension
34-
load_single_document (bool): Whether or not to concatenate the content of all
35-
notes into a single long Document.
36-
If this is set to True (default) then the only metadata on the document will be
37-
the 'source' which contains the file name of the export.
22+
"""Document loader for EverNote ENEX export files.
23+
24+
Loads EverNote notebook export files (``.enex`` format) into LangChain Documents.
25+
Extracts plain text content from HTML and preserves note metadata including
26+
titles, timestamps, and attachments. Uses secure XML parsing to prevent
27+
vulnerabilities.
28+
29+
The loader supports two modes:
30+
- Single document: Concatenates all notes into one Document (default)
31+
- Multiple documents: Creates separate Documents for each note
32+
33+
`Instructions for creating ENEX files <https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML>`__
34+
35+
Example:
36+
37+
.. code-block:: python
38+
39+
from langchain_community.document_loaders import EverNoteLoader
40+
41+
# Load all notes as a single document
42+
loader = EverNoteLoader("my_notebook.enex")
43+
documents = loader.load()
44+
45+
# Load each note as a separate document:
46+
# documents = [ document1, document2, ... ]
47+
loader = EverNoteLoader("my_notebook.enex", load_single_document=False)
48+
documents = loader.load()
49+
50+
# Lazy loading for large files
51+
for doc in loader.lazy_load():
52+
print(f"Title: {doc.metadata.get('title', 'Untitled')}")
53+
print(f"Content: {doc.page_content[:100]}...")
54+
55+
Note:
56+
Requires the ``lxml`` and ``html2text`` packages to be installed.
57+
Install with: ``pip install lxml html2text``
3858
"""
3959

4060
def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
41-
"""Initialize with file path."""
61+
"""Initialize the EverNote loader.
62+
63+
Args:
64+
file_path: Path to the EverNote export file (``.enex`` extension).
65+
load_single_document: Whether to concatenate all notes into a single
66+
Document. If ``True``, only the ``source`` metadata is preserved.
67+
If ``False``, each note becomes a separate Document with its own
68+
metadata.
69+
"""
4270
self.file_path = str(file_path)
4371
self.load_single_document = load_single_document
4472

4573
def _lazy_load(self) -> Iterator[Document]:
74+
"""Lazily load documents from the EverNote export file.
75+
76+
Lazy loading allows processing large EverNote files without
77+
loading everything into memory at once. This method yields Documents
78+
one by one by parsning the XML. Each document represents a note in the EverNote
79+
export, containing the note's content as ``page_content`` and metadata including
80+
``title``, ``created/updated`` ``timestamps``, and other note attributes.
81+
82+
Yields:
83+
Document: A Document object for each note in the export file.
84+
"""
4685
for note in self._parse_note_xml(self.file_path):
4786
if note.get("content") is not None:
4887
yield Document(
@@ -58,7 +97,14 @@ def _lazy_load(self) -> Iterator[Document]:
5897
)
5998

6099
def lazy_load(self) -> Iterator[Document]:
61-
"""Load documents from EverNote export file."""
100+
"""Load documents from EverNote export file.
101+
102+
Depending on the ``load_single_document`` setting, either yields individual
103+
Documents for each note or a single Document containing all notes.
104+
105+
Yields:
106+
Document: Either individual note Documents or a single combined Document.
107+
"""
62108
if not self.load_single_document:
63109
yield from self._lazy_load()
64110
else:
@@ -71,19 +117,44 @@ def lazy_load(self) -> Iterator[Document]:
71117

72118
@staticmethod
73119
def _parse_content(content: str) -> str:
120+
"""Parse HTML content from EverNote into plain text.
121+
122+
Converts HTML content to plain text using the ``html2text`` library.
123+
Strips whitespace from the result.
124+
125+
Args:
126+
content: HTML content string from EverNote.
127+
128+
Returns:
129+
Plain text version of the content.
130+
131+
Raises:
132+
ImportError: If ``html2text`` is not installed.
133+
"""
74134
try:
75135
import html2text
76136

77137
return html2text.html2text(content).strip()
78138
except ImportError as e:
79139
raise ImportError(
80140
"Could not import `html2text`. Although it is not a required package "
81-
"to use Langchain, using the EverNote loader requires `html2text`. "
141+
"to use LangChain, using the EverNote loader requires `html2text`. "
82142
"Please install `html2text` via `pip install html2text` and try again."
83143
) from e
84144

85145
@staticmethod
86146
def _parse_resource(resource: list) -> dict:
147+
"""Parse resource elements from EverNote XML.
148+
149+
Extracts resource information like attachments, images, etc.
150+
Base64 decodes data elements and generates MD5 hashes.
151+
152+
Args:
153+
resource: List of XML elements representing a resource.
154+
155+
Returns:
156+
Dictionary containing resource metadata and decoded data.
157+
"""
87158
rsc_dict: Dict[str, Any] = {}
88159
for elem in resource:
89160
if elem.tag == "data":
@@ -97,6 +168,18 @@ def _parse_resource(resource: list) -> dict:
97168

98169
@staticmethod
99170
def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
171+
"""Parse a note element from EverNote XML.
172+
173+
Extracts note content, metadata, resources, and attributes.
174+
Handles nested note-attributes recursively with prefixes.
175+
176+
Args:
177+
note: List of XML elements representing a note.
178+
prefix: Optional prefix for nested attribute names.
179+
180+
Returns:
181+
Dictionary containing note content and metadata.
182+
"""
100183
note_dict: Dict[str, Any] = {}
101184
resources = []
102185

@@ -129,22 +212,37 @@ def add_prefix(element_tag: str) -> str:
129212

130213
@staticmethod
131214
def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
132-
"""Parse Evernote xml."""
133-
# Without huge_tree set to True, parser may complain about huge text node
134-
# Try to recover, because there may be "&nbsp;", which will cause
135-
# "XMLSyntaxError: Entity 'nbsp' not defined"
215+
"""Parse EverNote XML file securely.
216+
217+
Uses ``lxml`` with secure parsing configuration to prevent XML vulnerabilities
218+
including XXE attacks, XML bombs, and malformed XML exploitation.
219+
220+
Args:
221+
xml_file: Path to the EverNote export XML file.
222+
223+
Yields:
224+
Dictionary containing parsed note data for each note in the file.
225+
226+
Raises:
227+
ImportError: If ``lxml`` is not installed.
228+
"""
136229
try:
137230
from lxml import etree
138231
except ImportError as e:
139232
logger.error(
140233
"Could not import `lxml`. Although it is not a required package to use "
141-
"Langchain, using the EverNote loader requires `lxml`. Please install "
234+
"LangChain, using the EverNote loader requires `lxml`. Please install "
142235
"`lxml` via `pip install lxml` and try again."
143236
)
144237
raise e
145238

146239
context = etree.iterparse(
147-
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
240+
xml_file,
241+
encoding="utf-8",
242+
resolve_entities=False, # Prevents XXE attacks
243+
no_network=True, # Blocks network-based external entities
244+
recover=False, # Avoid parsing invalid/malformed XML
245+
huge_tree=False, # Protect against XML Bomb DoS attacks
148246
)
149247

150248
for action, elem in context:

0 commit comments

Comments
 (0)