1
- """Load documents from Evernote .
1
+ """Document loader for EverNote ENEX export files .
2
2
3
- https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
3
+ This module provides functionality to securely load and parse EverNote notebook
4
+ export files (``.enex`` format) into LangChain Document objects.
4
5
"""
5
6
6
7
import hashlib
18
19
19
20
20
21
class EverNoteLoader (BaseLoader ):
21
- """Load from `EverNote`.
22
-
23
- Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
24
- Instructions on producing this file can be found at
25
- https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
26
-
27
- Currently only the plain text in the note is extracted and stored as the contents
28
- of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
29
- but not 'content-raw' or 'resource') tags on the note will be extracted and stored
30
- as metadata on the Document.
31
-
32
- Args:
33
- file_path (str): The path to the notebook export with a .enex extension
34
- load_single_document (bool): Whether or not to concatenate the content of all
35
- notes into a single long Document.
36
- If this is set to True (default) then the only metadata on the document will be
37
- the 'source' which contains the file name of the export.
22
+ """Document loader for EverNote ENEX export files.
23
+
24
+ Loads EverNote notebook export files (``.enex`` format) into LangChain Documents.
25
+ Extracts plain text content from HTML and preserves note metadata including
26
+ titles, timestamps, and attachments. Uses secure XML parsing to prevent
27
+ vulnerabilities.
28
+
29
+ The loader supports two modes:
30
+ - Single document: Concatenates all notes into one Document (default)
31
+ - Multiple documents: Creates separate Documents for each note
32
+
33
+ `Instructions for creating ENEX files <https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML>`__
34
+
35
+ Example:
36
+
37
+ .. code-block:: python
38
+
39
+ from langchain_community.document_loaders import EverNoteLoader
40
+
41
+ # Load all notes as a single document
42
+ loader = EverNoteLoader("my_notebook.enex")
43
+ documents = loader.load()
44
+
45
+ # Load each note as a separate document:
46
+ # documents = [ document1, document2, ... ]
47
+ loader = EverNoteLoader("my_notebook.enex", load_single_document=False)
48
+ documents = loader.load()
49
+
50
+ # Lazy loading for large files
51
+ for doc in loader.lazy_load():
52
+ print(f"Title: {doc.metadata.get('title', 'Untitled')}")
53
+ print(f"Content: {doc.page_content[:100]}...")
54
+
55
+ Note:
56
+ Requires the ``lxml`` and ``html2text`` packages to be installed.
57
+ Install with: ``pip install lxml html2text``
38
58
"""
39
59
40
60
def __init__ (self , file_path : Union [str , Path ], load_single_document : bool = True ):
41
- """Initialize with file path."""
61
+ """Initialize the EverNote loader.
62
+
63
+ Args:
64
+ file_path: Path to the EverNote export file (``.enex`` extension).
65
+ load_single_document: Whether to concatenate all notes into a single
66
+ Document. If ``True``, only the ``source`` metadata is preserved.
67
+ If ``False``, each note becomes a separate Document with its own
68
+ metadata.
69
+ """
42
70
self .file_path = str (file_path )
43
71
self .load_single_document = load_single_document
44
72
45
73
def _lazy_load (self ) -> Iterator [Document ]:
74
+ """Lazily load documents from the EverNote export file.
75
+
76
+ Lazy loading allows processing large EverNote files without
77
+ loading everything into memory at once. This method yields Documents
78
+ one by one by parsning the XML. Each document represents a note in the EverNote
79
+ export, containing the note's content as ``page_content`` and metadata including
80
+ ``title``, ``created/updated`` ``timestamps``, and other note attributes.
81
+
82
+ Yields:
83
+ Document: A Document object for each note in the export file.
84
+ """
46
85
for note in self ._parse_note_xml (self .file_path ):
47
86
if note .get ("content" ) is not None :
48
87
yield Document (
@@ -58,7 +97,14 @@ def _lazy_load(self) -> Iterator[Document]:
58
97
)
59
98
60
99
def lazy_load (self ) -> Iterator [Document ]:
61
- """Load documents from EverNote export file."""
100
+ """Load documents from EverNote export file.
101
+
102
+ Depending on the ``load_single_document`` setting, either yields individual
103
+ Documents for each note or a single Document containing all notes.
104
+
105
+ Yields:
106
+ Document: Either individual note Documents or a single combined Document.
107
+ """
62
108
if not self .load_single_document :
63
109
yield from self ._lazy_load ()
64
110
else :
@@ -71,19 +117,44 @@ def lazy_load(self) -> Iterator[Document]:
71
117
72
118
@staticmethod
73
119
def _parse_content (content : str ) -> str :
120
+ """Parse HTML content from EverNote into plain text.
121
+
122
+ Converts HTML content to plain text using the ``html2text`` library.
123
+ Strips whitespace from the result.
124
+
125
+ Args:
126
+ content: HTML content string from EverNote.
127
+
128
+ Returns:
129
+ Plain text version of the content.
130
+
131
+ Raises:
132
+ ImportError: If ``html2text`` is not installed.
133
+ """
74
134
try :
75
135
import html2text
76
136
77
137
return html2text .html2text (content ).strip ()
78
138
except ImportError as e :
79
139
raise ImportError (
80
140
"Could not import `html2text`. Although it is not a required package "
81
- "to use Langchain , using the EverNote loader requires `html2text`. "
141
+ "to use LangChain , using the EverNote loader requires `html2text`. "
82
142
"Please install `html2text` via `pip install html2text` and try again."
83
143
) from e
84
144
85
145
@staticmethod
86
146
def _parse_resource (resource : list ) -> dict :
147
+ """Parse resource elements from EverNote XML.
148
+
149
+ Extracts resource information like attachments, images, etc.
150
+ Base64 decodes data elements and generates MD5 hashes.
151
+
152
+ Args:
153
+ resource: List of XML elements representing a resource.
154
+
155
+ Returns:
156
+ Dictionary containing resource metadata and decoded data.
157
+ """
87
158
rsc_dict : Dict [str , Any ] = {}
88
159
for elem in resource :
89
160
if elem .tag == "data" :
@@ -97,6 +168,18 @@ def _parse_resource(resource: list) -> dict:
97
168
98
169
@staticmethod
99
170
def _parse_note (note : List , prefix : Optional [str ] = None ) -> dict :
171
+ """Parse a note element from EverNote XML.
172
+
173
+ Extracts note content, metadata, resources, and attributes.
174
+ Handles nested note-attributes recursively with prefixes.
175
+
176
+ Args:
177
+ note: List of XML elements representing a note.
178
+ prefix: Optional prefix for nested attribute names.
179
+
180
+ Returns:
181
+ Dictionary containing note content and metadata.
182
+ """
100
183
note_dict : Dict [str , Any ] = {}
101
184
resources = []
102
185
@@ -129,22 +212,37 @@ def add_prefix(element_tag: str) -> str:
129
212
130
213
@staticmethod
131
214
def _parse_note_xml (xml_file : str ) -> Iterator [Dict [str , Any ]]:
132
- """Parse Evernote xml."""
133
- # Without huge_tree set to True, parser may complain about huge text node
134
- # Try to recover, because there may be " ", which will cause
135
- # "XMLSyntaxError: Entity 'nbsp' not defined"
215
+ """Parse EverNote XML file securely.
216
+
217
+ Uses ``lxml`` with secure parsing configuration to prevent XML vulnerabilities
218
+ including XXE attacks, XML bombs, and malformed XML exploitation.
219
+
220
+ Args:
221
+ xml_file: Path to the EverNote export XML file.
222
+
223
+ Yields:
224
+ Dictionary containing parsed note data for each note in the file.
225
+
226
+ Raises:
227
+ ImportError: If ``lxml`` is not installed.
228
+ """
136
229
try :
137
230
from lxml import etree
138
231
except ImportError as e :
139
232
logger .error (
140
233
"Could not import `lxml`. Although it is not a required package to use "
141
- "Langchain , using the EverNote loader requires `lxml`. Please install "
234
+ "LangChain , using the EverNote loader requires `lxml`. Please install "
142
235
"`lxml` via `pip install lxml` and try again."
143
236
)
144
237
raise e
145
238
146
239
context = etree .iterparse (
147
- xml_file , encoding = "utf-8" , strip_cdata = False , huge_tree = True , recover = True
240
+ xml_file ,
241
+ encoding = "utf-8" ,
242
+ resolve_entities = False , # Prevents XXE attacks
243
+ no_network = True , # Blocks network-based external entities
244
+ recover = False , # Avoid parsing invalid/malformed XML
245
+ huge_tree = False , # Protect against XML Bomb DoS attacks
148
246
)
149
247
150
248
for action , elem in context :
0 commit comments