Skip to content

Commit 99a998f

Browse files
sjrlbglearning
andauthored
feat: Add MSGToDocument converter (#8868)
* Initial commit of MSG converter from Bijay * Updates to the MSG converter * Add license header * Add tests for msg converter * Update converter * Expanding tests * Update docstrings * add license header * Add reno * Add to inits and pydocs * Add test for empty input * Fix types * Fix mypy --------- Co-authored-by: Bijay Gurung <[email protected]>
1 parent d7dfc52 commit 99a998f

File tree

7 files changed

+242
-0
lines changed

7 files changed

+242
-0
lines changed

docs/pydoc/config/converters_api.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ loaders:
99
"html",
1010
"json",
1111
"markdown",
12+
"msg",
1213
"openapi_functions",
1314
"output_adapter",
1415
"pdfminer",

haystack/components/converters/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"html": ["HTMLToDocument"],
1515
"json": ["JSONConverter"],
1616
"markdown": ["MarkdownToDocument"],
17+
"msg": ["MSGToDocument"],
1718
"openapi_functions": ["OpenAPIServiceToFunctions"],
1819
"output_adapter": ["OutputAdapter"],
1920
"pdfminer": ["PDFMinerToDocument"],
@@ -31,6 +32,7 @@
3132
from .html import HTMLToDocument
3233
from .json import JSONConverter
3334
from .markdown import MarkdownToDocument
35+
from .msg import MSGToDocument
3436
from .openapi_functions import OpenAPIServiceToFunctions
3537
from .output_adapter import OutputAdapter
3638
from .pdfminer import PDFMinerToDocument
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import io
6+
import os
7+
from pathlib import Path
8+
from typing import Any, Dict, List, Optional, Tuple, Union
9+
10+
from haystack import Document, component, logging
11+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
12+
from haystack.dataclasses import ByteStream
13+
from haystack.lazy_imports import LazyImport
14+
15+
with LazyImport("Run 'pip install python-oxmsg'") as oxmsg_import:
16+
from oxmsg import Message, recipient
17+
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
@component
23+
class MSGToDocument:
24+
"""
25+
Converts Microsoft Outlook .msg files into Haystack Documents.
26+
27+
This component extracts email metadata (such as sender, recipients, CC, BCC, subject) and body content from .msg
28+
files and converts them into structured Haystack Documents. Additionally, any file attachments within the .msg
29+
file are extracted as ByteStream objects.
30+
31+
### Example Usage
32+
33+
```python
34+
from haystack.components.converters.msg import MSGToDocument
35+
from datetime import datetime
36+
37+
converter = MSGToDocument()
38+
results = converter.run(sources=["sample.msg"], meta={"date_added": datetime.now().isoformat()})
39+
documents = results["documents"]
40+
attachments = results["attachments"]
41+
print(documents[0].content)
42+
```
43+
"""
44+
45+
def __init__(self, store_full_path: bool = False) -> None:
46+
"""
47+
Creates a MSGToDocument component.
48+
49+
:param store_full_path:
50+
If True, the full path of the file is stored in the metadata of the document.
51+
If False, only the file name is stored.
52+
"""
53+
oxmsg_import.check()
54+
self.store_full_path = store_full_path
55+
56+
@staticmethod
57+
def _is_encrypted(msg: "Message") -> bool:
58+
"""
59+
Determines whether the provided MSG file is encrypted.
60+
61+
:param msg: The MSG file as a parsed Message object.
62+
:returns: True if the MSG file is encrypted, otherwise False.
63+
"""
64+
return "encrypted" in msg.message_headers.get("Content-Type", "")
65+
66+
@staticmethod
67+
def _create_recipient_str(recip: "recipient.Recipient") -> str:
68+
"""
69+
Formats a recipient's name and email into a single string.
70+
71+
:param recip: A recipient object extracted from the MSG file.
72+
:returns: A formatted string combining the recipient's name and email address.
73+
"""
74+
recip_str = ""
75+
if recip.name != "":
76+
recip_str += f"{recip.name} "
77+
if recip.email_address != "":
78+
recip_str += f"{recip.email_address}"
79+
return recip_str
80+
81+
def _convert(self, file_content: io.BytesIO) -> Tuple[str, List[ByteStream]]:
82+
"""
83+
Converts the MSG file content into text and extracts any attachments.
84+
85+
:param file_content: The MSG file content as a binary stream.
86+
:returns: A tuple containing the extracted email text and a list of ByteStream objects for attachments.
87+
:raises ValueError: If the MSG file is encrypted and cannot be read.
88+
"""
89+
msg = Message.load(file_content)
90+
if self._is_encrypted(msg):
91+
raise ValueError("The MSG file is encrypted and cannot be read.")
92+
93+
txt = ""
94+
95+
# Sender
96+
if msg.sender is not None:
97+
txt += f"From: {msg.sender}\n"
98+
99+
# To
100+
recipients_str = ",".join(self._create_recipient_str(r) for r in msg.recipients)
101+
if recipients_str != "":
102+
txt += f"To: {recipients_str}\n"
103+
104+
# CC
105+
cc_header = msg.message_headers.get("Cc") or msg.message_headers.get("CC")
106+
if cc_header is not None:
107+
txt += f"Cc: {cc_header}\n"
108+
109+
# BCC
110+
bcc_header = msg.message_headers.get("Bcc") or msg.message_headers.get("BCC")
111+
if bcc_header is not None:
112+
txt += f"Bcc: {bcc_header}\n"
113+
114+
# Subject
115+
if msg.subject != "":
116+
txt += f"Subject: {msg.subject}\n"
117+
118+
# Body
119+
if msg.body is not None:
120+
txt += "\n" + msg.body
121+
122+
# attachments
123+
attachments = [
124+
ByteStream(
125+
data=attachment.file_bytes, meta={"file_path": attachment.file_name}, mime_type=attachment.mime_type
126+
)
127+
for attachment in msg.attachments
128+
if attachment.file_bytes is not None
129+
]
130+
131+
return txt, attachments
132+
133+
@component.output_types(documents=List[Document], attachments=List[ByteStream])
134+
def run(
135+
self,
136+
sources: List[Union[str, Path, ByteStream]],
137+
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
138+
) -> Dict[str, Union[List[Document], List[ByteStream]]]:
139+
"""
140+
Converts MSG files to Documents.
141+
142+
:param sources:
143+
List of file paths or ByteStream objects.
144+
:param meta:
145+
Optional metadata to attach to the Documents.
146+
This value can be either a list of dictionaries or a single dictionary.
147+
If it's a single dictionary, its content is added to the metadata of all produced Documents.
148+
If it's a list, the length of the list must match the number of sources, because the two lists will
149+
be zipped.
150+
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
151+
152+
:returns:
153+
A dictionary with the following keys:
154+
- `documents`: Created Documents.
155+
- `attachments`: Created ByteStream objects from file attachments.
156+
"""
157+
if len(sources) == 0:
158+
return {"documents": [], "attachments": []}
159+
160+
documents = []
161+
all_attachments = []
162+
meta_list = normalize_metadata(meta, sources_count=len(sources))
163+
164+
for source, metadata in zip(sources, meta_list):
165+
try:
166+
bytestream = get_bytestream_from_source(source)
167+
except Exception as e:
168+
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
169+
continue
170+
try:
171+
text, attachments = self._convert(io.BytesIO(bytestream.data))
172+
except Exception as e:
173+
logger.warning(
174+
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
175+
)
176+
continue
177+
178+
merged_metadata = {**bytestream.meta, **metadata}
179+
180+
if not self.store_full_path and "file_path" in bytestream.meta:
181+
merged_metadata["file_path"] = os.path.basename(bytestream.meta["file_path"])
182+
183+
documents.append(Document(content=text, meta=merged_metadata))
184+
for attachment in attachments:
185+
attachment_meta = {
186+
**merged_metadata,
187+
"parent_file_path": merged_metadata["file_path"],
188+
"file_path": attachment.meta["file_path"],
189+
}
190+
all_attachments.append(
191+
ByteStream(data=attachment.data, meta=attachment_meta, mime_type=attachment.mime_type)
192+
)
193+
194+
return {"documents": documents, "attachments": all_attachments}

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ extra-dependencies = [
113113
"jq", # JSONConverter
114114
"openpyxl", # XLSXToDocument
115115
"tabulate", # XLSXToDocument
116+
"python-oxmsg", # MSGToDocument
116117

117118
"nltk>=3.9.1", # NLTKDocumentSplitter
118119

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
features:
3+
- |
4+
Add a new MSGToDocument component to convert .msg files into Haystack Document objects.
5+
- Extracts email metadata (e.g. sender, recipients, CC, BCC, subject) and body content into a Document.
6+
- Converts attachments into ByteStream objects which can be passed onto a FileTypeRouter + relevant converters.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from haystack.components.converters.msg import MSGToDocument
6+
7+
8+
class TestMSGToDocument:
9+
def test_run(self, test_files_path):
10+
converter = MSGToDocument(store_full_path=True)
11+
paths = [test_files_path / "msg" / "sample.msg"]
12+
result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
13+
assert len(result["documents"]) == 1
14+
assert result["documents"][0].content.startswith('From: "Sebastian Lee"')
15+
assert result["documents"][0].meta == {
16+
"date_added": "2021-09-01T00:00:00",
17+
"file_path": str(test_files_path / "msg" / "sample.msg"),
18+
}
19+
assert len(result["attachments"]) == 1
20+
assert result["attachments"][0].mime_type == "application/pdf"
21+
assert result["attachments"][0].meta == {
22+
"date_added": "2021-09-01T00:00:00",
23+
"parent_file_path": str(test_files_path / "msg" / "sample.msg"),
24+
"file_path": "sample_pdf_1.pdf",
25+
}
26+
27+
def test_run_wrong_file_type(self, test_files_path, caplog):
28+
converter = MSGToDocument(store_full_path=False)
29+
paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
30+
result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
31+
assert len(result["documents"]) == 0
32+
assert "msg_file is not an Outlook MSG file" in caplog.text
33+
34+
def test_run_empty_sources(self, test_files_path):
35+
converter = MSGToDocument(store_full_path=False)
36+
result = converter.run(sources=[])
37+
assert len(result["documents"]) == 0
38+
assert len(result["attachments"]) == 0

test/test_files/msg/sample.msg

66 KB
Binary file not shown.

0 commit comments

Comments
 (0)