feat: Add MSGToDocument converter (#8868)

sjrl · bglearning · web-flow · commit 99a998f90ba5 · 2025-02-24T08:12:32.000+01:00
* Initial commit of MSG converter from Bijay

* Updates to the MSG converter

* Add license header

* Add tests for msg converter

* Update converter

* Expanding tests

* Update docstrings

* add license header

* Add reno

* Add to inits and pydocs

* Add test for empty input

* Fix types

* Fix mypy

---------

Co-authored-by: Bijay Gurung &lt;bijay.learning@gmail.com&gt;
diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml
@@ -9,6 +9,7 @@ loaders:
         "html",
         "json",
         "markdown",
+        "msg",
         "openapi_functions",
         "output_adapter",
         "pdfminer",
diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -14,6 +14,7 @@
     "html": ["HTMLToDocument"],
     "json": ["JSONConverter"],
     "markdown": ["MarkdownToDocument"],
+    "msg": ["MSGToDocument"],
     "openapi_functions": ["OpenAPIServiceToFunctions"],
     "output_adapter": ["OutputAdapter"],
     "pdfminer": ["PDFMinerToDocument"],
@@ -31,6 +32,7 @@
     from .html import HTMLToDocument
     from .json import JSONConverter
     from .markdown import MarkdownToDocument
+    from .msg import MSGToDocument
     from .openapi_functions import OpenAPIServiceToFunctions
     from .output_adapter import OutputAdapter
     from .pdfminer import PDFMinerToDocument
diff --git a/haystack/components/converters/msg.py b/haystack/components/converters/msg.py
@@ -0,0 +1,194 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import io
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+from haystack.lazy_imports import LazyImport
+
+with LazyImport("Run 'pip install python-oxmsg'") as oxmsg_import:
+    from oxmsg import Message, recipient
+
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class MSGToDocument:
+    """
+    Converts Microsoft Outlook .msg files into Haystack Documents.
+
+    This component extracts email metadata (such as sender, recipients, CC, BCC, subject) and body content from .msg
+    files and converts them into structured Haystack Documents. Additionally, any file attachments within the .msg
+    file are extracted as ByteStream objects.
+
+    ### Example Usage
+
+    ```python
+    from haystack.components.converters.msg import MSGToDocument
+    from datetime import datetime
+
+    converter = MSGToDocument()
+    results = converter.run(sources=["sample.msg"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    attachments = results["attachments"]
+    print(documents[0].content)
+    ```
+    """
+
+    def __init__(self, store_full_path: bool = False) -> None:
+        """
+        Creates a MSGToDocument component.
+
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
+        """
+        oxmsg_import.check()
+        self.store_full_path = store_full_path
+
+    @staticmethod
+    def _is_encrypted(msg: "Message") -> bool:
+        """
+        Determines whether the provided MSG file is encrypted.
+
+        :param msg: The MSG file as a parsed Message object.
+        :returns: True if the MSG file is encrypted, otherwise False.
+        """
+        return "encrypted" in msg.message_headers.get("Content-Type", "")
+
+    @staticmethod
+    def _create_recipient_str(recip: "recipient.Recipient") -> str:
+        """
+        Formats a recipient's name and email into a single string.
+
+        :param recip: A recipient object extracted from the MSG file.
+        :returns: A formatted string combining the recipient's name and email address.
+        """
+        recip_str = ""
+        if recip.name != "":
+            recip_str += f"{recip.name} "
+        if recip.email_address != "":
+            recip_str += f"{recip.email_address}"
+        return recip_str
+
+    def _convert(self, file_content: io.BytesIO) -> Tuple[str, List[ByteStream]]:
+        """
+        Converts the MSG file content into text and extracts any attachments.
+
+        :param file_content: The MSG file content as a binary stream.
+        :returns: A tuple containing the extracted email text and a list of ByteStream objects for attachments.
+        :raises ValueError: If the MSG file is encrypted and cannot be read.
+        """
+        msg = Message.load(file_content)
+        if self._is_encrypted(msg):
+            raise ValueError("The MSG file is encrypted and cannot be read.")
+
+        txt = ""
+
+        # Sender
+        if msg.sender is not None:
+            txt += f"From: {msg.sender}\n"
+
+        # To
+        recipients_str = ",".join(self._create_recipient_str(r) for r in msg.recipients)
+        if recipients_str != "":
+            txt += f"To: {recipients_str}\n"
+
+        # CC
+        cc_header = msg.message_headers.get("Cc") or msg.message_headers.get("CC")
+        if cc_header is not None:
+            txt += f"Cc: {cc_header}\n"
+
+        # BCC
+        bcc_header = msg.message_headers.get("Bcc") or msg.message_headers.get("BCC")
+        if bcc_header is not None:
+            txt += f"Bcc: {bcc_header}\n"
+
+        # Subject
+        if msg.subject != "":
+            txt += f"Subject: {msg.subject}\n"
+
+        # Body
+        if msg.body is not None:
+            txt += "\n" + msg.body
+
+        # attachments
+        attachments = [
+            ByteStream(
+                data=attachment.file_bytes, meta={"file_path": attachment.file_name}, mime_type=attachment.mime_type
+            )
+            for attachment in msg.attachments
+            if attachment.file_bytes is not None
+        ]
+
+        return txt, attachments
+
+    @component.output_types(documents=List[Document], attachments=List[ByteStream])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ) -> Dict[str, Union[List[Document], List[ByteStream]]]:
+        """
+        Converts MSG files to Documents.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will
+            be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents.
+            - `attachments`: Created ByteStream objects from file attachments.
+        """
+        if len(sources) == 0:
+            return {"documents": [], "attachments": []}
+
+        documents = []
+        all_attachments = []
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+            try:
+                text, attachments = self._convert(io.BytesIO(bytestream.data))
+            except Exception as e:
+                logger.warning(
+                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
+                )
+                continue
+
+            merged_metadata = {**bytestream.meta, **metadata}
+
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                merged_metadata["file_path"] = os.path.basename(bytestream.meta["file_path"])
+
+            documents.append(Document(content=text, meta=merged_metadata))
+            for attachment in attachments:
+                attachment_meta = {
+                    **merged_metadata,
+                    "parent_file_path": merged_metadata["file_path"],
+                    "file_path": attachment.meta["file_path"],
+                }
+                all_attachments.append(
+                    ByteStream(data=attachment.data, meta=attachment_meta, mime_type=attachment.mime_type)
+                )
+
+        return {"documents": documents, "attachments": all_attachments}
diff --git a/pyproject.toml b/pyproject.toml
@@ -113,6 +113,7 @@ extra-dependencies = [
   "jq",                               # JSONConverter
   "openpyxl",                         # XLSXToDocument
   "tabulate",                         # XLSXToDocument
+  "python-oxmsg",                     # MSGToDocument
 
   "nltk>=3.9.1", # NLTKDocumentSplitter
 
diff --git a/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml b/releasenotes/notes/add-msg-to-document-converter-79338eef22a3fd82.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Add a new MSGToDocument component to convert .msg files into Haystack Document objects.
+    - Extracts email metadata (e.g. sender, recipients, CC, BCC, subject) and body content into a Document.
+    - Converts attachments into ByteStream objects which can be passed onto a FileTypeRouter + relevant converters.
diff --git a/test/components/converters/test_msg_to_document.py b/test/components/converters/test_msg_to_document.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack.components.converters.msg import MSGToDocument
+
+
+class TestMSGToDocument:
+    def test_run(self, test_files_path):
+        converter = MSGToDocument(store_full_path=True)
+        paths = [test_files_path / "msg" / "sample.msg"]
+        result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
+        assert len(result["documents"]) == 1
+        assert result["documents"][0].content.startswith('From: "Sebastian Lee"')
+        assert result["documents"][0].meta == {
+            "date_added": "2021-09-01T00:00:00",
+            "file_path": str(test_files_path / "msg" / "sample.msg"),
+        }
+        assert len(result["attachments"]) == 1
+        assert result["attachments"][0].mime_type == "application/pdf"
+        assert result["attachments"][0].meta == {
+            "date_added": "2021-09-01T00:00:00",
+            "parent_file_path": str(test_files_path / "msg" / "sample.msg"),
+            "file_path": "sample_pdf_1.pdf",
+        }
+
+    def test_run_wrong_file_type(self, test_files_path, caplog):
+        converter = MSGToDocument(store_full_path=False)
+        paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        result = converter.run(sources=paths, meta={"date_added": "2021-09-01T00:00:00"})
+        assert len(result["documents"]) == 0
+        assert "msg_file is not an Outlook MSG file" in caplog.text
+
+    def test_run_empty_sources(self, test_files_path):
+        converter = MSGToDocument(store_full_path=False)
+        result = converter.run(sources=[])
+        assert len(result["documents"]) == 0
+        assert len(result["attachments"]) == 0
diff --git a/test/test_files/msg/sample.msg b/test/test_files/msg/sample.msg