|
1 | | -# pyright: reportPrivateUsage=false |
2 | | - |
3 | 1 | from __future__ import annotations |
4 | 2 |
|
5 | | -from unstructured.partition.new_msg import ( |
6 | | - MsgPartitionerOptions, |
7 | | - _AttachmentPartitioner, |
8 | | - _MsgPartitioner, |
9 | | - partition_msg, |
| 3 | +import copy |
| 4 | +import os |
| 5 | +import tempfile |
| 6 | +from typing import IO, Any, Iterator, Optional |
| 7 | + |
| 8 | +from oxmsg import Message |
| 9 | +from oxmsg.attachment import Attachment |
| 10 | + |
| 11 | +from unstructured.chunking import add_chunking_strategy |
| 12 | +from unstructured.documents.elements import Element, ElementMetadata, process_metadata |
| 13 | +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype |
| 14 | +from unstructured.logger import logger |
| 15 | +from unstructured.partition.common import ( |
| 16 | + get_last_modified_date, |
| 17 | + get_last_modified_date_from_file, |
10 | 18 | ) |
| 19 | +from unstructured.partition.html import partition_html |
| 20 | +from unstructured.partition.lang import apply_lang_metadata |
| 21 | +from unstructured.partition.text import partition_text |
| 22 | +from unstructured.utils import is_temp_file_path, lazyproperty |
| 23 | + |
| 24 | + |
| 25 | +@process_metadata() |
| 26 | +@add_metadata_with_filetype(FileType.MSG) |
| 27 | +@add_chunking_strategy |
| 28 | +def partition_msg( |
| 29 | + filename: Optional[str] = None, |
| 30 | + *, |
| 31 | + file: Optional[IO[bytes]] = None, |
| 32 | + date_from_file_object: bool = False, |
| 33 | + metadata_filename: Optional[str] = None, |
| 34 | + metadata_last_modified: Optional[str] = None, |
| 35 | + process_attachments: bool = False, |
| 36 | + **kwargs: Any, |
| 37 | +) -> list[Element]: |
| 38 | + """Partitions a MSFT Outlook .msg file |
| 39 | +
|
| 40 | + Parameters |
| 41 | + ---------- |
| 42 | + filename |
| 43 | + A string defining the target filename path. |
| 44 | + file |
| 45 | + A file-like object using "rb" mode --> open(filename, "rb"). |
| 46 | + date_from_file_object |
| 47 | + Applies only when providing file via `file` parameter. If this option is True and inference |
| 48 | + from message header failed, attempt to infer last_modified metadata from bytes, |
| 49 | + otherwise set it to None. |
| 50 | + metadata_filename |
| 51 | + The filename to use for the metadata. |
| 52 | + metadata_last_modified |
| 53 | + The last modified date for the document. |
| 54 | + process_attachments |
| 55 | + If True, partition_email will process email attachments in addition to |
| 56 | + processing the content of the email itself. |
| 57 | + """ |
| 58 | + opts = MsgPartitionerOptions( |
| 59 | + date_from_file_object=date_from_file_object, |
| 60 | + file=file, |
| 61 | + file_path=filename, |
| 62 | + metadata_file_path=metadata_filename, |
| 63 | + metadata_last_modified=metadata_last_modified, |
| 64 | + partition_attachments=process_attachments, |
| 65 | + ) |
| 66 | + |
| 67 | + return list( |
| 68 | + apply_lang_metadata( |
| 69 | + elements=_MsgPartitioner.iter_message_elements(opts), |
| 70 | + languages=kwargs.get("languages", ["auto"]), |
| 71 | + detect_language_per_element=kwargs.get("detect_language_per_element", False), |
| 72 | + ) |
| 73 | + ) |
| 74 | + |
| 75 | + |
| 76 | +class MsgPartitionerOptions: |
| 77 | + """Encapsulates partitioning option validation, computation, and application of defaults.""" |
| 78 | + |
| 79 | + def __init__( |
| 80 | + self, |
| 81 | + *, |
| 82 | + date_from_file_object: bool, |
| 83 | + file: IO[bytes] | None, |
| 84 | + file_path: str | None, |
| 85 | + metadata_file_path: str | None, |
| 86 | + metadata_last_modified: str | None, |
| 87 | + partition_attachments: bool, |
| 88 | + ): |
| 89 | + self._date_from_file_object = date_from_file_object |
| 90 | + self._file = file |
| 91 | + self._file_path = file_path |
| 92 | + self._metadata_file_path = metadata_file_path |
| 93 | + self._metadata_last_modified = metadata_last_modified |
| 94 | + self._partition_attachments = partition_attachments |
| 95 | + |
| 96 | + @lazyproperty |
| 97 | + def is_encrypted(self) -> bool: |
| 98 | + """True when message is encrypted.""" |
| 99 | + # NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content |
| 100 | + # is multipart/encrypted (ref: https://www.ietf.org/rfc/rfc2015.txt) |
| 101 | + if "encrypted" in self.msg.message_headers.get("Content-Type", ""): |
| 102 | + return True |
| 103 | + # -- pretty sure we're going to want to dig deeper to discover messages that are encrypted |
| 104 | + # -- with something other than PGP. |
| 105 | + # - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed' |
| 106 | + # - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME |
| 107 | + # encryption. |
| 108 | + return False |
| 109 | + |
| 110 | + @lazyproperty |
| 111 | + def metadata_file_path(self) -> str | None: |
| 112 | + """Best available path for MSG file. |
| 113 | +
|
| 114 | + The value is the caller supplied `metadata_filename` if present, falling back to the |
| 115 | + source file-path if that was provided, otherwise `None`. |
| 116 | + """ |
| 117 | + return self._metadata_file_path or self._file_path |
| 118 | + |
| 119 | + @lazyproperty |
| 120 | + def metadata_last_modified(self) -> str | None: |
| 121 | + """Caller override for `.metadata.last_modified` to be applied to all elements.""" |
| 122 | + return self._metadata_last_modified |
| 123 | + |
| 124 | + @lazyproperty |
| 125 | + def msg(self) -> Message: |
| 126 | + """The `oxmsg.Message` object loaded from file or filename.""" |
| 127 | + return Message.load(self._msg_file) |
| 128 | + |
| 129 | + @property |
| 130 | + def msg_metadata(self) -> ElementMetadata: |
| 131 | + """ElementMetadata suitable for use on an element formed from message content. |
| 132 | +
|
| 133 | + A distinct instance is returned on each reference such that downstream changes to the |
| 134 | + metadata of one element is not also reflected in another element. |
| 135 | + """ |
| 136 | + return copy.copy(self._msg_metadata) |
| 137 | + |
| 138 | + @lazyproperty |
| 139 | + def partition_attachments(self) -> bool: |
| 140 | + """True when message attachments should also be partitioned.""" |
| 141 | + return self._partition_attachments |
| 142 | + |
| 143 | + @lazyproperty |
| 144 | + def partitioning_kwargs(self) -> dict[str, Any]: |
| 145 | + """Partitioning keyword-arguments to be passed along to attachment partitioner.""" |
| 146 | + # TODO: no good reason we can't accept and pass along any file-type specific kwargs |
| 147 | + # the caller might want to send along. |
| 148 | + return {} |
| 149 | + |
| 150 | + @lazyproperty |
| 151 | + def _last_modified(self) -> str | None: |
| 152 | + """The best last-modified date available from source-file, None if not available.""" |
| 153 | + if self._file_path: |
| 154 | + return ( |
| 155 | + None |
| 156 | + if is_temp_file_path(self._file_path) |
| 157 | + else get_last_modified_date(self._file_path) |
| 158 | + ) |
| 159 | + |
| 160 | + if self._file: |
| 161 | + return ( |
| 162 | + get_last_modified_date_from_file(self._file) |
| 163 | + if self._date_from_file_object |
| 164 | + else None |
| 165 | + ) |
| 166 | + |
| 167 | + return None |
| 168 | + |
| 169 | + @lazyproperty |
| 170 | + def _msg_file(self) -> str | IO[bytes]: |
| 171 | + """The source for the bytes of the message, either a file-path or a file-like object.""" |
| 172 | + if file_path := self._file_path: |
| 173 | + return file_path |
| 174 | + |
| 175 | + if file := self._file: |
| 176 | + return file |
| 177 | + |
| 178 | + raise ValueError("one of `file` or `filename` arguments must be provided") |
| 179 | + |
| 180 | + @property |
| 181 | + def _msg_metadata(self) -> ElementMetadata: |
| 182 | + """ElementMetadata "template" for elements of this message. |
| 183 | +
|
| 184 | + None of these metadata fields change based on the element, so compute it once here and then |
| 185 | + just make a separate copy for each element. |
| 186 | + """ |
| 187 | + msg = self.msg |
| 188 | + |
| 189 | + email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None |
| 190 | + sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None |
| 191 | + sent_to = [r.email_address for r in msg.recipients] or None |
| 192 | + |
| 193 | + element_metadata = ElementMetadata( |
| 194 | + filename=self.metadata_file_path, |
| 195 | + last_modified=self._metadata_last_modified or email_date or self._last_modified, |
| 196 | + sent_from=sent_from, |
| 197 | + sent_to=sent_to, |
| 198 | + subject=msg.subject or None, |
| 199 | + ) |
| 200 | + element_metadata.detection_origin = "msg" |
| 201 | + |
| 202 | + return element_metadata |
| 203 | + |
| 204 | + |
| 205 | +class _MsgPartitioner: |
| 206 | + """Partitions Outlook email message (MSG) files.""" |
| 207 | + |
| 208 | + def __init__(self, opts: MsgPartitionerOptions): |
| 209 | + self._opts = opts |
| 210 | + |
| 211 | + @classmethod |
| 212 | + def iter_message_elements(cls, opts: MsgPartitionerOptions) -> Iterator[Element]: |
| 213 | + """Partition MS Outlook email messages (.msg files) into elements.""" |
| 214 | + if opts.is_encrypted: |
| 215 | + logger.warning("Encrypted email detected. Partitioner will return an empty list.") |
| 216 | + return |
| 217 | + |
| 218 | + yield from cls(opts)._iter_message_elements() |
| 219 | + |
| 220 | + def _iter_message_elements(self) -> Iterator[Element]: |
| 221 | + """Partition MS Outlook email messages (.msg files) into elements.""" |
| 222 | + yield from self._iter_message_body_elements() |
| 223 | + |
| 224 | + if not self._opts.partition_attachments: |
| 225 | + return |
| 226 | + |
| 227 | + for attachment in self._attachments: |
| 228 | + yield from _AttachmentPartitioner.iter_elements(attachment, self._opts) |
| 229 | + |
| 230 | + @lazyproperty |
| 231 | + def _attachments(self) -> tuple[Attachment, ...]: |
| 232 | + """The `oxmsg.attachment.Attachment` objects for this message.""" |
| 233 | + return tuple(self._opts.msg.attachments) |
| 234 | + |
| 235 | + def _iter_message_body_elements(self) -> Iterator[Element]: |
| 236 | + """Partition the message body (but not the attachments).""" |
| 237 | + msg = self._opts.msg |
| 238 | + |
| 239 | + if html_body := msg.html_body: |
| 240 | + elements = partition_html(text=html_body, languages=[""]) |
| 241 | + elif msg.body: |
| 242 | + elements = partition_text(text=msg.body, languages=[""]) |
| 243 | + else: |
| 244 | + elements: list[Element] = [] |
| 245 | + |
| 246 | + # -- replace the element metadata with email-specific values -- |
| 247 | + for e in elements: |
| 248 | + e.metadata = self._opts.msg_metadata |
| 249 | + yield e |
| 250 | + |
| 251 | + |
| 252 | +class _AttachmentPartitioner: |
| 253 | + """Partitions an attachment to a MSG file.""" |
| 254 | + |
| 255 | + def __init__(self, attachment: Attachment, opts: MsgPartitionerOptions): |
| 256 | + self._attachment = attachment |
| 257 | + self._opts = opts |
| 258 | + |
| 259 | + @classmethod |
| 260 | + def iter_elements( |
| 261 | + cls, attachment: Attachment, opts: MsgPartitionerOptions |
| 262 | + ) -> Iterator[Element]: |
| 263 | + """Partition an `oxmsg.attachment.Attachment` from an Outlook email message (.msg file).""" |
| 264 | + return cls(attachment, opts)._iter_elements() |
| 265 | + |
| 266 | + def _iter_elements(self) -> Iterator[Element]: |
| 267 | + """Partition the file in an `oxmsg.attachment.Attachment` into elements.""" |
| 268 | + from unstructured.partition.auto import partition |
| 269 | + |
| 270 | + with tempfile.TemporaryDirectory() as tmp_dir_path: |
| 271 | + # -- save attachment as file in this temporary directory -- |
| 272 | + detached_file_path = os.path.join(tmp_dir_path, self._attachment_file_name) |
| 273 | + with open(detached_file_path, "wb") as f: |
| 274 | + f.write(self._file_bytes) |
| 275 | + |
| 276 | + # -- partition the attachment -- |
| 277 | + for element in partition( |
| 278 | + detached_file_path, |
| 279 | + metadata_filename=self._attachment_file_name, |
| 280 | + metadata_last_modified=self._attachment_last_modified, |
| 281 | + **self._opts.partitioning_kwargs, |
| 282 | + ): |
| 283 | + element.metadata.attached_to_filename = self._opts.metadata_file_path |
| 284 | + yield element |
| 285 | + |
| 286 | + @lazyproperty |
| 287 | + def _attachment_file_name(self) -> str: |
| 288 | + """The original name of the attached file, no path. |
| 289 | +
|
| 290 | + This value is 'unknown' if it is not present in the MSG file (not expected). |
| 291 | + """ |
| 292 | + return self._attachment.file_name or "unknown" |
| 293 | + |
| 294 | + @lazyproperty |
| 295 | + def _attachment_last_modified(self) -> str | None: |
| 296 | + """ISO8601 string timestamp of attachment last-modified date. |
| 297 | +
|
| 298 | + This value generally available on the attachment and will be the most reliable last-modifed |
| 299 | + time. There are fallbacks for when it is not present, ultimately `None` if we have no way |
| 300 | + of telling. |
| 301 | + """ |
| 302 | + if last_modified := self._attachment.last_modified: |
| 303 | + return last_modified.isoformat() |
| 304 | + return self._opts.metadata_last_modified |
11 | 305 |
|
12 | | -__all__ = [ |
13 | | - "MsgPartitionerOptions", |
14 | | - "_AttachmentPartitioner", |
15 | | - "_MsgPartitioner", |
16 | | - "partition_msg", |
17 | | -] |
| 306 | + @lazyproperty |
| 307 | + def _file_bytes(self) -> bytes: |
| 308 | + """The bytes of the attached file.""" |
| 309 | + return self._attachment.file_bytes or b"" |
0 commit comments