Skip to content

Commit f1cab24

Browse files
authored
rfctr(msg): remove temporary new_msg.py (#3157)
**Summary** Remove temporary `new_msg.py` module. **Additional Context** The rewrite of `partition_msg()` was placed in a separate file `new_msg.py` to avoid a messy diff for code-review. This PR makes that `new_msg.py` the new `msg.py`. No code changes were made in the process.
1 parent ddbe90f commit f1cab24

File tree

6 files changed

+308
-342
lines changed

6 files changed

+308
-342
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.5-dev3
1+
## 0.14.5-dev4
22

33
### Enhancements
44

test_unstructured/partition/test_msg.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
NarrativeText,
2424
Title,
2525
)
26-
from unstructured.partition.msg import partition_msg
27-
from unstructured.partition.new_msg import MsgPartitionerOptions
26+
from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
2827

2928
EXPECTED_MSG_OUTPUT = [
3029
NarrativeText(text="This is a test email to use for unit tests."),

typings/msg_parser/__init__.pyi

Lines changed: 0 additions & 16 deletions
This file was deleted.

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.5-dev3" # pragma: no cover
1+
__version__ = "0.14.5-dev4" # pragma: no cover

unstructured/partition/msg.py

Lines changed: 305 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,309 @@
1-
# pyright: reportPrivateUsage=false
2-
31
from __future__ import annotations
42

5-
from unstructured.partition.new_msg import (
6-
MsgPartitionerOptions,
7-
_AttachmentPartitioner,
8-
_MsgPartitioner,
9-
partition_msg,
3+
import copy
4+
import os
5+
import tempfile
6+
from typing import IO, Any, Iterator, Optional
7+
8+
from oxmsg import Message
9+
from oxmsg.attachment import Attachment
10+
11+
from unstructured.chunking import add_chunking_strategy
12+
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
13+
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
14+
from unstructured.logger import logger
15+
from unstructured.partition.common import (
16+
get_last_modified_date,
17+
get_last_modified_date_from_file,
1018
)
19+
from unstructured.partition.html import partition_html
20+
from unstructured.partition.lang import apply_lang_metadata
21+
from unstructured.partition.text import partition_text
22+
from unstructured.utils import is_temp_file_path, lazyproperty
23+
24+
25+
@process_metadata()
26+
@add_metadata_with_filetype(FileType.MSG)
27+
@add_chunking_strategy
28+
def partition_msg(
29+
filename: Optional[str] = None,
30+
*,
31+
file: Optional[IO[bytes]] = None,
32+
date_from_file_object: bool = False,
33+
metadata_filename: Optional[str] = None,
34+
metadata_last_modified: Optional[str] = None,
35+
process_attachments: bool = False,
36+
**kwargs: Any,
37+
) -> list[Element]:
38+
"""Partitions a MSFT Outlook .msg file
39+
40+
Parameters
41+
----------
42+
filename
43+
A string defining the target filename path.
44+
file
45+
A file-like object using "rb" mode --> open(filename, "rb").
46+
date_from_file_object
47+
Applies only when providing file via `file` parameter. If this option is True and inference
48+
from message header failed, attempt to infer last_modified metadata from bytes,
49+
otherwise set it to None.
50+
metadata_filename
51+
The filename to use for the metadata.
52+
metadata_last_modified
53+
The last modified date for the document.
54+
process_attachments
55+
If True, partition_email will process email attachments in addition to
56+
processing the content of the email itself.
57+
"""
58+
opts = MsgPartitionerOptions(
59+
date_from_file_object=date_from_file_object,
60+
file=file,
61+
file_path=filename,
62+
metadata_file_path=metadata_filename,
63+
metadata_last_modified=metadata_last_modified,
64+
partition_attachments=process_attachments,
65+
)
66+
67+
return list(
68+
apply_lang_metadata(
69+
elements=_MsgPartitioner.iter_message_elements(opts),
70+
languages=kwargs.get("languages", ["auto"]),
71+
detect_language_per_element=kwargs.get("detect_language_per_element", False),
72+
)
73+
)
74+
75+
76+
class MsgPartitionerOptions:
77+
"""Encapsulates partitioning option validation, computation, and application of defaults."""
78+
79+
def __init__(
80+
self,
81+
*,
82+
date_from_file_object: bool,
83+
file: IO[bytes] | None,
84+
file_path: str | None,
85+
metadata_file_path: str | None,
86+
metadata_last_modified: str | None,
87+
partition_attachments: bool,
88+
):
89+
self._date_from_file_object = date_from_file_object
90+
self._file = file
91+
self._file_path = file_path
92+
self._metadata_file_path = metadata_file_path
93+
self._metadata_last_modified = metadata_last_modified
94+
self._partition_attachments = partition_attachments
95+
96+
@lazyproperty
97+
def is_encrypted(self) -> bool:
98+
"""True when message is encrypted."""
99+
# NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content
100+
# is multipart/encrypted (ref: https://www.ietf.org/rfc/rfc2015.txt)
101+
if "encrypted" in self.msg.message_headers.get("Content-Type", ""):
102+
return True
103+
# -- pretty sure we're going to want to dig deeper to discover messages that are encrypted
104+
# -- with something other than PGP.
105+
# - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
106+
# - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
107+
# encryption.
108+
return False
109+
110+
@lazyproperty
111+
def metadata_file_path(self) -> str | None:
112+
"""Best available path for MSG file.
113+
114+
The value is the caller supplied `metadata_filename` if present, falling back to the
115+
source file-path if that was provided, otherwise `None`.
116+
"""
117+
return self._metadata_file_path or self._file_path
118+
119+
@lazyproperty
120+
def metadata_last_modified(self) -> str | None:
121+
"""Caller override for `.metadata.last_modified` to be applied to all elements."""
122+
return self._metadata_last_modified
123+
124+
@lazyproperty
125+
def msg(self) -> Message:
126+
"""The `oxmsg.Message` object loaded from file or filename."""
127+
return Message.load(self._msg_file)
128+
129+
@property
130+
def msg_metadata(self) -> ElementMetadata:
131+
"""ElementMetadata suitable for use on an element formed from message content.
132+
133+
A distinct instance is returned on each reference such that downstream changes to the
134+
metadata of one element is not also reflected in another element.
135+
"""
136+
return copy.copy(self._msg_metadata)
137+
138+
@lazyproperty
139+
def partition_attachments(self) -> bool:
140+
"""True when message attachments should also be partitioned."""
141+
return self._partition_attachments
142+
143+
@lazyproperty
144+
def partitioning_kwargs(self) -> dict[str, Any]:
145+
"""Partitioning keyword-arguments to be passed along to attachment partitioner."""
146+
# TODO: no good reason we can't accept and pass along any file-type specific kwargs
147+
# the caller might want to send along.
148+
return {}
149+
150+
@lazyproperty
151+
def _last_modified(self) -> str | None:
152+
"""The best last-modified date available from source-file, None if not available."""
153+
if self._file_path:
154+
return (
155+
None
156+
if is_temp_file_path(self._file_path)
157+
else get_last_modified_date(self._file_path)
158+
)
159+
160+
if self._file:
161+
return (
162+
get_last_modified_date_from_file(self._file)
163+
if self._date_from_file_object
164+
else None
165+
)
166+
167+
return None
168+
169+
@lazyproperty
170+
def _msg_file(self) -> str | IO[bytes]:
171+
"""The source for the bytes of the message, either a file-path or a file-like object."""
172+
if file_path := self._file_path:
173+
return file_path
174+
175+
if file := self._file:
176+
return file
177+
178+
raise ValueError("one of `file` or `filename` arguments must be provided")
179+
180+
@property
181+
def _msg_metadata(self) -> ElementMetadata:
182+
"""ElementMetadata "template" for elements of this message.
183+
184+
None of these metadata fields change based on the element, so compute it once here and then
185+
just make a separate copy for each element.
186+
"""
187+
msg = self.msg
188+
189+
email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None
190+
sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
191+
sent_to = [r.email_address for r in msg.recipients] or None
192+
193+
element_metadata = ElementMetadata(
194+
filename=self.metadata_file_path,
195+
last_modified=self._metadata_last_modified or email_date or self._last_modified,
196+
sent_from=sent_from,
197+
sent_to=sent_to,
198+
subject=msg.subject or None,
199+
)
200+
element_metadata.detection_origin = "msg"
201+
202+
return element_metadata
203+
204+
205+
class _MsgPartitioner:
206+
"""Partitions Outlook email message (MSG) files."""
207+
208+
def __init__(self, opts: MsgPartitionerOptions):
209+
self._opts = opts
210+
211+
@classmethod
212+
def iter_message_elements(cls, opts: MsgPartitionerOptions) -> Iterator[Element]:
213+
"""Partition MS Outlook email messages (.msg files) into elements."""
214+
if opts.is_encrypted:
215+
logger.warning("Encrypted email detected. Partitioner will return an empty list.")
216+
return
217+
218+
yield from cls(opts)._iter_message_elements()
219+
220+
def _iter_message_elements(self) -> Iterator[Element]:
221+
"""Partition MS Outlook email messages (.msg files) into elements."""
222+
yield from self._iter_message_body_elements()
223+
224+
if not self._opts.partition_attachments:
225+
return
226+
227+
for attachment in self._attachments:
228+
yield from _AttachmentPartitioner.iter_elements(attachment, self._opts)
229+
230+
@lazyproperty
231+
def _attachments(self) -> tuple[Attachment, ...]:
232+
"""The `oxmsg.attachment.Attachment` objects for this message."""
233+
return tuple(self._opts.msg.attachments)
234+
235+
def _iter_message_body_elements(self) -> Iterator[Element]:
236+
"""Partition the message body (but not the attachments)."""
237+
msg = self._opts.msg
238+
239+
if html_body := msg.html_body:
240+
elements = partition_html(text=html_body, languages=[""])
241+
elif msg.body:
242+
elements = partition_text(text=msg.body, languages=[""])
243+
else:
244+
elements: list[Element] = []
245+
246+
# -- replace the element metadata with email-specific values --
247+
for e in elements:
248+
e.metadata = self._opts.msg_metadata
249+
yield e
250+
251+
252+
class _AttachmentPartitioner:
253+
"""Partitions an attachment to a MSG file."""
254+
255+
def __init__(self, attachment: Attachment, opts: MsgPartitionerOptions):
256+
self._attachment = attachment
257+
self._opts = opts
258+
259+
@classmethod
260+
def iter_elements(
261+
cls, attachment: Attachment, opts: MsgPartitionerOptions
262+
) -> Iterator[Element]:
263+
"""Partition an `oxmsg.attachment.Attachment` from an Outlook email message (.msg file)."""
264+
return cls(attachment, opts)._iter_elements()
265+
266+
def _iter_elements(self) -> Iterator[Element]:
267+
"""Partition the file in an `oxmsg.attachment.Attachment` into elements."""
268+
from unstructured.partition.auto import partition
269+
270+
with tempfile.TemporaryDirectory() as tmp_dir_path:
271+
# -- save attachment as file in this temporary directory --
272+
detached_file_path = os.path.join(tmp_dir_path, self._attachment_file_name)
273+
with open(detached_file_path, "wb") as f:
274+
f.write(self._file_bytes)
275+
276+
# -- partition the attachment --
277+
for element in partition(
278+
detached_file_path,
279+
metadata_filename=self._attachment_file_name,
280+
metadata_last_modified=self._attachment_last_modified,
281+
**self._opts.partitioning_kwargs,
282+
):
283+
element.metadata.attached_to_filename = self._opts.metadata_file_path
284+
yield element
285+
286+
@lazyproperty
287+
def _attachment_file_name(self) -> str:
288+
"""The original name of the attached file, no path.
289+
290+
This value is 'unknown' if it is not present in the MSG file (not expected).
291+
"""
292+
return self._attachment.file_name or "unknown"
293+
294+
@lazyproperty
295+
def _attachment_last_modified(self) -> str | None:
296+
"""ISO8601 string timestamp of attachment last-modified date.
297+
298+
This value generally available on the attachment and will be the most reliable last-modifed
299+
time. There are fallbacks for when it is not present, ultimately `None` if we have no way
300+
of telling.
301+
"""
302+
if last_modified := self._attachment.last_modified:
303+
return last_modified.isoformat()
304+
return self._opts.metadata_last_modified
11305

12-
__all__ = [
13-
"MsgPartitionerOptions",
14-
"_AttachmentPartitioner",
15-
"_MsgPartitioner",
16-
"partition_msg",
17-
]
306+
@lazyproperty
307+
def _file_bytes(self) -> bytes:
308+
"""The bytes of the attached file."""
309+
return self._attachment.file_bytes or b""

0 commit comments

Comments
 (0)