Skip to content

Commit 5ae8950

Browse files
authored
feat: add sender and receive info to element metadata for emails (#439)
* add header metadata for .eml messages * sent to and from are lists * add metadata for outlook emails * version and changelog
1 parent 4211dda commit 5ae8950

File tree

7 files changed

+120
-5
lines changed

7 files changed

+120
-5
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.5.10-dev0
2+
3+
### Enhancements
4+
5+
* Add sender, recipient, date, and subject to element metadata for emails
6+
7+
### Features
8+
9+
### Fixes
10+
111
## 0.5.9
212

313
### Enhancements

test_unstructured/partition/test_email.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55

66
import pytest
77

8-
from unstructured.documents.elements import Image, ListItem, NarrativeText, Title
8+
from unstructured.documents.elements import (
9+
ElementMetadata,
10+
Image,
11+
ListItem,
12+
NarrativeText,
13+
Title,
14+
)
915
from unstructured.documents.email_elements import (
1016
MetaData,
1117
ReceivedInfo,
@@ -160,6 +166,21 @@ def test_partition_email_header():
160166
assert elements == RECEIVED_HEADER_OUTPUT
161167

162168

169+
def test_partition_email_has_metadata():
170+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
171+
elements = partition_email(filename=filename)
172+
assert len(elements) > 0
173+
assert elements[0].metadata == ElementMetadata(
174+
filename=filename,
175+
date="2022-12-16T17:04:16-05:00",
176+
page_number=None,
177+
url=None,
178+
sent_from=["Matthew Robinson <[email protected]>"],
179+
sent_to=["Matthew Robinson <[email protected]>"],
180+
subject="Test Email",
181+
)
182+
183+
163184
def test_extract_email_text_matches_html():
164185
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
165186
elements_from_text = partition_email(filename=filename, content_source="text/plain")

test_unstructured/partition/test_msg.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
import msg_parser
55
import pytest
66

7-
from unstructured.documents.elements import ListItem, NarrativeText, Title
7+
from unstructured.documents.elements import (
8+
ElementMetadata,
9+
ListItem,
10+
NarrativeText,
11+
Title,
12+
)
813
from unstructured.partition.msg import partition_msg
914

1015
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -22,6 +27,15 @@ def test_partition_msg_from_filename():
2227
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
2328
elements = partition_msg(filename=filename)
2429
assert elements == EXPECTED_MSG_OUTPUT
30+
assert elements[0].metadata == ElementMetadata(
31+
filename=filename,
32+
date="2022-12-16T17:04:16-05:00",
33+
page_number=None,
34+
url=None,
35+
sent_from=["Matthew Robinson <[email protected]>"],
36+
sent_to=["Matthew Robinson (None)"],
37+
subject="Test Email",
38+
)
2539

2640

2741
class MockMsOxMessage:

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.9" # pragma: no cover
1+
__version__ = "0.5.10-dev0" # pragma: no cover

unstructured/documents/elements.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,19 @@ class NoID(ABC):
1414
@dataclass
1515
class ElementMetadata:
1616
filename: Optional[str] = None
17+
date: Optional[str] = None
18+
19+
# Page numbers currenlty supported for PDF, HTML and PPT documents
1720
page_number: Optional[int] = None
21+
22+
# Webpage specific metadata fields
1823
url: Optional[str] = None
1924

25+
# E-mail specific metadata fields
26+
sent_from: Optional[List[str]] = None
27+
sent_to: Optional[List[str]] = None
28+
subject: Optional[str] = None
29+
2030
def __post_init__(self):
2131
if isinstance(self.filename, pathlib.Path):
2232
self.filename = str(self.filename)

unstructured/partition/email.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime
12
import email
23
import re
34
import sys
@@ -87,6 +88,35 @@ def partition_email_header(msg: Message) -> List[Element]:
8788
return elements
8889

8990

91+
def build_email_metadata(msg: Message) -> ElementMetadata:
92+
"""Creates an ElementMetadata object from the header information in the email."""
93+
header_dict = dict(msg.raw_items())
94+
email_date = header_dict.get("Date")
95+
if email_date is not None:
96+
email_date = convert_to_iso_8601(email_date)
97+
98+
sent_from = header_dict.get("To")
99+
if sent_from is not None:
100+
sent_from = [sender.strip() for sender in sent_from.split(",")]
101+
102+
sent_to = header_dict.get("To")
103+
if sent_to is not None:
104+
sent_to = [recipient.strip() for recipient in sent_to.split(",")]
105+
106+
return ElementMetadata(
107+
sent_to=sent_to,
108+
sent_from=sent_from,
109+
subject=header_dict.get("Subject"),
110+
date=email_date,
111+
)
112+
113+
114+
def convert_to_iso_8601(time: str) -> str:
115+
"""Converts the datetime from the email output to ISO-8601 format."""
116+
datetime_object = datetime.datetime.strptime(time, "%a, %d %b %Y %H:%M:%S %z")
117+
return datetime_object.isoformat()
118+
119+
90120
def extract_attachment_info(
91121
message: Message,
92122
output_dir: Optional[str] = None,
@@ -234,6 +264,8 @@ def partition_email(
234264
header = partition_email_header(msg)
235265
all_elements = header + elements
236266

267+
metadata = build_email_metadata(msg)
268+
metadata.filename = filename
237269
for element in all_elements:
238-
element.metadata = ElementMetadata(filename=filename)
270+
element.metadata = metadata
239271
return all_elements

unstructured/partition/msg.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33

44
import msg_parser
55

6-
from unstructured.documents.elements import Element
6+
from unstructured.documents.elements import Element, ElementMetadata
77
from unstructured.partition.common import exactly_one
8+
from unstructured.partition.email import convert_to_iso_8601
89
from unstructured.partition.html import partition_html
910
from unstructured.partition.text import partition_text
1011

@@ -38,4 +39,31 @@ def partition_msg(
3839
else:
3940
elements = partition_text(text=text)
4041

42+
metadata = build_msg_metadata(msg_obj)
43+
metadata.filename = filename
44+
for element in elements:
45+
element.metadata = metadata
46+
4147
return elements
48+
49+
50+
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
51+
"""Creates an ElementMetadata object from the header information in the emai."""
52+
email_date = getattr(msg_obj, "sent_date", None)
53+
if email_date is not None:
54+
email_date = convert_to_iso_8601(email_date)
55+
56+
sent_from = getattr(msg_obj, "sender", None)
57+
if sent_from is not None:
58+
sent_from = [str(sender) for sender in sent_from]
59+
60+
sent_to = getattr(msg_obj, "recipients", None)
61+
if sent_to is not None:
62+
sent_to = [str(recipient) for recipient in sent_to]
63+
64+
return ElementMetadata(
65+
sent_to=sent_to,
66+
sent_from=sent_from,
67+
subject=getattr(msg_obj, "subject", None),
68+
date=email_date,
69+
)

0 commit comments

Comments
 (0)