Skip to content

Commit 08ccee0

Browse files
authored
chore: Fix parse received data (#143)
* fix parse_received data
1 parent 749f9c6 commit 08ccee0

File tree

6 files changed

+75
-16
lines changed

6 files changed

+75
-16
lines changed

CHANGELOG.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
## 0.4.3-dev0
1+
## 0.4.3-dev1
22

33
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
4+
* Fix bug in `_parse_received_data`.
45

56
## 0.4.2
67

78
* Added `partition_image` to process documents in an image format.
89
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`
910

10-
1111
## 0.4.1
1212

1313
* Added support for text files in the `partition` function
@@ -40,7 +40,7 @@
4040
elements
4141
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
4242
* Helper functions for identifying and extracting phone numbers
43-
* Add new function `extract_attachment_info` that extracts and decode the attachment
43+
* Add new function `extract_attachment_info` that extracts and decodes the attachment
4444
of an email.
4545
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
4646
* Add plain text functionality to `partition_email`

example-docs/fake-email-header.eml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
Received: from ABCDEFG-000.ABC.guide (00.0.0.00) by ABCDEFG-000.ABC.guide
2+
([ba23::58b5:2236:45g2:88h2]) with Unstructured TTTT Server (version=ABC0_0,
3+
cipher=ABC_ABCDE_ABC_NOPE_ABC_000_ABC_ABC000) id 00.0.000.0 via Techbox
4+
Transport; Wed, 20 Feb 2023 10:03:18 +1200
5+
MIME-Version: 1.0
6+
Date: Fri, 16 Dec 2022 17:04:16 -0500
7+
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
8+
Subject: Test Email
9+
From: Matthew Robinson <[email protected]>
10+
To: Matthew Robinson <[email protected]>
11+
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
12+
13+
--00000000000095c9b205eff92630
14+
Content-Type: text/plain; charset="UTF-8"
15+
16+
This is a test email to use for unit tests.
17+
18+
Important points:
19+
20+
- Roses are red
21+
- Violets are blue
22+
23+
--00000000000095c9b205eff92630
24+
Content-Type: text/html; charset="UTF-8"
25+
26+
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
27+
28+
--00000000000095c9b205eff92630--

test_unstructured/partition/test_email.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1+
import datetime
12
import email
23
import os
34
import pathlib
45
import pytest
56

7+
68
from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
79
from unstructured.documents.email_elements import (
810
MetaData,
911
Recipient,
1012
Sender,
1113
Subject,
14+
ReceivedInfo,
1215
)
1316
from unstructured.partition.email import (
1417
extract_attachment_info,
@@ -36,6 +39,30 @@
3639
ListItem(text="Violets are blue"),
3740
]
3841

42+
RECEIVED_HEADER_OUTPUT = [
43+
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
44+
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
45+
ReceivedInfo(
46+
name="received_datetimetz",
47+
text="2023-02-20 10:03:18+12:00",
48+
datestamp=datetime.datetime(
49+
2023, 2, 20, 10, 3, 18, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
50+
),
51+
),
52+
MetaData(name="MIME-Version", text="1.0"),
53+
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
54+
MetaData(
55+
name="Message-ID",
56+
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
57+
),
58+
Subject(text="Test Email"),
59+
Sender(name="Matthew Robinson", text="[email protected]"),
60+
Recipient(name="Matthew Robinson", text="[email protected]"),
61+
MetaData(
62+
name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"'
63+
),
64+
]
65+
3966
HEADER_EXPECTED_OUTPUT = [
4067
MetaData(name="MIME-Version", text="1.0"),
4168
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
@@ -114,12 +141,12 @@ def test_partition_email_from_filename_with_embedded_image():
114141

115142

116143
def test_partition_email_header():
117-
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
144+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
118145
with open(filename, "r") as f:
119146
msg = email.message_from_file(f)
120147
elements = partition_email_header(msg)
121148
assert len(elements) > 0
122-
assert elements == HEADER_EXPECTED_OUTPUT
149+
assert elements == RECEIVED_HEADER_OUTPUT
123150

124151

125152
def test_extract_email_text_matches_html():

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.3-dev0" # pragma: no cover
1+
__version__ = "0.4.3-dev1" # pragma: no cover

unstructured/documents/email_elements.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
from unstructured.documents.elements import Element, Text, NoID
66

77

8+
class NoDatestamp(ABC):
9+
"""Class to indicate that an element do not have a datetime stamp."""
10+
11+
pass
12+
13+
814
class EmailElement(Element):
915
"""An email element is a section of the email."""
1016

@@ -20,28 +26,29 @@ def __init__(
2026
self,
2127
name: str,
2228
text: str,
29+
datestamp: Union[datetime, NoDatestamp] = NoDatestamp(),
2330
element_id: Union[str, NoID] = NoID(),
2431
):
2532
self.name: str = name
2633
self.text: str = text
27-
self.datestamp: datetime
28-
self.has_datestamp: bool = False
2934

3035
if isinstance(element_id, NoID):
3136
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
3237
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
3338

3439
super().__init__(element_id=element_id)
3540

36-
def set_datestamp(self, datestamp: datetime):
37-
self.datestamp = datestamp
38-
self.has_datestamp = True
41+
if isinstance(datestamp, datetime):
42+
self.datestamp: datetime = datestamp
43+
44+
def has_datestamp(self):
45+
return "self.datestamp" in globals()
3946

4047
def __str__(self):
4148
return f"{self.name}: {self.text}"
4249

4350
def __eq__(self, other):
44-
if self.has_datestamp:
51+
if self.has_datestamp():
4552
return (
4653
self.name == other.name
4754
and self.text == other.text

unstructured/partition/email.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,8 @@ def _parse_received_data(data: str) -> List[Element]:
4747
elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0]))
4848
if datetimetz:
4949
elements.append(
50-
ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp(
51-
datestamp=datetimetz
52-
)
50+
ReceivedInfo(name="received_datetimetz", text=str(datetimetz), datestamp=datetimetz)
5351
)
54-
5552
return elements
5653

5754

0 commit comments

Comments
 (0)