Skip to content

Commit 52aced8

Browse files
authored
fix: validate encodings from email headers (#881)
* add validate encoding function * remove extraneous file * added test case for malformed encoding * version and changelog
1 parent 209054f commit 52aced8

File tree

6 files changed

+44
-2
lines changed

6 files changed

+44
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
* Fix KeyError when `isd_to_elements` doesn't find a type
1111
* Fix _output_filename for local connector, allowing single files to be written correctly to the disk
1212

13+
* Fix for cases where an invalid encoding is extracted from an email header.
14+
1315
### BREAKING CHANGES
1416

1517
* Information about an element's location is no longer returned as top-level attributes of an element. Instead, it is returned in the `coordinates` attribute of the element's metadata.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
MIME-Version: 1.0
2+
Date: Fri, 16 Dec 2022 17:04:16 -0500
3+
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
4+
Subject: Test Email
5+
From: Matthew Robinson <[email protected]>
6+
To: Matthew Robinson <[email protected]>
7+
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
8+
9+
--00000000000095c9b205eff92630
10+
Content-Type: text/plain; charset = "UTF-8"Content-Transfer-Encoding: 8bit
11+
12+
This is a test email to use for unit tests.
13+
14+
Important points:
15+
16+
- Roses are red
17+
- Violets are blue
18+
19+
--00000000000095c9b205eff92630
20+
Content-Type: text/html; charset = "UTF-8"Content-Transfer-Encoding: 8bit
21+
22+
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
23+
24+
--00000000000095c9b205eff92630--

test_file.html

Lines changed: 0 additions & 1 deletion
This file was deleted.

test_unstructured/partition/test_email.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,13 @@ def test_partition_email_from_filename_with_metadata_filename():
117117
assert all(element.metadata.filename == "test" for element in elements)
118118

119119

120+
def test_partition_email_from_filename_malformed_encoding():
121+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml")
122+
elements = partition_email(filename=filename)
123+
assert len(elements) > 0
124+
assert elements == EXPECTED_OUTPUT
125+
126+
120127
@pytest.mark.parametrize(
121128
("filename", "expected_output"),
122129
[

unstructured/file_utils/encoding.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@ def format_encoding_str(encoding: str) -> str:
4949
return formatted_encoding
5050

5151

52+
def validate_encoding(encoding: str) -> bool:
53+
"""Checks if an encoding string is valid. Helps to avoid errors in cases where
54+
invalid encodings are extracted from malformed documents."""
55+
for common_encoding in COMMON_ENCODINGS:
56+
if format_encoding_str(common_encoding) == format_encoding_str(encoding):
57+
return True
58+
return False
59+
60+
5261
def detect_file_encoding(
5362
filename: str = "",
5463
file: Optional[Union[bytes, IO[bytes]]] = None,

unstructured/partition/email.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
COMMON_ENCODINGS,
1313
format_encoding_str,
1414
read_txt_file,
15+
validate_encoding,
1516
)
1617
from unstructured.partition.common import (
1718
convert_to_bytes,
@@ -208,7 +209,7 @@ def parse_email(
208209
encoding = None
209210
charsets = msg.get_charsets() or []
210211
for charset in charsets:
211-
if charset and charset.strip():
212+
if charset and charset.strip() and validate_encoding(charset):
212213
encoding = charset
213214
break
214215

0 commit comments

Comments
 (0)