Skip to content

Commit 18d4602

Browse files
committed
Make reply email parsing more robus
1 parent fdefabc commit 18d4602

File tree

2 files changed

+25
-22
lines changed

2 files changed

+25
-22
lines changed

tools/test_email.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,9 +345,9 @@ async def parse_messages(context: EmailContext, args: list[str]):
345345
try:
346346
msg = import_email_from_file(file_path)
347347
print(file_path)
348+
print("####################")
349+
print_email(msg)
348350
if named_args.verbose:
349-
print("####################")
350-
print_email(msg)
351351
print_knowledge(msg.get_knowledge())
352352
print("####################")
353353

typeagent/emails/email_import.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -104,37 +104,40 @@ def get_forwarded_email_parts(email_text: str) -> list[str]:
104104
parts: list[str] = split_delimiter.split(email_text)
105105
return _remove_empty_strings(parts)
106106

107+
# Precompiled regex for reply/forward delimiters and quoted reply headers
108+
_THREAD_DELIMITERS = re.compile(
109+
"|".join([
110+
r"^from: .+$", # From: someone
111+
r"^sent: .+$", # Sent: ...
112+
r"^to: .+$", # To: ...
113+
r"^subject: .+$", # Subject: ...
114+
r"^-{2,}\s*Original Message\s*-{2,}$", # -----Original Message-----
115+
r"^-{2,}\s*Forwarded by.*$", # ----- Forwarded by
116+
r"^_{5,}$", # _________
117+
r"^on .+wrote:\s*(?:\r?\n\s*)+>", # On ... wrote: followed by quoted text
118+
]),
119+
re.IGNORECASE | re.MULTILINE
120+
)
121+
122+
# Precompiled regex for trailing line delimiters (underscores, dashes, equals, spaces)
123+
_TRAILING_LINE_DELIMITERS = re.compile(r"[\r\n][_\-= ]+\s*$")
107124

108125
# Simple way to get the last response on an email thread in MIME format
109126
def get_last_response_in_thread(email_text: str) -> str:
110127
if not email_text:
111128
return ""
112129

113-
delimiters = [
114-
"From:",
115-
"Sent:",
116-
"To:",
117-
"Subject:",
118-
"-----Original Message-----",
119-
"----- Forwarded by",
120-
"________________________________________",
121-
]
122-
123-
first_delimiter_at = -1
124-
for delimiter in delimiters:
125-
index = email_text.find(delimiter)
126-
if index != -1 and (first_delimiter_at == -1 or index < first_delimiter_at):
127-
first_delimiter_at = index
130+
match = _THREAD_DELIMITERS.search(email_text)
131+
if match:
132+
email_text = email_text[:match.start()]
128133

129-
if first_delimiter_at > 0:
130-
email_text = email_text[:first_delimiter_at]
131134

132135
email_text = email_text.strip()
133-
# Remove trailing line delimiters
134-
email_text = re.sub(r"[\r\n]_+\s*$", "", email_text)
136+
# Remove trailing line delimiters (e.g. underscores, dashes, equals)
137+
_TRAILING_LINE_DELIMITER_REGEX = _TRAILING_LINE_DELIMITERS
138+
email_text = _TRAILING_LINE_DELIMITER_REGEX.sub("", email_text)
135139
return email_text
136140

137-
138141
# Extracts the plain text body from an email.message.Message object.
139142
def _extract_email_body(msg: Message) -> str:
140143
"""Extracts the plain text body from an email.message.Message object."""

0 commit comments

Comments
 (0)