Skip to content

Commit 067f4e8

Browse files
committed
Email app fix for deduped mails
1 parent 655b987 commit 067f4e8

File tree

2 files changed

+39
-13
lines changed

2 files changed

+39
-13
lines changed

email/1.3.0/requirements.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
requests==2.25.1
22
glom==20.11.0
3-
eml-parser==1.17.0
4-
msg-parser==1.2.0
5-
mail-parser==3.15.0
6-
extract-msg==0.30.9
73
jsonpickle==2.0.0
84

5+
eml-parser==2.0.0
6+
msg-parser==1.2.0

email/1.3.0/src/app.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
import time
1212
import random
1313
import eml_parser
14-
import mailparser
15-
import extract_msg
1614
import jsonpickle
1715

1816
from glom import glom
@@ -392,6 +390,18 @@ def merge(d1, d2):
392390
"messages": json.dumps(emails, default=default),
393391
}
394392

393+
def remove_similar_items(self, items):
394+
# Sort items by length in descending order
395+
items = sorted(items, key=len, reverse=True)
396+
result = []
397+
398+
for domain in items:
399+
# Check if the domain is part of any domain already in the result
400+
if not any(domain in main for main in result):
401+
result.append(domain)
402+
403+
return result
404+
395405
def parse_eml(self, filedata, extract_attachments=False):
396406
parsedfile = {
397407
"success": True,
@@ -443,16 +453,16 @@ def parse_email_file(self, file_id, extract_attachments=False):
443453
# Replace raw newlines \\r\\n with actual newlines
444454
# The data is a byte string, so we need to decode it to utf-8
445455
try:
446-
print("Pre size: %d" % len(file_path["data"]))
456+
#print("Pre size: %d" % len(file_path["data"]))
447457
file_path["data"] = file_path["data"].decode("utf-8").replace("\\r\\n", "\n").encode("utf-8")
448-
print("Post size: %d" % len(file_path["data"]))
458+
#print("Post size: %d" % len(file_path["data"]))
449459
except Exception as e:
450460
print(f"Failed to decode file: {e}")
451461
pass
452462

453463
# Makes msg into eml
454464
if ".msg" in file_path["filename"] or "." not in file_path["filename"]:
455-
print(f"[DEBUG] Working with .msg file {file_path['filename']}. Filesize: {len(file_path['data'])}")
465+
self.logger.info(f"[DEBUG] Working with .msg file {file_path['filename']}. Filesize: {len(file_path['data'])}")
456466
try:
457467
result = {}
458468
msg = MsOxMessage(file_path['data'])
@@ -471,17 +481,17 @@ def parse_email_file(self, file_id, extract_attachments=False):
471481
)
472482

473483
try:
474-
print("Pre email")
484+
self.logger.info("Pre email")
475485
parsed_eml = ep.decode_email_bytes(file_path['data'])
476486
#if str(parsed_eml["header"]["date"]) == "1970-01-01 00:00:00+00:00" and len(parsed_eml["header"]["subject"]) == 0:
477487
# return {"success":False,"reason":"Not a valid EML/MSG file, or the file have a timestamp or subject defined (required).", "date": str(parsed_eml["header"]["date"]), "subject": str(parsed_eml["header"]["subject"])}
478488

479489
# Put attachments in the shuffle file system
480-
print("Pre attachment")
490+
self.logger.info("Pre attachment")
481491
if extract_attachments == True and "attachment" in parsed_eml:
482492
cnt = -1
483493

484-
print("[INFO] Uploading %d attachments" % len(parsed_eml["attachment"]))
494+
self.logger.info("[INFO] Uploading %d attachments" % len(parsed_eml["attachment"]))
485495
for value in parsed_eml["attachment"]:
486496
cnt += 1
487497
if value["raw"] == None:
@@ -502,7 +512,25 @@ def parse_email_file(self, file_id, extract_attachments=False):
502512
if not "attachment" in parsed_eml:
503513
parsed_eml["attachment"] = []
504514

505-
print("Post attachment")
515+
self.logger.info("Post attachment. Has body: %s" % ("body" in parsed_eml))
516+
517+
try:
518+
if "body" in parsed_eml and len(parsed_eml["body"]) > 0:
519+
520+
for i in range(len(parsed_eml["body"])):
521+
if "uri" in parsed_eml["body"][i] and len(parsed_eml["body"][i]["uri"]) > 0:
522+
parsed_eml["body"][i]["uri"] = self.remove_similar_items(parsed_eml["body"][i]["uri"])
523+
524+
if "email" in parsed_eml["body"][i] and len(parsed_eml["body"][i]["email"]) > 0:
525+
parsed_eml["body"][i]["email"] = self.remove_similar_items(parsed_eml["body"][i]["email"])
526+
527+
if "domain" in parsed_eml["body"][i] and len(parsed_eml["body"][i]["domain"]) > 0:
528+
parsed_eml["body"][i]["domain"] = self.remove_similar_items(parsed_eml["body"][i]["domain"])
529+
530+
except Exception as e:
531+
self.logger.info(f"[ERROR] Failed to remove similar items: {e}")
532+
533+
parsed_eml["success"] = True
506534
return json.dumps(parsed_eml, default=json_serial)
507535
except Exception as e:
508536
return {"success":False, "reason": f"An exception occured during EML parsing: {e}. Please contact support"}

0 commit comments

Comments
 (0)