1111import time
1212import random
1313import eml_parser
14- import mailparser
15- import extract_msg
1614import jsonpickle
1715
1816from glom import glom
@@ -392,6 +390,18 @@ def merge(d1, d2):
392390 "messages" : json .dumps (emails , default = default ),
393391 }
394392
393+ def remove_similar_items (self , items ):
394+ # Sort items by length in descending order
395+ items = sorted (items , key = len , reverse = True )
396+ result = []
397+
398+ for domain in items :
399+ # Check if the domain is part of any domain already in the result
400+ if not any (domain in main for main in result ):
401+ result .append (domain )
402+
403+ return result
404+
395405 def parse_eml (self , filedata , extract_attachments = False ):
396406 parsedfile = {
397407 "success" : True ,
@@ -443,16 +453,16 @@ def parse_email_file(self, file_id, extract_attachments=False):
443453 # Replace raw newlines \\r\\n with actual newlines
444454 # The data is a byte string, so we need to decode it to utf-8
445455 try :
446- print ("Pre size: %d" % len (file_path ["data" ]))
456+ # print("Pre size: %d" % len(file_path["data"]))
447457 file_path ["data" ] = file_path ["data" ].decode ("utf-8" ).replace ("\\ r\\ n" , "\n " ).encode ("utf-8" )
448- print ("Post size: %d" % len (file_path ["data" ]))
458+ # print("Post size: %d" % len(file_path["data"]))
449459 except Exception as e :
450460 print (f"Failed to decode file: { e } " )
451461 pass
452462
453463 # Makes msg into eml
454464 if ".msg" in file_path ["filename" ] or "." not in file_path ["filename" ]:
455- print (f"[DEBUG] Working with .msg file { file_path ['filename' ]} . Filesize: { len (file_path ['data' ])} " )
465+ self . logger . info (f"[DEBUG] Working with .msg file { file_path ['filename' ]} . Filesize: { len (file_path ['data' ])} " )
456466 try :
457467 result = {}
458468 msg = MsOxMessage (file_path ['data' ])
@@ -471,17 +481,17 @@ def parse_email_file(self, file_id, extract_attachments=False):
471481 )
472482
473483 try :
474- print ("Pre email" )
484+ self . logger . info ("Pre email" )
475485 parsed_eml = ep .decode_email_bytes (file_path ['data' ])
476486 #if str(parsed_eml["header"]["date"]) == "1970-01-01 00:00:00+00:00" and len(parsed_eml["header"]["subject"]) == 0:
477487 # return {"success":False,"reason":"Not a valid EML/MSG file, or the file have a timestamp or subject defined (required).", "date": str(parsed_eml["header"]["date"]), "subject": str(parsed_eml["header"]["subject"])}
478488
479489 # Put attachments in the shuffle file system
480- print ("Pre attachment" )
490+ self . logger . info ("Pre attachment" )
481491 if extract_attachments == True and "attachment" in parsed_eml :
482492 cnt = - 1
483493
484- print ("[INFO] Uploading %d attachments" % len (parsed_eml ["attachment" ]))
494+ self . logger . info ("[INFO] Uploading %d attachments" % len (parsed_eml ["attachment" ]))
485495 for value in parsed_eml ["attachment" ]:
486496 cnt += 1
487497 if value ["raw" ] == None :
@@ -502,7 +512,25 @@ def parse_email_file(self, file_id, extract_attachments=False):
502512 if not "attachment" in parsed_eml :
503513 parsed_eml ["attachment" ] = []
504514
505- print ("Post attachment" )
515+ self .logger .info ("Post attachment. Has body: %s" % ("body" in parsed_eml ))
516+
517+ try :
518+ if "body" in parsed_eml and len (parsed_eml ["body" ]) > 0 :
519+
520+ for i in range (len (parsed_eml ["body" ])):
521+ if "uri" in parsed_eml ["body" ][i ] and len (parsed_eml ["body" ][i ]["uri" ]) > 0 :
522+ parsed_eml ["body" ][i ]["uri" ] = self .remove_similar_items (parsed_eml ["body" ][i ]["uri" ])
523+
524+ if "email" in parsed_eml ["body" ][i ] and len (parsed_eml ["body" ][i ]["email" ]) > 0 :
525+ parsed_eml ["body" ][i ]["email" ] = self .remove_similar_items (parsed_eml ["body" ][i ]["email" ])
526+
527+ if "domain" in parsed_eml ["body" ][i ] and len (parsed_eml ["body" ][i ]["domain" ]) > 0 :
528+ parsed_eml ["body" ][i ]["domain" ] = self .remove_similar_items (parsed_eml ["body" ][i ]["domain" ])
529+
530+ except Exception as e :
531+ self .logger .info (f"[ERROR] Failed to remove similar items: { e } " )
532+
533+ parsed_eml ["success" ] = True
506534 return json .dumps (parsed_eml , default = json_serial )
507535 except Exception as e :
508536 return {"success" :False , "reason" : f"An exception occured during EML parsing: { e } . Please contact support" }
0 commit comments