diff --git a/mbox-extract-attachments.py b/mbox-extract-attachments.py index ba719f0..a1b099f 100755 --- a/mbox-extract-attachments.py +++ b/mbox-extract-attachments.py @@ -24,6 +24,7 @@ # Related RFCs: 2047, 2044, 1522 +# 20150907 JMW Python Version 3.4.3 and other changes __author__ = "Pablo Castellano " __license__ = "GNU GPLv3+" @@ -36,7 +37,7 @@ import os import sys import email - +import codecs # 20150907 JMW For codec error handling BLACKLIST = ('signature.asc', 'message-footer.txt', 'smime.p7s') VERBOSE = 1 @@ -44,13 +45,27 @@ attachments = 0 #Count extracted attachment skipped = 0 +# 20150907 JMW For codec error handling +# Taken from http://www.gossamer-threads.com/lists/python/python/780611#780611 +def replace_spc_error_handler(error): +# error is an UnicodeEncodeError/UnicodeDecodeError instance +# with these attributes: +# object = unicode object being encoded +# start:end = slice of object with error +# reason = error message +# Must return a tuple (replacement unicode object, +# index into object to continue encoding) +# or raise the same or another exception + return (u' ' * (error.end-error.start), error.end) + + # Search for filename or find recursively if it's multipart def extract_attachment(payload): global attachments, skipped filename = payload.get_filename() if filename is not None: - print "\nAttachment found!" + print("\nAttachment found!") if filename.find('=?') != -1: ll = email.header.decode_header(filename) filename = "" @@ -60,7 +75,7 @@ def extract_attachment(payload): if filename in BLACKLIST: skipped = skipped + 1 if (VERBOSE >= 1): - print "Skipping %s (blacklist)\n" %filename + print("Skipping %s (blacklist)\n" %filename) return # Puede no venir especificado el nombre del archivo?? @@ -70,7 +85,7 @@ def extract_attachment(payload): content = payload.as_string() # Skip headers, go to the content fh = content.find('\n\n') - content = content[fh:] + content = content[fh:].encode('utf-8') # 20150709 JMW Address error I was getting # if it's base64.... if payload.get('Content-Transfer-Encoding') == 'base64': @@ -78,7 +93,7 @@ def extract_attachment(payload): # quoted-printable # what else? ... - print "Extracting %s (%d bytes)\n" %(filename, len(content)) + print("Extracting %s (%d bytes)\n" %(filename, len(content))) n = 1 orig_filename = filename @@ -87,11 +102,11 @@ def extract_attachment(payload): n = n+1 try: - fp = open(filename, "w") + fp = open(filename, "wb") # 201500907 JMW Needed to make binary # fp = open(str(i) + "_" + filename, "w") fp.write(content) except IOError: - print "Aborted, IOError!!!" + print("Aborted, IOError!!!") sys.exit(2) finally: fp.close() @@ -99,62 +114,69 @@ def extract_attachment(payload): attachments = attachments + 1 else: if payload.is_multipart(): - for payl in payload.get_payload(): + for payl in payload.get_payload(decode=False): # 20150907 JMW Needed to make decode=False extract_attachment(payl) ### -print "Extract attachments from mbox files" -print "Copyright (C) 2012 Pablo Castellano" -print "This program comes with ABSOLUTELY NO WARRANTY." -print "This is free software, and you are welcome to redistribute it under certain conditions." -print +print("Extract attachments from mbox files") +print("Copyright (C) 2012 Pablo Castellano") +print("This program comes with ABSOLUTELY NO WARRANTY.") +print("This is free software, and you are welcome to redistribute it under certain conditions.") +print() + +codecs.register_error("replace_spc", replace_spc_error_handler) # 20150907 JMW Register error handler if len(sys.argv) < 2 or len(sys.argv) > 3: - print "Usage: %s [directory]" %sys.argv[0] + print("Usage: %s [directory]" %sys.argv[0]) sys.exit(0) filename = sys.argv[1] directory = os.path.curdir if not os.path.exists(filename): - print "File doesn't exist:", filename + print("File doesn't exist:", filename) sys.exit(1) if len(sys.argv) == 3: directory = sys.argv[2] if not os.path.exists(directory) or not os.path.isdir(directory): - print "Directory doesn't exist:", directory + print("Directory doesn't exist:", directory) sys.exit(1) mb = mailbox.mbox(filename) -nmes = len(mb) +# nmes = len(mb) # 20150907 JMW Commented out as not used and a performance hit os.chdir(directory) -for i in range(len(mb)): +for i in range(len(mb)): # 20150907 JMW With >140k msgs I used a numeric literal here for performance reasons if (VERBOSE >= 2): - print "Analyzing message number", i + print("Analyzing message number", i) mes = mb.get_message(i) em = email.message_from_string(mes.as_string()) subject = em.get('Subject') - if subject.find('=?') != -1: - ll = email.header.decode_header(subject) - subject = "" - for l in ll: - subject = subject + l[0] + # 20150907 JMW Ended up adding exception handling to skip two messages that were throwing NonType + try: + if subject.find('=?') != -1: + ll = email.header.decode_header(subject) + subject = "" + for l in ll: + subject = subject + l[0].decode('utf-8', "replace_spc") # 20150907 JMW Explicit decode + except AttributeError: + print("NonType encountered") + continue em_from = em.get('From') if em_from.find('=?') != -1: ll = email.header.decode_header(em_from) em_from = "" for l in ll: - em_from = em_from + l[0] + em_from = em_from + l[0].decode('utf-8', "replace_spc") # 20150907 JMW Explicit decode if (VERBOSE >= 2): - print "%s - From: %s" %(subject, em_from) + print("%s - From: %s" %(subject, em_from)) filename = mes.get_filename() @@ -165,6 +187,6 @@ def extract_attachment(payload): else: extract_attachment(em) -print "\n--------------" -print "Total attachments extracted:", attachments -print "Total attachments skipped:", skipped +print("\n--------------") +print("Total attachments extracted:", attachments) +print("Total attachments skipped:", skipped)