sftp2s3/email_parser.py at master · foobarmus/sftp2s3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Utility for Copying Files from an SFTP Server to an S3 Bucket
#
# Copyright 2017 Jonas McCallum.
#
# Open source, MIT license
# http://www.opensource.org/licenses/mit-license.php
"""
Email parser

Author: Jonas McCallum
https://github.com/foobarmus

"""
import re

TOK_SPEC = [('CRUD',    '.+:', ''),
            ('EOL',     '$',   ''),
            ('TO',      '.+',  '(?<=To:)'),
            ('FROM',    '.+',  '(?<=From:)'),
            ('REGION',  '.+',  '(?<=Region:)'),
            ('SUBJECT', '.+',  '(?<=Subject:)'),
            ('BODY',    '.+',  '')]
TOKEN = '{}(?P<{}>{})'
TOKENS = (TOKEN.format(lookbehind, label, expression)
          for label, expression, lookbehind in TOK_SPEC)
RE = re.compile('|'.join(TOKENS), re.MULTILINE)
HEADER_LABELS = ['TO', 'FROM', 'REGION', 'SUBJECT']
IGNORE = ['CRUD', 'EOL']

def tokenize(text, re_, ignore):
    """tokenize an email template"""
    tokens = []
    pos = 0
    match = re_.match(text)
    while pos < len(text):
        typ = match.lastgroup
        if typ in ignore:
            pos = max(match.end(), pos + 1)
        elif typ == 'BODY':
            tok = text[pos:]
            tokens.append((typ, tok))
            break
        else:
            tok = match.group(typ).strip()
            tokens.append((typ, tok))
            pos = match.end()
        match = re_.match(text, pos)
    return tokens

def parse_email(template, **args):
    """populate and parse an email template"""
    with open(template, 'r') as f:
        text = f.read()
    personalized_template = text.format(**args)
    tokens = tokenize(personalized_template, RE, IGNORE)
    header = {k:v.strip() for k, v in tokens
              if k in HEADER_LABELS}
    body = tokens[-1][-1].strip()
    email = {'Destination': {'ToAddresses': [header['TO']]},
             'Message': {'Body': {'Text': {'Charset': 'UTF-8', 'Data': body}},
                         'Subject': {'Charset': 'UTF-8', 'Data': header['SUBJECT']}},
             'Source': header['FROM']}
    region = header['REGION']
    return email, region