Skip to content

Commit c2cc657

Browse files
authored
Merge pull request #2610 from Lukas-Heindl/smtpBatchGroup
smtp_batch: add feature for grouping and templating
2 parents a105925 + a88a205 commit c2cc657

File tree

4 files changed

+269
-35
lines changed

4 files changed

+269
-35
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
4040
- `intelmq.bots.experts.sieve.expert`: Test for textX dependency in self-check (PR#2605 by Sebastian Wagner).
4141

4242
#### Outputs
43+
- `intelmq.bots.outputs.smtp_batch.output`:
44+
- Add new parameter `additional_grouping_keys` for an enhanced email batching feature.
45+
- Add new parameter `templating` for additional template variables.
46+
- Add new parameter `allowed_fieldnames` for csv field specification.
47+
- Add new parameter `fieldnames_translation` for naming csv headers (PR#2610 by Lukas Heindl, fixes #2586).
4348

4449
### Documentation
4550
- Fix and refresh links to mailing lists (PR#2609 by Kamil Mańkowski)

docs/user/bots.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
51195119
51205120
```
51215121
5122+
**`additional_grouping_keys`**
5123+
5124+
(optional, list) By-default events are grouped by the E-Mail-Address into buckets. For each bucket one E-Mail is sent. You may add more fields to group-by here to make potentially more buckets.
5125+
Side-effect: Every field that is included in the group-by is ensured to be unique for all events in the bucket and may thus be used for templating.
5126+
Note: The keys listed here refer to the keys in the events (in contrast to the CSV column names).
5127+
Default: `[]`
5128+
5129+
**`templating`**
5130+
5131+
(optional, dict) Defines which strings should be processed by jinja2 templating. For templating only keys which are unique for the complete bucket are available. This always includes the destination address (`source.abuse_contact`) and all keys of `additional_grouping_keys` which are present in the bucket. There is one additional key `current_time` available which holds a `datetime.datetime` object of the current (local) time.
5132+
Note: The keys available for templating refer to the keys defined for the events (in contrast to the CSV column names). Still the keys get transformed: each `'.'` gets replaced to `_` in order to make referencing the key in jinja2 easier.
5133+
Default: `{subject: False, body: False, attachment: False}`
5134+
5135+
**`allowed_fieldnames`**
5136+
5137+
(optional, list) Lists the fields which are included in the csv file. Every element should be also included in `fieldnames_translation` to avoid crashes.
5138+
5139+
**`fieldnames_translation`**
5140+
5141+
(optional, dict) Maps each the name of each field listed in `allowed_fieldnames` to a different name to be used in the csv header.
5142+
**Warning:** The Bot will crash on sending in case a fieldname is present in an event and in `allowed_fieldnames` but not in `fieldnames_translation`.
5143+
51225144
**`attachment_name`**
51235145
51245146
(optional, string) Attachment file name for the outgoing messages. May contain date formatting like this `%Y-%m-%d`. Example: "events_%Y-%m-%d" will appear as "events_2022-12-01.zip". Defaults to "intelmq_%Y-%m-%d".

intelmq/bots/outputs/smtp_batch/output.py

Lines changed: 103 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
import sys
99
from tempfile import NamedTemporaryFile
1010
import time
11-
from typing import Any, Optional
11+
from typing import Any, Iterable, Optional, Dict, List
1212
import zipfile
1313
from base64 import b64decode
1414
from collections import OrderedDict
1515
from io import StringIO
16+
from hashlib import sha256
1617

1718
from redis.exceptions import TimeoutError
1819

@@ -25,17 +26,37 @@
2526
except ImportError:
2627
Envelope = None
2728

29+
try:
30+
import jinja2
31+
jinja_env = jinja2.Environment()
32+
except ImportError:
33+
jinja2 = None
34+
35+
36+
def hash_arbitrary(value: Any) -> bytes:
37+
value_bytes = None
38+
if isinstance(value, str):
39+
value_bytes = value.encode("utf-8")
40+
elif isinstance(value, int):
41+
value_bytes = bytes(value)
42+
else:
43+
value_bytes = json.dumps(value, sort_keys=True).encode("utf-8")
44+
return sha256(value_bytes).digest()
45+
2846

2947
@dataclass
3048
class Mail:
3149
key: str
3250
to: str
3351
path: str
3452
count: int
53+
template_data: Dict[str, Any]
3554

3655

3756
class SMTPBatchOutputBot(Bot):
3857
# configurable parameters
58+
additional_grouping_keys: Optional[list] = [] # refers to the event directly
59+
templating: Optional[Dict[str, bool]] = {'subject': False, 'body': False, 'attachment': False}
3960
alternative_mails: Optional[str] = None
4061
bcc: Optional[list] = None
4162
email_from: str = ""
@@ -75,7 +96,20 @@ def process(self):
7596
if "source.abuse_contact" in message:
7697
field = message["source.abuse_contact"]
7798
for mail in (field if isinstance(field, list) else [field]):
78-
self.cache.redis.rpush(f"{self.key}{mail}", message.to_json())
99+
# - Each event goes into one bucket (equivalent to group-by)
100+
# - The id of each bucket is calculated by hashing all the keys that should be grouped for
101+
# - Hashing ensures the redis-key does not grow indefinitely.
102+
# - In order to avoid collisions, each value is hashed before
103+
# appending to the input for the redis-key-hash
104+
# (could also be solved by special separator which would need
105+
# to be escaped or prepending the length of the value).
106+
h = sha256()
107+
h.update(sha256(mail.encode("utf-8")).digest())
108+
for i in self.additional_grouping_keys:
109+
if i not in message:
110+
continue
111+
h.update(hash_arbitrary(message[i]))
112+
self.cache.redis.rpush(f"{self.key}{h.hexdigest()}", message.to_json())
79113

80114
self.acknowledge_message()
81115

@@ -90,6 +124,8 @@ def set_cache(self):
90124
def init(self):
91125
if Envelope is None:
92126
raise MissingDependencyError('envelope', '>=2.0.0')
127+
if jinja2 is None:
128+
self.logger.warning("No jinja2 installed. Thus, the templating is deactivated.")
93129
self.set_cache()
94130
self.key = f"{self._Bot__bot_id}:"
95131

@@ -213,7 +249,7 @@ def set_tester(self, force=True):
213249
print("\nWhat e-mail should I use?")
214250
self.testing_to = input()
215251

216-
def send_mails_to_tester(self, mails):
252+
def send_mails_to_tester(self, mails: List[Mail]):
217253
"""
218254
These mails are going to tester's address. Then prints out their count.
219255
:param mails: list
@@ -222,7 +258,7 @@ def send_mails_to_tester(self, mails):
222258
count = sum([1 for mail in mails if self.build_mail(mail, send=True, override_to=self.testing_to)])
223259
print(f"{count}× mail sent to: {self.testing_to}\n")
224260

225-
def prepare_mails(self):
261+
def prepare_mails(self) -> Iterable[Mail]:
226262
""" Generates Mail objects """
227263

228264
for mail_record in self.cache.redis.keys(f"{self.key}*")[slice(self.limit_results)]:
@@ -254,7 +290,11 @@ def prepare_mails(self):
254290
# TODO: worthy to generate on the fly https://github.com/certtools/intelmq/pull/2253#discussion_r1172779620
255291
fieldnames = set()
256292
rows_output = []
293+
src_abuse_contact = None
257294
for row in lines:
295+
# obtain this field only once as it is the same for all lines here
296+
if not src_abuse_contact:
297+
src_abuse_contact = row["source.abuse_contact"]
258298
try:
259299
if threshold and row["time.observation"][:19] < threshold.isoformat()[:19]:
260300
continue
@@ -283,31 +323,45 @@ def prepare_mails(self):
283323
dict_writer.writerow(dict(zip(ordered_fieldnames, ordered_fieldnames)))
284324
dict_writer.writerows(rows_output)
285325

286-
email_to = str(mail_record[len(self.key):], encoding="utf-8")
287326
count = len(rows_output)
288-
if not count:
289-
path = None
290-
else:
291-
filename = f'{time.strftime("%y%m%d")}_{count}_events'
292-
path = NamedTemporaryFile().name
293-
294-
with zipfile.ZipFile(path, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
295-
try:
296-
zf.writestr(filename + ".csv", output.getvalue())
297-
except Exception:
298-
self.logger.error("Error: Cannot zip mail: %r", mail_record)
299-
continue
327+
if not count or count == 0:
328+
# send no mail if no events are present
329+
continue
330+
331+
# collect all data which must be the same for all events of the
332+
# bucket and thus can be used for templating
333+
template_keys = ['source.abuse_contact']
334+
# only collect if templating is enabled (save the memory otherwise)+
335+
if jinja2 and self.templating and any(self.templating.values()):
336+
template_keys.extend(self.additional_grouping_keys)
337+
338+
template_data = {
339+
k.replace(".", "_"): lines[0][k]
340+
for k in template_keys
341+
if k in lines[0]
342+
}
343+
344+
email_to = template_data["source_abuse_contact"]
345+
filename = f'{time.strftime("%y%m%d")}_{count}_events'
346+
path = NamedTemporaryFile().name
347+
348+
with zipfile.ZipFile(path, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
349+
try:
350+
zf.writestr(filename + ".csv", output.getvalue())
351+
except Exception:
352+
self.logger.error("Error: Cannot zip mail: %r", mail_record)
353+
continue
300354

301-
if email_to in self.alternative_mail:
302-
print(f"Alternative: instead of {email_to} we use {self.alternative_mail[email_to]}")
303-
email_to = self.alternative_mail[email_to]
355+
if email_to in self.alternative_mail:
356+
print(f"Alternative: instead of {email_to} we use {self.alternative_mail[email_to]}")
357+
email_to = self.alternative_mail[email_to]
304358

305-
mail = Mail(mail_record, email_to, path, count)
359+
mail = Mail(mail_record, email_to, path, count, template_data)
360+
# build_mail only used to output metadata of the mail -> send=False -> return None
306361
self.build_mail(mail, send=False)
307-
if count:
308-
yield mail
362+
yield mail
309363

310-
def build_mail(self, mail, send=False, override_to=None):
364+
def build_mail(self, mail: Mail, send=False, override_to=None):
311365
""" creates a MIME message
312366
:param mail: Mail object
313367
:param send: True to send through SMTP, False for just printing the information
@@ -322,15 +376,32 @@ def build_mail(self, mail, send=False, override_to=None):
322376
intended_to = None
323377
email_to = mail.to
324378
email_from = self.email_from
379+
380+
template_data = mail.template_data
381+
325382
text = self.mail_contents
326-
try:
327-
subject = time.strftime(self.subject)
328-
except ValueError:
329-
subject = self.subject
330-
try:
331-
attachment_name = time.strftime(self.attachment_name)
332-
except ValueError:
333-
attachment_name = self.attachment_name
383+
if jinja2 and self.templating and self.templating.get('body', False):
384+
jinja_tmpl = jinja_env.from_string(text)
385+
text = jinja_tmpl.render(current_time=datetime.datetime.now(), **template_data)
386+
387+
if jinja2 and self.templating and self.templating.get('subject', False):
388+
jinja_tmpl = jinja_env.from_string(self.subject)
389+
subject = jinja_tmpl.render(current_time=datetime.datetime.now(), **template_data)
390+
else:
391+
try:
392+
subject = time.strftime(self.subject)
393+
except ValueError:
394+
subject = self.subject
395+
396+
if jinja2 and self.templating and self.templating.get('attachment', False):
397+
jinja_tmpl = jinja_env.from_string(self.attachment_name)
398+
attachment_name = jinja_tmpl.render(current_time=datetime.datetime.now(), **template_data)
399+
else:
400+
try:
401+
attachment_name = time.strftime(self.attachment_name)
402+
except ValueError:
403+
attachment_name = self.attachment_name
404+
334405
if intended_to:
335406
subject += f" (intended for {intended_to})"
336407
else:

0 commit comments

Comments
 (0)