Skip to content

Commit 40e4ae1

Browse files
[ENH] Using msgpack instead of json
Using msgpack instead of json results in faster (de)serialize and less memory usage. Redis is also capable of msgpack within its lua api i.e. https://github.com/kengonakajima/lua-msgpack-native. ====== Benchmark ======= JSON median size: 387 MSGPACK median size: 329 ------------------------ Diff: 16.20% JSON * Serialize: 39286 * Deserialize: 30713 MSGPACK * Serialize: 23483 * Deserialize: 12602 --------------------- DIFF * Serialize: 50.35% * Deserialize: 83.62% Data extracted from spamhaus-collector Measurements based on deduplicator-expert 460 events in total process by deducplicator-expert Signed-off-by: Sebastian Waldbauer <[email protected]>
1 parent 36d7940 commit 40e4ae1

File tree

15 files changed

+106
-57
lines changed

15 files changed

+106
-57
lines changed

debian/control

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Build-Depends: debhelper (>= 4.1.16),
2121
python3-sphinx-rtd-theme,
2222
python3-termstyle,
2323
python3-tz,
24+
python3-msgpack,
2425
quilt,
2526
rsync,
2627
safe-rm
@@ -42,6 +43,7 @@ Depends: bash-completion,
4243
python3-ruamel.yaml,
4344
python3-termstyle (>= 0.1.10),
4445
python3-tz,
46+
python3-msgpack,
4547
redis-server,
4648
systemd,
4749
${misc:Depends},

intelmq/bots/parsers/json/parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ def process(self):
2828
for line in lines:
2929
new_event = MessageFactory.unserialize(line,
3030
harmonization=self.harmonization,
31-
default_type='Event')
31+
default_type='Event',
32+
use_packer="json")
33+
3234
event = self.new_event(report)
3335
event.update(new_event)
3436
if 'raw' not in event:

intelmq/lib/bot.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import inspect
1818
import io
1919
import json
20+
import msgpack
2021
import logging
2122
import os
2223
import re
@@ -320,8 +321,8 @@ def start(self, starting: bool = True, error_on_pipeline: bool = True,
320321
self.logger.error('Pipeline failed.')
321322
self.__disconnect_pipelines()
322323

323-
except exceptions.DecodingError as exc:
324-
self.logger.exception('Could not decode message from pipeline. No retries useful.')
324+
except exceptions.UnserializationError as exc:
325+
self.logger.exception('Could not unserialize message from pipeline. No retries useful.')
325326

326327
# ensure that we do not re-process the faulty message
327328
self.__error_retries_counter = self.error_max_retries + 1

intelmq/lib/exceptions.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,3 +172,12 @@ def __init__(self, encodings=None, exception: UnicodeDecodeError = None,
172172
suffixes.append('with reason %r' % exception.reason)
173173
suffix = (' ' + ' '.join(suffixes)) if suffixes else ''
174174
super().__init__("Could not decode string%s." % suffix)
175+
176+
177+
class UnserializationError(IntelMQException, ValueError):
178+
"""
179+
Unrecoverable error during message unserialization
180+
"""
181+
def __init__(self, exception: Exception = None, object: bytes = None):
182+
self.object = object
183+
super().__init__("Could not unserialize message%s." % exception)

intelmq/lib/message.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import warnings
1515
from collections import defaultdict
1616
from typing import Any, Dict, Iterable, Optional, Sequence, Union
17+
import msgpack
1718

1819
import intelmq.lib.exceptions as exceptions
1920
import intelmq.lib.harmonization
@@ -58,8 +59,8 @@ def from_dict(message: dict, harmonization=None,
5859
return class_reference(message, auto=True, harmonization=harmonization)
5960

6061
@staticmethod
61-
def unserialize(raw_message: str, harmonization: dict = None,
62-
default_type: Optional[str] = None) -> dict:
62+
def unserialize(raw_message: bytes, harmonization: dict = None,
63+
default_type: Optional[str] = None, use_packer: str = "msgpack") -> dict:
6364
"""
6465
Takes JSON-encoded Message object, returns instance of correct class.
6566
@@ -72,12 +73,12 @@ def unserialize(raw_message: str, harmonization: dict = None,
7273
MessageFactory.from_dict
7374
MessageFactory.serialize
7475
"""
75-
message = Message.unserialize(raw_message)
76+
message = Message.unserialize(raw_message, use_packer=use_packer)
7677
return MessageFactory.from_dict(message, harmonization=harmonization,
7778
default_type=default_type)
7879

7980
@staticmethod
80-
def serialize(message):
81+
def serialize(message) -> bytes:
8182
"""
8283
Takes instance of message-derived class and makes JSON-encoded Message.
8384
@@ -125,7 +126,7 @@ def __init__(self, message: Union[dict, tuple] = (), auto: bool = False,
125126
elif isinstance(message, tuple):
126127
self.iterable = dict(message)
127128
else:
128-
raise ValueError("Type %r of message can't be handled, must be dict or tuple.", type(message))
129+
raise ValueError("Type %r of message can't be handled, must be dict or tuple." % type(message))
129130
for key, value in self.iterable.items():
130131
if not self.add(key, value, sanitize=False, raise_failure=False):
131132
self.add(key, value, sanitize=True)
@@ -308,18 +309,32 @@ def deep_copy(self):
308309
harmonization={self.__class__.__name__.lower(): self.harmonization_config})
309310

310311
def __str__(self):
311-
return self.serialize()
312+
return self.serialize(use_packer="json")
312313

313-
def serialize(self):
314-
self['__type'] = self.__class__.__name__
315-
json_dump = utils.decode(json.dumps(self))
316-
del self['__type']
317-
return json_dump
314+
def serialize(self, use_packer: str = "msgpack"):
315+
delete_type = False
316+
if '__type' not in self:
317+
delete_type = True
318+
self['__type'] = self.__class__.__name__
319+
320+
if use_packer == "json":
321+
packed = json.dumps(self)
322+
else:
323+
packed = msgpack.packb(self)
324+
325+
if delete_type:
326+
del self['__type']
327+
return packed
318328

319329
@staticmethod
320-
def unserialize(message_string: str):
321-
message = json.loads(message_string)
322-
return message
330+
def unserialize(message: bytes, use_packer: str = "msgpack"):
331+
try:
332+
if use_packer == "json":
333+
return json.loads(message)
334+
else:
335+
return msgpack.unpackb(message, raw=False)
336+
except Exception as exc:
337+
raise exceptions.UnserializationError(exception=exc, object=message)
323338

324339
def __is_valid_key(self, key: str):
325340
try:
@@ -466,14 +481,18 @@ def to_dict(self, hierarchical: bool = False, with_type: bool = False,
466481
json_dict_fp = json_dict_fp[subkey]
467482

468483
for key, value in jsondicts.items():
469-
new_dict[key] = json.dumps(value, ensure_ascii=False)
484+
new_dict[key] = json.dumps(value)
470485

471486
return new_dict
472487

473488
def to_json(self, hierarchical=False, with_type=False, jsondict_as_string=False):
474489
json_dict = self.to_dict(hierarchical=hierarchical, with_type=with_type)
475490
return json.dumps(json_dict, ensure_ascii=False, sort_keys=True)
476491

492+
def to_msgpack(self, hierarchical=False, with_type=False):
493+
msgpack_dict = self.to_dict(hierarchical=hierarchical, with_type=with_type)
494+
return msgpack.packb(msgpack_dict)
495+
477496
def __eq__(self, other: dict) -> bool:
478497
"""
479498
Wrapper is necessary as we have additional members

intelmq/lib/pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,14 +123,14 @@ def send(self, message: str, path: str = "_default",
123123
path_permissive: bool = False):
124124
raise NotImplementedError
125125

126-
def receive(self) -> str:
126+
def receive(self) -> bytes:
127127
if self._has_message:
128128
raise exceptions.PipelineError("There's already a message, first "
129129
"acknowledge the existing one.")
130130

131131
retval = self._receive()
132132
self._has_message = True
133-
return utils.decode(retval)
133+
return retval
134134

135135
def _receive(self) -> bytes:
136136
raise NotImplementedError

intelmq/lib/test.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import io
1313
import inspect
1414
import json
15+
import msgpack
1516
import os
1617
import re
1718
import unittest
@@ -152,8 +153,7 @@ def setUpClass(cls):
152153
elif cls.bot_type != 'collector' and cls.default_input_message == '':
153154
cls.default_input_message = {'__type': 'Event'}
154155
if type(cls.default_input_message) is dict:
155-
cls.default_input_message = \
156-
utils.decode(json.dumps(cls.default_input_message))
156+
cls.default_input_message = msgpack.packb(cls.default_input_message)
157157

158158
if cls.use_cache and not os.environ.get('INTELMQ_SKIP_REDIS'):
159159
password = os.environ.get('INTELMQ_TEST_REDIS_PASSWORD') or \
@@ -170,10 +170,10 @@ def setUpClass(cls):
170170
harmonization = utils.load_configuration(pkg_resources.resource_filename('intelmq',
171171
'etc/harmonization.conf'))
172172

173-
def new_report(self, auto=False, examples=False):
173+
def new_report(self, auto=False, examples=False) -> message.Report:
174174
return message.Report(harmonization=self.harmonization, auto=auto)
175175

176-
def new_event(self):
176+
def new_event(self) -> message.Event:
177177
return message.Event(harmonization=self.harmonization)
178178

179179
def get_mocked_logger(self, logger):
@@ -242,7 +242,7 @@ def prepare_source_queue(self):
242242
self.input_queue = []
243243
for msg in self.input_message:
244244
if type(msg) is dict:
245-
self.input_queue.append(json.dumps(msg))
245+
self.input_queue.append(message.MessageFactory.serialize(msg))
246246
elif issubclass(type(msg), message.Message):
247247
self.input_queue.append(msg.serialize())
248248
else:
@@ -320,17 +320,17 @@ def run_bot(self, iterations: int = 1, error_on_pipeline: bool = False,
320320

321321
""" Test if report has required fields. """
322322
if self.bot_type == 'collector':
323-
for report_json in self.get_output_queue():
324-
report = message.MessageFactory.unserialize(report_json,
323+
for report_data in self.get_output_queue():
324+
report = message.MessageFactory.unserialize(report_data,
325325
harmonization=self.harmonization)
326326
self.assertIsInstance(report, message.Report)
327327
self.assertIn('raw', report)
328328
self.assertIn('time.observation', report)
329329

330330
""" Test if event has required fields. """
331331
if self.bot_type == 'parser':
332-
for event_json in self.get_output_queue():
333-
event = message.MessageFactory.unserialize(event_json,
332+
for event_data in self.get_output_queue():
333+
event = message.MessageFactory.unserialize(event_data,
334334
harmonization=self.harmonization)
335335
self.assertIsInstance(event, message.Event)
336336
self.assertIn('classification.type', event)
@@ -389,7 +389,7 @@ def get_output_queue(self, path="_default"):
389389
"""Getter for items in the output queues of this bot. Use in TestCase scenarios
390390
If there is multiple queues in named queue group, we return all the items chained.
391391
"""
392-
return [utils.decode(text) for text in chain(*[self.pipe.state[x] for x in self.pipe.destination_queues[path]])]
392+
return [text for text in chain(*[self.pipe.state[x] for x in self.pipe.destination_queues[path]])]
393393
# return [utils.decode(text) for text in self.pipe.state["%s-output" % self.bot_id]]
394394

395395
def test_bot_name(self, *args, **kwargs):
@@ -520,9 +520,9 @@ def assertMessageEqual(self, queue_pos, expected_msg, compare_raw=True, path="_d
520520
given queue position.
521521
"""
522522
event = self.get_output_queue(path=path)[queue_pos]
523-
self.assertIsInstance(event, str)
523+
self.assertIsInstance(event, bytes)
524524

525-
event_dict = json.loads(event)
525+
event_dict = msgpack.unpackb(event, raw=False)
526526
if isinstance(expected_msg, (message.Event, message.Report)):
527527
expected = expected_msg.to_dict(with_type=True)
528528
else:

intelmq/tests/bots/collectors/tcp/test_collector.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,10 @@ def test_intelmq_exchange(self):
123123
for i, msg in enumerate(self.get_output_queue()):
124124
report = MessageFactory.unserialize(msg, harmonization=self.harmonization, default_type='Event')
125125

126-
output = MessageFactory.unserialize(utils.base64_decode(report["raw"]), harmonization=self.harmonization, default_type='Event')
126+
output = MessageFactory.unserialize(utils.base64_decode(report["raw"]),
127+
harmonization=self.harmonization,
128+
default_type='Event',
129+
use_packer="json")
127130
self.assertDictEqual(output, INPUT1)
128131

129132
del report['time.observation']

intelmq/tests/bots/experts/cymru_whois/test_expert.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: AGPL-3.0-or-later
44

55
# -*- coding: utf-8 -*-
6-
import json
6+
import msgpack
77
import unittest
88

99
import intelmq.lib.test as test
@@ -93,7 +93,7 @@ def test_6to4_result(self):
9393
"""
9494
self.input_message = EXAMPLE_6TO4_INPUT
9595
self.run_bot()
96-
actual = json.loads(self.get_output_queue()[0])
96+
actual = msgpack.loads(self.get_output_queue()[0])
9797
self.assertDictContainsSubset(EXAMPLE_6TO4_INPUT, actual)
9898
self.assertIn("source.asn", actual)
9999
self.assertIn("source.as_name", actual)

intelmq/tests/bots/experts/idea/test_expert.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
# -*- coding: utf-8 -*-
66
import unittest
77
import json
8+
import msgpack
89

910
import intelmq.lib.test as test
11+
from intelmq.lib.message import MessageFactory
1012
from intelmq.bots.experts.idea.expert import IdeaExpertBot
1113
from intelmq.lib.harmonization import ClassificationType
1214

@@ -86,10 +88,10 @@ def test_conversion(self):
8688
# The ID in the generated Idea event is random, so we have to extract
8789
# the data from the "output" field and compare after removing ID's
8890
event = self.get_output_queue()[0]
89-
self.assertIsInstance(event, str)
90-
event_dict = json.loads(event)
91+
self.assertIsInstance(event, bytes)
92+
event_dict = MessageFactory.unserialize(event)
9193
self.assertIsInstance(event_dict, dict)
92-
self.assertTrue("output" in event_dict)
94+
self.assertTrue(b"output" in event_dict)
9395
idea_event = json.loads(event_dict["output"])
9496
self.assertIsInstance(idea_event, dict)
9597
del TEST_OUTPUT1["ID"]

0 commit comments

Comments
 (0)