Skip to content

Commit fbb56a2

Browse files
committed
PYTHON-1820 Validate bson size in RawBSONDocument init
Also fixes a bug where an empty bson document could not be represented by RawBSONDocument.
1 parent 2cb34e4 commit fbb56a2

File tree

4 files changed

+75
-38
lines changed

4 files changed

+75
-38
lines changed

bson/__init__.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,26 @@ def _get_string(data, position, obj_end, opts, dummy):
183183
opts.unicode_decode_error_handler, True)[0], end + 1
184184

185185

186-
def _get_object(data, position, obj_end, opts, dummy):
187-
"""Decode a BSON subdocument to opts.document_class or bson.dbref.DBRef."""
188-
obj_size = _UNPACK_INT(data[position:position + 4])[0]
186+
def _get_object_size(data, position, obj_end):
187+
"""Validate and return a BSON document's size."""
188+
try:
189+
obj_size = _UNPACK_INT(data[position:position + 4])[0]
190+
except struct.error as exc:
191+
raise InvalidBSON(str(exc))
189192
end = position + obj_size - 1
190-
if data[end:position + obj_size] != b"\x00":
193+
if data[end:end + 1] != b"\x00":
191194
raise InvalidBSON("bad eoo")
192195
if end >= obj_end:
193196
raise InvalidBSON("invalid object length")
197+
# If this is the top-level document, validate the total size too.
198+
if position == 0 and obj_size != obj_end:
199+
raise InvalidBSON("invalid object length")
200+
return obj_size, end
201+
202+
203+
def _get_object(data, position, obj_end, opts, dummy):
204+
"""Decode a BSON subdocument to opts.document_class or bson.dbref.DBRef."""
205+
obj_size, end = _get_object_size(data, position, obj_end)
194206
if _raw_document_class(opts.document_class):
195207
return (opts.document_class(data[position:end + 1], opts),
196208
position + obj_size)
@@ -406,38 +418,26 @@ def _element_to_dict(data, position, obj_end, opts):
406418
_element_to_dict = _cbson._element_to_dict
407419

408420

409-
def _iterate_elements(data, position, obj_end, opts):
421+
def _elements_to_dict(data, position, obj_end, opts, result=None):
422+
"""Decode a BSON document into result."""
423+
if result is None:
424+
result = opts.document_class()
410425
end = obj_end - 1
411426
while position < end:
412-
(key, value, position) = _element_to_dict(data, position, obj_end, opts)
413-
yield key, value, position
414-
415-
416-
def _elements_to_dict(data, position, obj_end, opts):
417-
"""Decode a BSON document."""
418-
result = opts.document_class()
419-
pos = position
420-
for key, value, pos in _iterate_elements(data, position, obj_end, opts):
427+
key, value, position = _element_to_dict(data, position, obj_end, opts)
421428
result[key] = value
422-
if pos != obj_end:
429+
if position != obj_end:
423430
raise InvalidBSON('bad object or element length')
424431
return result
425432

426433

427434
def _bson_to_dict(data, opts):
428435
"""Decode a BSON string to document_class."""
429-
try:
430-
obj_size = _UNPACK_INT(data[:4])[0]
431-
except struct.error as exc:
432-
raise InvalidBSON(str(exc))
433-
if obj_size != len(data):
434-
raise InvalidBSON("invalid object size")
435-
if data[obj_size - 1:obj_size] != b"\x00":
436-
raise InvalidBSON("bad eoo")
437436
try:
438437
if _raw_document_class(opts.document_class):
439438
return opts.document_class(data, opts)
440-
return _elements_to_dict(data, 4, obj_size - 1, opts)
439+
_, end = _get_object_size(data, 0, len(data))
440+
return _elements_to_dict(data, 4, end, opts)
441441
except InvalidBSON:
442442
raise
443443
except Exception:

bson/raw_bson.py

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,10 @@
1515
"""Tools for representing raw BSON documents.
1616
"""
1717

18-
from bson import _UNPACK_INT, _iterate_elements
18+
from bson import _elements_to_dict, _get_object_size
1919
from bson.py3compat import abc, iteritems
2020
from bson.codec_options import (
2121
DEFAULT_CODEC_OPTIONS as DEFAULT, _RAW_BSON_DOCUMENT_MARKER)
22-
from bson.errors import InvalidBSON
2322

2423

2524
class RawBSONDocument(abc.Mapping):
@@ -34,12 +33,33 @@ class RawBSONDocument(abc.Mapping):
3433
_type_marker = _RAW_BSON_DOCUMENT_MARKER
3534

3635
def __init__(self, bson_bytes, codec_options=None):
37-
"""Create a new :class:`RawBSONDocument`.
36+
"""Create a new :class:`RawBSONDocument`
37+
38+
:class:`RawBSONDocument` is a representation of a BSON document that
39+
provides access to the underlying raw BSON bytes. Only when a field is
40+
accessed or modified within the document does RawBSONDocument decode
41+
its bytes.
42+
43+
:class:`RawBSONDocument` implements the ``Mapping`` abstract base
44+
class from the standard library so it can be used like a read-only
45+
``dict``::
46+
47+
>>> raw_doc = RawBSONDocument(BSON.encode({'_id': 'my_doc'}))
48+
>>> raw_doc.raw
49+
b'...'
50+
>>> raw_doc['_id']
51+
'my_doc'
3852
3953
:Parameters:
4054
- `bson_bytes`: the BSON bytes that compose this document
4155
- `codec_options` (optional): An instance of
42-
:class:`~bson.codec_options.CodecOptions`.
56+
:class:`~bson.codec_options.CodecOptions` whose ``document_class``
57+
must be :class:`RawBSONDocument`. The default is
58+
:attr:`DEFAULT_RAW_BSON_OPTIONS`.
59+
60+
.. versionchanged:: 3.8
61+
:class:`RawBSONDocument` now validates that the ``bson_bytes``
62+
passed in represent a single bson document.
4363
4464
.. versionchanged:: 3.5
4565
If a :class:`~bson.codec_options.CodecOptions` is passed in, its
@@ -56,6 +76,8 @@ def __init__(self, bson_bytes, codec_options=None):
5676
"RawBSONDocument cannot use CodecOptions with document "
5777
"class %s" % (codec_options.document_class, ))
5878
self.__codec_options = codec_options
79+
# Validate the bson object size.
80+
_get_object_size(bson_bytes, 0, len(bson_bytes))
5981

6082
@property
6183
def raw(self):
@@ -70,16 +92,9 @@ def items(self):
7092
def __inflated(self):
7193
if self.__inflated_doc is None:
7294
# We already validated the object's size when this document was
73-
# created, so no need to do that again. We still need to check the
74-
# size of all the elements and compare to the document size.
75-
object_size = _UNPACK_INT(self.__raw[:4])[0] - 1
76-
position = 0
77-
self.__inflated_doc = {}
78-
for key, value, position in _iterate_elements(
79-
self.__raw, 4, object_size, self.__codec_options):
80-
self.__inflated_doc[key] = value
81-
if position != object_size:
82-
raise InvalidBSON('bad object or element length')
95+
# created, so no need to do that again.
96+
self.__inflated_doc = _elements_to_dict(
97+
self.__raw, 4, len(self.__raw)-1, self.__codec_options, {})
8398
return self.__inflated_doc
8499

85100
def __getitem__(self, item):
@@ -102,3 +117,6 @@ def __repr__(self):
102117

103118

104119
DEFAULT_RAW_BSON_OPTIONS = DEFAULT.with_options(document_class=RawBSONDocument)
120+
"""The default :class:`~bson.codec_options.CodecOptions` for
121+
:class:`RawBSONDocument`.
122+
"""

doc/changelog.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ Changes in Version 3.8.0.dev0
9898
supported since PyMongo 2.7. Valid values are `pythonLegacy` (the default),
9999
`javaLegacy`, `csharpLegacy` and `standard`. New applications should consider
100100
setting this to `standard` for cross language compatibility.
101+
- :class:`~bson.raw_bson.RawBSONDocument` now validates that the ``bson_bytes``
102+
passed in represent a single bson document. Earlier versions would mistakenly
103+
accept multiple bson documents.
101104

102105
Issues Resolved
103106
...............

test/test_raw_bson.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from bson import BSON
1919
from bson.binary import JAVA_LEGACY
2020
from bson.codec_options import CodecOptions
21+
from bson.errors import InvalidBSON
2122
from bson.raw_bson import RawBSONDocument
2223
from test import client_context, unittest
2324

@@ -51,6 +52,21 @@ def test_decode(self):
5152
def test_raw(self):
5253
self.assertEqual(self.bson_string, self.document.raw)
5354

55+
def test_empty_doc(self):
56+
doc = RawBSONDocument(BSON.encode({}))
57+
with self.assertRaises(KeyError):
58+
doc['does-not-exist']
59+
60+
def test_invalid_bson_sequence(self):
61+
bson_byte_sequence = BSON.encode({'a': 1})+BSON.encode({})
62+
with self.assertRaisesRegex(InvalidBSON, 'invalid object length'):
63+
RawBSONDocument(bson_byte_sequence)
64+
65+
def test_invalid_bson_eoo(self):
66+
invalid_bson_eoo = BSON.encode({'a': 1})[:-1] + b'\x01'
67+
with self.assertRaisesRegex(InvalidBSON, 'bad eoo'):
68+
RawBSONDocument(invalid_bson_eoo)
69+
5470
@client_context.require_connection
5571
def test_round_trip(self):
5672
db = self.client.get_database(

0 commit comments

Comments
 (0)