Skip to content

Commit 70f6e43

Browse files
committed
Fixes PYTHON-5126 and adopts agreement that ignored bits raise if non-zero
1 parent 7243b43 commit 70f6e43

File tree

3 files changed

+53
-18
lines changed

3 files changed

+53
-18
lines changed

bson/binary.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,10 @@ def from_vector(
462462
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
463463
if padding and not vector:
464464
raise ValueError("Empty vector with non-zero padding.")
465+
if padding and not (vector[-1] & ((1 << padding) - 1)) == 0:
466+
raise ValueError(
467+
"If padding p is provided, all bits in the final byte lower than p must be 0."
468+
)
465469
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
466470
format_str = "f"
467471
if padding:
@@ -490,6 +494,11 @@ def as_vector(self) -> BinaryVector:
490494
dtype = BinaryVectorDtype(dtype)
491495
n_values = len(self) - position
492496

497+
if padding and dtype != BinaryVectorDtype.PACKED_BIT:
498+
raise ValueError(
499+
f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})"
500+
)
501+
493502
if dtype == BinaryVectorDtype.INT8:
494503
dtype_format = "b"
495504
format_string = f"<{n_values}{dtype_format}"
@@ -513,6 +522,12 @@ def as_vector(self) -> BinaryVector:
513522
dtype_format = "B"
514523
format_string = f"<{n_values}{dtype_format}"
515524
unpacked_uint8s = list(struct.unpack_from(format_string, self, position))
525+
if padding and not n_values:
526+
raise ValueError("Corrupt data. Vector has a padding P, but no data.")
527+
if padding and n_values and not (unpacked_uint8s[-1] & ((1 << padding) - 1)) == 0:
528+
raise ValueError(
529+
"Corrupt data. Vector has a padding P, but bits in the final byte lower than P are non-zero."
530+
)
516531
return BinaryVector(unpacked_uint8s, dtype, padding)
517532

518533
else:

test/bson_binary_vector/packed_bit.json

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,39 @@
1414
{
1515
"description": "Simple Vector PACKED_BIT",
1616
"valid": true,
17-
"vector": [127, 7],
17+
"vector": [127, 8],
1818
"dtype_hex": "0x10",
1919
"dtype_alias": "PACKED_BIT",
2020
"padding": 0,
21-
"canonical_bson": "1600000005766563746F7200040000000910007F0700"
21+
"canonical_bson": "1600000005766563746F7200040000000910007F0800"
2222
},
2323
{
24-
"description": "Empty Vector PACKED_BIT",
24+
"description": "PACKED_BIT with padding",
2525
"valid": true,
26-
"vector": [],
26+
"vector": [127, 8],
2727
"dtype_hex": "0x10",
2828
"dtype_alias": "PACKED_BIT",
29-
"padding": 0,
30-
"canonical_bson": "1400000005766563746F72000200000009100000"
29+
"padding": 3,
30+
"canonical_bson": "1600000005766563746F7200040000000910037F0800"
3131
},
3232
{
33-
"description": "PACKED_BIT with padding",
34-
"valid": true,
33+
"description": "PACKED_BIT with inconsistent padding",
34+
"valid": false,
3535
"vector": [127, 7],
3636
"dtype_hex": "0x10",
3737
"dtype_alias": "PACKED_BIT",
3838
"padding": 3,
3939
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
4040
},
41+
{
42+
"description": "Empty Vector PACKED_BIT",
43+
"valid": true,
44+
"vector": [],
45+
"dtype_hex": "0x10",
46+
"dtype_alias": "PACKED_BIT",
47+
"padding": 0,
48+
"canonical_bson": "1400000005766563746F72000200000009100000"
49+
},
4150
{
4251
"description": "Overflow Vector PACKED_BIT",
4352
"valid": false,

test/test_bson_binary_vector.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,11 @@ def create_test(case_spec):
4848
def run_test(self):
4949
for test_case in case_spec.get("tests", []):
5050
description = test_case["description"]
51-
vector_exp = test_case.get("vector", [])
51+
vector_exp = test_case.get("vector", None)
5252
dtype_hex_exp = test_case["dtype_hex"]
5353
dtype_alias_exp = test_case.get("dtype_alias")
5454
padding_exp = test_case.get("padding", 0)
55-
canonical_bson_exp = test_case.get("canonical_bson")
55+
canonical_bson_exp = test_case.get("canonical_bson", None)
5656
# Convert dtype hex string into bytes
5757
dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little"))
5858

@@ -85,14 +85,25 @@ def run_test(self):
8585
self.assertEqual(cB_obs, canonical_bson_exp, description)
8686

8787
else:
88-
with self.assertRaises((struct.error, ValueError), msg=description):
89-
# Tests Binary.from_vector
90-
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
91-
# Tests Binary.as_vector
92-
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
93-
decoded_doc = decode(cB_exp)
94-
binary_obs = decoded_doc[test_key]
95-
binary_obs.as_vector()
88+
"""
89+
#### To prove correct in an invalid case (`valid:false`), one MUST
90+
- if the vector field is present, raise an exception when attempting to encode a document from the numeric values,
91+
dtype, and padding.
92+
- if the canonical_bson field is present, raise an exception when attempting to deserialize it into the corresponding
93+
numeric values, as the field contains corrupted data.
94+
"""
95+
# Tests Binary.from_vector()
96+
if vector_exp is not None:
97+
with self.assertRaises((struct.error, ValueError), msg=description):
98+
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
99+
100+
# Tests Binary.as_vector()
101+
if canonical_bson_exp is not None:
102+
with self.assertRaises((struct.error, ValueError), msg=description):
103+
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
104+
decoded_doc = decode(cB_exp)
105+
binary_obs = decoded_doc[test_key]
106+
binary_obs.as_vector()
96107

97108
return run_test
98109

0 commit comments

Comments
 (0)