Skip to content

Commit c352332

Browse files
committed
adding support for serialization of bytes taht are not utf8 compatible
1 parent 59fbd1d commit c352332

File tree

2 files changed

+126
-1
lines changed

2 files changed

+126
-1
lines changed

deepdiff/serialization.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import orderly_set # NOQA
1313
import collections # NOQA
1414
import ipaddress
15+
import base64
1516
from copy import deepcopy, copy
1617
from functools import partial
1718
from collections.abc import Mapping, KeysView
@@ -607,13 +608,25 @@ def _serialize_tuple(value):
607608
return value
608609

609610

611+
def _serialize_bytes(value):
612+
"""
613+
Serialize bytes to JSON-compatible format.
614+
First tries UTF-8 decoding for backward compatibility.
615+
Falls back to base64 encoding for binary data.
616+
"""
617+
try:
618+
return value.decode('utf-8')
619+
except UnicodeDecodeError:
620+
return base64.b64encode(value).decode('ascii')
621+
622+
610623
JSON_CONVERTOR = {
611624
decimal.Decimal: _serialize_decimal,
612625
SetOrdered: list,
613626
orderly_set.StableSetEq: list,
614627
set: list,
615628
type: lambda x: x.__name__,
616-
bytes: lambda x: x.decode('utf-8'),
629+
bytes: _serialize_bytes,
617630
datetime.datetime: lambda x: x.isoformat(),
618631
uuid.UUID: lambda x: str(x),
619632
np_float32: float,

tests/test_serialization.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import pytest
77
import datetime
88
import numpy as np
9+
import hashlib
10+
import base64
911
from typing import NamedTuple, Optional
1012
from pickle import UnpicklingError
1113
from decimal import Decimal
@@ -455,3 +457,113 @@ def test_dict_keys(self):
455457
dic = {"foo": "bar", "apple": "too sweet"}
456458
serialized = json_dumps(dic.keys())
457459
assert '["foo","apple"]' == serialized
460+
461+
def test_non_utf8_bytes_serialization(self):
462+
"""Test that non-UTF-8 bytes are properly base64 encoded"""
463+
# Create binary data that cannot be decoded as UTF-8
464+
binary_data = b'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f'
465+
466+
# Verify it's not UTF-8 decodable
467+
with pytest.raises(UnicodeDecodeError):
468+
binary_data.decode('utf-8')
469+
470+
# Test serialization
471+
test_data = {"binary": binary_data}
472+
serialized = json_dumps(test_data)
473+
474+
# Should contain base64 encoded data
475+
expected_b64 = base64.b64encode(binary_data).decode('ascii')
476+
assert expected_b64 in serialized
477+
478+
# Should be deserializable
479+
deserialized = json_loads(serialized)
480+
assert deserialized == {"binary": expected_b64}
481+
482+
def test_hash_bytes_serialization(self):
483+
"""Test serialization of hash-like binary data (blake3, sha256, etc.)"""
484+
# Generate various hash-like byte sequences
485+
test_cases = [
486+
hashlib.md5(b"test").digest(),
487+
hashlib.sha1(b"test").digest(),
488+
hashlib.sha256(b"test").digest(),
489+
hashlib.sha512(b"test").digest()[:16], # Truncated
490+
b'\xff\xfe\xfd\xfc' * 8, # Artificial binary pattern
491+
]
492+
493+
for i, hash_bytes in enumerate(test_cases):
494+
test_data = {"hash": hash_bytes}
495+
496+
# Should not raise UnicodeDecodeError
497+
serialized = json_dumps(test_data)
498+
assert serialized # Should produce valid JSON
499+
500+
# Should contain base64 if not UTF-8 decodable, or string if UTF-8 decodable
501+
try:
502+
utf8_decoded = hash_bytes.decode('utf-8')
503+
# If UTF-8 decodable, should be in JSON as string
504+
assert utf8_decoded in serialized
505+
except UnicodeDecodeError:
506+
# If not UTF-8 decodable, should be base64 encoded
507+
expected_b64 = base64.b64encode(hash_bytes).decode('ascii')
508+
assert expected_b64 in serialized
509+
510+
def test_mixed_utf8_and_binary_bytes(self):
511+
"""Test data structure with both UTF-8 decodable and binary bytes"""
512+
test_data = {
513+
"utf8_text": b"hello world", # UTF-8 decodable
514+
"binary_hash": hashlib.sha256(b"secret").digest(), # Binary
515+
"empty_bytes": b"", # Edge case
516+
"utf8_unicode": "café".encode('utf-8'), # UTF-8 with unicode
517+
"non_utf8_byte": b"\xff\xfe\xfd", # Non-UTF-8 bytes
518+
}
519+
520+
# Should serialize without errors
521+
serialized = json_dumps(test_data)
522+
deserialized = json_loads(serialized)
523+
524+
# UTF-8 decodable bytes should remain as strings
525+
assert "hello world" in serialized
526+
assert deserialized["utf8_text"] == "hello world"
527+
528+
# Unicode UTF-8 should work
529+
assert deserialized["utf8_unicode"] == "café"
530+
531+
# Binary data should be base64 encoded
532+
expected_hash_b64 = base64.b64encode(test_data["binary_hash"]).decode('ascii')
533+
assert expected_hash_b64 in serialized
534+
assert deserialized["binary_hash"] == expected_hash_b64
535+
536+
# Empty bytes should be empty string
537+
assert deserialized["empty_bytes"] == ""
538+
539+
# Non-UTF-8 bytes should be base64 encoded
540+
expected_non_utf8_b64 = base64.b64encode(test_data["non_utf8_byte"]).decode('ascii')
541+
assert expected_non_utf8_b64 in serialized
542+
assert deserialized["non_utf8_byte"] == expected_non_utf8_b64
543+
544+
def test_bytes_in_deepdiff_serialization(self):
545+
"""Test that bytes work correctly in DeepDiff JSON serialization"""
546+
t1 = {
547+
"text": b"hello",
548+
"hash": hashlib.sha256(b"data1").digest(),
549+
}
550+
t2 = {
551+
"text": b"world",
552+
"hash": hashlib.sha256(b"data2").digest(),
553+
}
554+
555+
diff = DeepDiff(t1, t2)
556+
557+
# Should serialize without errors
558+
json_output = diff.to_json()
559+
assert json_output
560+
561+
# Should contain both UTF-8 decoded strings and base64 encoded hashes
562+
assert "hello" in json_output
563+
assert "world" in json_output
564+
565+
# Hash values should be base64 encoded
566+
expected_hash1 = base64.b64encode(t1["hash"]).decode('ascii')
567+
expected_hash2 = base64.b64encode(t2["hash"]).decode('ascii')
568+
assert expected_hash1 in json_output
569+
assert expected_hash2 in json_output

0 commit comments

Comments
 (0)