|
6 | 6 | import pytest |
7 | 7 | import datetime |
8 | 8 | import numpy as np |
| 9 | +import hashlib |
| 10 | +import base64 |
9 | 11 | from typing import NamedTuple, Optional |
10 | 12 | from pickle import UnpicklingError |
11 | 13 | from decimal import Decimal |
@@ -455,3 +457,113 @@ def test_dict_keys(self): |
455 | 457 | dic = {"foo": "bar", "apple": "too sweet"} |
456 | 458 | serialized = json_dumps(dic.keys()) |
457 | 459 | assert '["foo","apple"]' == serialized |
| 460 | + |
| 461 | + def test_non_utf8_bytes_serialization(self): |
| 462 | + """Test that non-UTF-8 bytes are properly base64 encoded""" |
| 463 | + # Create binary data that cannot be decoded as UTF-8 |
| 464 | + binary_data = b'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f' |
| 465 | + |
| 466 | + # Verify it's not UTF-8 decodable |
| 467 | + with pytest.raises(UnicodeDecodeError): |
| 468 | + binary_data.decode('utf-8') |
| 469 | + |
| 470 | + # Test serialization |
| 471 | + test_data = {"binary": binary_data} |
| 472 | + serialized = json_dumps(test_data) |
| 473 | + |
| 474 | + # Should contain base64 encoded data |
| 475 | + expected_b64 = base64.b64encode(binary_data).decode('ascii') |
| 476 | + assert expected_b64 in serialized |
| 477 | + |
| 478 | + # Should be deserializable |
| 479 | + deserialized = json_loads(serialized) |
| 480 | + assert deserialized == {"binary": expected_b64} |
| 481 | + |
| 482 | + def test_hash_bytes_serialization(self): |
| 483 | + """Test serialization of hash-like binary data (blake3, sha256, etc.)""" |
| 484 | + # Generate various hash-like byte sequences |
| 485 | + test_cases = [ |
| 486 | + hashlib.md5(b"test").digest(), |
| 487 | + hashlib.sha1(b"test").digest(), |
| 488 | + hashlib.sha256(b"test").digest(), |
| 489 | + hashlib.sha512(b"test").digest()[:16], # Truncated |
| 490 | + b'\xff\xfe\xfd\xfc' * 8, # Artificial binary pattern |
| 491 | + ] |
| 492 | + |
| 493 | + for i, hash_bytes in enumerate(test_cases): |
| 494 | + test_data = {"hash": hash_bytes} |
| 495 | + |
| 496 | + # Should not raise UnicodeDecodeError |
| 497 | + serialized = json_dumps(test_data) |
| 498 | + assert serialized # Should produce valid JSON |
| 499 | + |
| 500 | + # Should contain base64 if not UTF-8 decodable, or string if UTF-8 decodable |
| 501 | + try: |
| 502 | + utf8_decoded = hash_bytes.decode('utf-8') |
| 503 | + # If UTF-8 decodable, should be in JSON as string |
| 504 | + assert utf8_decoded in serialized |
| 505 | + except UnicodeDecodeError: |
| 506 | + # If not UTF-8 decodable, should be base64 encoded |
| 507 | + expected_b64 = base64.b64encode(hash_bytes).decode('ascii') |
| 508 | + assert expected_b64 in serialized |
| 509 | + |
| 510 | + def test_mixed_utf8_and_binary_bytes(self): |
| 511 | + """Test data structure with both UTF-8 decodable and binary bytes""" |
| 512 | + test_data = { |
| 513 | + "utf8_text": b"hello world", # UTF-8 decodable |
| 514 | + "binary_hash": hashlib.sha256(b"secret").digest(), # Binary |
| 515 | + "empty_bytes": b"", # Edge case |
| 516 | + "utf8_unicode": "café".encode('utf-8'), # UTF-8 with unicode |
| 517 | + "non_utf8_byte": b"\xff\xfe\xfd", # Non-UTF-8 bytes |
| 518 | + } |
| 519 | + |
| 520 | + # Should serialize without errors |
| 521 | + serialized = json_dumps(test_data) |
| 522 | + deserialized = json_loads(serialized) |
| 523 | + |
| 524 | + # UTF-8 decodable bytes should remain as strings |
| 525 | + assert "hello world" in serialized |
| 526 | + assert deserialized["utf8_text"] == "hello world" |
| 527 | + |
| 528 | + # Unicode UTF-8 should work |
| 529 | + assert deserialized["utf8_unicode"] == "café" |
| 530 | + |
| 531 | + # Binary data should be base64 encoded |
| 532 | + expected_hash_b64 = base64.b64encode(test_data["binary_hash"]).decode('ascii') |
| 533 | + assert expected_hash_b64 in serialized |
| 534 | + assert deserialized["binary_hash"] == expected_hash_b64 |
| 535 | + |
| 536 | + # Empty bytes should be empty string |
| 537 | + assert deserialized["empty_bytes"] == "" |
| 538 | + |
| 539 | + # Non-UTF-8 bytes should be base64 encoded |
| 540 | + expected_non_utf8_b64 = base64.b64encode(test_data["non_utf8_byte"]).decode('ascii') |
| 541 | + assert expected_non_utf8_b64 in serialized |
| 542 | + assert deserialized["non_utf8_byte"] == expected_non_utf8_b64 |
| 543 | + |
| 544 | + def test_bytes_in_deepdiff_serialization(self): |
| 545 | + """Test that bytes work correctly in DeepDiff JSON serialization""" |
| 546 | + t1 = { |
| 547 | + "text": b"hello", |
| 548 | + "hash": hashlib.sha256(b"data1").digest(), |
| 549 | + } |
| 550 | + t2 = { |
| 551 | + "text": b"world", |
| 552 | + "hash": hashlib.sha256(b"data2").digest(), |
| 553 | + } |
| 554 | + |
| 555 | + diff = DeepDiff(t1, t2) |
| 556 | + |
| 557 | + # Should serialize without errors |
| 558 | + json_output = diff.to_json() |
| 559 | + assert json_output |
| 560 | + |
| 561 | + # Should contain both UTF-8 decoded strings and base64 encoded hashes |
| 562 | + assert "hello" in json_output |
| 563 | + assert "world" in json_output |
| 564 | + |
| 565 | + # Hash values should be base64 encoded |
| 566 | + expected_hash1 = base64.b64encode(t1["hash"]).decode('ascii') |
| 567 | + expected_hash2 = base64.b64encode(t2["hash"]).decode('ascii') |
| 568 | + assert expected_hash1 in json_output |
| 569 | + assert expected_hash2 in json_output |
0 commit comments