Skip to content

Commit ed70fce

Browse files
Walzagateau-gg
authored andcommitted
fix(encoding): Encode document in UTF-8
1 parent 951154f commit ed70fce

File tree

2 files changed

+16
-1
lines changed

2 files changed

+16
-1
lines changed

pygitguardian/models.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def validate_document(self, document: str) -> None:
5656
"""
5757
validate that document is smaller than scan limit
5858
"""
59-
encoded = document.encode("utf-8")
59+
encoded = document.encode("utf-8", errors="replace")
6060
if len(encoded) > DOCUMENT_SIZE_THRESHOLD_BYTES:
6161
raise ValidationError(
6262
"file exceeds the maximum allowed size of {}B".format(
@@ -71,6 +71,15 @@ def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, A
7171
in_data["document"] = doc.replace("\0", "\uFFFD")
7272
return in_data
7373

74+
@post_load
75+
def force_utf_8_encoding(
76+
self, in_data: Dict[str, Any], **kwargs: Any
77+
) -> Dict[str, Any]:
78+
doc = in_data["document"]
79+
# Force UTF-8 and substitute ? for encoding errors
80+
in_data["document"] = doc.encode("utf-8", errors="replace").decode("utf-8")
81+
return in_data
82+
7483

7584
class Document(Base):
7685
"""

tests/test_models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ def test_document_handle_0_bytes(self):
4040
)
4141
assert document["document"] == "hello\uFFFDworld"
4242

43+
def test_document_handle_surrogates(self):
44+
document = Document.SCHEMA.load(
45+
{"filename": "name", "document": "hello\udbdeworld"}
46+
)
47+
assert document["document"] == "hello?world", document
48+
4349
@pytest.mark.parametrize(
4450
"schema_klass, expected_klass, instance_data",
4551
[

0 commit comments

Comments
 (0)