fix: do not make the encoded document larger than expected

agateau-gg · agateau-gg · commit 5164a7208c8f · 2023-07-12T17:29:59.000+02:00
The Unicode replacement character becomes 2 bytes in UTF-8 (0xFF 0xFD).
Replacing \0 with this character causes the encoded string to be one
byte longer, making it possible for the encoded document to be longer
than the maximum document size.

Use the ASCII substitute character instead: it's only 1 byte long in
UTF-8, so it does not make the encoded document grow.
diff --git a/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md b/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md
@@ -0,0 +1,3 @@
+### Fixed
+
+- Do not make documents longer when preparing them to be sent to the API.
diff --git a/pygitguardian/client.py b/pygitguardian/client.py
@@ -312,8 +312,8 @@ def content_scan(
         """
         content_scan handles the /scan endpoint of the API.
 
-        If document contains `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If document contains `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param filename: name of file, example: "intro.py"
         :param document: content of file
@@ -355,8 +355,8 @@ def multi_content_scan(
         """
         multi_content_scan handles the /multiscan endpoint of the API.
 
-        If documents contain `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If documents contain `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param documents: List of dictionaries containing the keys document
         and, optionally, filename.
diff --git a/pygitguardian/models.py b/pygitguardian/models.py
@@ -94,8 +94,12 @@ def validate_size(document: Dict[str, Any], maximum_size: int) -> None:
     @post_load
     def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         doc = in_data["document"]
-        # Our API does not accept 0 bytes in documents, so replace them with the replacement character
-        in_data["document"] = doc.replace("\0", "\uFFFD")
+        # Our API does not accept 0 bytes in documents so replace them with
+        # the ASCII substitute character.
+        # We no longer uses the Unicode replacement character (U+FFFD) because
+        # it makes the encoded string one byte longer, making it possible to
+        # hit the maximum size limit.
+        in_data["document"] = doc.replace("\0", "\x1a")
         return in_data
 
     @post_load
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -40,7 +40,7 @@ def test_document_handle_0_bytes(self):
         document = Document.SCHEMA.load(
             {"filename": "name", "document": "hello\0world"}
         )
-        assert document["document"] == "hello\uFFFDworld"
+        assert document["document"] == "hello\x1aworld"
 
     def test_document_handle_surrogates(self):
         document = Document.SCHEMA.load(

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+### Fixed`
	`2`	`+`
	`3`	`+- Do not make documents longer when preparing them to be sent to the API.`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ def test_document_handle_0_bytes(self):`
`40`	`40`	`document = Document.SCHEMA.load(`
`41`	`41`	`{"filename": "name", "document": "hello\0world"}`
`42`	`42`	`)`
`43`		`- assert document["document"] == "hello\uFFFDworld"`
	`43`	`+ assert document["document"] == "hello\x1aworld"`
`44`	`44`
`45`	`45`	`def test_document_handle_surrogates(self):`
`46`	`46`	`document = Document.SCHEMA.load(`