Skip to content

Commit 6801abe

Browse files
Fix #581 (#585)
1 parent 7e9101f commit 6801abe

File tree

10 files changed

+141
-30
lines changed

10 files changed

+141
-30
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [2.3.3] - 2025-06-??
9+
10+
- Make `charset_normalizer` an **optional** dependency. This library is
11+
optional and is only used when a `UnicodeDecodeError` exception occurs when
12+
parsing the body of a web request. This can happen in two circumstances:
13+
when the client sends a payload specifying the wrong encoding in the
14+
`Content-Type` request header, or when the client sends a payload that is not
15+
`UTF-8` encoded and without specifying the charset encoding.
16+
- Correct bug in the `parse_charset` function that prevented proper parsing and
17+
optimal handling of input encodings different than `UTF8`. Parsing still
18+
worked in this case because of the automatic fallback to `charset_normalizer`.
19+
- Correct the output of `request.charset` when the charset is obtained from
20+
the 'Content-Type' request header.
21+
822
## [2.3.2] - 2025-06-17 :telescope:
923

1024
- Add built-in features to enable `OpenTelemetry` logging for all web requests

blacksheep/messages.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
from typing import TYPE_CHECKING, Optional
77
from urllib.parse import parse_qs, quote, unquote, urlencode
88

9-
import charset_normalizer
10-
119
from blacksheep.multipart import parse_multipart
10+
from blacksheep.settings.encodings import encodings_settings
1211
from blacksheep.settings.json import json_settings
1312
from blacksheep.utils.time import utcnow
1413

@@ -26,13 +25,13 @@
2625
if TYPE_CHECKING:
2726
from blacksheep.sessions import Session
2827

29-
_charset_rx = re.compile(rb"charset=([^;]+)\s", re.I)
28+
_charset_rx = re.compile(rb"charset=([\w\-]+)", re.I)
3029

3130

3231
def parse_charset(value: bytes):
33-
m = _charset_rx.match(value)
32+
m = _charset_rx.search(value)
3433
if m:
35-
return m.group(1).decode("utf8")
34+
return m.group(1).decode("ascii")
3635
return None
3736

3837

@@ -162,12 +161,8 @@ async def text(self):
162161
return ""
163162
try:
164163
return body.decode(self.charset)
165-
except UnicodeDecodeError:
166-
if self.charset != "ISO-8859-1":
167-
try:
168-
return body.decode("ISO-8859-1")
169-
except UnicodeDecodeError:
170-
return body.decode(charset_normalizer.detect(body)["encoding"])
164+
except UnicodeDecodeError as decode_error:
165+
return encodings_settings.decode(body, decode_error)
171166

172167
async def form(self):
173168
content_type_value = self.content_type()

blacksheep/messages.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,4 @@ def is_cors_request(request: Request) -> bool: ...
176176
def is_cors_preflight_request(request: Request) -> bool: ...
177177
def get_request_absolute_url(request: Request) -> URL: ...
178178
def get_absolute_url_to_path(request: Request, path: str) -> URL: ...
179+
def parse_charset(value: bytes) -> str: ...

blacksheep/messages.pyx

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ from datetime import datetime, timedelta
55
from json.decoder import JSONDecodeError
66
from urllib.parse import parse_qs, quote, unquote, urlencode
77

8-
import charset_normalizer
9-
108
from blacksheep.multipart import parse_multipart
119
from blacksheep.sessions import Session
10+
from blacksheep.settings.encodings import encodings_settings
1211
from blacksheep.settings.json import json_settings
1312
from blacksheep.utils.time import utcnow
1413

@@ -28,13 +27,13 @@ from .exceptions cimport (
2827
from .headers cimport Headers
2928
from .url cimport URL, build_absolute_url
3029

31-
_charset_rx = re.compile(b'charset=([^;]+)\\s', re.I)
30+
_charset_rx = re.compile(rb"charset=([\w\-]+)", re.I)
3231

3332

3433
cpdef str parse_charset(bytes value):
35-
m = _charset_rx.match(value)
34+
m = _charset_rx.search(value)
3635
if m:
37-
return m.group(1).decode('utf8')
36+
return m.group(1).decode("ascii")
3837
return None
3938

4039

@@ -181,21 +180,12 @@ cdef class Message:
181180

182181
async def text(self):
183182
body = await self.read()
184-
185183
if body is None:
186184
return ""
187185
try:
188186
return body.decode(self.charset)
189-
except UnicodeDecodeError:
190-
# this can happen when the server returned a declared charset,
191-
# but its content is not actually using the declared encoding
192-
# a common encoding is 'ISO-8859-1', so before using chardet, we try with this
193-
if self.charset != 'ISO-8859-1':
194-
try:
195-
return body.decode('ISO-8859-1')
196-
except UnicodeDecodeError:
197-
# fallback to trying to detect the encoding;
198-
return body.decode(charset_normalizer.detect(body)['encoding'])
187+
except UnicodeDecodeError as decode_error:
188+
return encodings_settings.decode(body, decode_error)
199189

200190
async def form(self):
201191
cdef str text

blacksheep/server/bindings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,13 @@ def example(id: str):
326326
"""
327327
try:
328328
value = await self.get_value(request)
329+
except UnicodeDecodeError as decode_error:
330+
raise BadRequest(
331+
f"Unicode decode error. "
332+
f"Cannot decode the request content using: {decode_error.encoding}. "
333+
"Ensure the request content is encoded using the encoding declared in "
334+
"the Content-Type request header."
335+
)
329336
except ValueError as value_error:
330337
raise BadRequest("Invalid parameter.") from value_error
331338

blacksheep/settings/encodings.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from abc import ABC, abstractmethod
2+
3+
try:
4+
import charset_normalizer
5+
except ImportError:
6+
charset_normalizer = None
7+
8+
9+
class Decoder(ABC):
10+
"""
11+
Abstract base class for byte sequence decoders.
12+
13+
Implementations of this class provide a strategy for decoding bytes into
14+
strings, used when a UnicodeDecodeError occurs during standard
15+
decoding. Subclasses must implement the `decode` method, which receives
16+
the bytes to decode and the original UnicodeDecodeError.
17+
18+
Methods:
19+
decode(value: bytes, decode_error: UnicodeDecodeError) -> str:
20+
Attempts to decode the given bytes. Should raise the provided
21+
decode_error if decoding is not possible.
22+
"""
23+
24+
@abstractmethod
25+
def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str: ...
26+
27+
28+
class DefaultDecoder(Decoder):
29+
"""
30+
Decoder implementation that attempts to detect the encoding using charset_normalizer
31+
if available. If charset_normalizer is not available, it raises again the
32+
UnicodeDecodeError.
33+
"""
34+
35+
def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str:
36+
if charset_normalizer is None:
37+
raise decode_error
38+
detected_encoding = charset_normalizer.detect(value)["encoding"]
39+
if detected_encoding is None:
40+
raise decode_error
41+
return value.decode(detected_encoding)
42+
43+
44+
class NoopDecoder(Decoder):
45+
"""
46+
A decoder implementation that does not attempt to decode input bytes.
47+
48+
This class always raises the provided UnicodeDecodeError when its decode
49+
method is called. It can be used to disable automatic encoding detection
50+
and force strict decoding behavior, ensuring that decoding errors are
51+
not silently handled or guessed.
52+
53+
Methods:
54+
decode(value: bytes, decode_error: UnicodeDecodeError) -> str:
55+
Always raises the provided decode_error.
56+
"""
57+
58+
def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str:
59+
raise decode_error
60+
61+
62+
class EncodingsSettings:
63+
"""
64+
Manages the decoding strategy for byte sequences in the application.
65+
66+
EncodingsSettings allows configuring which Decoder implementation is used
67+
to decode bytes when a UnicodeDecodeError occurs. By default, it uses
68+
DefaultDecoder, which attempts to detect the encoding using charset_normalizer
69+
if available. The decoder can be replaced at runtime using the `use` method.
70+
71+
Methods:
72+
use(decoder: Decoder) -> None:
73+
Sets the decoder to be used for decoding operations.
74+
75+
decode(value: bytes, decode_error: UnicodeDecodeError) -> str:
76+
Decodes the given bytes using the configured decoder. If decoding fails,
77+
the provided UnicodeDecodeError is raised or handled according to the
78+
decoder.
79+
"""
80+
81+
def __init__(self) -> None:
82+
self._decoder = DefaultDecoder()
83+
84+
def use(self, decoder: Decoder) -> None:
85+
self._decoder = decoder
86+
87+
def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str:
88+
return self._decoder.decode(value, decode_error)
89+
90+
91+
encodings_settings = EncodingsSettings()

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ classifiers = [
2424
keywords = ["blacksheep", "web framework", "asyncio"]
2525
dependencies = [
2626
"certifi>=2025.4.26",
27-
"charset-normalizer~=3.4.2",
2827
"guardpost>=1.0.2",
2928
"rodi~=2.0.8",
3029
"essentials>=1.1.4,<2.0",

requirements.pypy.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
certifi>=2025.4.26
2-
charset-normalizer~=3.4.2
32
guardpost>=1.0.2
43
rodi~=2.0.2
54
essentials>=1.1.4,<2.0

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
certifi>=2025.4.26
2-
charset-normalizer~=3.4.2
32
guardpost>=1.0.2
43
rodi~=2.0.2
54
essentials>=1.1.4,<2.0

tests/test_requests.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,3 +635,19 @@ async def content_gen():
635635
bytes(data)
636636
== b"POST / HTTP/1.1\r\ncontent-type: text/plain\r\ncontent-length: 10\r\n\r\nHelloWorld"
637637
)
638+
639+
640+
@pytest.mark.parametrize(
641+
"content_type_header,expected_charset",
642+
[
643+
("text/plain; charset=UTF-8", "UTF-8"),
644+
("application/json", "utf8"), # default
645+
("application/json; charset=utf-8", "utf-8"),
646+
("application/json; charset=ISO-8859-1", "ISO-8859-1"),
647+
("text/html; charset=ISO-8859-1", "ISO-8859-1"),
648+
("application/xml; charset=utf-8", "utf-8"),
649+
],
650+
)
651+
def test_request_charset(content_type_header, expected_charset):
652+
request = Request("POST", b"/", [(b"Content-Type", content_type_header.encode())])
653+
assert request.charset == expected_charset

0 commit comments

Comments
 (0)