Fix #581 (#585)

RobertoPrevato · RobertoPrevato · commit 6801abe96ae0 · 2025-06-22T07:55:33.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [2.3.3] - 2025-06-??
+
+- Make `charset_normalizer` an **optional** dependency. This library is
+  optional and is only used when a `UnicodeDecodeError` exception occurs when
+  parsing the body of a web request. This can happen in two circumstances:
+  when the client sends a payload specifying the wrong encoding in the
+  `Content-Type` request header, or when the client sends a payload that is not
+  `UTF-8` encoded and without specifying the charset encoding.
+- Correct bug in the `parse_charset` function that prevented proper parsing and
+  optimal handling of input encodings different than `UTF8`. Parsing still
+  worked in this case because of the automatic fallback to `charset_normalizer`.
+- Correct the output of `request.charset` when the charset is obtained from
+  the 'Content-Type' request header.
+
 ## [2.3.2] - 2025-06-17 :telescope:
 
 - Add built-in features to enable `OpenTelemetry` logging for all web requests
diff --git a/blacksheep/messages.py b/blacksheep/messages.py
@@ -6,9 +6,8 @@
 from typing import TYPE_CHECKING, Optional
 from urllib.parse import parse_qs, quote, unquote, urlencode
 
-import charset_normalizer
-
 from blacksheep.multipart import parse_multipart
+from blacksheep.settings.encodings import encodings_settings
 from blacksheep.settings.json import json_settings
 from blacksheep.utils.time import utcnow
 
@@ -26,13 +25,13 @@
 if TYPE_CHECKING:
     from blacksheep.sessions import Session
 
-_charset_rx = re.compile(rb"charset=([^;]+)\s", re.I)
+_charset_rx = re.compile(rb"charset=([\w\-]+)", re.I)
 
 
 def parse_charset(value: bytes):
-    m = _charset_rx.match(value)
+    m = _charset_rx.search(value)
     if m:
-        return m.group(1).decode("utf8")
+        return m.group(1).decode("ascii")
     return None
 
 
@@ -162,12 +161,8 @@ async def text(self):
             return ""
         try:
             return body.decode(self.charset)
-        except UnicodeDecodeError:
-            if self.charset != "ISO-8859-1":
-                try:
-                    return body.decode("ISO-8859-1")
-                except UnicodeDecodeError:
-                    return body.decode(charset_normalizer.detect(body)["encoding"])
+        except UnicodeDecodeError as decode_error:
+            return encodings_settings.decode(body, decode_error)
 
     async def form(self):
         content_type_value = self.content_type()
diff --git a/blacksheep/messages.pyi b/blacksheep/messages.pyi
@@ -176,3 +176,4 @@ def is_cors_request(request: Request) -> bool: ...
 def is_cors_preflight_request(request: Request) -> bool: ...
 def get_request_absolute_url(request: Request) -> URL: ...
 def get_absolute_url_to_path(request: Request, path: str) -> URL: ...
+def parse_charset(value: bytes) -> str: ...
diff --git a/blacksheep/messages.pyx b/blacksheep/messages.pyx
@@ -5,10 +5,9 @@ from datetime import datetime, timedelta
 from json.decoder import JSONDecodeError
 from urllib.parse import parse_qs, quote, unquote, urlencode
 
-import charset_normalizer
-
 from blacksheep.multipart import parse_multipart
 from blacksheep.sessions import Session
+from blacksheep.settings.encodings import encodings_settings
 from blacksheep.settings.json import json_settings
 from blacksheep.utils.time import utcnow
 
@@ -28,13 +27,13 @@ from .exceptions cimport (
 from .headers cimport Headers
 from .url cimport URL, build_absolute_url
 
-_charset_rx = re.compile(b'charset=([^;]+)\\s', re.I)
+_charset_rx = re.compile(rb"charset=([\w\-]+)", re.I)
 
 
 cpdef str parse_charset(bytes value):
-    m = _charset_rx.match(value)
+    m = _charset_rx.search(value)
     if m:
-        return m.group(1).decode('utf8')
+        return m.group(1).decode("ascii")
     return None
 
 
@@ -181,21 +180,12 @@ cdef class Message:
 
     async def text(self):
         body = await self.read()
-
         if body is None:
             return ""
         try:
             return body.decode(self.charset)
-        except UnicodeDecodeError:
-            # this can happen when the server returned a declared charset,
-            # but its content is not actually using the declared encoding
-            # a common encoding is 'ISO-8859-1', so before using chardet, we try with this
-            if self.charset != 'ISO-8859-1':
-                try:
-                    return body.decode('ISO-8859-1')
-                except UnicodeDecodeError:
-                    # fallback to trying to detect the encoding;
-                    return body.decode(charset_normalizer.detect(body)['encoding'])
+        except UnicodeDecodeError as decode_error:
+            return encodings_settings.decode(body, decode_error)
 
     async def form(self):
         cdef str text
diff --git a/blacksheep/server/bindings.py b/blacksheep/server/bindings.py
@@ -326,6 +326,13 @@ def example(id: str):
         """
         try:
             value = await self.get_value(request)
+        except UnicodeDecodeError as decode_error:
+            raise BadRequest(
+                f"Unicode decode error. "
+                f"Cannot decode the request content using: {decode_error.encoding}. "
+                "Ensure the request content is encoded using the encoding declared in "
+                "the Content-Type request header."
+            )
         except ValueError as value_error:
             raise BadRequest("Invalid parameter.") from value_error
 
diff --git a/blacksheep/settings/encodings.py b/blacksheep/settings/encodings.py
@@ -0,0 +1,91 @@
+from abc import ABC, abstractmethod
+
+try:
+    import charset_normalizer
+except ImportError:
+    charset_normalizer = None
+
+
+class Decoder(ABC):
+    """
+    Abstract base class for byte sequence decoders.
+
+    Implementations of this class provide a strategy for decoding bytes into
+    strings, used when a UnicodeDecodeError occurs during standard
+    decoding. Subclasses must implement the `decode` method, which receives
+    the bytes to decode and the original UnicodeDecodeError.
+
+    Methods:
+        decode(value: bytes, decode_error: UnicodeDecodeError) -> str:
+            Attempts to decode the given bytes. Should raise the provided
+            decode_error if decoding is not possible.
+    """
+
+    @abstractmethod
+    def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str: ...
+
+
+class DefaultDecoder(Decoder):
+    """
+    Decoder implementation that attempts to detect the encoding using charset_normalizer
+    if available. If charset_normalizer is not available, it raises again the
+    UnicodeDecodeError.
+    """
+
+    def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str:
+        if charset_normalizer is None:
+            raise decode_error
+        detected_encoding = charset_normalizer.detect(value)["encoding"]
+        if detected_encoding is None:
+            raise decode_error
+        return value.decode(detected_encoding)
+
+
+class NoopDecoder(Decoder):
+    """
+    A decoder implementation that does not attempt to decode input bytes.
+
+    This class always raises the provided UnicodeDecodeError when its decode
+    method is called. It can be used to disable automatic encoding detection
+    and force strict decoding behavior, ensuring that decoding errors are
+    not silently handled or guessed.
+
+    Methods:
+        decode(value: bytes, decode_error: UnicodeDecodeError) -> str:
+            Always raises the provided decode_error.
+    """
+
+    def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str:
+        raise decode_error
+
+
+class EncodingsSettings:
+    """
+    Manages the decoding strategy for byte sequences in the application.
+
+    EncodingsSettings allows configuring which Decoder implementation is used
+    to decode bytes when a UnicodeDecodeError occurs. By default, it uses
+    DefaultDecoder, which attempts to detect the encoding using charset_normalizer
+    if available. The decoder can be replaced at runtime using the `use` method.
+
+    Methods:
+        use(decoder: Decoder) -> None:
+            Sets the decoder to be used for decoding operations.
+
+        decode(value: bytes, decode_error: UnicodeDecodeError) -> str:
+            Decodes the given bytes using the configured decoder. If decoding fails,
+            the provided UnicodeDecodeError is raised or handled according to the
+            decoder.
+    """
+
+    def __init__(self) -> None:
+        self._decoder = DefaultDecoder()
+
+    def use(self, decoder: Decoder) -> None:
+        self._decoder = decoder
+
+    def decode(self, value: bytes, decode_error: UnicodeDecodeError) -> str:
+        return self._decoder.decode(value, decode_error)
+
+
+encodings_settings = EncodingsSettings()
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,6 @@ classifiers = [
 keywords = ["blacksheep", "web framework", "asyncio"]
 dependencies = [
     "certifi>=2025.4.26",
-    "charset-normalizer~=3.4.2",
     "guardpost>=1.0.2",
     "rodi~=2.0.8",
     "essentials>=1.1.4,<2.0",
diff --git a/requirements.pypy.txt b/requirements.pypy.txt
@@ -1,5 +1,4 @@
 certifi>=2025.4.26
-charset-normalizer~=3.4.2
 guardpost>=1.0.2
 rodi~=2.0.2
 essentials>=1.1.4,<2.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
 certifi>=2025.4.26
-charset-normalizer~=3.4.2
 guardpost>=1.0.2
 rodi~=2.0.2
 essentials>=1.1.4,<2.0
diff --git a/tests/test_requests.py b/tests/test_requests.py
@@ -635,3 +635,19 @@ async def content_gen():
         bytes(data)
         == b"POST / HTTP/1.1\r\ncontent-type: text/plain\r\ncontent-length: 10\r\n\r\nHelloWorld"
     )
+
+
+@pytest.mark.parametrize(
+    "content_type_header,expected_charset",
+    [
+        ("text/plain; charset=UTF-8", "UTF-8"),
+        ("application/json", "utf8"),  # default
+        ("application/json; charset=utf-8", "utf-8"),
+        ("application/json; charset=ISO-8859-1", "ISO-8859-1"),
+        ("text/html; charset=ISO-8859-1", "ISO-8859-1"),
+        ("application/xml; charset=utf-8", "utf-8"),
+    ],
+)
+def test_request_charset(content_type_header, expected_charset):
+    request = Request("POST", b"/", [(b"Content-Type", content_type_header.encode())])
+    assert request.charset == expected_charset

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`certifi>=2025.4.26`
`2`		`-charset-normalizer~=3.4.2`
`3`	`2`	`guardpost>=1.0.2`
`4`	`3`	`rodi~=2.0.2`
`5`	`4`	`essentials>=1.1.4,<2.0`