88from http .cookiejar import Cookie , CookieJar
99from urllib .parse import parse_qs , quote , unquote , urlencode
1010
11+ import charset_normalizer
1112import idna
1213import rfc3986
1314import rfc3986 .exceptions
@@ -1314,22 +1315,26 @@ def text(self) -> str:
13141315 if not content :
13151316 self ._text = ""
13161317 else :
1317- decoder = TextDecoder (encoding = self .encoding )
1318+ decoder = TextDecoder (encoding = self .encoding or "utf-8" )
13181319 self ._text = "" .join ([decoder .decode (self .content ), decoder .flush ()])
13191320 return self ._text
13201321
13211322 @property
13221323 def encoding (self ) -> typing .Optional [str ]:
13231324 """
1324- Return the encoding, which may have been set explicitly, or may have
1325- been specified by the Content-Type header.
1325+ Return an encoding to use for decoding the byte content into text.
1326+ The priority for determining this is given by...
1327+
1328+ * `.encoding = <>` has been set explicitly.
1329+ * The encoding as specified by the charset parameter in the Content-Type header.
1330+ * The encoding as determined by `charset_normalizer`.
1331+ * UTF-8.
13261332 """
13271333 if not hasattr (self , "_encoding" ):
13281334 encoding = self .charset_encoding
13291335 if encoding is None or not is_known_encoding (encoding ):
1330- self ._encoding = None
1331- else :
1332- self ._encoding = encoding
1336+ encoding = self .apparent_encoding
1337+ self ._encoding = encoding
13331338 return self ._encoding
13341339
13351340 @encoding .setter
@@ -1351,6 +1356,19 @@ def charset_encoding(self) -> typing.Optional[str]:
13511356
13521357 return params ["charset" ].strip ("'\" " )
13531358
1359+ @property
1360+ def apparent_encoding (self ) -> typing .Optional [str ]:
1361+ """
1362+ Return the encoding, as detemined by `charset_normalizer`.
1363+ """
1364+ content = getattr (self , "_content" , b"" )
1365+ if len (content ) < 32 :
1366+ # charset_normalizer will issue warnings if we run it with
1367+ # fewer bytes than this cutoff.
1368+ return None
1369+ match = charset_normalizer .from_bytes (self .content ).best ()
1370+ return None if match is None else match .encoding
1371+
13541372 def _get_content_decoder (self ) -> ContentDecoder :
13551373 """
13561374 Returns a decoder instance which can be used to decode the raw byte
@@ -1411,10 +1429,7 @@ def json(self, **kwargs: typing.Any) -> typing.Any:
14111429 if self .charset_encoding is None and self .content and len (self .content ) > 3 :
14121430 encoding = guess_json_utf (self .content )
14131431 if encoding is not None :
1414- try :
1415- return jsonlib .loads (self .content .decode (encoding ), ** kwargs )
1416- except UnicodeDecodeError :
1417- pass
1432+ return jsonlib .loads (self .content .decode (encoding ), ** kwargs )
14181433 return jsonlib .loads (self .text , ** kwargs )
14191434
14201435 @property
@@ -1495,7 +1510,7 @@ def iter_text(self, chunk_size: int = None) -> typing.Iterator[str]:
14951510 that handles both gzip, deflate, etc but also detects the content's
14961511 string encoding.
14971512 """
1498- decoder = TextDecoder (encoding = self .encoding )
1513+ decoder = TextDecoder (encoding = self .encoding or "utf-8" )
14991514 chunker = TextChunker (chunk_size = chunk_size )
15001515 with request_context (request = self ._request ):
15011516 for byte_content in self .iter_bytes ():
@@ -1593,7 +1608,7 @@ async def aiter_text(self, chunk_size: int = None) -> typing.AsyncIterator[str]:
15931608 that handles both gzip, deflate, etc but also detects the content's
15941609 string encoding.
15951610 """
1596- decoder = TextDecoder (encoding = self .encoding )
1611+ decoder = TextDecoder (encoding = self .encoding or "utf-8" )
15971612 chunker = TextChunker (chunk_size = chunk_size )
15981613 with request_context (request = self ._request ):
15991614 async for byte_content in self .aiter_bytes ():
0 commit comments