1
+ from __future__ import annotations
2
+
1
3
import logging
2
4
from os import PathLike
3
- from typing import BinaryIO , List , Optional , Set , Union
5
+ from typing import BinaryIO
4
6
5
7
from .cd import (
6
8
coherence_ratio ,
21
23
should_strip_sig_or_bom ,
22
24
)
23
25
24
- # Will most likely be controversial
25
- # logging.addLevelName(TRACE, "TRACE")
26
26
logger = logging .getLogger ("charset_normalizer" )
27
27
explain_handler = logging .StreamHandler ()
28
28
explain_handler .setFormatter (
31
31
32
32
33
33
def from_bytes (
34
- sequences : Union [ bytes , bytearray ] ,
34
+ sequences : bytes | bytearray ,
35
35
steps : int = 5 ,
36
36
chunk_size : int = 512 ,
37
37
threshold : float = 0.2 ,
38
- cp_isolation : Optional [ List [ str ]] = None ,
39
- cp_exclusion : Optional [ List [ str ]] = None ,
38
+ cp_isolation : list [ str ] | None = None ,
39
+ cp_exclusion : list [ str ] | None = None ,
40
40
preemptive_behaviour : bool = True ,
41
41
explain : bool = False ,
42
42
language_threshold : float = 0.1 ,
@@ -62,7 +62,7 @@ def from_bytes(
62
62
63
63
if not isinstance (sequences , (bytearray , bytes )):
64
64
raise TypeError (
65
- "Expected object of type bytes or bytearray, got: {0 }" .format (
65
+ "Expected object of type bytes or bytearray, got: {}" .format (
66
66
type (sequences )
67
67
)
68
68
)
@@ -76,7 +76,7 @@ def from_bytes(
76
76
77
77
if length == 0 :
78
78
logger .debug ("Encoding detection on empty bytes, assuming utf_8 intention." )
79
- if explain :
79
+ if explain : # Defensive: ensure exit path clean handler
80
80
logger .removeHandler (explain_handler )
81
81
logger .setLevel (previous_logger_level or logging .WARNING )
82
82
return CharsetMatches ([CharsetMatch (sequences , "utf_8" , 0.0 , False , [], "" )])
@@ -135,9 +135,9 @@ def from_bytes(
135
135
),
136
136
)
137
137
138
- prioritized_encodings : List [str ] = []
138
+ prioritized_encodings : list [str ] = []
139
139
140
- specified_encoding : Optional [ str ] = (
140
+ specified_encoding : str | None = (
141
141
any_specified_encoding (sequences ) if preemptive_behaviour else None
142
142
)
143
143
@@ -149,13 +149,13 @@ def from_bytes(
149
149
specified_encoding ,
150
150
)
151
151
152
- tested : Set [str ] = set ()
153
- tested_but_hard_failure : List [str ] = []
154
- tested_but_soft_failure : List [str ] = []
152
+ tested : set [str ] = set ()
153
+ tested_but_hard_failure : list [str ] = []
154
+ tested_but_soft_failure : list [str ] = []
155
155
156
- fallback_ascii : Optional [ CharsetMatch ] = None
157
- fallback_u8 : Optional [ CharsetMatch ] = None
158
- fallback_specified : Optional [ CharsetMatch ] = None
156
+ fallback_ascii : CharsetMatch | None = None
157
+ fallback_u8 : CharsetMatch | None = None
158
+ fallback_specified : CharsetMatch | None = None
159
159
160
160
results : CharsetMatches = CharsetMatches ()
161
161
@@ -189,7 +189,7 @@ def from_bytes(
189
189
190
190
tested .add (encoding_iana )
191
191
192
- decoded_payload : Optional [ str ] = None
192
+ decoded_payload : str | None = None
193
193
bom_or_sig_available : bool = sig_encoding == encoding_iana
194
194
strip_sig_or_bom : bool = bom_or_sig_available and should_strip_sig_or_bom (
195
195
encoding_iana
@@ -292,7 +292,7 @@ def from_bytes(
292
292
early_stop_count : int = 0
293
293
lazy_str_hard_failure = False
294
294
295
- md_chunks : List [str ] = []
295
+ md_chunks : list [str ] = []
296
296
md_ratios = []
297
297
298
298
try :
@@ -397,7 +397,7 @@ def from_bytes(
397
397
)
398
398
399
399
if not is_multi_byte_decoder :
400
- target_languages : List [str ] = encoding_languages (encoding_iana )
400
+ target_languages : list [str ] = encoding_languages (encoding_iana )
401
401
else :
402
402
target_languages = mb_encoding_languages (encoding_iana )
403
403
@@ -462,7 +462,7 @@ def from_bytes(
462
462
"Encoding detection: %s is most likely the one." ,
463
463
current_match .encoding ,
464
464
)
465
- if explain :
465
+ if explain : # Defensive: ensure exit path clean handler
466
466
logger .removeHandler (explain_handler )
467
467
logger .setLevel (previous_logger_level )
468
468
return CharsetMatches ([current_match ])
@@ -480,7 +480,7 @@ def from_bytes(
480
480
"Encoding detection: %s is most likely the one." ,
481
481
probable_result .encoding ,
482
482
)
483
- if explain :
483
+ if explain : # Defensive: ensure exit path clean handler
484
484
logger .removeHandler (explain_handler )
485
485
logger .setLevel (previous_logger_level )
486
486
@@ -492,7 +492,7 @@ def from_bytes(
492
492
"the beginning of the sequence." ,
493
493
encoding_iana ,
494
494
)
495
- if explain :
495
+ if explain : # Defensive: ensure exit path clean handler
496
496
logger .removeHandler (explain_handler )
497
497
logger .setLevel (previous_logger_level )
498
498
return CharsetMatches ([results [encoding_iana ]])
@@ -546,8 +546,8 @@ def from_fp(
546
546
steps : int = 5 ,
547
547
chunk_size : int = 512 ,
548
548
threshold : float = 0.20 ,
549
- cp_isolation : Optional [ List [ str ]] = None ,
550
- cp_exclusion : Optional [ List [ str ]] = None ,
549
+ cp_isolation : list [ str ] | None = None ,
550
+ cp_exclusion : list [ str ] | None = None ,
551
551
preemptive_behaviour : bool = True ,
552
552
explain : bool = False ,
553
553
language_threshold : float = 0.1 ,
@@ -572,12 +572,12 @@ def from_fp(
572
572
573
573
574
574
def from_path (
575
- path : Union [ str , bytes , PathLike ] , # type: ignore[type-arg]
575
+ path : str | bytes | PathLike , # type: ignore[type-arg]
576
576
steps : int = 5 ,
577
577
chunk_size : int = 512 ,
578
578
threshold : float = 0.20 ,
579
- cp_isolation : Optional [ List [ str ]] = None ,
580
- cp_exclusion : Optional [ List [ str ]] = None ,
579
+ cp_isolation : list [ str ] | None = None ,
580
+ cp_exclusion : list [ str ] | None = None ,
581
581
preemptive_behaviour : bool = True ,
582
582
explain : bool = False ,
583
583
language_threshold : float = 0.1 ,
@@ -603,12 +603,12 @@ def from_path(
603
603
604
604
605
605
def is_binary (
606
- fp_or_path_or_payload : Union [ PathLike , str , BinaryIO , bytes ] , # type: ignore[type-arg]
606
+ fp_or_path_or_payload : PathLike | str | BinaryIO | bytes , # type: ignore[type-arg]
607
607
steps : int = 5 ,
608
608
chunk_size : int = 512 ,
609
609
threshold : float = 0.20 ,
610
- cp_isolation : Optional [ List [ str ]] = None ,
611
- cp_exclusion : Optional [ List [ str ]] = None ,
610
+ cp_isolation : list [ str ] | None = None ,
611
+ cp_exclusion : list [ str ] | None = None ,
612
612
preemptive_behaviour : bool = True ,
613
613
explain : bool = False ,
614
614
language_threshold : float = 0.1 ,
0 commit comments