Skip to content

Commit be20a75

Browse files
committed
Upgrade chardet to 5.1.0
1 parent 1c110be commit be20a75

37 files changed

+620
-287
lines changed

news/chardet.vendor.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Upgrade chardet to 5.1.0

src/pip/_vendor/chardet.pyi

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/pip/_vendor/chardet/__init__.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,32 +15,46 @@
1515
# 02110-1301 USA
1616
######################### END LICENSE BLOCK #########################
1717

18+
from typing import List, Union
19+
20+
from .charsetgroupprober import CharSetGroupProber
21+
from .charsetprober import CharSetProber
1822
from .enums import InputState
23+
from .resultdict import ResultDict
1924
from .universaldetector import UniversalDetector
2025
from .version import VERSION, __version__
2126

2227
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
2328

2429

25-
def detect(byte_str):
30+
def detect(
31+
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
32+
) -> ResultDict:
2633
"""
2734
Detect the encoding of the given byte string.
2835
2936
:param byte_str: The byte sequence to examine.
3037
:type byte_str: ``bytes`` or ``bytearray``
38+
:param should_rename_legacy: Should we rename legacy encodings
39+
to their more modern equivalents?
40+
:type should_rename_legacy: ``bool``
3141
"""
3242
if not isinstance(byte_str, bytearray):
3343
if not isinstance(byte_str, bytes):
3444
raise TypeError(
3545
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
3646
)
3747
byte_str = bytearray(byte_str)
38-
detector = UniversalDetector()
48+
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
3949
detector.feed(byte_str)
4050
return detector.close()
4151

4252

43-
def detect_all(byte_str, ignore_threshold=False):
53+
def detect_all(
54+
byte_str: Union[bytes, bytearray],
55+
ignore_threshold: bool = False,
56+
should_rename_legacy: bool = False,
57+
) -> List[ResultDict]:
4458
"""
4559
Detect all the possible encodings of the given byte string.
4660
@@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False):
5064
``UniversalDetector.MINIMUM_THRESHOLD``
5165
in results.
5266
:type ignore_threshold: ``bool``
67+
:param should_rename_legacy: Should we rename legacy encodings
68+
to their more modern equivalents?
69+
:type should_rename_legacy: ``bool``
5370
"""
5471
if not isinstance(byte_str, bytearray):
5572
if not isinstance(byte_str, bytes):
@@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False):
5875
)
5976
byte_str = bytearray(byte_str)
6077

61-
detector = UniversalDetector()
78+
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
6279
detector.feed(byte_str)
6380
detector.close()
6481

6582
if detector.input_state == InputState.HIGH_BYTE:
66-
results = []
67-
probers = []
83+
results: List[ResultDict] = []
84+
probers: List[CharSetProber] = []
6885
for prober in detector.charset_probers:
69-
if hasattr(prober, "probers"):
86+
if isinstance(prober, CharSetGroupProber):
7087
probers.extend(p for p in prober.probers)
7188
else:
7289
probers.append(prober)
@@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False):
8097
charset_name = detector.ISO_WIN_MAP.get(
8198
lower_charset_name, charset_name
8299
)
100+
# Rename legacy encodings with superset encodings if asked
101+
if should_rename_legacy:
102+
charset_name = detector.LEGACY_MAP.get(
103+
charset_name.lower(), charset_name
104+
)
83105
results.append(
84106
{
85107
"encoding": charset_name,

src/pip/_vendor/chardet/big5prober.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,16 @@
3232

3333

3434
class Big5Prober(MultiByteCharSetProber):
35-
def __init__(self):
35+
def __init__(self) -> None:
3636
super().__init__()
3737
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
3838
self.distribution_analyzer = Big5DistributionAnalysis()
3939
self.reset()
4040

4141
@property
42-
def charset_name(self):
42+
def charset_name(self) -> str:
4343
return "Big5"
4444

4545
@property
46-
def language(self):
46+
def language(self) -> str:
4747
return "Chinese"

src/pip/_vendor/chardet/chardistribution.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
# 02110-1301 USA
2626
######################### END LICENSE BLOCK #########################
2727

28+
from typing import Tuple, Union
29+
2830
from .big5freq import (
2931
BIG5_CHAR_TO_FREQ_ORDER,
3032
BIG5_TABLE_SIZE,
@@ -59,22 +61,22 @@ class CharDistributionAnalysis:
5961
SURE_NO = 0.01
6062
MINIMUM_DATA_THRESHOLD = 3
6163

62-
def __init__(self):
64+
def __init__(self) -> None:
6365
# Mapping table to get frequency order from char order (get from
6466
# GetOrder())
65-
self._char_to_freq_order = tuple()
66-
self._table_size = None # Size of above table
67+
self._char_to_freq_order: Tuple[int, ...] = tuple()
68+
self._table_size = 0 # Size of above table
6769
# This is a constant value which varies from language to language,
6870
# used in calculating confidence. See
6971
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
7072
# for further detail.
71-
self.typical_distribution_ratio = None
72-
self._done = None
73-
self._total_chars = None
74-
self._freq_chars = None
73+
self.typical_distribution_ratio = 0.0
74+
self._done = False
75+
self._total_chars = 0
76+
self._freq_chars = 0
7577
self.reset()
7678

77-
def reset(self):
79+
def reset(self) -> None:
7880
"""reset analyser, clear any state"""
7981
# If this flag is set to True, detection is done and conclusion has
8082
# been made
@@ -83,7 +85,7 @@ def reset(self):
8385
# The number of characters whose frequency order is less than 512
8486
self._freq_chars = 0
8587

86-
def feed(self, char, char_len):
88+
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
8789
"""feed a character with known length"""
8890
if char_len == 2:
8991
# we only care about 2-bytes character in our distribution analysis
@@ -97,7 +99,7 @@ def feed(self, char, char_len):
9799
if 512 > self._char_to_freq_order[order]:
98100
self._freq_chars += 1
99101

100-
def get_confidence(self):
102+
def get_confidence(self) -> float:
101103
"""return confidence based on existing data"""
102104
# if we didn't receive any character in our consideration range,
103105
# return negative answer
@@ -114,12 +116,12 @@ def get_confidence(self):
114116
# normalize confidence (we don't want to be 100% sure)
115117
return self.SURE_YES
116118

117-
def got_enough_data(self):
119+
def got_enough_data(self) -> bool:
118120
# It is not necessary to receive all data to draw conclusion.
119121
# For charset detection, certain amount of data is enough
120122
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
121123

122-
def get_order(self, _):
124+
def get_order(self, _: Union[bytes, bytearray]) -> int:
123125
# We do not handle characters based on the original encoding string,
124126
# but convert this encoding string to a number, here called order.
125127
# This allows multiple encodings of a language to share one frequency
@@ -128,13 +130,13 @@ def get_order(self, _):
128130

129131

130132
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
131-
def __init__(self):
133+
def __init__(self) -> None:
132134
super().__init__()
133135
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
134136
self._table_size = EUCTW_TABLE_SIZE
135137
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
136138

137-
def get_order(self, byte_str):
139+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
138140
# for euc-TW encoding, we are interested
139141
# first byte range: 0xc4 -- 0xfe
140142
# second byte range: 0xa1 -- 0xfe
@@ -146,13 +148,13 @@ def get_order(self, byte_str):
146148

147149

148150
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
149-
def __init__(self):
151+
def __init__(self) -> None:
150152
super().__init__()
151153
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
152154
self._table_size = EUCKR_TABLE_SIZE
153155
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
154156

155-
def get_order(self, byte_str):
157+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
156158
# for euc-KR encoding, we are interested
157159
# first byte range: 0xb0 -- 0xfe
158160
# second byte range: 0xa1 -- 0xfe
@@ -164,13 +166,13 @@ def get_order(self, byte_str):
164166

165167

166168
class JOHABDistributionAnalysis(CharDistributionAnalysis):
167-
def __init__(self):
169+
def __init__(self) -> None:
168170
super().__init__()
169171
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
170172
self._table_size = EUCKR_TABLE_SIZE
171173
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
172174

173-
def get_order(self, byte_str):
175+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
174176
first_char = byte_str[0]
175177
if 0x88 <= first_char < 0xD4:
176178
code = first_char * 256 + byte_str[1]
@@ -179,13 +181,13 @@ def get_order(self, byte_str):
179181

180182

181183
class GB2312DistributionAnalysis(CharDistributionAnalysis):
182-
def __init__(self):
184+
def __init__(self) -> None:
183185
super().__init__()
184186
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
185187
self._table_size = GB2312_TABLE_SIZE
186188
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
187189

188-
def get_order(self, byte_str):
190+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
189191
# for GB2312 encoding, we are interested
190192
# first byte range: 0xb0 -- 0xfe
191193
# second byte range: 0xa1 -- 0xfe
@@ -197,13 +199,13 @@ def get_order(self, byte_str):
197199

198200

199201
class Big5DistributionAnalysis(CharDistributionAnalysis):
200-
def __init__(self):
202+
def __init__(self) -> None:
201203
super().__init__()
202204
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
203205
self._table_size = BIG5_TABLE_SIZE
204206
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
205207

206-
def get_order(self, byte_str):
208+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
207209
# for big5 encoding, we are interested
208210
# first byte range: 0xa4 -- 0xfe
209211
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
@@ -217,13 +219,13 @@ def get_order(self, byte_str):
217219

218220

219221
class SJISDistributionAnalysis(CharDistributionAnalysis):
220-
def __init__(self):
222+
def __init__(self) -> None:
221223
super().__init__()
222224
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
223225
self._table_size = JIS_TABLE_SIZE
224226
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
225227

226-
def get_order(self, byte_str):
228+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
227229
# for sjis encoding, we are interested
228230
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
229231
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
@@ -242,13 +244,13 @@ def get_order(self, byte_str):
242244

243245

244246
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
245-
def __init__(self):
247+
def __init__(self) -> None:
246248
super().__init__()
247249
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
248250
self._table_size = JIS_TABLE_SIZE
249251
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
250252

251-
def get_order(self, byte_str):
253+
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
252254
# for euc-JP encoding, we are interested
253255
# first byte range: 0xa0 -- 0xfe
254256
# second byte range: 0xa1 -- 0xfe

src/pip/_vendor/chardet/charsetgroupprober.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,47 +25,46 @@
2525
# 02110-1301 USA
2626
######################### END LICENSE BLOCK #########################
2727

28+
from typing import List, Optional, Union
29+
2830
from .charsetprober import CharSetProber
29-
from .enums import ProbingState
31+
from .enums import LanguageFilter, ProbingState
3032

3133

3234
class CharSetGroupProber(CharSetProber):
33-
def __init__(self, lang_filter=None):
35+
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
3436
super().__init__(lang_filter=lang_filter)
3537
self._active_num = 0
36-
self.probers = []
37-
self._best_guess_prober = None
38+
self.probers: List[CharSetProber] = []
39+
self._best_guess_prober: Optional[CharSetProber] = None
3840

39-
def reset(self):
41+
def reset(self) -> None:
4042
super().reset()
4143
self._active_num = 0
4244
for prober in self.probers:
43-
if prober:
44-
prober.reset()
45-
prober.active = True
46-
self._active_num += 1
45+
prober.reset()
46+
prober.active = True
47+
self._active_num += 1
4748
self._best_guess_prober = None
4849

4950
@property
50-
def charset_name(self):
51+
def charset_name(self) -> Optional[str]:
5152
if not self._best_guess_prober:
5253
self.get_confidence()
5354
if not self._best_guess_prober:
5455
return None
5556
return self._best_guess_prober.charset_name
5657

5758
@property
58-
def language(self):
59+
def language(self) -> Optional[str]:
5960
if not self._best_guess_prober:
6061
self.get_confidence()
6162
if not self._best_guess_prober:
6263
return None
6364
return self._best_guess_prober.language
6465

65-
def feed(self, byte_str):
66+
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
6667
for prober in self.probers:
67-
if not prober:
68-
continue
6968
if not prober.active:
7069
continue
7170
state = prober.feed(byte_str)
@@ -83,7 +82,7 @@ def feed(self, byte_str):
8382
return self.state
8483
return self.state
8584

86-
def get_confidence(self):
85+
def get_confidence(self) -> float:
8786
state = self.state
8887
if state == ProbingState.FOUND_IT:
8988
return 0.99
@@ -92,8 +91,6 @@ def get_confidence(self):
9291
best_conf = 0.0
9392
self._best_guess_prober = None
9493
for prober in self.probers:
95-
if not prober:
96-
continue
9794
if not prober.active:
9895
self.logger.debug("%s not active", prober.charset_name)
9996
continue

0 commit comments

Comments
 (0)