Skip to content

Commit c5afcf4

Browse files
committed
Upgrade chardet to 5.0.0
1 parent 6c3853a commit c5afcf4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+8529
-5643
lines changed

news/chardet.vendor.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Upgrade chardet to 5.0.0

src/pip/_vendor/chardet/LICENSE

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
GNU LESSER GENERAL PUBLIC LICENSE
2-
Version 2.1, February 1999
1+
GNU LESSER GENERAL PUBLIC LICENSE
2+
Version 2.1, February 1999
33

44
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
5-
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
5+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
66
Everyone is permitted to copy and distribute verbatim copies
77
of this license document, but changing it is not allowed.
88

99
[This is the first released version of the Lesser GPL. It also counts
1010
as the successor of the GNU Library Public License, version 2, hence
1111
the version number 2.1.]
1212

13-
Preamble
13+
Preamble
1414

1515
The licenses for most software are designed to take away your
1616
freedom to share and change it. By contrast, the GNU General Public
@@ -112,7 +112,7 @@ modification follow. Pay close attention to the difference between a
112112
former contains code derived from the library, whereas the latter must
113113
be combined with the library in order to run.
114114

115-
GNU LESSER GENERAL PUBLIC LICENSE
115+
GNU LESSER GENERAL PUBLIC LICENSE
116116
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117117

118118
0. This License Agreement applies to any software library or other
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
146146
on the Library (independent of the use of the Library in a tool for
147147
writing it). Whether that is true depends on what the Library does
148148
and what the program that uses the Library does.
149-
149+
150150
1. You may copy and distribute verbatim copies of the Library's
151151
complete source code as you receive it, in any medium, provided that
152152
you conspicuously and appropriately publish on each copy an
@@ -432,7 +432,7 @@ decision will be guided by the two goals of preserving the free status
432432
of all derivatives of our free software and of promoting the sharing
433433
and reuse of software generally.
434434

435-
NO WARRANTY
435+
NO WARRANTY
436436

437437
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438438
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
@@ -455,7 +455,7 @@ FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455455
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456456
DAMAGES.
457457

458-
END OF TERMS AND CONDITIONS
458+
END OF TERMS AND CONDITIONS
459459

460460
How to Apply These Terms to Your New Libraries
461461

@@ -485,7 +485,7 @@ convey the exclusion of warranty; and each file should have at least the
485485

486486
You should have received a copy of the GNU Lesser General Public
487487
License along with this library; if not, write to the Free Software
488-
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
488+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
489489

490490
Also add information on how to contact you by electronic and paper mail.
491491

@@ -500,5 +500,3 @@ necessary. Here is a sample; alter the names:
500500
Ty Coon, President of Vice
501501

502502
That's all there is to it!
503-
504-

src/pip/_vendor/chardet/__init__.py

Lines changed: 41 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,11 @@
1515
# 02110-1301 USA
1616
######################### END LICENSE BLOCK #########################
1717

18-
19-
from .universaldetector import UniversalDetector
2018
from .enums import InputState
21-
from .version import __version__, VERSION
22-
19+
from .universaldetector import UniversalDetector
20+
from .version import VERSION, __version__
2321

24-
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
22+
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
2523

2624

2725
def detect(byte_str):
@@ -33,51 +31,63 @@ def detect(byte_str):
3331
"""
3432
if not isinstance(byte_str, bytearray):
3533
if not isinstance(byte_str, bytes):
36-
raise TypeError('Expected object of type bytes or bytearray, got: '
37-
'{}'.format(type(byte_str)))
38-
else:
39-
byte_str = bytearray(byte_str)
34+
raise TypeError(
35+
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
36+
)
37+
byte_str = bytearray(byte_str)
4038
detector = UniversalDetector()
4139
detector.feed(byte_str)
4240
return detector.close()
4341

4442

45-
def detect_all(byte_str):
43+
def detect_all(byte_str, ignore_threshold=False):
4644
"""
4745
Detect all the possible encodings of the given byte string.
4846
49-
:param byte_str: The byte sequence to examine.
50-
:type byte_str: ``bytes`` or ``bytearray``
47+
:param byte_str: The byte sequence to examine.
48+
:type byte_str: ``bytes`` or ``bytearray``
49+
:param ignore_threshold: Include encodings that are below
50+
``UniversalDetector.MINIMUM_THRESHOLD``
51+
in results.
52+
:type ignore_threshold: ``bool``
5153
"""
5254
if not isinstance(byte_str, bytearray):
5355
if not isinstance(byte_str, bytes):
54-
raise TypeError('Expected object of type bytes or bytearray, got: '
55-
'{}'.format(type(byte_str)))
56-
else:
57-
byte_str = bytearray(byte_str)
56+
raise TypeError(
57+
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
58+
)
59+
byte_str = bytearray(byte_str)
5860

5961
detector = UniversalDetector()
6062
detector.feed(byte_str)
6163
detector.close()
6264

63-
if detector._input_state == InputState.HIGH_BYTE:
65+
if detector.input_state == InputState.HIGH_BYTE:
6466
results = []
65-
for prober in detector._charset_probers:
66-
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
67-
charset_name = prober.charset_name
68-
lower_charset_name = prober.charset_name.lower()
67+
probers = []
68+
for prober in detector.charset_probers:
69+
if hasattr(prober, "probers"):
70+
probers.extend(p for p in prober.probers)
71+
else:
72+
probers.append(prober)
73+
for prober in probers:
74+
if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
75+
charset_name = prober.charset_name or ""
76+
lower_charset_name = charset_name.lower()
6977
# Use Windows encoding name instead of ISO-8859 if we saw any
7078
# extra Windows-specific bytes
71-
if lower_charset_name.startswith('iso-8859'):
72-
if detector._has_win_bytes:
73-
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
74-
charset_name)
75-
results.append({
76-
'encoding': charset_name,
77-
'confidence': prober.get_confidence(),
78-
'language': prober.language,
79-
})
79+
if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
80+
charset_name = detector.ISO_WIN_MAP.get(
81+
lower_charset_name, charset_name
82+
)
83+
results.append(
84+
{
85+
"encoding": charset_name,
86+
"confidence": prober.get_confidence(),
87+
"language": prober.language,
88+
}
89+
)
8090
if len(results) > 0:
81-
return sorted(results, key=lambda result: -result['confidence'])
91+
return sorted(results, key=lambda result: -result["confidence"])
8292

8393
return [detector.result]

src/pip/_vendor/chardet/big5freq.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@
4242

4343
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
4444

45-
#Char to FreqOrder table
45+
# Char to FreqOrder table
4646
BIG5_TABLE_SIZE = 5376
47-
47+
# fmt: off
4848
BIG5_CHAR_TO_FREQ_ORDER = (
4949
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
5050
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
@@ -383,4 +383,4 @@
383383
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
384384
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
385385
)
386-
386+
# fmt: on

src/pip/_vendor/chardet/big5prober.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@
2525
# 02110-1301 USA
2626
######################### END LICENSE BLOCK #########################
2727

28-
from .mbcharsetprober import MultiByteCharSetProber
29-
from .codingstatemachine import CodingStateMachine
3028
from .chardistribution import Big5DistributionAnalysis
29+
from .codingstatemachine import CodingStateMachine
30+
from .mbcharsetprober import MultiByteCharSetProber
3131
from .mbcssm import BIG5_SM_MODEL
3232

3333

3434
class Big5Prober(MultiByteCharSetProber):
3535
def __init__(self):
36-
super(Big5Prober, self).__init__()
36+
super().__init__()
3737
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
3838
self.distribution_analyzer = Big5DistributionAnalysis()
3939
self.reset()

0 commit comments

Comments
 (0)