pypa
diff --git a/‎news/chardet.vendor.rst
Lines changed: 1 addition & 0 deletions b/‎news/chardet.vendor.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/pip/_vendor/chardet.pyi
Lines changed: 0 additions & 1 deletion b/‎src/pip/_vendor/chardet.pyi
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/pip/_vendor/chardet/__init__.py
Lines changed: 29 additions & 7 deletions b/‎src/pip/_vendor/chardet/__init__.py
Lines changed: 29 additions & 7 deletions
diff --git a/‎src/pip/_vendor/chardet/big5prober.py
Lines changed: 3 additions & 3 deletions b/‎src/pip/_vendor/chardet/big5prober.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/pip/_vendor/chardet/chardistribution.py
Lines changed: 28 additions & 26 deletions b/‎src/pip/_vendor/chardet/chardistribution.py
Lines changed: 28 additions & 26 deletions
diff --git a/‎src/pip/_vendor/chardet/charsetgroupprober.py
Lines changed: 14 additions & 17 deletions b/‎src/pip/_vendor/chardet/charsetgroupprober.py
Lines changed: 14 additions & 17 deletions
@@ -0,0 +1 @@
+Upgrade chardet to 5.1.0
@@ -15,32 +15,46 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
+from typing import List, Union
+
+from .charsetgroupprober import CharSetGroupProber
+from .charsetprober import CharSetProber
 from .enums import InputState
+from .resultdict import ResultDict
 from .universaldetector import UniversalDetector
 from .version import VERSION, __version__
 
 __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
 
 
-def detect(byte_str):
+def detect(
+    byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
+) -> ResultDict:
     """
     Detect the encoding of the given byte string.
 
     :param byte_str:     The byte sequence to examine.
     :type byte_str:      ``bytes`` or ``bytearray``
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
             raise TypeError(
                 f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
             )
         byte_str = bytearray(byte_str)
-    detector = UniversalDetector()
+    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
     detector.feed(byte_str)
     return detector.close()
 
 
-def detect_all(byte_str, ignore_threshold=False):
+def detect_all(
+    byte_str: Union[bytes, bytearray],
+    ignore_threshold: bool = False,
+    should_rename_legacy: bool = False,
+) -> List[ResultDict]:
     """
     Detect all the possible encodings of the given byte string.
 
@@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False):
                               ``UniversalDetector.MINIMUM_THRESHOLD``
                               in results.
     :type ignore_threshold:   ``bool``
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
@@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False):
             )
         byte_str = bytearray(byte_str)
 
-    detector = UniversalDetector()
+    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
     detector.feed(byte_str)
     detector.close()
 
     if detector.input_state == InputState.HIGH_BYTE:
-        results = []
-        probers = []
+        results: List[ResultDict] = []
+        probers: List[CharSetProber] = []
         for prober in detector.charset_probers:
-            if hasattr(prober, "probers"):
+            if isinstance(prober, CharSetGroupProber):
                 probers.extend(p for p in prober.probers)
             else:
                 probers.append(prober)
@@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False):
                     charset_name = detector.ISO_WIN_MAP.get(
                         lower_charset_name, charset_name
                     )
+                # Rename legacy encodings with superset encodings if asked
+                if should_rename_legacy:
+                    charset_name = detector.LEGACY_MAP.get(
+                        charset_name.lower(), charset_name
+                    )
                 results.append(
                     {
                         "encoding": charset_name,
 
@@ -32,16 +32,16 @@
 
 
 class Big5Prober(MultiByteCharSetProber):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
         self.distribution_analyzer = Big5DistributionAnalysis()
         self.reset()
 
     @property
-    def charset_name(self):
+    def charset_name(self) -> str:
         return "Big5"
 
     @property
-    def language(self):
+    def language(self) -> str:
         return "Chinese"
@@ -25,6 +25,8 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
+from typing import Tuple, Union
+
 from .big5freq import (
     BIG5_CHAR_TO_FREQ_ORDER,
     BIG5_TABLE_SIZE,
@@ -59,22 +61,22 @@ class CharDistributionAnalysis:
     SURE_NO = 0.01
     MINIMUM_DATA_THRESHOLD = 3
 
-    def __init__(self):
+    def __init__(self) -> None:
         # Mapping table to get frequency order from char order (get from
         # GetOrder())
-        self._char_to_freq_order = tuple()
-        self._table_size = None  # Size of above table
+        self._char_to_freq_order: Tuple[int, ...] = tuple()
+        self._table_size = 0  # Size of above table
         # This is a constant value which varies from language to language,
         # used in calculating confidence.  See
         # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
         # for further detail.
-        self.typical_distribution_ratio = None
-        self._done = None
-        self._total_chars = None
-        self._freq_chars = None
+        self.typical_distribution_ratio = 0.0
+        self._done = False
+        self._total_chars = 0
+        self._freq_chars = 0
         self.reset()
 
-    def reset(self):
+    def reset(self) -> None:
         """reset analyser, clear any state"""
         # If this flag is set to True, detection is done and conclusion has
         # been made
@@ -83,7 +85,7 @@ def reset(self):
         # The number of characters whose frequency order is less than 512
         self._freq_chars = 0
 
-    def feed(self, char, char_len):
+    def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
         """feed a character with known length"""
         if char_len == 2:
             # we only care about 2-bytes character in our distribution analysis
@@ -97,7 +99,7 @@ def feed(self, char, char_len):
                 if 512 > self._char_to_freq_order[order]:
                     self._freq_chars += 1
 
-    def get_confidence(self):
+    def get_confidence(self) -> float:
         """return confidence based on existing data"""
         # if we didn't receive any character in our consideration range,
         # return negative answer
@@ -114,12 +116,12 @@ def get_confidence(self):
         # normalize confidence (we don't want to be 100% sure)
         return self.SURE_YES
 
-    def got_enough_data(self):
+    def got_enough_data(self) -> bool:
         # It is not necessary to receive all data to draw conclusion.
         # For charset detection, certain amount of data is enough
         return self._total_chars > self.ENOUGH_DATA_THRESHOLD
 
-    def get_order(self, _):
+    def get_order(self, _: Union[bytes, bytearray]) -> int:
         # We do not handle characters based on the original encoding string,
         # but convert this encoding string to a number, here called order.
         # This allows multiple encodings of a language to share one frequency
@@ -128,13 +130,13 @@ def get_order(self, _):
 
 
 class EUCTWDistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
         self._table_size = EUCTW_TABLE_SIZE
         self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         # for euc-TW encoding, we are interested
         #   first  byte range: 0xc4 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
@@ -146,13 +148,13 @@ def get_order(self, byte_str):
 
 
 class EUCKRDistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
         self._table_size = EUCKR_TABLE_SIZE
         self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         # for euc-KR encoding, we are interested
         #   first  byte range: 0xb0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
@@ -164,13 +166,13 @@ def get_order(self, byte_str):
 
 
 class JOHABDistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
         self._table_size = EUCKR_TABLE_SIZE
         self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         first_char = byte_str[0]
         if 0x88 <= first_char < 0xD4:
             code = first_char * 256 + byte_str[1]
@@ -179,13 +181,13 @@ def get_order(self, byte_str):
 
 
 class GB2312DistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
         self._table_size = GB2312_TABLE_SIZE
         self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         # for GB2312 encoding, we are interested
         #  first  byte range: 0xb0 -- 0xfe
         #  second byte range: 0xa1 -- 0xfe
@@ -197,13 +199,13 @@ def get_order(self, byte_str):
 
 
 class Big5DistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
         self._table_size = BIG5_TABLE_SIZE
         self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         # for big5 encoding, we are interested
         #   first  byte range: 0xa4 -- 0xfe
         #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
@@ -217,13 +219,13 @@ def get_order(self, byte_str):
 
 
 class SJISDistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
         self._table_size = JIS_TABLE_SIZE
         self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         # for sjis encoding, we are interested
         #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
         #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
@@ -242,13 +244,13 @@ def get_order(self, byte_str):
 
 
 class EUCJPDistributionAnalysis(CharDistributionAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
         self._table_size = JIS_TABLE_SIZE
         self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
 
-    def get_order(self, byte_str):
+    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
         # for euc-JP encoding, we are interested
         #   first  byte range: 0xa0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
 
@@ -25,47 +25,46 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
+from typing import List, Optional, Union
+
 from .charsetprober import CharSetProber
-from .enums import ProbingState
+from .enums import LanguageFilter, ProbingState
 
 
 class CharSetGroupProber(CharSetProber):
-    def __init__(self, lang_filter=None):
+    def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
         super().__init__(lang_filter=lang_filter)
         self._active_num = 0
-        self.probers = []
-        self._best_guess_prober = None
+        self.probers: List[CharSetProber] = []
+        self._best_guess_prober: Optional[CharSetProber] = None
 
-    def reset(self):
+    def reset(self) -> None:
         super().reset()
         self._active_num = 0
         for prober in self.probers:
-            if prober:
-                prober.reset()
-                prober.active = True
-                self._active_num += 1
+            prober.reset()
+            prober.active = True
+            self._active_num += 1
         self._best_guess_prober = None
 
     @property
-    def charset_name(self):
+    def charset_name(self) -> Optional[str]:
         if not self._best_guess_prober:
             self.get_confidence()
             if not self._best_guess_prober:
                 return None
         return self._best_guess_prober.charset_name
 
     @property
-    def language(self):
+    def language(self) -> Optional[str]:
         if not self._best_guess_prober:
             self.get_confidence()
             if not self._best_guess_prober:
                 return None
         return self._best_guess_prober.language
 
-    def feed(self, byte_str):
+    def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
         for prober in self.probers:
-            if not prober:
-                continue
             if not prober.active:
                 continue
             state = prober.feed(byte_str)
@@ -83,7 +82,7 @@ def feed(self, byte_str):
                     return self.state
         return self.state
 
-    def get_confidence(self):
+    def get_confidence(self) -> float:
         state = self.state
         if state == ProbingState.FOUND_IT:
             return 0.99
@@ -92,8 +91,6 @@ def get_confidence(self):
         best_conf = 0.0
         self._best_guess_prober = None
         for prober in self.probers:
-            if not prober:
-                continue
             if not prober.active:
                 self.logger.debug("%s not active", prober.charset_name)
                 continue